a2c를 keras로 사용하려면 loss function을 새롭게 정의해야 한다. 보통 fit으로 넘어오는 인자가 input, output 각 한 개씩 사용한다. input이나 output으로 파라미터를 넘길 때 advantage를 같이 넘겨야 한다. tensorflow 1.x에서는 이게 꼼수로 되었는데, 2.x로 올라오면서 안된다. 아래 보면 actor loss가 0으로 고정되어 있다.
입력 파라미터를 넘길 때 리스트로 2개를 넘길 수 있다. input = [input, advantage] 형식으로 사용할 수 있다. 그러나 tensor를 그대로 넘길 경우 값을 알 수 없어 에러가 난다.
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py", line 60, in quick_execute inputs, attrs, num_outputs) TypeError: An op outside of the function building code is being passed a "Graph" tensor. It is possible to have Graph tensors leak out of the function building context by including a tf.init_scope in your function building code. For example, the following function will fail: @tf.function def has_init_scope(): my_constant = tf.constant(1.) with tf.init_scope(): added = my_constant * 2 The graph tensor has name: input_2:0 During handling of the above exception, another exception occurred: Traceback (most recent call last): File "agent_a2c.py", line 153, in <module> agent.train_model(state, action, reward, next_state, done ) File "agent_a2c.py", line 110, in train_model self.actor.fit(x=[state, advantageTmp], y=actions, epochs = 1, verbose =0) File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 108, in _method_wrapper return method(self, *args, **kwargs) File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 1098, in fit tmp_logs = train_function(iterator) File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py", line 780, in __call__ result = self._call(*args, **kwds) File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py", line 840, in _call return self._stateless_fn(*args, **kwds) File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py", line 2829, in __call__ return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py", line 1848, in _filtered_call cancellation_manager=cancellation_manager) File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py", line 1924, in _call_flat ctx, args, cancellation_manager=cancellation_manager)) File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py", line 550, in call ctx=ctx) File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py", line 74, in quick_execute "tensors, but found {}".format(keras_symbolic_tensors)) tensorflow.python.eager.core._SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'input_2:0' shape=(None, 1) dtype=float32>]
이럴 경우 eager.execution을 넣어주면 에러를 없앨 수 있다. tensorflow 2.x부터 추가되었다.
from env_reinforce import CarrierStorage from env_reinforce import Action import random from collections import defaultdict import numpy as np from termcolor import colored from keras.models import Sequential from keras.layers import Dense, Input from keras.models import Model from keras.optimizers import Adam import copy from keras.models import model_from_json from collections import deque from keras import backend as K import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt #custom loss를 구하기 위해 tensor를 즉시 확인. import tensorflow as tf tf.config.experimental_run_functions_eagerly(True) class A2CAgent(object): def __init__(self): #단순하게 했을 경우에는 40으로 사용. self.state_size = 40 #float value 하나 사용 self.action_size = 7 self.value_size = 1 self.discount_factor = 0.99 self.actor_lr = 0.001 self.critic_lr = 0.005 self.actor = self.build_actor() self.critic = self.build_critic() # actor: 상태를 받아 각 행동의 확률을 계산 def build_actor(self): input = Input(shape = (self.state_size,)) delta = Input(shape = [1]) print("delta is ", delta) dense1 = Dense(self.state_size*2, activation='relu', kernel_initializer='he_uniform')(input) action = Dense(self.action_size, activation = 'softmax', kernel_initializer='he_uniform')(dense1) actor = Model(inputs = [input, delta], outputs = action) def actor_loss(y_true, y_prediction): out = K.clip(y_prediction, 1e-8, 1-1e-8) log_likily = y_true*K.log(out) return K.sum(-log_likily * delta) actor.summary() #loss function이 문제.. actor.compile(loss = actor_loss, optimizer = Adam(lr=self.actor_lr)) return actor # critic: 상태를 받아서 상태의 가치를 계산 def build_critic(self): critic = Sequential() critic.add(Dense(self.state_size*2, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) #critic.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) critic.add(Dense(self.value_size, activation='linear', kernel_initializer='he_uniform')) critic.compile(loss = 'mse', optimizer = Adam(lr=self.critic_lr)) print("critic summary") critic.summary() return critic # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self, state, action, reward, next_state, done): value = self.critic.predict(state)[0][0] next_value = self.critic.predict(next_state)[0][0] #action을 one-hot 으로 만듦. actions = np.zeros([1, self.action_size]) actions[np.arange(1), action] = 1.0 #reshape actions = np.reshape(actions, [1, self.action_size]) # 벨만 기대 방정식를 이용한 어드벤티지와 업데이트 타깃 if done: advantage = reward - value target = reward else: advantage = (reward + self.discount_factor * next_value) - value target = reward + self.discount_factor * next_value #tensorflow 2.3, keras 2.4에 맞도록 수정. #np.array를 추가해야 함. target = np.reshape(target, [1,self.value_size]) #print("target shape is", target.shape) #critic을 predictionr과 target으로 업데이트 self.critic.fit(state, target, epochs = 1, verbose = 0) advantageTmp = np.reshape(advantage, [1,1]) self.actor.fit(x=[state, advantageTmp], y=actions, epochs = 1, verbose =0) def get_action(self, state): #[[확율 형식으로 출력]] # [0]을 넣어 줌 policy = self.actor.predict(state)[0] #print("policy = ", policy) return np.random.choice(self.action_size, 1, p=policy)[0] if __name__ == '__main__': #메인 함수 env = CarrierStorage() agent = A2CAgent() state = env.reset() #state history를 기록 #historyState = [] scores, episodes = [], [] EPISODES = 1000 global_step = 0 for e in range (EPISODES): done = False score = 0 state = env.reset() state = env.stateTo1hot(agent.state_size) status = env.isItEnd() if(status == 0 or status == 1): done = True reward = 0 while not done: #env.render() global_step += 1 action = agent.get_action(state) #print("action is", Action(action)) next_state, reward, done, info = env.step(action) next_state = env.stateTo1hot(agent.state_size) agent.train_model(state, action, reward, next_state, done ) score += reward state = copy.deepcopy(next_state) if done: print("episode:", e, " score:", score, "global_step", global_step) scores.append(score) episodes.append(e) plt.plot(episodes, scores, 'b') plt.show() plt.savefig("./history.png")
뭐가 잘 안맞는지, 1,000회 학습하면 별 효과가 없다. 각 100번째 평균을 보면 다음과 같다. 경험 리플레이를 사용하지 않은 것과 같은 현상이다. A3C로 고고!
전통적인 fake input으로 loss funcion에 필요한 파라미터를 전달하는 방법은 tensorflow 2.0 이상에서는 동작하지 않는 듯 하다.
[16:01:00]>cat testCustomLoss.py import keras from keras.layers import Input, Embedding, LSTM, Dense from keras.models import Model import numpy as np from keras import backend as K import tensorflow as tf tf.config.run_functions_eagerly(True) #tf.experimental_run_tf_function main_input = Input(shape=(10,), dtype='int32', name='main_input') #x = Embedding(output_dim=12, input_dim=100, input_length=100)(main_input) #lstm_out = LSTM(4)(x) #auxiliary_output = Dense(1, activation='sigmoid', name='aux_output')(lstm_out) auxiliary_input = Input(shape=[1], name='aux_input') #x = keras.layers.concatenate([lstm_out, auxiliary_input]) x = Dense(4, activation='relu')(main_input) main_output = Dense(1, activation='sigmoid', name='main_output')(x) def actor_loss_threeParameter(y_true, y_prediction, auxiliary_input): #aux_in = tf.keras.backend.cast(auxiliary_input, dtype='float64') out = K.clip(y_prediction, 1e-8, 1-1e-8) log_likily = y_true*K.log(out) return K.sum(-log_likily * auxiliary_input) def actor_loss(delta): def actor_loss_fit(y_true, y_prediction): return actor_loss_threeParameter(y_true, y_prediction, delta) return actor_loss_fit def test_loss(y_true, y_prediction): return (y_true - y_prediction) model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output]) #model.compile(optimizer='rmsprop', loss='binary_crossentropy',loss_weights=[1., 0.2]) model.compile(optimizer='rmsprop', loss=actor_loss(delta = auxiliary_input)) #model.compile(optimizer='rmsprop', loss=test_loss) model.summary() main_in = np.arange(0,10) main_in = np.reshape(main_in,[1,10]) copied_main_in = main_in.astype(np.float32) main_out = 10 main_out = np.reshape(main_out, [1,1]) copied_out = main_out.astype(np.float32) aux_in = 20 aux_in = np.reshape(aux_in, [1,1]) print("main in", main_in.shape) print("main out", main_out.shape) print("aux input", aux_in.shape) model.fit(x= [copied_main_in, aux_in], y=copied_out, epochs = 10, verbose = 1)
이렇게 놓고 테스트해보면 loss가 0에서 줄어들지 않는다.
tf-docker /home/mnt/myStorage/test_gradientTape > python testCustomLoss.py 2020-11-18 16:00:53.764598: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2020-11-18 16:00:54.659858: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2020-11-18 16:00:54.678468: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-11-18 16:00:54.678865: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:26:00.0 name: GeForce GTX 1060 6GB computeCapability: 6.1 coreClock: 1.7085GHz coreCount: 10 deviceMemorySize: 5.93GiB deviceMemoryBandwidth: 178.99GiB/s 2020-11-18 16:00:54.678890: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2020-11-18 16:00:54.680022: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2020-11-18 16:00:54.681178: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2020-11-18 16:00:54.681346: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2020-11-18 16:00:54.682446: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2020-11-18 16:00:54.683116: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2020-11-18 16:00:54.685549: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2020-11-18 16:00:54.685688: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-11-18 16:00:54.686097: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-11-18 16:00:54.686405: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2020-11-18 16:00:54.686688: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2020-11-18 16:00:54.710941: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 3399500000 Hz 2020-11-18 16:00:54.711710: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x41b63d0 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2020-11-18 16:00:54.711754: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2020-11-18 16:00:54.986824: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-11-18 16:00:54.987296: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x41b8620 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2020-11-18 16:00:54.987352: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): GeForce GTX 1060 6GB, Compute Capability 6.1 2020-11-18 16:00:54.987771: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-11-18 16:00:54.988673: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:26:00.0 name: GeForce GTX 1060 6GB computeCapability: 6.1 coreClock: 1.7085GHz coreCount: 10 deviceMemorySize: 5.93GiB deviceMemoryBandwidth: 178.99GiB/s 2020-11-18 16:00:54.988743: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2020-11-18 16:00:54.988804: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2020-11-18 16:00:54.988846: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2020-11-18 16:00:54.988887: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2020-11-18 16:00:54.988926: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2020-11-18 16:00:54.988972: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2020-11-18 16:00:54.989018: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2020-11-18 16:00:54.989196: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-11-18 16:00:54.990189: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-11-18 16:00:54.991068: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2020-11-18 16:00:54.991140: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2020-11-18 16:00:55.378048: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2020-11-18 16:00:55.378099: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2020-11-18 16:00:55.378107: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2020-11-18 16:00:55.378324: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-11-18 16:00:55.378869: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-11-18 16:00:55.379209: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 4990 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:26:00.0, compute capability: 6.1) Model: "functional_1" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== main_input (InputLayer) [(None, 10)] 0 __________________________________________________________________________________________________ dense (Dense) (None, 4) 44 main_input[0][0] __________________________________________________________________________________________________ aux_input (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ main_output (Dense) (None, 1) 5 dense[0][0] ================================================================================================== Total params: 49 Trainable params: 49 Non-trainable params: 0 __________________________________________________________________________________________________ main in (1, 10) main out (1, 1) aux input (1, 1) /usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/dataset_ops.py:3350: UserWarning: Even though the tf.config.experimental_run_functions_eagerly option is set, this option does not apply to tf.data functions. tf.data functions are still traced and executed as graphs. "Even though the tf.config.experimental_run_functions_eagerly " Epoch 1/10 2020-11-18 16:00:55.511115: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 1/1 [==============================] - 0s 428us/step - loss: 0.0000e+00 Epoch 2/10 1/1 [==============================] - 0s 331us/step - loss: 0.0000e+00 Epoch 3/10 1/1 [==============================] - 0s 333us/step - loss: 0.0000e+00 Epoch 4/10 1/1 [==============================] - 0s 303us/step - loss: 0.0000e+00 Epoch 5/10 1/1 [==============================] - 0s 302us/step - loss: 0.0000e+00 Epoch 6/10 1/1 [==============================] - 0s 277us/step - loss: 0.0000e+00 Epoch 7/10 1/1 [==============================] - 0s 348us/step - loss: 0.0000e+00 Epoch 8/10 1/1 [==============================] - 0s 289us/step - loss: 0.0000e+00 Epoch 9/10 1/1 [==============================] - 0s 275us/step - loss: 0.0000e+00 Epoch 10/10 1/1 [==============================] - 0s 269us/step - loss: 0.0000e+00 tf-docker /home/mnt/myStorage/test_gradientTape >
혹시나 해서 loss function 안에 변수 대신 숫자를 넣어보니 loss가 변했다. tensorflow 2.x에서는 fit으로 할 수 있을까라는 마음을 접고, 2.x이 지원하는 gradient tape로 해야 할 듯 하다. 다행히 누가 이미 구현했다.
20.11.21. 삽질 끝에 왜 gradient tape으로 업데이트 할 수 없는지 알았다. gradientTape()을 사용한 뒤, model로 입력을 집어 넣어야 한다. 분리되어 있으면 얘가 알 수 없다 에러난다. 코드는 넝마 조각이 되가고 있다. 아직도 한번 함정에 빠지면 빠져 나올 수 없다. 여러 샘플을 수집하여 한번에 학습시켜야 할 듯 하다.
from env_reinforce import CarrierStorage from env_reinforce import Action import random from collections import defaultdict import numpy as np from termcolor import colored from keras.models import Sequential from keras.layers import Dense, Input from keras.models import Model from keras.optimizers import Adam import copy from keras.models import model_from_json from collections import deque from keras import backend as K import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt #custom loss를 구하기 위해 tensor를 즉시 확인. import tensorflow as tf tf.config.run_functions_eagerly(True) #여기 참조 #https://github.com/keras-team/keras-io/blob/master/examples/rl/actor_critic_cartpole.py class A2CAgent(object): def __init__(self): #단순하게 했을 경우에는 40으로 사용. self.state_size = 40 #float value 하나 사용 self.action_size = 7 self.discount_factor = 0.99 self.actor_lr = 0.001 self.critic_lr = 0.005 self.DEFINE_NEW = True self.RENDER = True #self.actor = self.build_actor() #self.critic = self.build_critic() self.model = self.build_actorCritic() def build_actorCritic(self): input = Input(shape = (self.state_size,)) common = Dense(self.state_size*2, activation='relu', kernel_initializer='he_uniform')(input) action_prob = Dense(self.action_size, activation = 'softmax', kernel_initializer='he_uniform')(common) critic = Dense(1)(common) model = Model(inputs = input, outputs = [action_prob, critic]) return model def get_action(self, action_prob): #[[확율 형식으로 출력]] # [0]을 넣어 줌 #print("policy = ", policy) return np.random.choice(self.action_size, 1, p=np.squeeze(action_prob))[0] if __name__ == '__main__': #메인 함수 env = CarrierStorage() agent = A2CAgent() state = env.reset() #state history를 기록 #historyState = [] scores, episodes, score_average = [], [], [] EPISODES = 100000 global_step = 0 average = 0 for e in range (EPISODES): done = False score = 0 state = env.reset() state = env.stateTo1hot(agent.state_size) status = env.isItEnd() if(status == 0 or status == 1): done = True reward = 0 while not done: if(agent.RENDER == True): env.render() global_step += 1 with tf.GradientTape() as tape: #tape 아래로 모델을 입력해야 input, output 관계를 알 수 있음. #actor, critic 모두 예측. action_prob, critic = agent.model(state) #numpy state를 tensor로 바꾸고, overide한 call로 입력. #state = np.reshape(state, [1, agent.state_size]) #state = tf.convert_to_tensor(state, dtype=tf.int8, dtype_hint=None, name=None) #action_prob, critic = agent.model.call(state) print("action prob", action_prob) #print("critic", critic) #action은 action tf.Tensor( #[[0.16487105 0.0549401 0.12524831 0.1738248 0.31119537 0.07012787 0.0997925 ]], shape=(1, 7), dtype=float32) #critic은 #critic tf.Tensor([[0.04798129]], shape=(1, 1), dtype=float32) #으로 출력. #action_prob로 action을 구함. action = agent.get_action(action_prob[0]) #print("Action is", Action(action)) # #print("critic", critic) #print("next critic", next_critic) if(agent.RENDER == True): print("action is", Action(action)) next_state, reward, done, info = env.step(action) next_state = env.stateTo1hot(agent.state_size) _, next_critic = agent.model(next_state) advantage = reward + (1.0 - done) * agent.discount_factor * next_critic - critic #[ [prob, prob, ... ] ]형식으로 입력이 들어옮 actor_loss = tf.math.log(action_prob[0, action]) * advantage critic_loss = advantage**2 print("actor loss ", actor_loss) print("critic loss ", critic_loss) #모델이 하나라 actor_loss + critic_loss 더해서 한번에 train total_loss = actor_loss + critic_loss grads = tape.gradient(total_loss, agent.model.trainable_weights) #print("grad" , grads) optimizer = Adam(learning_rate = 0.01) optimizer.apply_gradients(zip(grads, agent.model.trainable_weights)) score += reward average = average + score state = copy.deepcopy(next_state) if done: if(agent.RENDER == True): print("episode:", e, " score:", score) if(e%1000 == 0 and e>1): print("episode:", e, " score:", score, "global_step", global_step,"average", average) scores.append(score) score_average.append(average) episodes.append(e) #매 100회마다 average 초기화. average = 0 model_json_actor = agent.model.to_json() model_json_critic = agent.model.to_json() with open("./201027ActorA2c.json", "w") as json_file: json_file.write(model_json_actor) with open("./201027CriticA2c.json", "w") as json_file: json_file.write(model_json_critic) agent.model.save_weights("./201027weightActorA2c.h5") agent.model.save_weights("./201027weightCriticA2c.h5") plt.plot(episodes, score_average, 'b') #plt.show() plt.savefig("./history.png")
tensorflow 2.x이 fit을 지원하지 않고 gradientTape로 학습시켜야 하여 좀 불편하다. 나온지 오래되어 여러 꼼수들을 써먹을 수 없다. 수정하고 수정하여 아래와 같이 했다. 점수는 DQN보다 잘 안오르는 편이다. 100개씩 샘플을 저장하여 학습시켰는데, 총점이 1000점 넘기기 힘들다. 그래도 점수가 오르락 내리락 하는 패턴을 보면 알고리즘은 정확한 듯 하다. gradientTape 안에 넣을 때 loss 구하는 부분도 같은 탭에 있어야 한다. 잘 몰라 한참 해멨다.
from env_reinforce import CarrierStorage from env_reinforce import Action import random from collections import defaultdict import numpy as np from termcolor import colored from keras.models import Sequential from keras.layers import Dense, Input from keras.models import Model from keras.optimizers import Adam import copy from keras.models import model_from_json from collections import deque from keras import backend as K import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0 #custom loss를 구하기 위해 tensor를 즉시 확인. import tensorflow as tf tf.config.run_functions_eagerly(True) #여기 참조 #https://github.com/keras-team/keras-io/blob/master/examples/rl/actor_critic_cartpole.py class A2CAgent(object): def __init__(self): #단순하게 했을 경우에는 40으로 사용. self.state_size = 40 #float value 하나 사용 self.action_size = 7 self.discount_factor = 0.8 self.DEFINE_NEW = False self.RENDER = False #self.actor = self.build_actor() #self.critic = self.build_critic() self.model = self.build_actorCritic() def build_actorCritic(self): if(self.DEFINE_NEW == True): input = Input(shape = (self.state_size,)) common = Dense(self.state_size*24, activation='relu', kernel_initializer='he_uniform')(input) common2 = Dense(self.action_size*12, activation = 'relu',kernel_initializer='he_uniform')(common) action_prob = Dense(self.action_size, activation = 'softmax', kernel_initializer='he_uniform')(common2) critic = Dense(1)(common2) model = Model(inputs = input, outputs = [action_prob, critic]) else: #있는 데이터 로딩 json_actor = open("./201027ActorA2c.json", "r") loaded_actor = json_actor.read() json_actor.close() model= model_from_json(loaded_actor) print("모델 %s를 로딩"%json_actor) weight_actor = "./201027weightCriticA2c.h5" model.load_weights(weight_actor) print("저장된 weights %s를 로딩"%weight_actor) return model def get_action(self, action_prob): #[[확율 형식으로 출력]] # [0]을 넣어 줌 #print("policy = ", policy) return np.random.choice(self.action_size, 1, p=np.squeeze(action_prob))[0] if __name__ == '__main__': #메인 함수 env = CarrierStorage() agent = A2CAgent() state = env.reset() #state history를 기록 #historyState = [] scores, episodes, score_average = [], [], [] EPISODES = 100000 global_step = 0 average = 0 huber_loss = tf.losses.Huber() optimizer = Adam(learning_rate = 0.0001) #action, critic, reward를 list로 기록. actionprob_history, critic_history, reward_history = [], [], [] for e in range (EPISODES): #print("episode check", e) done = False score = 0 state = env.reset() state = env.stateTo1hot(agent.state_size) status = env.isItEnd() #print("reseted") if(status == 0 or status == 1): done = True reward = 0 #print("zero rewards") #여기에서 apply.gradients를 적용한면 안됨. while not done: if(agent.RENDER == True): env.render() global_step += 1 #tape 아래로 모델을 입력해야 input, output 관계를 알 수 있음. #actor, critic 모두 예측. #with tf.GradientTape(persistent=True) as tape: with tf.GradientTape() as tape: action_prob, critic = agent.model(state) #action은 action tf.Tensor( #[[0.16487105 0.0549401 0.12524831 0.1738248 0.31119537 0.07012787 0.0997925 ]], shape=(1, 7), dtype=float32) #critic은 #critic tf.Tensor([[0.04798129]], shape=(1, 1), dtype=float32) #으로 출력. #action_prob로 action을 구함. action = agent.get_action(action_prob[0]) #print("actionprob history",actionprob_history) if(agent.RENDER == True): print("action is", Action(action)) next_state, reward, done, info = env.step(action) #history에 추가 critic_history.append(critic[0,0]) actionprob_history.append(tf.math.log(action_prob[0, action])) reward_history.append(reward) next_state = env.stateTo1hot(agent.state_size) #_, next_critic = agent.model(next_state) score += reward average = average + score state = copy.deepcopy(next_state) #rewards 를 discounted factor로 다시 계산. returns = [] discounted_sum = 0 for r in reward_history[::-1]: discounted_sum = r + agent.discount_factor* discounted_sum returns.insert(0, discounted_sum) # Normalize returns = np.array(returns) returns = (returns - np.mean(returns)) / (np.std(returns) + eps) returns = returns.tolist() # Calculating loss values to update our network history = zip(actionprob_history, critic_history, returns) actor_losses = [] critic_losses = [] for log_prob, value, ret in history: advantage = ret - value #advantage = reward + (1.0 - done) * agent.discount_factor * next_critic - critic #[ [prob, prob, ... ] ]형식으로 입력이 들어옮 actor_losses.append(-log_prob*advantage) #critic_losses.append(advantage**2) critic_losses.append(huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))) #print("actor loss ", actor_losses) #print("critic loss ", critic_losses) #모델이 하나라 actor_loss + critic_loss 더해서 한번에 train #print("grad" , grads) #print("history", len(actionprob_history)) total_loss = actor_losses + critic_losses #loss도 gradientTape 안에 들어있어야 함. if(len(actionprob_history) > 0 ): #print("actor losses", len(actor_losses)) #print("critic losses", len(critic_losses)) #print("check", len(total_loss)) grads = tape.gradient(total_loss, agent.model.trainable_weights) #print("grads", grads) optimizer.apply_gradients(zip(grads, agent.model.trainable_weights)) #print("actionprob history", actionprob_history) #print("cirtic,",critic_history) #print("rewards", reward_history) #print("actor losses", len(actor_losses)) #print("critic losses", len(critic_losses)) #print("total loss", len(total_loss)) #print("actionprob_history", len(actionprob_history)) #print("episodes", e) if(agent.RENDER == True): print("episode:", e, " score:", score) if(e%100 == 0): print("history length is", len(actionprob_history)) print("episode:", e, " score:", score, "global_step", global_step,"average", average) scores.append(score) score_average.append(average) episodes.append(e) #매 1000회마다 average 초기화. average = 0 model_json_actor = agent.model.to_json() with open("./201027ActorA2c.json", "w") as json_file: json_file.write(model_json_actor) agent.model.save_weights("./201027weightCriticA2c.h5") plt.plot(episodes, score_average, 'b') #plt.show() plt.savefig("./history.png") #비어있는 history로 gradients를 계산하지 않도록.. #print("episode", e) actionprob_history.clear() critic_history.clear() reward_history.clear() plt.plot(episodes, score_average, 'b') #plt.show() plt.savefig("./history.png")
쉬운 작업은 잘 해내어 점수를 내는데, 대차가 구석에 처박혀 있으면 삽질만 하다 점수를 모두 까 먹는다. DQN보다 성능이 낮다. 이제 대망의 A3C로 고고.