A2C keras로 구현(updated, 완성)

a2c를 keras로 사용하려면 loss function을 새롭게 정의해야 한다. ~~보통 fit으로 넘어오는 인자가 input, output 각 한 개씩 사용한다. input이나 output으로 파라미터를 넘길 때 advantage를 같이 넘겨야 한다.~~ tensorflow 1.x에서는 이게 꼼수로 되었는데, 2.x로 올라오면서 안된다. 아래 보면 actor loss가 0으로 고정되어 있다.

입력 파라미터를 넘길 때 리스트로 2개를 넘길 수 있다. input = [input, advantage] 형식으로 사용할 수 있다. 그러나 tensor를 그대로 넘길 경우 값을 알 수 없어 에러가 난다.

  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py", line 60, in quick_execute
    inputs, attrs, num_outputs)
TypeError: An op outside of the function building code is being passed
a "Graph" tensor. It is possible to have Graph tensors
leak out of the function building context by including a
tf.init_scope in your function building code.
For example, the following function will fail:
  @tf.function
  def has_init_scope():
    my_constant = tf.constant(1.)
    with tf.init_scope():
      added = my_constant * 2
The graph tensor has name: input_2:0

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "agent_a2c.py", line 153, in <module>
    agent.train_model(state, action, reward, next_state, done )
  File "agent_a2c.py", line 110, in train_model
    self.actor.fit(x=[state, advantageTmp], y=actions, epochs = 1, verbose =0)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 108, in _method_wrapper
    return method(self, *args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 1098, in fit
    tmp_logs = train_function(iterator)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py", line 780, in __call__
    result = self._call(*args, **kwds)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py", line 840, in _call
    return self._stateless_fn(*args, **kwds)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py", line 2829, in __call__
    return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py", line 1848, in _filtered_call
    cancellation_manager=cancellation_manager)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py", line 1924, in _call_flat
    ctx, args, cancellation_manager=cancellation_manager))
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py", line 550, in call
    ctx=ctx)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py", line 74, in quick_execute
    "tensors, but found {}".format(keras_symbolic_tensors))
tensorflow.python.eager.core._SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'input_2:0' shape=(None, 1) dtype=float32>]

이럴 경우 eager.execution을 넣어주면 에러를 없앨 수 있다. tensorflow 2.x부터 추가되었다.

from env_reinforce import CarrierStorage 
from env_reinforce import Action
import random
from collections import defaultdict
import numpy as np
from termcolor import colored
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.models  import Model
from keras.optimizers import Adam
import copy
from keras.models import model_from_json
from collections import deque
from keras import backend as K
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


#custom loss를 구하기 위해 tensor를 즉시 확인.
import tensorflow as tf
tf.config.experimental_run_functions_eagerly(True)


class A2CAgent(object):

    def __init__(self):

        #단순하게 했을 경우에는 40으로 사용.
        self.state_size = 40 #float value 하나 사용
        self.action_size = 7
        self.value_size = 1

        self.discount_factor = 0.99
        self.actor_lr = 0.001
        self.critic_lr = 0.005

        self.actor = self.build_actor()
        self.critic = self.build_critic()



    # actor: 상태를 받아 각 행동의 확률을 계산
    def build_actor(self):
        input = Input(shape = (self.state_size,))
        delta = Input(shape = [1])

        print("delta is ", delta)

        dense1 = Dense(self.state_size*2, activation='relu', kernel_initializer='he_uniform')(input)
        action = Dense(self.action_size, activation = 'softmax', kernel_initializer='he_uniform')(dense1)
        actor = Model(inputs = [input, delta], outputs = action)

        def actor_loss(y_true, y_prediction):
            out = K.clip(y_prediction, 1e-8, 1-1e-8)
            log_likily = y_true*K.log(out)

            return K.sum(-log_likily * delta)

        actor.summary()
        #loss function이 문제..
        actor.compile(loss = actor_loss, optimizer = Adam(lr=self.actor_lr))
        return actor

    # critic: 상태를 받아서 상태의 가치를 계산
    def build_critic(self):
        critic = Sequential()
        critic.add(Dense(self.state_size*2, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        #critic.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        critic.add(Dense(self.value_size, activation='linear', kernel_initializer='he_uniform'))


        critic.compile(loss = 'mse', optimizer = Adam(lr=self.critic_lr))
        print("critic summary")
        critic.summary()
        return critic


    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self, state, action, reward, next_state, done):

        value = self.critic.predict(state)[0][0]
        next_value = self.critic.predict(next_state)[0][0]

        #action을 one-hot 으로 만듦.
        actions = np.zeros([1, self.action_size])
        actions[np.arange(1), action] = 1.0

        #reshape
        actions = np.reshape(actions, [1, self.action_size])

        # 벨만 기대 방정식를 이용한 어드벤티지와 업데이트 타깃
        if done:
            advantage = reward - value
            target = reward
        else:
            advantage = (reward + self.discount_factor * next_value) - value
            target = reward + self.discount_factor * next_value


        #tensorflow 2.3, keras 2.4에 맞도록 수정.
        #np.array를 추가해야 함.
        target = np.reshape(target, [1,self.value_size])
        #print("target shape is", target.shape)
        #critic을 predictionr과 target으로 업데이트
        self.critic.fit(state, target, epochs = 1, verbose = 0)

        advantageTmp = np.reshape(advantage, [1,1])

        self.actor.fit(x=[state, advantageTmp], y=actions, epochs = 1, verbose =0)

    def get_action(self, state):
        #[[확율 형식으로 출력]]
        # [0]을 넣어 줌
        policy = self.actor.predict(state)[0]
        #print("policy = ", policy)
        return np.random.choice(self.action_size, 1, p=policy)[0]
         

if __name__ == '__main__':

    #메인 함수
    env = CarrierStorage()
    agent = A2CAgent()
    state = env.reset()

    #state history를 기록
    #historyState = []

    scores, episodes = [], []
    EPISODES = 1000

    global_step = 0

    for e in range (EPISODES):
        done = False
        score = 0
        state = env.reset()
        state = env.stateTo1hot(agent.state_size)
        status = env.isItEnd()
        if(status == 0 or status == 1):
            done = True
            reward = 0

        while not done:
            #env.render()
            global_step += 1
            action = agent.get_action(state)
            #print("action is", Action(action))
            next_state, reward, done, info = env.step(action)
            next_state = env.stateTo1hot(agent.state_size)
            agent.train_model(state, action, reward, next_state, done )
            score += reward
            state = copy.deepcopy(next_state)

        if done:
            print("episode:", e, "  score:", score, "global_step", global_step)
            scores.append(score)
            episodes.append(e)
        plt.plot(episodes, scores, 'b')
        plt.show()
        plt.savefig("./history.png")

뭐가 잘 안맞는지, 1,000회 학습하면 별 효과가 없다. 각 100번째 평균을 보면 다음과 같다. 경험 리플레이를 사용하지 않은 것과 같은 현상이다. ~~A3C로 고고!~~

Keras: Multiple Inputs and Mixed Data

https://stackoverflow.com/questions/57704771/inputs-to-eager-execution-function-cannot-be-keras-symbolic-tensors

https://stackoverflow.com/questions/45961428/make-a-custom-loss-function-in-keras

전통적인 fake input으로 loss funcion에 필요한 파라미터를 전달하는 방법은 tensorflow 2.0 이상에서는 동작하지 않는 듯 하다.

https://github.com/tensorflow/tensorflow/issues/32142

[16:01:00]>cat testCustomLoss.py 
import keras 
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model
import numpy as np
from keras import backend as K
import tensorflow as tf
tf.config.run_functions_eagerly(True)
#tf.experimental_run_tf_function
main_input = Input(shape=(10,), dtype='int32', name='main_input')
 
#x = Embedding(output_dim=12, input_dim=100, input_length=100)(main_input)
 
#lstm_out = LSTM(4)(x)
 
#auxiliary_output = Dense(1, activation='sigmoid', name='aux_output')(lstm_out)
 
auxiliary_input = Input(shape=[1], name='aux_input')

 
#x = keras.layers.concatenate([lstm_out, auxiliary_input])
x = Dense(4, activation='relu')(main_input)
main_output = Dense(1, activation='sigmoid', name='main_output')(x)

def actor_loss_threeParameter(y_true, y_prediction, auxiliary_input):
    #aux_in = tf.keras.backend.cast(auxiliary_input, dtype='float64')
    out = K.clip(y_prediction, 1e-8, 1-1e-8)
    log_likily = y_true*K.log(out)
    return K.sum(-log_likily * auxiliary_input)

def actor_loss(delta):
    def actor_loss_fit(y_true, y_prediction):
        return actor_loss_threeParameter(y_true, y_prediction, delta)
    return actor_loss_fit


def test_loss(y_true, y_prediction):
    return (y_true - y_prediction)


 
model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output])
#model.compile(optimizer='rmsprop', loss='binary_crossentropy',loss_weights=[1., 0.2])
model.compile(optimizer='rmsprop', loss=actor_loss(delta = auxiliary_input))
#model.compile(optimizer='rmsprop', loss=test_loss)


model.summary()

main_in = np.arange(0,10)
main_in = np.reshape(main_in,[1,10])
copied_main_in = main_in.astype(np.float32)

main_out = 10
main_out = np.reshape(main_out, [1,1])
copied_out = main_out.astype(np.float32)


aux_in = 20
aux_in = np.reshape(aux_in, [1,1])
print("main in", main_in.shape)
print("main out", main_out.shape)
print("aux input", aux_in.shape)
model.fit(x= [copied_main_in, aux_in], y=copied_out, epochs = 10, verbose = 1)

이렇게 놓고 테스트해보면 loss가 0에서 줄어들지 않는다.

tf-docker /home/mnt/myStorage/test_gradientTape > python testCustomLoss.py 
2020-11-18 16:00:53.764598: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-11-18 16:00:54.659858: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2020-11-18 16:00:54.678468: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-18 16:00:54.678865: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:26:00.0 name: GeForce GTX 1060 6GB computeCapability: 6.1
coreClock: 1.7085GHz coreCount: 10 deviceMemorySize: 5.93GiB deviceMemoryBandwidth: 178.99GiB/s
2020-11-18 16:00:54.678890: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-11-18 16:00:54.680022: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2020-11-18 16:00:54.681178: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2020-11-18 16:00:54.681346: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10
2020-11-18 16:00:54.682446: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10
2020-11-18 16:00:54.683116: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10
2020-11-18 16:00:54.685549: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7
2020-11-18 16:00:54.685688: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-18 16:00:54.686097: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-18 16:00:54.686405: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2020-11-18 16:00:54.686688: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2020-11-18 16:00:54.710941: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 3399500000 Hz
2020-11-18 16:00:54.711710: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x41b63d0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-11-18 16:00:54.711754: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2020-11-18 16:00:54.986824: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-18 16:00:54.987296: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x41b8620 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-11-18 16:00:54.987352: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): GeForce GTX 1060 6GB, Compute Capability 6.1
2020-11-18 16:00:54.987771: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-18 16:00:54.988673: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:26:00.0 name: GeForce GTX 1060 6GB computeCapability: 6.1
coreClock: 1.7085GHz coreCount: 10 deviceMemorySize: 5.93GiB deviceMemoryBandwidth: 178.99GiB/s
2020-11-18 16:00:54.988743: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-11-18 16:00:54.988804: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2020-11-18 16:00:54.988846: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2020-11-18 16:00:54.988887: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10
2020-11-18 16:00:54.988926: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10
2020-11-18 16:00:54.988972: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10
2020-11-18 16:00:54.989018: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7
2020-11-18 16:00:54.989196: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-18 16:00:54.990189: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-18 16:00:54.991068: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2020-11-18 16:00:54.991140: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-11-18 16:00:55.378048: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-11-18 16:00:55.378099: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263]      0 
2020-11-18 16:00:55.378107: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0:   N 
2020-11-18 16:00:55.378324: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-18 16:00:55.378869: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-18 16:00:55.379209: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 4990 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:26:00.0, compute capability: 6.1)
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
main_input (InputLayer)         [(None, 10)]         0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 4)            44          main_input[0][0]                 
__________________________________________________________________________________________________
aux_input (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
main_output (Dense)             (None, 1)            5           dense[0][0]                      
==================================================================================================
Total params: 49
Trainable params: 49
Non-trainable params: 0
__________________________________________________________________________________________________
main in (1, 10)
main out (1, 1)
aux input (1, 1)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/dataset_ops.py:3350: UserWarning: Even though the tf.config.experimental_run_functions_eagerly option is set, this option does not apply to tf.data functions. tf.data functions are still traced and executed as graphs.
  "Even though the tf.config.experimental_run_functions_eagerly "
Epoch 1/10
2020-11-18 16:00:55.511115: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
1/1 [==============================] - 0s 428us/step - loss: 0.0000e+00
Epoch 2/10
1/1 [==============================] - 0s 331us/step - loss: 0.0000e+00
Epoch 3/10
1/1 [==============================] - 0s 333us/step - loss: 0.0000e+00
Epoch 4/10
1/1 [==============================] - 0s 303us/step - loss: 0.0000e+00
Epoch 5/10
1/1 [==============================] - 0s 302us/step - loss: 0.0000e+00
Epoch 6/10
1/1 [==============================] - 0s 277us/step - loss: 0.0000e+00
Epoch 7/10
1/1 [==============================] - 0s 348us/step - loss: 0.0000e+00
Epoch 8/10
1/1 [==============================] - 0s 289us/step - loss: 0.0000e+00
Epoch 9/10
1/1 [==============================] - 0s 275us/step - loss: 0.0000e+00
Epoch 10/10
1/1 [==============================] - 0s 269us/step - loss: 0.0000e+00
tf-docker /home/mnt/myStorage/test_gradientTape >

혹시나 해서 loss function 안에 변수 대신 숫자를 넣어보니 loss가 변했다. tensorflow 2.x에서는 fit으로 할 수 있을까라는 마음을 접고, 2.x이 지원하는 gradient tape로 해야 할 듯 하다. 다행히 누가 이미 구현했다.

20.11.21. 삽질 끝에 왜 gradient tape으로 업데이트 할 수 없는지 알았다. gradientTape()을 사용한 뒤, model로 입력을 집어 넣어야 한다. 분리되어 있으면 얘가 알 수 없다 에러난다. 코드는 넝마 조각이 되가고 있다. 아직도 한번 함정에 빠지면 빠져 나올 수 없다. 여러 샘플을 수집하여 한번에 학습시켜야 할 듯 하다.

from env_reinforce import CarrierStorage 
from env_reinforce import Action
import random
from collections import defaultdict
import numpy as np
from termcolor import colored
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.models  import Model
from keras.optimizers import Adam
import copy
from keras.models import model_from_json
from collections import deque
from keras import backend as K
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


#custom loss를 구하기 위해 tensor를 즉시 확인.
import tensorflow as tf
tf.config.run_functions_eagerly(True)


#여기 참조
#https://github.com/keras-team/keras-io/blob/master/examples/rl/actor_critic_cartpole.py

class A2CAgent(object):

    def __init__(self):

        #단순하게 했을 경우에는 40으로 사용.
        self.state_size = 40 #float value 하나 사용
        self.action_size = 7

        self.discount_factor = 0.99
        self.actor_lr = 0.001
        self.critic_lr = 0.005

        self.DEFINE_NEW = True
        self.RENDER = True

        #self.actor = self.build_actor()
        #self.critic = self.build_critic()
        self.model = self.build_actorCritic()

    def build_actorCritic(self):
        input = Input(shape = (self.state_size,))
        common = Dense(self.state_size*2, activation='relu', kernel_initializer='he_uniform')(input)
        action_prob = Dense(self.action_size, activation = 'softmax', kernel_initializer='he_uniform')(common)
        critic = Dense(1)(common)
        model = Model(inputs = input, outputs = [action_prob, critic])
        return model



    def get_action(self, action_prob):
        #[[확율 형식으로 출력]]
        # [0]을 넣어 줌
        #print("policy = ", policy)
        return np.random.choice(self.action_size, 1, p=np.squeeze(action_prob))[0]

if __name__ == '__main__':

    #메인 함수
    env = CarrierStorage()
    agent = A2CAgent()
    state = env.reset()

    #state history를 기록
    #historyState = []

    scores, episodes, score_average = [], [], []
    EPISODES = 100000

    global_step = 0
    average = 0

    for e in range (EPISODES):
        done = False
        score = 0
        state = env.reset()
        state = env.stateTo1hot(agent.state_size)
        status = env.isItEnd()
        if(status == 0 or status == 1):
            done = True
            reward = 0

        while not done:
            if(agent.RENDER == True):
                env.render()
            global_step += 1
            with tf.GradientTape() as tape:
                #tape 아래로 모델을 입력해야 input, output 관계를 알 수 있음.
                #actor, critic 모두 예측.
                action_prob, critic = agent.model(state)
                #numpy state를 tensor로 바꾸고, overide한 call로 입력.
                #state = np.reshape(state, [1, agent.state_size])
                #state = tf.convert_to_tensor(state, dtype=tf.int8, dtype_hint=None, name=None)
                #action_prob, critic = agent.model.call(state)
                print("action prob", action_prob)
                #print("critic", critic)
                #action은 action tf.Tensor(
                #[[0.16487105 0.0549401  0.12524831 0.1738248  0.31119537 0.07012787  0.0997925 ]], shape=(1, 7), dtype=float32)
                #critic은 
                #critic tf.Tensor([[0.04798129]], shape=(1, 1), dtype=float32)
                #으로 출력.
                #action_prob로 action을 구함.
                action = agent.get_action(action_prob[0])
                #print("Action is", Action(action))
                #
                #print("critic", critic)
                #print("next critic", next_critic)
                if(agent.RENDER == True):
                    print("action is", Action(action))
                next_state, reward, done, info = env.step(action)
                next_state = env.stateTo1hot(agent.state_size)
                _, next_critic = agent.model(next_state)
                advantage = reward  + (1.0 - done) * agent.discount_factor * next_critic - critic
                #[ [prob, prob, ... ] ]형식으로 입력이 들어옮
                actor_loss = tf.math.log(action_prob[0, action]) * advantage
                critic_loss =  advantage**2
                print("actor loss ", actor_loss)
                print("critic loss ", critic_loss)
                #모델이 하나라 actor_loss + critic_loss 더해서 한번에 train
                total_loss = actor_loss + critic_loss
                grads = tape.gradient(total_loss, agent.model.trainable_weights)
                #print("grad" , grads)
                optimizer = Adam(learning_rate = 0.01)
                optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))

            score += reward
            average = average + score
            state = copy.deepcopy(next_state)

        if done:
            if(agent.RENDER == True):
                print("episode:", e, "  score:", score)

            if(e%1000 == 0 and e>1):
                print("episode:", e, "  score:", score, "global_step", global_step,"average", average)
                scores.append(score)
                score_average.append(average)
                episodes.append(e)

                #매 100회마다 average 초기화.
                average = 0
                model_json_actor = agent.model.to_json()
                model_json_critic = agent.model.to_json()
                with open("./201027ActorA2c.json", "w") as json_file:
                    json_file.write(model_json_actor)
                with open("./201027CriticA2c.json", "w") as json_file:
                    json_file.write(model_json_critic)

                agent.model.save_weights("./201027weightActorA2c.h5")
                agent.model.save_weights("./201027weightCriticA2c.h5")

        plt.plot(episodes, score_average, 'b')
        #plt.show()
        plt.savefig("./history.png")

tensorflow 2.x이 fit을 지원하지 않고 gradientTape로 학습시켜야 하여 좀 불편하다. 나온지 오래되어 여러 꼼수들을 써먹을 수 없다. 수정하고 수정하여 아래와 같이 했다. 점수는 DQN보다 잘 안오르는 편이다. 100개씩 샘플을 저장하여 학습시켰는데, 총점이 1000점 넘기기 힘들다. 그래도 점수가 오르락 내리락 하는 패턴을 보면 알고리즘은 정확한 듯 하다. gradientTape 안에 넣을 때 loss 구하는 부분도 같은 탭에 있어야 한다. 잘 몰라 한참 해멨다.

from env_reinforce import CarrierStorage 
from env_reinforce import Action
import random
from collections import defaultdict
import numpy as np
from termcolor import colored
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.models  import Model
from keras.optimizers import Adam
import copy
from keras.models import model_from_json
from collections import deque
from keras import backend as K
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0

#custom loss를 구하기 위해 tensor를 즉시 확인.
import tensorflow as tf
tf.config.run_functions_eagerly(True)


#여기 참조
#https://github.com/keras-team/keras-io/blob/master/examples/rl/actor_critic_cartpole.py

class A2CAgent(object):

    def __init__(self):

        #단순하게 했을 경우에는 40으로 사용.
        self.state_size = 40 #float value 하나 사용
        self.action_size = 7

        self.discount_factor = 0.8

        self.DEFINE_NEW = False
        self.RENDER = False

        #self.actor = self.build_actor()
        #self.critic = self.build_critic()
        self.model = self.build_actorCritic()

    def build_actorCritic(self):
        if(self.DEFINE_NEW == True):
            input = Input(shape = (self.state_size,))
            common = Dense(self.state_size*24, activation='relu', kernel_initializer='he_uniform')(input)
            common2 = Dense(self.action_size*12, activation = 'relu',kernel_initializer='he_uniform')(common)
            action_prob = Dense(self.action_size, activation = 'softmax', kernel_initializer='he_uniform')(common2)
            critic = Dense(1)(common2)
            model = Model(inputs = input, outputs = [action_prob, critic])

        else:
            #있는 데이터 로딩
            json_actor = open("./201027ActorA2c.json", "r")
            loaded_actor = json_actor.read()
            json_actor.close()
            model= model_from_json(loaded_actor)
            print("모델 %s를 로딩"%json_actor)
            weight_actor = "./201027weightCriticA2c.h5"
            model.load_weights(weight_actor)
            print("저장된 weights %s를 로딩"%weight_actor)
        return model

    def get_action(self, action_prob):
        #[[확율 형식으로 출력]]
        # [0]을 넣어 줌
        #print("policy = ", policy)
        return np.random.choice(self.action_size, 1, p=np.squeeze(action_prob))[0]

if __name__ == '__main__':

    #메인 함수
    env = CarrierStorage()
    agent = A2CAgent()
    state = env.reset()

    #state history를 기록
    #historyState = []

    scores, episodes, score_average = [], [], []
    EPISODES = 100000

    global_step = 0
    average = 0
    huber_loss = tf.losses.Huber()
    optimizer = Adam(learning_rate = 0.0001)


    #action, critic, reward를 list로 기록.
    actionprob_history, critic_history, reward_history = [], [], []
    

    for e in range (EPISODES):
        #print("episode check", e)
        done = False
        score = 0
        state = env.reset()
        state = env.stateTo1hot(agent.state_size)
        status = env.isItEnd()
        #print("reseted")
        if(status == 0 or status == 1):
            done = True
            reward = 0
            #print("zero rewards")
            #여기에서 apply.gradients를 적용한면 안됨.
        while not done:
            if(agent.RENDER == True):
                env.render()
            global_step += 1
            #tape 아래로 모델을 입력해야 input, output 관계를 알 수 있음.
            #actor, critic 모두 예측.

            #with tf.GradientTape(persistent=True) as tape:
            with tf.GradientTape() as tape:
                action_prob, critic = agent.model(state)

                #action은 action tf.Tensor(
                #[[0.16487105 0.0549401  0.12524831 0.1738248  0.31119537 0.07012787  0.0997925 ]], shape=(1, 7), dtype=float32)
                #critic은 
                #critic tf.Tensor([[0.04798129]], shape=(1, 1), dtype=float32)
                #으로 출력.
                #action_prob로 action을 구함.
                action = agent.get_action(action_prob[0])
                #print("actionprob history",actionprob_history)
                if(agent.RENDER == True):
                    print("action is", Action(action))
                next_state, reward, done, info = env.step(action)

                #history에 추가
                critic_history.append(critic[0,0])
                actionprob_history.append(tf.math.log(action_prob[0, action]))
                reward_history.append(reward)
                next_state = env.stateTo1hot(agent.state_size)
                #_, next_critic = agent.model(next_state)
                score += reward
                average = average + score
                state = copy.deepcopy(next_state)

                #rewards 를 discounted factor로 다시 계산.
                returns = []
                discounted_sum = 0
                for r in reward_history[::-1]:
                    discounted_sum = r + agent.discount_factor* discounted_sum
                    returns.insert(0, discounted_sum)

                # Normalize
                returns = np.array(returns)
                returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
                returns = returns.tolist()

                # Calculating loss values to update our network
                history = zip(actionprob_history, critic_history, returns)
                actor_losses = []
                critic_losses = []
                for log_prob, value, ret in history:
                    advantage = ret - value
                    #advantage = reward  + (1.0 - done) * agent.discount_factor * next_critic - critic
                    #[ [prob, prob, ... ] ]형식으로 입력이 들어옮
                    actor_losses.append(-log_prob*advantage)
                    #critic_losses.append(advantage**2)
                    critic_losses.append(huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)))
                    #print("actor loss ", actor_losses)
                    #print("critic loss ", critic_losses)
                    #모델이 하나라 actor_loss + critic_loss 더해서 한번에 train
                    #print("grad" , grads)
                    #print("history", len(actionprob_history))
               
                total_loss = actor_losses + critic_losses
                #loss도 gradientTape 안에 들어있어야 함.
            if(len(actionprob_history) > 0 ):
                #print("actor losses", len(actor_losses))
                #print("critic losses", len(critic_losses))
                #print("check", len(total_loss))
                grads = tape.gradient(total_loss, agent.model.trainable_weights)
                #print("grads", grads)
                optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
                #print("actionprob history", actionprob_history)
                #print("cirtic,",critic_history)
                #print("rewards", reward_history)
                #print("actor losses", len(actor_losses))
                #print("critic losses", len(critic_losses))
                #print("total loss", len(total_loss))

                #print("actionprob_history", len(actionprob_history))
                #print("episodes", e)
        if(agent.RENDER == True):
            print("episode:", e, "  score:", score)
        if(e%100 == 0):
            print("history length is", len(actionprob_history))
            print("episode:", e, "  score:", score, "global_step", global_step,"average", average)
            scores.append(score)
            score_average.append(average)
            episodes.append(e)
            #매 1000회마다 average 초기화.
            average = 0
            model_json_actor = agent.model.to_json()
            with open("./201027ActorA2c.json", "w") as json_file:
                json_file.write(model_json_actor)
            agent.model.save_weights("./201027weightCriticA2c.h5")
            plt.plot(episodes, score_average, 'b')
            #plt.show()
            plt.savefig("./history.png")
        #비어있는 history로 gradients를 계산하지 않도록..
        #print("episode", e)
            actionprob_history.clear()
            critic_history.clear()
            reward_history.clear()

    plt.plot(episodes, score_average, 'b')
    #plt.show()
    plt.savefig("./history.png")

쉬운 작업은 잘 해내어 점수를 내는데, 대차가 구석에 처박혀 있으면 삽질만 하다 점수를 모두 까 먹는다. DQN보다 성능이 낮다. 이제 대망의 A3C로 고고.

2025 6월
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30

이 글 공유하기:

댓글 남기기응답 취소