여기에 있는 코드 참조..
책은 “파이썬과 케라스로 배우는 강화학습” 내용.
Monte Carlo Method의 코드가 설명되어 있지 않아, python 디버그로 그 결과를 잠깐 살폈다.
메인 코드.
# 메인 함수 if __name__ == "__main__": env = Env() agent = MCAgent(actions=list(range(env.n_actions))) for episode in range(1000): state = env.reset() action = agent.get_action(state) while True: env.render() # 다음 상태로 이동 # 보상은 숫자이고, 완료 여부는 boolean next_state, reward, done = env.step(action) agent.save_sample(next_state, reward, done) ##테스트용. print(agent.samples) print("\n") # 다음 행동 받아옴 action = agent.get_action(next_state) # 에피소드가 완료됐을 때, 큐 함수 업데이트 if done: print("episode : ", episode) agent.update() agent.samples.clear() break
6번행: episode를 1,000번 반복.
10번행: 무한번 반복이나, 25번 결과를 보고 break 결정. done은 15번 행에서 업데이트.
(Pdb) l 109 # 보상은 숫자이고, 완료 여부는 boolean 110 next_state, reward, done = env.step(action) 111 agent.save_sample(next_state, reward, done) 112 ##테스트용. 113 print(agent.samples) 114 -> print("\n") 115 116 # 다음 행동 받아옴 117 action = agent.get_action(next_state) 118 119 # 에피소드가 완료됐을 때, 큐 함수 업데이트 (Pdb) l 120 if done: 121 print("episode : ", episode) 122 agent.update() 123 agent.samples.clear() 124 break [EOF] (Pdb) p agent.samples [[[0, 0], 0, False], [[0, 1], 0, False], [[1, 1], 0, False], [[1, 2], -100, True]] (Pdb)
에피소드가 끝나면, 자신이 방문한 모든 셀을 list로 기억.
아래 좌표 중, 행과 열이 바뀜.
22번행: 현재 스테이트에서 다음 스테이트를 예측.
(Pdb) l 27 for reward in reversed(self.samples): 28 state = str(reward[0]) 29 if state not in visit_state: 30 visit_state.append(state) 31 G_t = self.discount_factor * (reward[1] + G_t) 32 -> value = self.value_table[state] 33 ##테스트.. 34 print("Value is ",value) 35 self.value_table[state] = (value + 36 self.learning_rate * (G_t - value)) 37 (Pdb) 38 print("Value state is", state) 39 #print("type of value_table is", type(self.value_table)) 40 print("Value table is", self.value_table) 41 #print("Value table is",self.value_table[state]) 42 43 print("\n") 44 45 # 큐 함수에 따라서 행동을 반환 46 # 입실론 탐욕 정책에 따라서 행동을 반환 47 def get_action(self, state): 48 if np.random.rand() < self.epsilon: (Pdb) n > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(34)update() -> print("Value is ",value) (Pdb) n Value is 0.0 > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(35)update() -> self.value_table[state] = (value + (Pdb) n > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(36)update() -> self.learning_rate * (G_t - value)) (Pdb) > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(38)update() -> print("Value state is", state) (Pdb) Value state is [1, 2] > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(40)update() -> print("Value table is", self.value_table) (Pdb) Value table is defaultdict(<class 'float'>, {'[0, 0]': 0.0, '[0, 1]': 0.0, '[1, 0]': 0.0, '[1, 2]': -0.9, '[2, 1]': 0.0, '[1, 1]': 0.0, '[1, 3]': 0.0, '[0, 2]': 0.0, '[2, 2]': 0.0}) > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(43)update() -> print("\n") (Pdb) p self.value_table defaultdict(<class 'float'>, {'[0, 0]': 0.0, '[0, 1]': 0.0, '[1, 0]': 0.0, '[1, 2]': -0.9, '[2, 1]': 0.0, '[1, 1]': 0.0, '[1, 3]': 0.0, '[0, 2]': 0.0, '[2, 2]': 0.0}) (Pdb)
self.value_table을 각 state 관리. 방문한 state를 list로 추가하고, 각 value를 update. 나중에 현재 state를 보고 action을 산출 할 때 필요.
update 하면 teminal state의 reward를 감가율로 계속 감소시켜 시작 스테이트까지 value_table[state]로 업데이트..
아래 코드에서
if state not in visit_state가 있기 때문에, 처음 방문한 state만 관심을 갖음.
19 # 메모리에 샘플을 추가 20 def save_sample(self, state, reward, done): 21 self.samples.append([state, reward, done]) 22 23 # 모든 에피소드에서 에이전트가 방문한 상태의 큐 함수를 업데이트 24 -> def update(self): 25 G_t = 0 26 visit_state = [] 27 for reward in reversed(self.samples): 28 state = str(reward[0]) 29 if state not in visit_state: (Pdb) n > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(25)update() -> G_t = 0 (Pdb) p self.samples [[[1, 0], 0, False], [[1, 0], 0, False], [[2, 0], 0, False], [[3, 0], 0, False], [[3, 1], 0, False], [[3, 2], 0, False], [[2, 2], 100, True]] (Pdb) ... (Pdb) l 24 def update(self): 25 G_t = 0 26 visit_state = [] 27 for reward in reversed(self.samples): 28 state = str(reward[0]) 29 -> if state not in visit_state: 30 visit_state.append(state) 31 G_t = self.discount_factor * (reward[1] + G_t) 32 value = self.value_table[state] 33 ##테스트.. 34 print("Value is ",value) (Pdb) n > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(30)update() -> visit_state.append(state) (Pdb) > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(31)update() -> G_t = self.discount_factor * (reward[1] + G_t) (Pdb) > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(32)update() -> value = self.value_table[state] (Pdb) p self.value_table defaultdict(<class 'float'>, {'[0, 0]': 10.523899590468769, '[0, 1]': 2.3971909843905967, '[1, 0]': 45.81574921045315, '[1, 2]': -2.67309, '[2, 1]': -19.99607765407679, '[1, 1]': 6.356969207767222, '[1, 3]': 0.0, '[0, 2]': -0.41447951099999986, '[2, 2]': 89.22430756838313, '[2, 0]': 50.92319972091863, '[3, 0]': 60.0913655106793, '[3, 1]': 66.88710703917133, '[0, 3]': -0.7290000000000001, '[0, 4]': -0.6561000000000001, '[1, 4]': 0.0, '[4, 0]': 8.302503491918047, '[4, 1]': 6.159613912118276, '[4, 2]': 6.304592658442578, '[3, 2]': 80.25132103821517, '[3, 3]': 6.970546731858152, '[2, 3]': 0.0, '[3, 4]': 0.6561000000000001, '[4, 3]': 0.0, '[2, 4]': 0.0, '[4, 4]': 0.0})
500회 시행 후, 과거 이력을 보면 아래와 같다.
(Pdb) p agent.samples [[[0, 1], 0, False], [[0, 0], 0, False], [[0, 1], 0, False], [[0, 0], 0, False], [[1, 0], 0, False], [[1, 1], 0, False], [[1, 0], 0, False], [[2, 0], 0, False], [[3, 0], 0, False], [[3, 1], 0, False], [[3, 2], 0, False], [[2, 2], 100, True]] (Pdb) l 116 # 다음 행동 받아옴 117 action = agent.get_action(next_state) 118 119 # 에피소드가 완료됐을 때, 큐 함수 업데이트 120 if done: 121 B-> print("episode : ", episode) 122 agent.update() 123 agent.samples.clear() 124 break [EOF] (Pdb) n episode : 500 > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(122)<module>() -> agent.update() (Pdb) l 117 action = agent.get_action(next_state) 118 119 # 에피소드가 완료됐을 때, 큐 함수 업데이트 120 if done: 121 B print("episode : ", episode) 122 -> agent.update() 123 agent.samples.clear() 124 break [EOF] (Pdb) s --Call-- > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(24)update() -> def update(self): (Pdb) l 19 # 메모리에 샘플을 추가 20 def save_sample(self, state, reward, done): 21 self.samples.append([state, reward, done]) 22 23 # 모든 에피소드에서 에이전트가 방문한 상태의 큐 함수를 업데이트 24 -> def update(self): 25 G_t = 0 26 visit_state = [] 27 for reward in reversed(self.samples): 28 state = str(reward[0]) 29 if state not in visit_state: (Pdb) n > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(25)update() -> G_t = 0 (Pdb) > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(26)update() -> visit_state = [] (Pdb) > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(27)update() -> for reward in reversed(self.samples): (Pdb) p visit_state [] (Pdb) l 22 23 # 모든 에피소드에서 에이전트가 방문한 상태의 큐 함수를 업데이트 24 def update(self): 25 G_t = 0 26 visit_state = [] 27 -> for reward in reversed(self.samples): 28 state = str(reward[0]) 29 if state not in visit_state: 30 visit_state.append(state) 31 G_t = self.discount_factor * (reward[1] + G_t) 32 value = self.value_table[state] (Pdb) p self.samples [[[0, 1], 0, False], [[0, 0], 0, False], [[0, 1], 0, False], [[0, 0], 0, False], [[1, 0], 0, False], [[1, 1], 0, False], [[1, 0], 0, False], [[2, 0], 0, False], [[3, 0], 0, False], [[3, 1], 0, False], [[3, 2], 0, False], [[2, 2], 100, True]] (Pdb) n > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(28)update() -> state = str(reward[0]) (Pdb) p state [0, 0] (Pdb) n > /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(29)update() -> if state not in visit_state: (Pdb) p state '[2, 2]' (Pdb) p self.value_table defaultdict(<class 'float'>, {'[0, 0]': 12.843038729477103, '[0, 1]': 3.6010347623049843, '[1, 0]': 45.739579013968516, '[0, 2]': 0.38742048900000015, '[1, 1]': 4.629255057991681, '[2, 0]': 50.83890751506551, '[2, 2]': 89.2397438477723, '[3, 1]': 66.85889829029591, '[2, 1]': -19.99607765407679, '[3, 0]': 60.06357418599044, '[4, 0]': 8.024660160565901, '[3, 2]': 80.26033687696258, '[4, 1]': 5.96080478456966, '[3, 3]': 6.970546731858152, '[4, 2]': 6.029123110357315, '[3, 4]': 1.3056390000000002, '[2, 4]': 0.47829690000000014, '[4, 4]': 0.5904900000000002, '[2, 3]': 0.81, '[4, 3]': 0.5314410000000002, '[1, 3]': 0.0, '[1, 4]': 0.0, '[1, 2]': 0.0, '[0, 3]': 0.0})