여기에 있는 코드 참조..
책은 “파이썬과 케라스로 배우는 강화학습” 내용.
Monte Carlo Method의 코드가 설명되어 있지 않아, python 디버그로 그 결과를 잠깐 살폈다.
메인 코드.
# 메인 함수
if __name__ == "__main__":
env = Env()
agent = MCAgent(actions=list(range(env.n_actions)))
for episode in range(1000):
state = env.reset()
action = agent.get_action(state)
while True:
env.render()
# 다음 상태로 이동
# 보상은 숫자이고, 완료 여부는 boolean
next_state, reward, done = env.step(action)
agent.save_sample(next_state, reward, done)
##테스트용.
print(agent.samples)
print("\n")
# 다음 행동 받아옴
action = agent.get_action(next_state)
# 에피소드가 완료됐을 때, 큐 함수 업데이트
if done:
print("episode : ", episode)
agent.update()
agent.samples.clear()
break
6번행: episode를 1,000번 반복.
10번행: 무한번 반복이나, 25번 결과를 보고 break 결정. done은 15번 행에서 업데이트.
(Pdb) l
109 # 보상은 숫자이고, 완료 여부는 boolean
110 next_state, reward, done = env.step(action)
111 agent.save_sample(next_state, reward, done)
112 ##테스트용.
113 print(agent.samples)
114 -> print("\n")
115
116 # 다음 행동 받아옴
117 action = agent.get_action(next_state)
118
119 # 에피소드가 완료됐을 때, 큐 함수 업데이트
(Pdb) l
120 if done:
121 print("episode : ", episode)
122 agent.update()
123 agent.samples.clear()
124 break
[EOF]
(Pdb) p agent.samples
[[[0, 0], 0, False], [[0, 1], 0, False], [[1, 1], 0, False], [[1, 2], -100, True]]
(Pdb)
에피소드가 끝나면, 자신이 방문한 모든 셀을 list로 기억.
아래 좌표 중, 행과 열이 바뀜.

22번행: 현재 스테이트에서 다음 스테이트를 예측.
(Pdb) l
27 for reward in reversed(self.samples):
28 state = str(reward[0])
29 if state not in visit_state:
30 visit_state.append(state)
31 G_t = self.discount_factor * (reward[1] + G_t)
32 -> value = self.value_table[state]
33 ##테스트..
34 print("Value is ",value)
35 self.value_table[state] = (value +
36 self.learning_rate * (G_t - value))
37
(Pdb)
38 print("Value state is", state)
39 #print("type of value_table is", type(self.value_table))
40 print("Value table is", self.value_table)
41 #print("Value table is",self.value_table[state])
42
43 print("\n")
44
45 # 큐 함수에 따라서 행동을 반환
46 # 입실론 탐욕 정책에 따라서 행동을 반환
47 def get_action(self, state):
48 if np.random.rand() < self.epsilon:
(Pdb) n
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(34)update()
-> print("Value is ",value)
(Pdb) n
Value is 0.0
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(35)update()
-> self.value_table[state] = (value +
(Pdb) n
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(36)update()
-> self.learning_rate * (G_t - value))
(Pdb)
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(38)update()
-> print("Value state is", state)
(Pdb)
Value state is [1, 2]
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(40)update()
-> print("Value table is", self.value_table)
(Pdb)
Value table is defaultdict(<class 'float'>, {'[0, 0]': 0.0, '[0, 1]': 0.0, '[1, 0]': 0.0, '[1, 2]': -0.9, '[2, 1]': 0.0, '[1, 1]': 0.0, '[1, 3]': 0.0, '[0, 2]': 0.0, '[2, 2]': 0.0})
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(43)update()
-> print("\n")
(Pdb) p self.value_table
defaultdict(<class 'float'>, {'[0, 0]': 0.0, '[0, 1]': 0.0, '[1, 0]': 0.0, '[1, 2]': -0.9, '[2, 1]': 0.0, '[1, 1]': 0.0, '[1, 3]': 0.0, '[0, 2]': 0.0, '[2, 2]': 0.0})
(Pdb)
self.value_table을 각 state 관리. 방문한 state를 list로 추가하고, 각 value를 update. 나중에 현재 state를 보고 action을 산출 할 때 필요.
update 하면 teminal state의 reward를 감가율로 계속 감소시켜 시작 스테이트까지 value_table[state]로 업데이트..
아래 코드에서
if state not in visit_state가 있기 때문에, 처음 방문한 state만 관심을 갖음.
19 # 메모리에 샘플을 추가
20 def save_sample(self, state, reward, done):
21 self.samples.append([state, reward, done])
22
23 # 모든 에피소드에서 에이전트가 방문한 상태의 큐 함수를 업데이트
24 -> def update(self):
25 G_t = 0
26 visit_state = []
27 for reward in reversed(self.samples):
28 state = str(reward[0])
29 if state not in visit_state:
(Pdb) n
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(25)update()
-> G_t = 0
(Pdb) p self.samples
[[[1, 0], 0, False], [[1, 0], 0, False], [[2, 0], 0, False], [[3, 0], 0, False], [[3, 1], 0, False], [[3, 2], 0, False], [[2, 2], 100, True]]
(Pdb)
...
(Pdb) l
24 def update(self):
25 G_t = 0
26 visit_state = []
27 for reward in reversed(self.samples):
28 state = str(reward[0])
29 -> if state not in visit_state:
30 visit_state.append(state)
31 G_t = self.discount_factor * (reward[1] + G_t)
32 value = self.value_table[state]
33 ##테스트..
34 print("Value is ",value)
(Pdb) n
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(30)update()
-> visit_state.append(state)
(Pdb)
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(31)update()
-> G_t = self.discount_factor * (reward[1] + G_t)
(Pdb)
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(32)update()
-> value = self.value_table[state]
(Pdb) p self.value_table
defaultdict(<class 'float'>, {'[0, 0]': 10.523899590468769, '[0, 1]': 2.3971909843905967, '[1, 0]': 45.81574921045315, '[1, 2]': -2.67309, '[2, 1]': -19.99607765407679, '[1, 1]': 6.356969207767222, '[1, 3]': 0.0, '[0, 2]': -0.41447951099999986, '[2, 2]': 89.22430756838313, '[2, 0]': 50.92319972091863, '[3, 0]': 60.0913655106793, '[3, 1]': 66.88710703917133, '[0, 3]': -0.7290000000000001, '[0, 4]': -0.6561000000000001, '[1, 4]': 0.0, '[4, 0]': 8.302503491918047, '[4, 1]': 6.159613912118276, '[4, 2]': 6.304592658442578, '[3, 2]': 80.25132103821517, '[3, 3]': 6.970546731858152, '[2, 3]': 0.0, '[3, 4]': 0.6561000000000001, '[4, 3]': 0.0, '[2, 4]': 0.0, '[4, 4]': 0.0})
500회 시행 후, 과거 이력을 보면 아래와 같다.
(Pdb) p agent.samples
[[[0, 1], 0, False], [[0, 0], 0, False], [[0, 1], 0, False], [[0, 0], 0, False], [[1, 0], 0, False], [[1, 1], 0, False], [[1, 0], 0, False], [[2, 0], 0, False], [[3, 0], 0, False], [[3, 1], 0, False], [[3, 2], 0, False], [[2, 2], 100, True]]
(Pdb) l
116 # 다음 행동 받아옴
117 action = agent.get_action(next_state)
118
119 # 에피소드가 완료됐을 때, 큐 함수 업데이트
120 if done:
121 B-> print("episode : ", episode)
122 agent.update()
123 agent.samples.clear()
124 break
[EOF]
(Pdb) n
episode : 500
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(122)<module>()
-> agent.update()
(Pdb) l
117 action = agent.get_action(next_state)
118
119 # 에피소드가 완료됐을 때, 큐 함수 업데이트
120 if done:
121 B print("episode : ", episode)
122 -> agent.update()
123 agent.samples.clear()
124 break
[EOF]
(Pdb) s
--Call--
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(24)update()
-> def update(self):
(Pdb) l
19 # 메모리에 샘플을 추가
20 def save_sample(self, state, reward, done):
21 self.samples.append([state, reward, done])
22
23 # 모든 에피소드에서 에이전트가 방문한 상태의 큐 함수를 업데이트
24 -> def update(self):
25 G_t = 0
26 visit_state = []
27 for reward in reversed(self.samples):
28 state = str(reward[0])
29 if state not in visit_state:
(Pdb) n
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(25)update()
-> G_t = 0
(Pdb)
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(26)update()
-> visit_state = []
(Pdb)
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(27)update()
-> for reward in reversed(self.samples):
(Pdb) p visit_state
[]
(Pdb) l
22
23 # 모든 에피소드에서 에이전트가 방문한 상태의 큐 함수를 업데이트
24 def update(self):
25 G_t = 0
26 visit_state = []
27 -> for reward in reversed(self.samples):
28 state = str(reward[0])
29 if state not in visit_state:
30 visit_state.append(state)
31 G_t = self.discount_factor * (reward[1] + G_t)
32 value = self.value_table[state]
(Pdb) p self.samples
[[[0, 1], 0, False], [[0, 0], 0, False], [[0, 1], 0, False], [[0, 0], 0, False], [[1, 0], 0, False], [[1, 1], 0, False], [[1, 0], 0, False], [[2, 0], 0, False], [[3, 0], 0, False], [[3, 1], 0, False], [[3, 2], 0, False], [[2, 2], 100, True]]
(Pdb) n
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(28)update()
-> state = str(reward[0])
(Pdb) p state
[0, 0]
(Pdb) n
> /home/now0930/tensorflow/reinforceLearing/reinforcement-learning-kr-master/1-grid-world/3-monte-carlo/mc_agentTest.py(29)update()
-> if state not in visit_state:
(Pdb) p state
'[2, 2]'
(Pdb) p self.value_table
defaultdict(<class 'float'>, {'[0, 0]': 12.843038729477103, '[0, 1]': 3.6010347623049843, '[1, 0]': 45.739579013968516, '[0, 2]': 0.38742048900000015, '[1, 1]': 4.629255057991681, '[2, 0]': 50.83890751506551, '[2, 2]': 89.2397438477723, '[3, 1]': 66.85889829029591, '[2, 1]': -19.99607765407679, '[3, 0]': 60.06357418599044, '[4, 0]': 8.024660160565901, '[3, 2]': 80.26033687696258, '[4, 1]': 5.96080478456966, '[3, 3]': 6.970546731858152, '[4, 2]': 6.029123110357315, '[3, 4]': 1.3056390000000002, '[2, 4]': 0.47829690000000014, '[4, 4]': 0.5904900000000002, '[2, 3]': 0.81, '[4, 3]': 0.5314410000000002, '[1, 3]': 0.0, '[1, 4]': 0.0, '[1, 2]': 0.0, '[0, 3]': 0.0})
