defstep(self,a): x,y = self.pos x += self.move[a][0] y += self.move[a][1] if self.checkLegal(x,y): self.pos = (x,y) reward = self.stepReward if self.pos == self.goal: self.done = True reward = self.winReward return reward, self.pos, self.done
defreset(self): self.pos = (0,0) self.done = False return self.pos defcheckLegal(self,x,y): returnnot(x < 0or y < 0or x >= self.width or y >= self.height)
defrender(self): for i inrange(self.height): s = " " for j inrange(self.width): if j == self.pos[0] and i == self.pos[1]: s +='P' elif j == self.goal[0] and i == self.goal[1]: s+='G' else: s+='-' print(s) print(self.pos) print()
defprint_policy(width, height,policy,goal): dic={0:'D',1:'U',2:'R',3:'L'} for i inrange(height): s = "" for j inrange(width): if (j,i) == goal: s += "S" else: s += dic[policy[(j,i)]] print(s) print()
GAMMA = 0.95 EPS = 1e-3 grid = Grid() width = grid.width height = grid.height policy={} value ={} for i inrange(width): for j inrange(height): ifnot(i == grid.goal[0] and j == grid.goal[1]): policy[(i,j)] = np.random.randint(4) value[(i,j)] = 0 cnt = 0 whileTrue: whileTrue: delta = 0 for pos in policy: grid.pos = pos a = policy[pos] r,pos_, _ = grid.step(a) v = value[pos] v_ = r + GAMMA * value[pos_] value[pos] = v_ delta = max(np.abs(v-v_),delta) if delta < EPS: break cnt += 1 print("iteration: ",cnt) print(value) print_policy(width,height,policy,grid.goal) is_stable = True for pos in policy: max_v = float('-inf') max_a = 0 for a inrange(4): grid.pos = pos r,pos_,_ = grid.step(a) v = r + GAMMA * value[pos_] if max_v < v: max_v = v max_a = a ifnot policy[pos] == max_a: policy[pos] = max_a is_stable=False if is_stable: break
defprint_policy(width, height,policy,goal): dic={0:'D',1:'U',2:'R',3:'L'} for i inrange(height): s = "" for j inrange(width): if (j,i) == goal: s += "S" else: s += dic[policy[(j,i)]] print(s) print()
GAMMA = 0.95 EPS = 1e-3 grid = Grid() width = grid.width height = grid.height policy={} value ={} for i inrange(width): for j inrange(height): value[(i,j)] = 0 print(value) cnt = 0
whileTrue: delta = 0 for pos in value: if pos == grid.goal: continue v = value[pos] max_v = float('-inf') for a inrange(4): grid.pos = pos r,pos_, _ = grid.step(a) v_ = r + GAMMA * value[pos_] max_v = max(max_v, v_) value[pos] = max_v delta = max(np.abs(v-max_v),delta) if delta < EPS: break cnt += 1 print("iteration: ",cnt) print(value)
for pos in value: max_v = float('-inf') max_a = 0 for a inrange(4): grid.pos = pos r,pos_,_ = grid.step(a) v_ = r + GAMMA* value[pos_] if max_v < v_: max_v = v_ max_a = a policy[pos] = max_a