Coding Exercise
Please use the next concept to complete the following sections of Monte_Carlo.ipynb
:
- Part 0: Explore BlackjackEnv
- Part 1: MC Prediction
To reference the pseudocode while working on the notebook, you are encouraged to look at this sheet.
Important Note
Please do not complete the entire notebook in the next concept – you should only complete Part 0 and Part 1. The final part of the notebook will be addressed later in the lesson.
Download the Exercise
If you would prefer to work on your own machine, you can download the exercise from the DRLND GitHub repository.
Check Your Implementation
Once you have completed the exercise, you can check your solution by looking at the corresponding sections in Monte_Carlo_Solution.ipynb
. Watch the video below to see a solution walkthrough!
Note that the Jupyter interface will look slightly different, since at one point we experimented with JupyterLab. However, all of the Python code is the same as you see in the videos!
import sys import gym import numpy as np from collections import defaultdict from plot_utils import plot_blackjack_values, plot_policy env = gym.make('Blackjack-v0') print(env.observation_space) print(env.action_space) """ Tuple(Discrete(32), Discrete(11), Discrete(2)) Discrete(2) """ for i_episode in range(3): state = env.reset() while True: print(state) action = env.action_space.sample() state, reward, done, info = env.step(action) if done: print('End game! Reward: ', reward) print('You won :)\n') if reward > 0 else print('You lost :(\n') break def generate_episode_from_limit_stochastic(bj_env): episode = [] state = bj_env.reset() while True: probs = [0.8, 0.2] if state[0] > 18 else [0.2, 0.8] action = np.random.choice(np.arange(2), p=probs) next_state, reward, done, info = bj_env.step(action) episode.append((state, action, reward)) state = next_state if done: break return episode for i in range(3): print(generate_episode_from_limit_stochastic(env)) def mc_prediction_q(env, num_episodes, generate_episode, gamma=1.0): # initialize empty dictionaries of arrays returns_sum = defaultdict(lambda: np.zeros(env.action_space.n)) N = defaultdict(lambda: np.zeros(env.action_space.n)) Q = defaultdict(lambda: np.zeros(env.action_space.n)) # loop over episodes for i_episode in range(1, num_episodes+1): # monitor progress if i_episode % 1000 == 0: print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="") sys.stdout.flush() ## TODO: complete the function # generate an episode episode = generate_episode(env) # obtain the states, actions, and rewards states, actions, rewards = zip(*episode) # prepare for discounting discounts = np.array([gamma**i for i in range(len(rewards)+1)]) # update the sum of the returns, number of visits, and action-value # function estimates for each state-action pair in the episode for i, state in enumerate(states): returns_sum[state][actions[i]] += sum(rewards[i:]*discounts[:-(1+i)]) N[state][actions[i]] += 1.0 Q[state][actions[i]] = returns_sum[state][actions[i]] / N[state][actions[i]] return Q # obtain the action-value function Q = mc_prediction_q(env, 500000, generate_episode_from_limit_stochastic) # obtain the corresponding state-value function V_to_plot = dict((k,(k[0]>18)*(np.dot([0.8, 0.2],v)) + (k[0]<=18)*(np.dot([0.2, 0.8],v))) \ for k, v in Q.items()) # plot the state-value function plot_blackjack_values(V_to_plot) def mc_control(env, num_episodes, alpha, gamma=1.0): nA = env.action_space.n # initialize empty dictionary of arrays Q = defaultdict(lambda: np.zeros(nA)) # loop over episodes for i_episode in range(1, num_episodes+1): # monitor progress if i_episode % 1000 == 0: print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="") sys.stdout.flush() ## TODO: complete the function # set the value of epsilon epsilon = max(epsilon*eps_decay, eps_min) # generate an episode by following epsilon-greedy policy episode = generate_episode_from_Q(env, Q, epsilon, nA) # update the action-value function estimate using the episode Q = update_Q(env, episode, Q, alpha, gamma) # determine the policy corresponding to the final action-value function estimate policy = dict((k,np.argmax(v)) for k, v in Q.items()) return policy, Q # obtain the estimated optimal policy and action-value function policy, Q = mc_control(env, 500000, 0.02) # obtain the corresponding state-value function V = dict((k,np.max(v)) for k, v in Q.items()) # plot the state-value function plot_blackjack_values(V) # plot the policy plot_policy(policy)
댓글을 달려면 로그인해야 합니다.