Sep 13, 2020

Solution Reinforcement Learning for Taxi-v2

Updated: Mar 25, 2021

There are 4 locations (labeled by different letters), and our job is to pick up the passenger at one location and drop him off at another. We receive +20 points for a successful drop-off and lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions.

First import all related libraries:

import numpy as np
 
import gym
 
import random
 
import pandas as pd
 
import spacy
 
from spacy.tokens import Span
 
from spacy.matcher import PhraseMatcher
 
env = gym.make("Taxi-v3")
 
env.render()

Output:

Fetching Origing, Destination, and Time of Pickup from the sms data:

def fetch_pickup_drop(text_file_path):
 

 
# Append All the texts in the List
 
texts_list=[]
 

 
#read the text sms file
 
df=pd.read_csv(text_file_path,header=None,names=['Sms'])
 
#print(df.shape[0])
 

 
for i in range(0,df.shape[0]):
 
s=df.iloc[i,0]
 
texts_list.append(s)
 
# add the locations through add_pipe
 
l1=[]
 
LOCATIONS = ["dwarka sector 23", "dwarka sector 21", "hauz khaas", "airport"]
 

 
nlp = spacy.load('en')
 
matcher = PhraseMatcher(nlp.vocab)
 
matcher.add("LOCATIONS", None, *list(nlp.pipe(LOCATIONS)))
 

 
def places_component(doc):
 
doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matcher(doc)]
 
return doc
 

 
nlp.add_pipe(places_component) #last=True
 

 
#fetch the locations from the texts list of each text and append in the l1 list
 
for doc in nlp.pipe(texts_list):
 
l1.append([(ent.text, ent.label_) for ent in doc.ents])
 

 

 
dest=[]
 
pickup=[]
 
timing=[]
 
for i in range(0,len(texts_list)):
 
str_text=texts_list[i].lower()
 
str1='for '+l1[i][1][0]
 
str2='to '+l1[i][1][0]
 
str3='from '+l1[i][1][0]
 
## fetch the pickup and drop up location from the texts list of each text sms and append in the destination and pickup list
 
if str1 in str_text or str2 in str_text:
 
dest.append(l1[i][1][0])
 
pickup.append(l1[i][0][0])
 
elif str3 in str_text:
 
dest.append(l1[i][0][0])
 
pickup.append(l1[i][1][0])
 
# fetch the timing from the texts list of each text and append in the timing list.
 
if 'am' in str_text:
 
new_str=str_text[0:str_text.index('am')-1]
 
n=new_str.rindex(' ')
 
timing.append(new_str[n+1:]+' AM')
 

 
elif 'pm' in str_text:
 
new_str=str_text[0:str_text.index('pm')-1]
 
n=new_str.rindex(' ')
 
timing.append(new_str[n+1:]+' PM')
 

 
## create the dataframe of the pickup, Destination and time of pickup
 
df1 = pd.DataFrame(pickup,columns=['origing'])
 
df2 = pd.DataFrame(dest,columns=['destination'])
 
df3 = pd.DataFrame(timing,columns=['time of pickup'])
 
# concatenate the above three dataframe to get df_final dataframe of sms text file.
 
df_table_final=pd.concat([df1,df2,df3], axis=1)
 
return df_table_final
 

env.reset() # reset environment to a new, random state
 
env.render()
 

 
action_size = env.action_space.n
 
print("Action size ", action_size)
 

 
state_size = env.observation_space.n
 
print("State size ", state_size)
 

 
q_table = np.zeros((state_size, action_size))
 
print(q_table)

Output:

Training the Agent:

%%time
 
"""Training the agent"""
 

 
import random
 
from IPython.display import clear_output
 

 
# Hyperparameters
 
alpha = 0.1
 
gamma = 0.6
 
epsilon = 0.1
 

 
# For plotting metrics
 
all_epochs = []
 
all_penalties = []
 

 
for i in range(1, 100001):
 
state = env.reset()
 

 
epochs, penalties, reward, = 0, 0, 0
 
done = False
 

 
while not done:
 
if random.uniform(0, 1) < epsilon:
 
action = env.action_space.sample() # Explore action space
 
else:
 
action = np.argmax(q_table[state]) # Exploit learned values
 

 
next_state, reward, done, info = env.step(action)
 

 
old_value = q_table[state, action]
 
next_max = np.max(q_table[next_state])
 

 
new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
 
q_table[state, action] = new_value
 

 
if reward == -10:
 
penalties += 1
 

 
state = next_state
 
epochs += 1
 

 
if i % 100 == 0:
 
clear_output(wait=True)
 
print(f"Episode: {i}")
 

 
print("Training finished.\n")

Output:

Episode: 100000
 
Training finished.
 

 
CPU times: user 1min 10s, sys: 13.3 s, total: 1min 24s
 
Wall time: 1min 12s

Distance between pic up and drop location

text_file_path='drive/My Drive/project_2_dataset/sms.txt'
 

 
df_original=fetch_pickup_drop(text_file_path)
 
print(df_original)
 

 
# Create a Local Dictionary of city
 
city=pd.read_csv('drive/My Drive/project_2_dataset/city.csv')
 
city['mapping']=city['mapping'].map({0:0., 1:1., 2:2., 3:3.})
 

 
loc_dict={city.iloc[0,0]:city.iloc[0,1],city.iloc[1,0]:city.iloc[1,1],city.iloc[2,0]:city.iloc[2,1],city.iloc[3,0]:city.iloc[3,1]}
 

 
# Change the location by numeric value of city in df_original dataframe
 
df_original['origing']=df_original['origing'].map(loc_dict)
 
df_original['destination']=df_original['destination'].map(loc_dict)

Output:

origing destination time of pickup
 
0 airport hauz khaas 3 PM
 
1 airport hauz khaas 6 PM
 
2 hauz khaas dwarka sector 23 1 PM
 
3 airport hauz khaas 1 AM
 
4 airport dwarka sector 21 10 PM
 
.. ... ... ...
 
995 airport dwarka sector 23 2 AM
 
996 dwarka sector 21 dwarka sector 23 2 PM
 
997 hauz khaas dwarka sector 21 5 AM
 
998 airport dwarka sector 23 6 PM
 
999 airport hauz khaas 1 AM
 

 
[1000 rows x 3 columns]

Check Pick up and Drop up correction:

#### Check Pick up and Drop up correction
 

 
def check_pick_up_drop_correction(pick_up, drop, line_num):
 
original_origin = int(df_original.iloc[line_num,0])
 
original_destination = int(df_original.iloc[line_num,1])
 
if original_origin == pick_up and original_destination == drop:
 
return True
 
else:
 
return False

Evaluate the agent's performance after Q-learning

"""Evaluate the agent's performance after Q-learning"""
 

 
total_epochs, total_penalties, wrong_predictions, total_reward = 0, 0, 0, 0
 
episodes = 1000
 

 
for i in range(episodes):
 

 
epochs, penalties, reward = 0, 0, 0
 

 
#Generate the random state from an enviroment and change the pick up and drop as the fetched one
 
state = env.reset()
 
q_table[state][4]=df_original.iloc[i,0]
 
q_table[state][5]=df_original.iloc[i,1]
 

 
done = False
 

 
while not done:
 
action = np.argmax(q_table[state,:])
 
state, reward, done, info = env.step(action)
 

 

 

 
epochs += 1
 
checking = check_pick_up_drop_correction(int(q_table[state][4]), int(q_table[state][5]), i)
 
if checking == False:
 
wrong_predictions += 1
 
reward=-10
 
penalties += 1
 
else:
 
reward=20
 

 

 

 
total_penalties += penalties
 
total_epochs += epochs
 
total_reward += reward
 

 
print(f"Results after {episodes} episodes:")
 
print(f"Average timesteps per episode: {total_epochs / episodes}")
 
print(f"Average penalties per episode: {total_penalties / episodes}")
 
print(f"Total number of wrong predictions", wrong_predictions)
 
print("Total Reward is", total_reward)

Output:

Results after 1000 episodes:
 
Average timesteps per episode: 196.365
 
Average penalties per episode: 0.019
 
Total number of wrong predictions 19
 
Total Reward is 19430

Contact us to get instant help related to Reinforcement Machine Learning Projects at: contact@codersarts.com