Sep 13, 2020

Solution Reinforcement Learning for Taxi-v2

Updated: Mar 25, 2021

There are 4 locations (labeled by different letters), and our job is to pick up the passenger at one location and drop him off at another. We receive +20 points for a successful drop-off and lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions.

First import all related libraries:

import numpy as np
import gym
import random
import pandas as pd
import spacy
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher
env = gym.make("Taxi-v3")
env.render()

Output:

Fetching Origing, Destination, and Time of Pickup from the sms data:

def fetch_pickup_drop(text_file_path):

# Append All the texts in the List
texts_list=[]

#read the text sms file
df=pd.read_csv(text_file_path,header=None,names=['Sms'])
#print(df.shape[0])

for i in range(0,df.shape[0]):
s=df.iloc[i,0]
texts_list.append(s)
# add the locations through add_pipe
l1=[]
LOCATIONS = ["dwarka sector 23", "dwarka sector 21", "hauz khaas", "airport"]

nlp = spacy.load('en')
matcher = PhraseMatcher(nlp.vocab)
matcher.add("LOCATIONS", None, *list(nlp.pipe(LOCATIONS)))

def places_component(doc):
doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matcher(doc)]
return doc

nlp.add_pipe(places_component) #last=True

#fetch the locations from the texts list of each text and append in the l1 list
for doc in nlp.pipe(texts_list):
l1.append([(ent.text, ent.label_) for ent in doc.ents])

dest=[]
pickup=[]
timing=[]
for i in range(0,len(texts_list)):
str_text=texts_list[i].lower()
str1='for '+l1[i][1][0]
str2='to '+l1[i][1][0]
str3='from '+l1[i][1][0]
## fetch the pickup and drop up location from the texts list of each text sms and append in the destination and pickup list
if str1 in str_text or str2 in str_text:
dest.append(l1[i][1][0])
pickup.append(l1[i][0][0])
elif str3 in str_text:
dest.append(l1[i][0][0])
pickup.append(l1[i][1][0])
# fetch the timing from the texts list of each text and append in the timing list.
if 'am' in str_text:
new_str=str_text[0:str_text.index('am')-1]
n=new_str.rindex(' ')
timing.append(new_str[n+1:]+' AM')

elif 'pm' in str_text:
new_str=str_text[0:str_text.index('pm')-1]
n=new_str.rindex(' ')
timing.append(new_str[n+1:]+' PM')

## create the dataframe of the pickup, Destination and time of pickup
df1 = pd.DataFrame(pickup,columns=['origing'])
df2 = pd.DataFrame(dest,columns=['destination'])
df3 = pd.DataFrame(timing,columns=['time of pickup'])
# concatenate the above three dataframe to get df_final dataframe of sms text file.
df_table_final=pd.concat([df1,df2,df3], axis=1)
return df_table_final

env.reset() # reset environment to a new, random state
env.render()

action_size = env.action_space.n
print("Action size ", action_size)

state_size = env.observation_space.n
print("State size ", state_size)

q_table = np.zeros((state_size, action_size))
print(q_table)

Output:

Training the Agent:

%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
state = env.reset()

epochs, penalties, reward, = 0, 0, 0
done = False

while not done:
if random.uniform(0, 1) < epsilon:
action = env.action_space.sample() # Explore action space
else:
action = np.argmax(q_table[state]) # Exploit learned values

next_state, reward, done, info = env.step(action)

old_value = q_table[state, action]
next_max = np.max(q_table[next_state])

new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
q_table[state, action] = new_value

if reward == -10:
penalties += 1

state = next_state
epochs += 1

if i % 100 == 0:
clear_output(wait=True)
print(f"Episode: {i}")

print("Training finished.\n")

Output:

Episode: 100000
Training finished.

CPU times: user 1min 10s, sys: 13.3 s, total: 1min 24s
Wall time: 1min 12s

Distance between pic up and drop location

text_file_path='drive/My Drive/project_2_dataset/sms.txt'

df_original=fetch_pickup_drop(text_file_path)
print(df_original)

# Create a Local Dictionary of city
city=pd.read_csv('drive/My Drive/project_2_dataset/city.csv')
city['mapping']=city['mapping'].map({0:0., 1:1., 2:2., 3:3.})

loc_dict={city.iloc[0,0]:city.iloc[0,1],city.iloc[1,0]:city.iloc[1,1],city.iloc[2,0]:city.iloc[2,1],city.iloc[3,0]:city.iloc[3,1]}

# Change the location by numeric value of city in df_original dataframe
df_original['origing']=df_original['origing'].map(loc_dict)
df_original['destination']=df_original['destination'].map(loc_dict)

Output:

origing destination time of pickup
0 airport hauz khaas 3 PM
1 airport hauz khaas 6 PM
2 hauz khaas dwarka sector 23 1 PM
3 airport hauz khaas 1 AM
4 airport dwarka sector 21 10 PM
.. ... ... ...
995 airport dwarka sector 23 2 AM
996 dwarka sector 21 dwarka sector 23 2 PM
997 hauz khaas dwarka sector 21 5 AM
998 airport dwarka sector 23 6 PM
999 airport hauz khaas 1 AM

[1000 rows x 3 columns]

Check Pick up and Drop up correction:

#### Check Pick up and Drop up correction

def check_pick_up_drop_correction(pick_up, drop, line_num):
original_origin = int(df_original.iloc[line_num,0])
original_destination = int(df_original.iloc[line_num,1])
if original_origin == pick_up and original_destination == drop:
return True
else:
return False

Evaluate the agent's performance after Q-learning

"""Evaluate the agent's performance after Q-learning"""

total_epochs, total_penalties, wrong_predictions, total_reward = 0, 0, 0, 0
episodes = 1000

for i in range(episodes):

epochs, penalties, reward = 0, 0, 0

#Generate the random state from an enviroment and change the pick up and drop as the fetched one
state = env.reset()
q_table[state][4]=df_original.iloc[i,0]
q_table[state][5]=df_original.iloc[i,1]

done = False

while not done:
action = np.argmax(q_table[state,:])
state, reward, done, info = env.step(action)

epochs += 1
checking = check_pick_up_drop_correction(int(q_table[state][4]), int(q_table[state][5]), i)
if checking == False:
wrong_predictions += 1
reward=-10
penalties += 1
else:
reward=20

total_penalties += penalties
total_epochs += epochs
total_reward += reward

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")
print(f"Total number of wrong predictions", wrong_predictions)
print("Total Reward is", total_reward)

Output:

Results after 1000 episodes:
Average timesteps per episode: 196.365
Average penalties per episode: 0.019
Total number of wrong predictions 19
Total Reward is 19430

Contact us to get instant help related to Reinforcement Machine Learning Projects at: contact@codersarts.com