Search

# Solution Reinforcement Learning for Taxi-v2

Updated: Mar 25

There are 4 locations (labeled by different letters), and our job is to pick up the passenger at one location and drop him off at another. We receive +20 points for a successful drop-off and lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions.

First import all related libraries:

```import numpy as np
import gym
import random
import pandas as pd
import spacy
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher
env = gym.make("Taxi-v3")
env.render()```

Output: Fetching Origing, Destination, and Time of Pickup from the sms data:

```def fetch_pickup_drop(text_file_path):

# Append All the texts in the List
texts_list=[]

#print(df.shape)

for i in range(0,df.shape):
s=df.iloc[i,0]
texts_list.append(s)
l1=[]
LOCATIONS = ["dwarka sector 23", "dwarka sector 21", "hauz khaas", "airport"]

matcher = PhraseMatcher(nlp.vocab)

def places_component(doc):
doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matcher(doc)]
return doc

#fetch the locations from the texts list of each text and append in the l1 list
for doc in nlp.pipe(texts_list):
l1.append([(ent.text, ent.label_) for ent in doc.ents])

dest=[]
pickup=[]
timing=[]
for i in range(0,len(texts_list)):
str_text=texts_list[i].lower()
str1='for '+l1[i]
str2='to '+l1[i]
str3='from '+l1[i]
## fetch the pickup and drop up location from the texts list of each text sms and append in the destination and pickup list
if str1 in str_text or str2 in str_text:
dest.append(l1[i])
pickup.append(l1[i])
elif str3 in str_text:
dest.append(l1[i])
pickup.append(l1[i])
# fetch the timing from the texts list of each text and append in the timing list.
if 'am' in str_text:
new_str=str_text[0:str_text.index('am')-1]
n=new_str.rindex(' ')
timing.append(new_str[n+1:]+' AM')

elif 'pm' in str_text:
new_str=str_text[0:str_text.index('pm')-1]
n=new_str.rindex(' ')
timing.append(new_str[n+1:]+' PM')

## create the dataframe of the pickup, Destination and time of pickup
df1 = pd.DataFrame(pickup,columns=['origing'])
df2 = pd.DataFrame(dest,columns=['destination'])
df3 = pd.DataFrame(timing,columns=['time of pickup'])
# concatenate the above three dataframe to get df_final dataframe of sms text file.
df_table_final=pd.concat([df1,df2,df3], axis=1)
return df_table_final

```

```env.reset() # reset environment to a new, random state
env.render()

action_size = env.action_space.n
print("Action size ", action_size)

state_size = env.observation_space.n
print("State size ", state_size)

q_table = np.zeros((state_size, action_size))
print(q_table)```

Output: Training the Agent:

```%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
state = env.reset()

epochs, penalties, reward, = 0, 0, 0
done = False

while not done:
if random.uniform(0, 1) < epsilon:
action = env.action_space.sample() # Explore action space
else:
action = np.argmax(q_table[state]) # Exploit learned values

next_state, reward, done, info = env.step(action)

old_value = q_table[state, action]
next_max = np.max(q_table[next_state])

new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
q_table[state, action] = new_value

if reward == -10:
penalties += 1

state = next_state
epochs += 1

if i % 100 == 0:
clear_output(wait=True)
print(f"Episode: {i}")

print("Training finished.\n")```

Output:

Episode: 100000 Training finished. CPU times: user 1min 10s, sys: 13.3 s, total: 1min 24s Wall time: 1min 12s

Distance between pic up and drop location

```text_file_path='drive/My Drive/project_2_dataset/sms.txt'

df_original=fetch_pickup_drop(text_file_path)
print(df_original)

# Create a Local Dictionary of city
city['mapping']=city['mapping'].map({0:0., 1:1., 2:2., 3:3.})

loc_dict={city.iloc[0,0]:city.iloc[0,1],city.iloc[1,0]:city.iloc[1,1],city.iloc[2,0]:city.iloc[2,1],city.iloc[3,0]:city.iloc[3,1]}

# Change the location by numeric value of city in df_original dataframe
df_original['origing']=df_original['origing'].map(loc_dict)
df_original['destination']=df_original['destination'].map(loc_dict)```

Output:

origing destination time of pickup 0 airport hauz khaas 3 PM 1 airport hauz khaas 6 PM 2 hauz khaas dwarka sector 23 1 PM 3 airport hauz khaas 1 AM 4 airport dwarka sector 21 10 PM .. ... ... ... 995 airport dwarka sector 23 2 AM 996 dwarka sector 21 dwarka sector 23 2 PM 997 hauz khaas dwarka sector 21 5 AM 998 airport dwarka sector 23 6 PM 999 airport hauz khaas 1 AM [1000 rows x 3 columns]

Check Pick up and Drop up correction:

```#### Check Pick up and Drop up correction

def check_pick_up_drop_correction(pick_up, drop, line_num):
original_origin = int(df_original.iloc[line_num,0])
original_destination = int(df_original.iloc[line_num,1])
if original_origin == pick_up and original_destination == drop:
return True
else:
return False```

Evaluate the agent's performance after Q-learning

```"""Evaluate the agent's performance after Q-learning"""

total_epochs, total_penalties, wrong_predictions, total_reward = 0, 0, 0, 0
episodes = 1000

for i in range(episodes):

epochs, penalties, reward = 0, 0, 0

#Generate the random state from an enviroment and change the pick up and drop as the fetched one
state = env.reset()
q_table[state]=df_original.iloc[i,0]
q_table[state]=df_original.iloc[i,1]

done = False

while not done:
action = np.argmax(q_table[state,:])
state, reward, done, info = env.step(action)

epochs += 1
checking = check_pick_up_drop_correction(int(q_table[state]), int(q_table[state]), i)
if checking == False:
wrong_predictions += 1
reward=-10
penalties += 1
else:
reward=20

total_penalties += penalties
total_epochs += epochs
total_reward += reward

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")
print(f"Total number of wrong predictions", wrong_predictions)
print("Total Reward is", total_reward)```

Output:

Results after 1000 episodes: Average timesteps per episode: 196.365 Average penalties per episode: 0.019 Total number of wrong predictions 19 Total Reward is 19430

Contact us to get instant help related to Reinforcement Machine Learning Projects at: contact@codersarts.com

### Recent Posts

See All

#### Machine Learning With R | Sample Assignment | Assignment Help

Tel: (+91) 0120  4118730

Time :   10 : 00  AM -  08 : 00 PM IST