top of page

Solution Reinforcement Learning for Taxi-v2

Updated: Mar 25, 2021

There are 4 locations (labeled by different letters), and our job is to pick up the passenger at one location and drop him off at another. We receive +20 points for a successful drop-off and lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions.


First import all related libraries:

import numpy as np
import gym
import random
import pandas as pd
import spacy
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher
env = gym.make("Taxi-v3")
env.render()

Output:








Fetching Origing, Destination, and Time of Pickup from the sms data:


def fetch_pickup_drop(text_file_path):

    # Append All the texts in the List
    texts_list=[]

    #read the text sms file
    df=pd.read_csv(text_file_path,header=None,names=['Sms'])
    #print(df.shape[0])

    for i in range(0,df.shape[0]):
        s=df.iloc[i,0]
        texts_list.append(s)
    # add the locations through add_pipe
    l1=[]
    LOCATIONS = ["dwarka sector 23", "dwarka sector 21", "hauz khaas", "airport"]

    nlp = spacy.load('en')
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("LOCATIONS", None, *list(nlp.pipe(LOCATIONS)))

    def places_component(doc):
        doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matcher(doc)]
        return doc

    nlp.add_pipe(places_component) #last=True

    #fetch the locations from the texts list of each text and append in the l1 list
    for doc in nlp.pipe(texts_list):
        l1.append([(ent.text, ent.label_) for ent in doc.ents])
    

    dest=[]
    pickup=[]
    timing=[]
    for i in range(0,len(texts_list)):
        str_text=texts_list[i].lower()
        str1='for '+l1[i][1][0]
        str2='to '+l1[i][1][0]
        str3='from '+l1[i][1][0]
        ## fetch the pickup and drop up location from the texts list of each text sms and append in the destination and pickup list
        if str1 in str_text or str2 in str_text:
            dest.append(l1[i][1][0])
            pickup.append(l1[i][0][0])
        elif str3 in str_text:
            dest.append(l1[i][0][0])
            pickup.append(l1[i][1][0])
        # fetch the timing from the texts list of each text and append in the timing list.
        if 'am' in str_text:
            new_str=str_text[0:str_text.index('am')-1]
            n=new_str.rindex(' ')
            timing.append(new_str[n+1:]+' AM')

        elif 'pm' in str_text:
            new_str=str_text[0:str_text.index('pm')-1]
            n=new_str.rindex(' ')
            timing.append(new_str[n+1:]+' PM')

## create the dataframe of the pickup, Destination and time of pickup
    df1 = pd.DataFrame(pickup,columns=['origing'])
    df2 = pd.DataFrame(dest,columns=['destination'])
    df3 = pd.DataFrame(timing,columns=['time of pickup'])
    # concatenate the above three dataframe to get df_final dataframe of sms text file.
    df_table_final=pd.concat([df1,df2,df3], axis=1)
    return df_table_final


env.reset() # reset environment to a new, random state
env.render()

action_size = env.action_space.n
print("Action size ", action_size)

state_size = env.observation_space.n
print("State size ", state_size)

q_table = np.zeros((state_size, action_size))
print(q_table)

Output:













Training the Agent:

%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Output:

Episode: 100000 Training finished. CPU times: user 1min 10s, sys: 13.3 s, total: 1min 24s Wall time: 1min 12s



Distance between pic up and drop location

text_file_path='drive/My Drive/project_2_dataset/sms.txt'

df_original=fetch_pickup_drop(text_file_path)
print(df_original)

# Create a Local Dictionary of city
city=pd.read_csv('drive/My Drive/project_2_dataset/city.csv')
city['mapping']=city['mapping'].map({0:0., 1:1., 2:2., 3:3.})

loc_dict={city.iloc[0,0]:city.iloc[0,1],city.iloc[1,0]:city.iloc[1,1],city.iloc[2,0]:city.iloc[2,1],city.iloc[3,0]:city.iloc[3,1]}

# Change the location by numeric value of city in df_original dataframe
df_original['origing']=df_original['origing'].map(loc_dict)
df_original['destination']=df_original['destination'].map(loc_dict)

Output:


origing destination time of pickup 0 airport hauz khaas 3 PM 1 airport hauz khaas 6 PM 2 hauz khaas dwarka sector 23 1 PM 3 airport hauz khaas 1 AM 4 airport dwarka sector 21 10 PM .. ... ... ... 995 airport dwarka sector 23 2 AM 996 dwarka sector 21 dwarka sector 23 2 PM 997 hauz khaas dwarka sector 21 5 AM 998 airport dwarka sector 23 6 PM 999 airport hauz khaas 1 AM [1000 rows x 3 columns]



Check Pick up and Drop up correction:


#### Check Pick up and Drop up correction 

def check_pick_up_drop_correction(pick_up, drop, line_num):
    original_origin = int(df_original.iloc[line_num,0])
    original_destination = int(df_original.iloc[line_num,1])
    if original_origin == pick_up and original_destination == drop:
        return True
    else:
        return False


Evaluate the agent's performance after Q-learning


"""Evaluate the agent's performance after Q-learning"""
           
total_epochs, total_penalties, wrong_predictions, total_reward = 0, 0, 0, 0
episodes = 1000

for i in range(episodes):
    
    epochs, penalties, reward = 0, 0, 0

    #Generate the random state from an enviroment and change the pick up and drop as the fetched one
    state = env.reset()
    q_table[state][4]=df_original.iloc[i,0]
    q_table[state][5]=df_original.iloc[i,1]
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state,:])
        state, reward, done, info = env.step(action)
        

        
        epochs += 1
    checking = check_pick_up_drop_correction(int(q_table[state][4]), int(q_table[state][5]), i)
    if checking == False:
        wrong_predictions += 1
        reward=-10
        penalties += 1
    else:
        reward=20
     
        

    total_penalties += penalties
    total_epochs += epochs
    total_reward += reward

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")
print(f"Total number of wrong predictions", wrong_predictions)
print("Total Reward is", total_reward)


Output:

Results after 1000 episodes: Average timesteps per episode: 196.365 Average penalties per episode: 0.019 Total number of wrong predictions 19 Total Reward is 19430



Contact us to get instant help related to Reinforcement Machine Learning Projects at: contact@codersarts.com
bottom of page