Visualization Techniques Used in Machine Learning

Sep 9, 2020
3 min read

Data visualization is the discipline of trying to understand data by placing it in a visual context so that patterns, trends, and correlations that might not otherwise be detected can be exposed.

To get a little overview here are a few popular plotting libraries:

Matplotlib: low level, provides lots of freedom
Pandas Visualization: easy to use interface, built on Matplotlib
Seaborn: high-level interface, great default styles
Plotly: can create interactive plots

To understand these all data visualization tools let's deep dive into the code part.

For this here the Air Quality dataset from 2015 to 2020 is used.

Now to implement these following libraries we have to import it first.(Make sure these are installed on your system.)

Import the libraries

Libraries to imported are:

NumPy
pandas
plotly.express
plotly.offline
cufflinks
matplotlib
seaborn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
%matplotlib inline
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
#plt.rcParams['figure.figsize']=17,8
import cufflinks as cf
import plotly.offline as pyo
from plotly.offline import init_notebook_mode,plot,iplot

Read the dataset through pandas

df=pd.read_csv('city_day.csv')

Here is what the dataset looks like

df.head()

Now let's plot yearly changes of SO2 plot using Plotly Library

Before that, we have to group by each column wrt to the date column.

The below code is shown that

SO2=df.groupby('year')['SO2'].sum().reset_index().sort_values(by='year',ascending=False)
NO2=df.groupby('year')['NO2'].sum().reset_index().sort_values(by='year',ascending=False)
BTX=df.groupby('year')['BTX'].sum().reset_index().sort_values(by='year',ascending=False)
CO=df.groupby('year')['CO'].sum().reset_index().sort_values(by='year',ascending=False)
PM=df.groupby('year')['PM2.5'].sum().reset_index().sort_values(by='year',ascending=False)
O=df.groupby('year')['O3'].sum().reset_index().sort_values(by='year',ascending=False)

Now let's plot for SO2:-

Line plot:(Modes="Lines+markers")

SO2.iplot(kind='line',mode='lines+markers',x='year',y='SO2',title='AMOUNT OF SO2 IN DIFFERENT YEARS ')

Let's check for the Table + Bar plot in plotly library.

trace = go.Table(
    domain=dict(x=[0, 0.52],
                y=[0, 1.0]),
    header=dict(values=["City","SO2"],
                fill = dict(color = '#119DFF'),
                font = dict(color = 'white', size = 14),
                align = ['center'],
               height = 30),
    cells=dict(values=[S['City'].head(10),S['SO2'].head(10)],
               fill = dict(color = ['lightgreen', 'white']),
               align = ['center']))

trace1 = go.Bar(x=S['City'].head(10),
                y=S['SO2'].head(10),
                xaxis='x1',
                yaxis='y1',
                marker=dict(color='lime'),opacity=0.60)
layout = dict(
    width=830,
    height=420,
    autosize=False,
    title='TOP 10 Cities with Max SO2',
    showlegend=False,   
    xaxis1=dict(**dict(domain=[0.58, 1], anchor='y1', showticklabels=True)),
    yaxis1=dict(**dict(domain=[0, 1.0], anchor='x1', hoverformat='.2f')),  
)

fig1 = dict(data=[trace, trace1], layout=layout)
iplot(fig1)

So here we have made a table with the top 10 cities with the max amount of S02. and the bar graph is drawn in the very next.

Point plot using Seaborn:

Let's see how to draw a Point plot using seaborn.

plt.subplots(figsize =(15,8))
sns.pointplot(x='month', y='SO2', data=df,color='Orange')
plt.xlabel('MONTHS',fontsize = 16,color='blue')
plt.ylabel('SO2',fontsize = 16,color='blue')
plt.title('SO2 in Different Months',fontsize = 20,color='blue')
plt.savefig('loc\\SO2_monthly')

So here we can plot the amount of SO2 for different months.

Subplots:

We can plot some plots together like in a gallery view in all.

Let's see the below example where we have plotted all the pollutants changes with respect to different years.

from plotly.tools import make_subplots
trace1=go.Scatter(x=SO2['year'], y=SO2['SO2'], mode='lines+markers', name='NO2')
trace2=go.Scatter(x=NO2['year'], y=NO2['NO2'], mode='lines+markers', name='NO2')
trace3=go.Scatter(x=CO['year'], y=CO['CO'], mode='lines+markers', name='CO')
trace4=go.Scatter(x=PM['year'], y=PM['PM2.5'], mode='lines+markers', name='PM2.5')
fig = plotly.tools.make_subplots(rows=2, cols=2,print_grid=False,
                          subplot_titles=('SO2 in diff. years','NO2 in diff. years','CO in diff. years',
                                          'PM2.5 in diff. years'))
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 2, 1)
fig.append_trace(trace4, 2, 2)
fig['layout'].update(height=550, width=850,title='AIR Pollutants In different Years',showlegend=False)
iplot(fig)

In the above lines of code, we have drawn for 4 pollutants changes wrt year.

We can also draw the all in a single graph also.

Let's have a look at that.

fig=go.Figure()
fig.add_trace(go.Scatter(x=SO2['year'], y=SO2['SO2'], mode='lines+markers', name='SO2',line=dict(color='Blue', width=2)))
fig.add_trace(go.Scatter(x=NO2['year'], y=NO2['NO2'], mode='lines+markers', name='NO2',line=dict(color='Red', width=2)))
fig.add_trace(go.Scatter(x=BTX['year'], y=BTX['BTX'], mode='lines+markers', name='BTX',line=dict(color='Green', width=2)))
fig.add_trace(go.Scatter(x=CO['year'], y=CO['CO'], mode='lines+markers', name='CO',line=dict(color='orange', width=2)))
fig.add_trace(go.Scatter(x=PM['year'], y=PM['PM2.5'], mode='lines+markers', name='PM2.5',line=dict(color='Magenta', width=2)))
fig.add_trace(go.Scatter(x=O['year'], y=O['O3'], mode='lines+markers', name='Ozone',line=dict(color='royalblue', width=2)))
fig.update_layout(title='AIR POLLUTANTS PARTICLES IN DIFFERENT YEARS', xaxis_tickfont_size=14,yaxis=dict(title='TOTAL AMOUNT IN YEARS'))
fig.show()

PIE PLOT:

Let's have a look at the below code to make a pie plot:

x = df_Ahmedabad_2019
y = df_Bengaluru_2019
z = df_Hyderabad_2019

data = [go.Scatterpolar(
  r = [x['SO2'].values[0],x['NO2'].values[0],x['CO'].values[0],x['BTX'].values[0],x['PM2.5'].values[0]],
  theta = ['SO2','NO2','CO','BTX','PM2.5'],
  fill = 'toself', opacity = 0.8,
  name = "Ahmedabad"),
        
    go.Scatterpolar(
  r = [y['SO2'].values[0],y['NO2'].values[0],y['CO'].values[0],y['BTX'].values[0],y['PM2.5'].values[0]],
  theta = ['SO2','NO2','CO','BTX','PM2.5'],
  fill = 'toself',subplot = "polar2",
    name = "Bengaluru"),
       
    go.Scatterpolar(
  r = [z['SO2'].values[0],z['NO2'].values[0],z['CO'].values[0],z['BTX'].values[0],z['PM2.5'].values[0]],
  theta = ['SO2','NO2','CO','BTX','PM2.5'],
  fill = 'toself',subplot = "polar3",
    name = "Hyderbad")]
layout = go.Layout(title = "Comparison Between Ahmedabad,Bengaluru,Hyderabad in the year 2019",
                   
                   polar = dict(radialaxis = dict(visible = True,range = [0, 120]),
                   domain = dict(x = [0, 0.27],y = [0, 1])),
                  
                   polar2 = dict(radialaxis = dict(visible = True,range = [0, 60]),
                   domain = dict(x = [0.35, 0.65],y = [0, 1])),
                  
                   polar3 = dict(radialaxis = dict(visible = True,range = [0, 70]),
                   domain = dict(x = [0.75, 1.0],y = [0, 1])),)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

Distribution plot:

Let's check the AQI distribution of 5 major cities

fig,ax=plt.subplots(figsize=(20, 10))
sns.despine(fig, left=True, bottom=True)
sns.set_context("notebook", font_scale=2, rc={"lines.linewidth": 2})
sns.distplot(df_Delhi['AQI'].iloc[::30], color="y",label = 'Delhi')
sns.distplot(df_Ahmedabad['AQI'].iloc[::30], color="b",label = 'Ahmedabad')
sns.distplot(df_Hyderabad['AQI'].iloc[::30], color="black",label = 'Hyderabad')
sns.distplot(df_Bengaluru['AQI'].iloc[::30], color="g",label = 'Bengaluru')
sns.distplot(df_Kolkata['AQI'].iloc  [::30], color="r",label = 'Kolkata')
labels = [item.get_text() for item in ax.get_xticklabels()]
ax.set_xticklabels(ax.get_xticklabels(labels), rotation=30,ha="left")
plt.rcParams["xtick.labelsize"] = 15
ax.set_title('AQI DISTRIBUTIONS FROM DIFFERENT CITIES')
ax.legend(fontsize = 14);

Go Scatter Plot:

Let's trace a scatter+line plot for the city Kolkata.

fig=go.Figure()
fig.add_trace(go.Scatter(x=df_Kolkata_2020['Date'], y=df_Kolkata_2020['SO2'], mode='lines', name='SO2',line=dict(color='Blue', width=2)))
fig.add_trace(go.Scatter(x=df_Kolkata_2020['Date'], y=df_Kolkata_2020['NO2'], mode='lines', name='NO2',line=dict(color='Red', width=2)))
fig.add_trace(go.Scatter(x=df_Kolkata_2020['Date'], y=df_Kolkata_2020['BTX'], mode='lines', name='BTX',line=dict(color='Green', width=2)))
fig.add_trace(go.Scatter(x=df_Kolkata_2020['Date'], y=df_Kolkata_2020['CO'], mode='lines', name='CO',line=dict(color='orange', width=2)))
fig.add_trace(go.Scatter(x=df_Kolkata_2020['Date'], y=df_Kolkata_2020['PM2.5'], mode='lines', name='PM2.5',line=dict(color='Magenta', width=2)))
fig.add_trace(go.Scatter(x=df_Kolkata_2020['Date'], y=df_Kolkata_2020['O3'], mode='lines', name='Ozone',line=dict(color='royalblue', width=2)))
fig.update_layout(title='AIR POLLUTANTS PARTICLES ON 2020 Kolkata', xaxis_tickfont_size=14,yaxis=dict(title='AIR POLLUTANTS'))
fig.show()