Data visualization: matplotlib & seaborn

Sources:

Parameter

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Adjust default figure size
fig_w, fig_h = plt.rcParams.get('figure.figsize')
plt.rcParams['figure.figsize'] = (fig_w, fig_h * .5)

# inline plot (for jupyter)
%matplotlib inline

Set style

print(plt.style.available)
plt.style.use('seaborn-v0_8-whitegrid')
['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn-v0_8', 'seaborn-v0_8-bright', 'seaborn-v0_8-colorblind', 'seaborn-v0_8-dark', 'seaborn-v0_8-dark-palette', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8-deep', 'seaborn-v0_8-muted', 'seaborn-v0_8-notebook', 'seaborn-v0_8-paper', 'seaborn-v0_8-pastel', 'seaborn-v0_8-poster', 'seaborn-v0_8-talk', 'seaborn-v0_8-ticks', 'seaborn-v0_8-white', 'seaborn-v0_8-whitegrid', 'tableau-colorblind10']

Basic plots

plt.figure(figsize=(9, 3))
x = np.linspace(0, 10, 50)
sinus = np.sin(x)

plt.plot(x, sinus)
plt.show()
../_images/data_visualization_5_0.png
plt.figure(figsize=(9, 3))

plt.plot(x, sinus, "o")
plt.show()
# use plt.plot to get color / marker abbreviations
../_images/data_visualization_6_0.png
# Rapid multiplot

plt.figure(figsize=(9, 3))
cosinus = np.cos(x)
plt.plot(x, sinus, "-b", x, sinus, "ob", x, cosinus, "-r", x, cosinus, "or")
plt.xlabel('this is x!')
plt.ylabel('this is y!')
plt.title('My First Plot')
plt.show()
../_images/data_visualization_7_0.png
# Step by step

plt.figure(figsize=(9, 3))
plt.plot(x, sinus, label='sinus', color='blue', linestyle='--', linewidth=2)
plt.plot(x, cosinus, label='cosinus', color='red', linestyle='-', linewidth=2)
plt.legend()
plt.show()
../_images/data_visualization_8_0.png

Scatter (2D) plots

Load dataset

import pandas as pd
try:
    salary = pd.read_csv("../datasets/salary_table.csv")
except:
    url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/salary_table.csv'
    salary = pd.read_csv(url)

df = salary
print(df.head())
   salary  experience education management
0   13876           1  Bachelor          Y
1   11608           1      Ph.D          N
2   18701           1      Ph.D          Y
3   11283           1    Master          N
4   11767           1      Ph.D          N

Simple scatter with colors

plt.figure(figsize=(3, 3), dpi=100)
_ = sns.scatterplot(x="experience", y="salary", hue="education", data=salary)
../_images/data_visualization_12_0.png

Legend outside

ax = sns.relplot(x="experience", y="salary", hue="education", data=salary)
../_images/data_visualization_14_0.png

Linear model

ax = sns.lmplot(x="experience", y="salary", hue="education", data=salary)
../_images/data_visualization_16_0.png

Scatter plot with colors and symbols

ax = sns.relplot(x="experience", y="salary", hue="education", style='management', data=salary)
../_images/data_visualization_18_0.png

Saving Figures

### bitmap format
plt.plot(x, sinus)
plt.savefig("sinus.png")
plt.close()

# Prefer vectorial format (SVG: Scalable Vector Graphics) can be edited with
# Inkscape, Adobe Illustrator, Blender, etc.
plt.plot(x, sinus)
plt.savefig("sinus.svg")
plt.close()

# Or pdf
plt.plot(x, sinus)
plt.savefig("sinus.pdf")
plt.close()

Boxplot and violin plot: one factor

Box plots are non-parametric: they display variation in samples of a statistical population without making any assumptions of the underlying statistical distribution.

ax = sns.boxplot(x="management", y="salary", data=salary)
ax = sns.stripplot(x="management", y="salary", data=salary, jitter=True, color="black")
../_images/data_visualization_22_0.png
ax = sns.violinplot(x="management", y="salary", data=salary)
ax = sns.stripplot(x="management", y="salary", data=salary, jitter=True, color="white")
../_images/data_visualization_23_0.png

Boxplot and violin plot: two factors

ax = sns.boxplot(x="management", y="salary", hue="education", data=salary)
ax = sns.stripplot(x="management", y="salary", hue="education", data=salary, jitter=True, dodge=True, linewidth=1)
../_images/data_visualization_25_0.png
ax = sns.violinplot(x="management", y="salary", hue="education", data=salary)
ax = sns.stripplot(x="management", y="salary", hue="education", data=salary, jitter=True, dodge=True, linewidth=1)
../_images/data_visualization_26_0.png

Distributions and density plot

Histogram as probability density function estimator

numpy.histogram can be used to probability density function at the each histogram bin, setting density=True parameter.

Warning, histogram doesn’t sum to 1. Histogram as PDF estimator should be multiplied by dx’s to sum to 1.

Sum(Hist)= 5.06334585386319 Sum(Hist * dx)= 1.0
../_images/data_visualization_28_1.png

Distributions with seaborn

ax = sns.displot(x="salary", hue="management", kind="kde", data=salary, fill=True)
../_images/data_visualization_30_0.png

Multiple axis

fig, axes = plt.subplots(3, 1, figsize=(9, 9), sharex=True)

i = 0
for edu, d in salary.groupby(['education']):
    sns.kdeplot(x="salary", hue="management", data=d, fill=True, ax=axes[i], palette="muted")
    axes[i].set_title(edu)
    i += 1
../_images/data_visualization_32_0.png

Pairwise scatter plots

ax = sns.pairplot(salary, hue="management")
../_images/data_visualization_34_0.png

Time series

import seaborn as sns
sns.set(style="darkgrid")

# Load an example dataset with long-form data
fmri = sns.load_dataset("fmri")

# Plot the responses for different events and regions
ax = sns.pointplot(x="timepoint", y="signal",
             hue="region", data=fmri)
../_images/data_visualization_36_0.png