Plot a PDF on a Histogram
2023-12-31
It is often useful to plot data as a histogram with a representative probability density function (PDF). In python, this can be done in just a few lines of code:
Import relevant libraries
# Third party libraries
import numpy as np # for creating arrays of numbers
import plotly.express as px # for making plots
import plotly.graph_objects as go # for adding data to existing plots
import plotly.io as pio # allows me to use `plotly` templates
from scipy import stats # for various probability distributions functions
# Local library
# where I stored my `plotly` theme
from utilities.plotting_template import local_theme
# Sets my plotly theme
pio.templates.default = local_theme
Define the population statistics and the sample
# Define the population statistics and sample size
pop_mean = 10
pop_std = 3
samp_N = 1000
# Create random variates following a normal distribution
## in the `stats.norm` library, `loc` and `scale` are mean and standard
## deviation respectively
sample = stats.norm.rvs(loc=pop_mean, scale=pop_std, size=samp_N)
# Get the sample statistics
samp_mean = np.mean(sample)
samp_std = np.std(sample)
# Compare the population and sample mean and standard deviation
print(f"Population mean: {pop_mean}; Sample mean: {samp_mean}")
print(f"Population std: {pop_std}; Sample std {samp_std}")
Output:
Population mean: 10; Sample mean: 10.11404341677092
Population std: 3; Sample std 3.10050438539277
Plot the histogram
# Plot the newly created random variates (`rvs`)
fig = px.histogram(x=sample, histnorm="probability density").update(
layout=dict(height=500, width=800)
)
fig.show()
Generate plotting points for the pdf curve
# Define the lower and upper bounds of the pdf to plot
## `stats.norm.ppf` returns the `x` value for the pdf at the probability `q`
pdf_x_lower = stats.norm.ppf(q=0.01, loc=samp_mean, scale=samp_std)
pdf_x_upper = stats.norm.ppf(q=0.99, loc=samp_mean, scale=samp_std)
print(f"Lower bound: {pdf_x_lower}; Upper bound: {pdf_x_upper}")
Output:
Lower bound: 2.901191631358146; Upper bound: 17.326895202183696
# Create a 100 element array to represent the x-values for the pdf
pdf_x = np.linspace(start=pdf_x_lower, stop=pdf_x_upper, num=100)
pdf_x
Output:
array([ 2.90119163, 3.04690581, 3.19261999, 3.33833416, 3.48404834,
3.62976252, 3.7754767 , 3.92119087, 4.06690505, 4.21261923,
4.35833341, 4.50404758, 4.64976176, 4.79547594, 4.94119012,
5.08690429, 5.23261847, 5.37833265, 5.52404683, 5.669761 ,
5.81547518, 5.96118936, 6.10690354, 6.25261771, 6.39833189,
6.54404607, 6.68976025, 6.83547442, 6.9811886 , 7.12690278,
7.27261696, 7.41833113, 7.56404531, 7.70975949, 7.85547367,
8.00118784, 8.14690202, 8.2926162 , 8.43833038, 8.58404455,
8.72975873, 8.87547291, 9.02118709, 9.16690126, 9.31261544,
9.45832962, 9.6040438 , 9.74975797, 9.89547215, 10.04118633,
10.18690051, 10.33261468, 10.47832886, 10.62404304, 10.76975722,
10.91547139, 11.06118557, 11.20689975, 11.35261393, 11.4983281 ,
11.64404228, 11.78975646, 11.93547064, 12.08118481, 12.22689899,
12.37261317, 12.51832735, 12.66404152, 12.8097557 , 12.95546988,
13.10118406, 13.24689823, 13.39261241, 13.53832659, 13.68404077,
13.82975494, 13.97546912, 14.1211833 , 14.26689748, 14.41261165,
14.55832583, 14.70404001, 14.84975418, 14.99546836, 15.14118254,
15.28689672, 15.43261089, 15.57832507, 15.72403925, 15.86975343,
16.0154676 , 16.16118178, 16.30689596, 16.45261014, 16.59832431,
16.74403849, 16.88975267, 17.03546685, 17.18118102, 17.3268952 ])
Add the plotting points to the histogram
# Create the y-values for the pdf
pdf_y = stats.norm.pdf(x=pdf_x, loc=samp_mean, scale=samp_std)
fig.add_trace(go.Scatter(x=pdf_x, y=pdf_y, showlegend=False))
fig.show()
Turn it into a a function
The above code can be turned into a function so the process can be replicated:
def plot_hist_and_norm_pdf(x: np.ndarray) -> go.Figure:
"""
Plots a histogram with an associated pdf (assuming the data is normally distributed).
Parameters
----------
x : np.ndarray
Sample data (list of observed values)
Returns
-------
fig : go.Figure
Plotly figure object with a histogram and pdf representing sample data `x`
"""
mean = np.mean(sample)
std = np.std(sample)
pdf_x_lower = stats.norm.ppf(q=0.01, loc=mean, scale=std)
pdf_x_upper = stats.norm.ppf(q=0.99, loc=mean, scale=std)
pdf_x = np.linspace(start=pdf_x_lower, stop=pdf_x_upper, num=100)
pdf_y = stats.norm.pdf(x=pdf_x, loc=mean, scale=std)
fig = px.histogram(x=sample, histnorm="probability density").update(
layout=dict(height=500, width=800)
)
fig.add_trace(go.Scatter(x=pdf_x, y=pdf_y, showlegend=False))
return fig
plot_hist_and_norm_pdf(sample)