© 2021 Rebecca Rousseau. This work is licensed under a Creative Commons Attribution License CC-BY 4.0. All code contained herein is licensed under an MIT license.
Objective: In this notebook, we will analyze mRNA count distributions calculated from yeast gene data (adapted from this paper) and fit appropriate probability distributions to the resulting plots. The chosen genes highlight constitutive and "bursty" gene transcription.
# Import modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import panel as pn
pn.extension()
import scipy.special as spec
import scipy.stats as stats
import seaborn as sns
sns.set()
# Enable high resolution graphics inline
%config InlineBackend.figure_format = 'retina'
data = pd.read_csv('data/mRNA_MDN1.csv')
df = pd.DataFrame(data, columns= ['mRNA_count','probability','error'])
print(df)
plt.bar(df.mRNA_count, df.probability, width=0.9)
plotline, caplines, barlinecols = plt.errorbar(df.mRNA_count, df.probability,
yerr= df.error,
ecolor='black',
capsize=5,
lolims=True,
fmt='none',)
caplines[0].set_marker('_')
plt.xlabel('mRNA count')
_ = plt.ylabel('probability')
mRNA_count probability error 0 0 0.003460 0.006055 1 1 0.027682 0.018166 2 2 0.040657 0.025087 3 3 0.092561 0.038062 4 4 0.124567 0.032872 5 5 0.172145 0.043253 6 6 0.149654 0.025952 7 7 0.109862 0.048443 8 8 0.116782 0.044118 9 9 0.074394 0.038062 10 10 0.035467 0.035467 11 11 0.021626 0.008651 12 12 0.015571 0.009516 13 13 0.014706 0.012976 14 14 0.004325 0.008651 15 15 0.000865 0.004325
Poisson distribution for probability of $k$ counts: $$ P(X=k) = \frac{\lambda^k e^{-\lambda}}{k!}$$
# Define chi-squared error function
def err(dat, count, rate):
"""
Returns the sum of the errors for the theory curve (given the provided rate parameter)
versus the provided data (dat). Note that the rate parameter lambda is equivalent to the mean and variance.
"""
theory = ((rate**count) * np.exp((-1)*rate))/spec.factorial(count)
return np.sum((theory - dat)**2)
# Determine error for a range of possible lambdas
# Number of points to plot
n_points = 100
# Range of lambdas to test
lambdas = np.linspace(0, 30, n_points)
# Initialize an array of error
errors = np.zeros(n_points)
# For each lambda, call our err funtion to determine the amount of error
for i in range(n_points):
errors[i] = err(df.probability, df.mRNA_count, lambdas[i])
# Plot the errors over the values of lambda
plt.plot(lambdas, errors)
plt.ylabel('Error')
plt.xlabel('Average mRNA count $\lambda$')
plt.show()
To find the exact value of $\lambda$ with minimum error,
# Index of the optimal lambda
ind_optimal = np.where(errors == np.min(errors))
# Optimal lambda
lambda_fit = lambdas[ind_optimal]
# Show the optimal fit
print(lambda_fit)
# Plot histogram with optimal Poisson fit
plt.bar(df.mRNA_count, df.probability, width=0.9)
plotline, caplines, barlinecols = plt.errorbar(df.mRNA_count, df.probability,
yerr= df.error,
ecolor='black',
capsize=5,
lolims=True,
fmt='none',)
caplines[0].set_marker('_')
plt.plot(df.mRNA_count, ((lambda_fit**df.mRNA_count) * np.exp((-1)*lambda_fit))/spec.factorial(df.mRNA_count), \
color = 'black')
plt.xlabel('mRNA count')
_ = plt.ylabel('probability')
[6.06060606]
data_P = pd.read_csv('data/mRNA_PDR5.csv')
df_P = pd.DataFrame(data_P, columns= ['mRNA_count','probability','error'])
print(df_P)
plt.bar(df_P.mRNA_count, df_P.probability, width=1)
plotline, caplines, barlinecols = plt.errorbar(df_P.mRNA_count, df_P.probability,
yerr= df_P.error,
ecolor='black',
capsize=5,
lolims=True,
fmt='none',)
caplines[0].set_marker('_')
plt.xlabel('mRNA count')
_ = plt.ylabel('probability')
mRNA_count probability error 0 0 0.007862 0.007448 1 1 0.007034 0.007448 2 2 0.013655 0.013241 3 3 0.033517 0.031862 4 4 0.035172 0.010759 5 5 0.052966 0.024000 6 6 0.053793 0.008276 7 7 0.067034 0.036828 8 8 0.064552 0.033517 9 9 0.052552 -0.000414 10 10 0.052138 0.019034 11 11 0.059586 0.034345 12 12 0.060828 0.015310 13 13 0.050483 0.028966 14 14 0.026483 0.021517 15 15 0.042207 0.014069 16 16 0.038897 0.024000 17 17 0.016966 0.010345 18 18 0.019448 0.014897 19 19 0.025241 0.026069 20 20 0.028138 0.025655 21 21 0.020690 0.007448 22 22 0.009931 0.010759 23 23 0.019034 0.011172 24 24 0.021931 0.004138 25 25 0.020276 0.013655 26 26 0.009517 0.006621 27 27 0.005793 0.006621 28 28 0.014069 0.008276 29 29 0.012414 0.008690 30 30 0.002483 0.003724 31 31 0.011586 0.009517 32 32 0.008690 0.011172 33 33 0.007034 0.006207 34 34 0.002897 0.003724 35 35 0.007448 0.004966 36 36 0.010345 0.010345 37 37 0.000414 0.005379 38 38 0.001241 0.008690 39 39 0.001241 0.009103 40 40 0.006207 0.005379 41 41 0.009103 0.011172 42 42 0.000828 0.009103 43 43 0.000828 0.009103 44 44 0.004552 0.005379
Let's try to fit a Poisson to this skewed distribution.
# Determine error for a range of possible lambdas
# Number of points to plot
n_points = 100
# Range of lambdas to test
lambdas_P = np.linspace(0, 30, n_points)
# Initialize an array of error
errors_P = np.zeros(n_points)
# For each lambda, call our err funtion to determine the amount of error
for i in range(n_points):
errors_P[i] = err(df_P.probability, df_P.mRNA_count, lambdas_P[i])
# Plot the errors over the values of lambda
plt.plot(lambdas_P, errors_P)
plt.ylabel('Error')
plt.xlabel('Average mRNA count $\lambda$')
plt.show()
# Index of the optimal lambda
ind_optimal_P = np.where(errors_P == np.min(errors_P))
# Optimal lambda
lambda_fit_P = lambdas_P[ind_optimal_P]
# Show the optimal fit
print(lambda_fit_P)
# Plot histogram with optimal Poisson fit
plt.bar(df_P.mRNA_count, df_P.probability, width=0.9)
plotline, caplines, barlinecols = plt.errorbar(df_P.mRNA_count, df_P.probability,
yerr= df_P.error,
ecolor='black',
capsize=5,
lolims=True,
fmt='none',)
caplines[0].set_marker('_')
plt.plot(df_P.mRNA_count, ((lambda_fit_P**df_P.mRNA_count) * np.exp((-1)*lambda_fit_P)) / \
spec.factorial(df_P.mRNA_count), color = 'black')
plt.xlabel('mRNA count')
_ = plt.ylabel('probability')
[10.60606061]
The probability distribution as defined by a Poisson curve is too densely centered around the mean, whereas the PDR5 gene expression distribution varies considerably across different possible mRNA counts. This is a strong indicator that, in the steady state, this gene exhibits "bursty" transcription. Such a phenomenon is better modeled by a negative binomial distribution (the discrete analog to the continuous Gamma distribution), defined as
$$ P(k) = {k+n-1 \choose n-1} p^n(1-p)^k, $$where $k$ is the number of mRNA transcripts made in the characteristic lifetime of an mRNA, $n$ is a measure of the frequency of bursts, and $p$ is the probability that a burst in transcription stops.
We now determine the combination of parameters $(n,p)$ for which the negative binomial distribution best fits PDR5 gene expression.
# Define chi-squared error function for negative binomial distribution
def err_nb(dat, count, success, successprob):
"""
Returns the sum of the errors for the theory curve (given the provided count, success, and success probability parameters)
versus the provided data (dat).
"""
theory = spec.comb(count + success - 1, success - 1) * (successprob**success) * (1 - successprob)**count
return np.sum((theory - dat)**2)
# Determine error for possible combinations of (n,p)
# Number of points to plot
n_points = 100
# Range of n (number of successes) to test
n_succ = np.linspace(0, 30, n_points)
# Range of p (success probabilities) to test
p_succ = np.linspace(0, 1, n_points)
# Initialize an array of error
errors_Pnb = np.zeros((n_points, n_points))
# For each lambda, call our err funtion to determine the amount of error
for i in range(n_points):
for j in range(n_points):
errors_Pnb[i,j] = err_nb(df_P.probability, df_P.mRNA_count, n_succ[i], p_succ[j])
# Index of the optimal
ind_optimal_Pnb = np.where(errors_Pnb == np.min(errors_Pnb))
# Optimal (n,p)
n_fit = n_succ[ind_optimal_Pnb[0]]
p_fit = p_succ[ind_optimal_Pnb[1]]
# Show the optimal fit
print(n_fit)
print(p_fit)
# Plot histogram with optimal poisson vs negative binomial fit
plt.bar(df_P.mRNA_count, df_P.probability, width=0.9)
plotline, caplines, barlinecols = plt.errorbar(df_P.mRNA_count, df_P.probability,
yerr= df_P.error,
ecolor='black',
capsize=5,
lolims=True,
fmt='none',)
caplines[0].set_marker('_')
# Poisson
plt.plot(df_P.mRNA_count, ((lambda_fit_P**df_P.mRNA_count) * np.exp((-1)*lambda_fit_P))/ \
spec.factorial(df_P.mRNA_count), color='black', alpha=0.7)
# Negative binomial
plt.plot(df_P.mRNA_count, spec.comb(df_P.mRNA_count + n_fit - 1, n_fit - 1) * (p_fit)**n_fit * \
(1-p_fit)**df_P.mRNA_count, color='red', linewidth=2)
plt.xlabel('mRNA count')
_ = plt.ylabel('probability')
[3.33333333] [0.2020202]