# Import modules

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import panel as pn
pn.extension()

import scipy.special as spec

import scipy.stats as stats

import seaborn as sns
sns.set()
# Enable high resolution graphics inline
%config InlineBackend.figure_format = 'retina'


data = pd.read_csv('data/mRNA_MDN1.csv')
df = pd.DataFrame(data, columns= ['mRNA_count','probability','error'])
print(df)

plt.bar(df.mRNA_count, df.probability, width=0.9)
plotline, caplines, barlinecols = plt.errorbar(df.mRNA_count, df.probability,
             yerr= df.error,
             ecolor='black',
             capsize=5,
             lolims=True,
             fmt='none',)
caplines[0].set_marker('_')
plt.xlabel('mRNA count')
_ = plt.ylabel('probability')

    mRNA_count  probability     error
0            0     0.003460  0.006055
1            1     0.027682  0.018166
2            2     0.040657  0.025087
3            3     0.092561  0.038062
4            4     0.124567  0.032872
5            5     0.172145  0.043253
6            6     0.149654  0.025952
7            7     0.109862  0.048443
8            8     0.116782  0.044118
9            9     0.074394  0.038062
10          10     0.035467  0.035467
11          11     0.021626  0.008651
12          12     0.015571  0.009516
13          13     0.014706  0.012976
14          14     0.004325  0.008651
15          15     0.000865  0.004325


# Define chi-squared error function

def err(dat, count, rate):
    """
    Returns the sum of the errors for the theory curve (given the provided rate parameter)
    versus the provided data (dat). Note that the rate parameter lambda is equivalent to the mean and variance.
    """
    theory = ((rate**count) * np.exp((-1)*rate))/spec.factorial(count)
    return np.sum((theory - dat)**2)


# Determine error for a range of possible lambdas

# Number of points to plot
n_points = 100

# Range of lambdas to test
lambdas = np.linspace(0, 30, n_points)

# Initialize an array of error
errors = np.zeros(n_points)

# For each lambda, call our err funtion to determine the amount of error
for i in range(n_points):
    errors[i] = err(df.probability, df.mRNA_count, lambdas[i])
    
# Plot the errors over the values of lambda
plt.plot(lambdas, errors)
plt.ylabel('Error')
plt.xlabel('Average mRNA count $\lambda$')
plt.show()


# Index of the optimal lambda
ind_optimal = np.where(errors == np.min(errors))

# Optimal lambda
lambda_fit = lambdas[ind_optimal]

# Show the optimal fit
print(lambda_fit)

# Plot histogram with optimal Poisson fit
plt.bar(df.mRNA_count, df.probability, width=0.9)
plotline, caplines, barlinecols = plt.errorbar(df.mRNA_count, df.probability,
             yerr= df.error,
             ecolor='black',
             capsize=5,
             lolims=True,
             fmt='none',)
caplines[0].set_marker('_')
plt.plot(df.mRNA_count, ((lambda_fit**df.mRNA_count) * np.exp((-1)*lambda_fit))/spec.factorial(df.mRNA_count), \
            color = 'black')
plt.xlabel('mRNA count')
_ = plt.ylabel('probability')

[6.06060606]


data_P = pd.read_csv('data/mRNA_PDR5.csv')
df_P = pd.DataFrame(data_P, columns= ['mRNA_count','probability','error'])
print(df_P)

plt.bar(df_P.mRNA_count, df_P.probability, width=1)
plotline, caplines, barlinecols = plt.errorbar(df_P.mRNA_count, df_P.probability,
             yerr= df_P.error,
             ecolor='black',
             capsize=5,
             lolims=True,
             fmt='none',)
caplines[0].set_marker('_')
plt.xlabel('mRNA count')
_ = plt.ylabel('probability')

    mRNA_count  probability     error
0            0     0.007862  0.007448
1            1     0.007034  0.007448
2            2     0.013655  0.013241
3            3     0.033517  0.031862
4            4     0.035172  0.010759
5            5     0.052966  0.024000
6            6     0.053793  0.008276
7            7     0.067034  0.036828
8            8     0.064552  0.033517
9            9     0.052552 -0.000414
10          10     0.052138  0.019034
11          11     0.059586  0.034345
12          12     0.060828  0.015310
13          13     0.050483  0.028966
14          14     0.026483  0.021517
15          15     0.042207  0.014069
16          16     0.038897  0.024000
17          17     0.016966  0.010345
18          18     0.019448  0.014897
19          19     0.025241  0.026069
20          20     0.028138  0.025655
21          21     0.020690  0.007448
22          22     0.009931  0.010759
23          23     0.019034  0.011172
24          24     0.021931  0.004138
25          25     0.020276  0.013655
26          26     0.009517  0.006621
27          27     0.005793  0.006621
28          28     0.014069  0.008276
29          29     0.012414  0.008690
30          30     0.002483  0.003724
31          31     0.011586  0.009517
32          32     0.008690  0.011172
33          33     0.007034  0.006207
34          34     0.002897  0.003724
35          35     0.007448  0.004966
36          36     0.010345  0.010345
37          37     0.000414  0.005379
38          38     0.001241  0.008690
39          39     0.001241  0.009103
40          40     0.006207  0.005379
41          41     0.009103  0.011172
42          42     0.000828  0.009103
43          43     0.000828  0.009103
44          44     0.004552  0.005379


# Determine error for a range of possible lambdas

# Number of points to plot
n_points = 100

# Range of lambdas to test
lambdas_P = np.linspace(0, 30, n_points)

# Initialize an array of error
errors_P = np.zeros(n_points)

# For each lambda, call our err funtion to determine the amount of error
for i in range(n_points):
    errors_P[i] = err(df_P.probability, df_P.mRNA_count, lambdas_P[i])
    
# Plot the errors over the values of lambda
plt.plot(lambdas_P, errors_P)
plt.ylabel('Error')
plt.xlabel('Average mRNA count $\lambda$')
plt.show()


# Index of the optimal lambda
ind_optimal_P = np.where(errors_P == np.min(errors_P))

# Optimal lambda
lambda_fit_P = lambdas_P[ind_optimal_P]

# Show the optimal fit
print(lambda_fit_P)

# Plot histogram with optimal Poisson fit
plt.bar(df_P.mRNA_count, df_P.probability, width=0.9)
plotline, caplines, barlinecols = plt.errorbar(df_P.mRNA_count, df_P.probability,
             yerr= df_P.error,
             ecolor='black',
             capsize=5,
             lolims=True,
             fmt='none',)
caplines[0].set_marker('_')
plt.plot(df_P.mRNA_count, ((lambda_fit_P**df_P.mRNA_count) * np.exp((-1)*lambda_fit_P)) / \
         spec.factorial(df_P.mRNA_count), color = 'black')
plt.xlabel('mRNA count')
_ = plt.ylabel('probability')

[10.60606061]


# Define chi-squared error function for negative binomial distribution

def err_nb(dat, count, success, successprob):
    """
    Returns the sum of the errors for the theory curve (given the provided count, success, and success probability parameters)
    versus the provided data (dat).
    """
    theory = spec.comb(count + success - 1, success - 1) * (successprob**success) * (1 - successprob)**count
    return np.sum((theory - dat)**2)


# Determine error for possible combinations of (n,p)

# Number of points to plot
n_points = 100

# Range of n (number of successes) to test
n_succ = np.linspace(0, 30, n_points)

# Range of p (success probabilities) to test
p_succ = np.linspace(0, 1, n_points)

# Initialize an array of error
errors_Pnb = np.zeros((n_points, n_points))

# For each lambda, call our err funtion to determine the amount of error
for i in range(n_points):
    for j in range(n_points):
        errors_Pnb[i,j] = err_nb(df_P.probability, df_P.mRNA_count, n_succ[i], p_succ[j])


# Index of the optimal 
ind_optimal_Pnb = np.where(errors_Pnb == np.min(errors_Pnb))

# Optimal (n,p)
n_fit = n_succ[ind_optimal_Pnb[0]]
p_fit = p_succ[ind_optimal_Pnb[1]]

# Show the optimal fit
print(n_fit)
print(p_fit)

# Plot histogram with optimal poisson vs negative binomial fit
plt.bar(df_P.mRNA_count, df_P.probability, width=0.9)
plotline, caplines, barlinecols = plt.errorbar(df_P.mRNA_count, df_P.probability,
             yerr= df_P.error,
             ecolor='black',
             capsize=5,
             lolims=True,
             fmt='none',)
caplines[0].set_marker('_')

# Poisson
plt.plot(df_P.mRNA_count, ((lambda_fit_P**df_P.mRNA_count) * np.exp((-1)*lambda_fit_P))/ \
         spec.factorial(df_P.mRNA_count), color='black', alpha=0.7)

# Negative binomial
plt.plot(df_P.mRNA_count, spec.comb(df_P.mRNA_count + n_fit - 1, n_fit - 1) * (p_fit)**n_fit * \
         (1-p_fit)**df_P.mRNA_count, color='red', linewidth=2)
plt.xlabel('mRNA count')
_ = plt.ylabel('probability')

[3.33333333]
[0.2020202]

mRNA distributions in yeast for constitutive and bursty genes¶

MDN1 mRNA count distribution (constitutive)¶

PDR5 mRNA count distribution (bursty transcription)¶