Spaces:
Sleeping
Sleeping
Commit
·
bb8952c
1
Parent(s):
f88cb30
Upload 3 files
Browse files
functions/functions/__pycache__/pathway_analyses.cpython-39.pyc
ADDED
|
Binary file (26.4 kB). View file
|
|
|
functions/functions/pathway_analyses.py
ADDED
|
@@ -0,0 +1,1015 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import scipy
|
| 3 |
+
import warnings
|
| 4 |
+
#import anndata2ri
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import scanpy as sc
|
| 7 |
+
import numpy as np
|
| 8 |
+
import seaborn as sb
|
| 9 |
+
import decoupler as dc
|
| 10 |
+
from scipy import sparse
|
| 11 |
+
from anndata import AnnData
|
| 12 |
+
#from tabnanny import verbose
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
#from gsva_prep import prep_gsva
|
| 15 |
+
from typing import Optional, Union
|
| 16 |
+
from matplotlib.pyplot import rcParams
|
| 17 |
+
#from statsmodels.stats.multitest import multipletests
|
| 18 |
+
#from sklearn.model_selection import train_test_split
|
| 19 |
+
#from rpy2.robjects.conversion import localconverter
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def rescale_matrix(S, log_scale=False):
|
| 25 |
+
"""
|
| 26 |
+
Sums cell-level counts by factors in label vector
|
| 27 |
+
|
| 28 |
+
Parameters
|
| 29 |
+
----------
|
| 30 |
+
S : np.ndarray, scipy.sparse.csr_matrix or pandas.DataFrame
|
| 31 |
+
Matrix with read counts (gene x cell)
|
| 32 |
+
log_scale : bool, optional (default: False)
|
| 33 |
+
Whether to log-transform the rescaled matrix
|
| 34 |
+
|
| 35 |
+
Returns
|
| 36 |
+
-------
|
| 37 |
+
B : np.ndarray or scipy.sparse.csr_matrix
|
| 38 |
+
Scaled and log-transformed matrix
|
| 39 |
+
"""
|
| 40 |
+
if isinstance(S, pd.DataFrame):
|
| 41 |
+
S = S.values
|
| 42 |
+
elif isinstance(S, np.ndarray):
|
| 43 |
+
pass
|
| 44 |
+
elif isinstance(S, scipy.sparse.csr_matrix):
|
| 45 |
+
S = S.toarray()
|
| 46 |
+
else:
|
| 47 |
+
raise ValueError('Input S must be a pandas.DataFrame, numpy.ndarray or scipy.sparse.csr_matrix')
|
| 48 |
+
|
| 49 |
+
cs = np.sum(S, axis=0)
|
| 50 |
+
cs[cs == 0] = 1
|
| 51 |
+
B = np.median(cs) * (S / cs)
|
| 52 |
+
if log_scale:
|
| 53 |
+
B = np.log1p(B)
|
| 54 |
+
return B
|
| 55 |
+
|
| 56 |
+
def normalize_default(adata, log_scale=True):
|
| 57 |
+
"""
|
| 58 |
+
Normalizes gene expression matrix by total count and scales by median
|
| 59 |
+
|
| 60 |
+
Parameters
|
| 61 |
+
----------
|
| 62 |
+
adata : AnnData
|
| 63 |
+
Annotated data matrix.
|
| 64 |
+
log_scale : bool, optional (default: True)
|
| 65 |
+
Whether to log-transform the rescaled matrix.
|
| 66 |
+
|
| 67 |
+
Returns
|
| 68 |
+
-------
|
| 69 |
+
adata : AnnData
|
| 70 |
+
Annotated data matrix with normalized and scaled expression values.
|
| 71 |
+
"""
|
| 72 |
+
if 'counts' in adata.layers.keys():
|
| 73 |
+
print('normalizaing data using count data in .layers["counts] ')
|
| 74 |
+
S = adata.layers['counts']
|
| 75 |
+
else:
|
| 76 |
+
print('normaling data using count data in .X')
|
| 77 |
+
S = adata.X
|
| 78 |
+
B = rescale_matrix(S, log_scale=log_scale)
|
| 79 |
+
adata.X = B
|
| 80 |
+
return adata
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def normalize_matrix(
|
| 84 |
+
X: Union[np.ndarray, sparse.spmatrix],
|
| 85 |
+
top_features_frac: float = 1.0,
|
| 86 |
+
scale_factor: Union[str, float, int, np.ndarray, None] = "median",
|
| 87 |
+
transformation: Union[str, None] = "log",
|
| 88 |
+
anchor_features: Union[np.ndarray, None] = None,
|
| 89 |
+
) -> Union[np.ndarray, sparse.spmatrix]:
|
| 90 |
+
|
| 91 |
+
X = X.astype(dtype=np.float64)
|
| 92 |
+
|
| 93 |
+
# Which features (i.e. genes) should we use to compute library sizes?
|
| 94 |
+
if anchor_features is not None:
|
| 95 |
+
lib_sizes = np.array(np.mean(X[:, anchor_features], axis=1))
|
| 96 |
+
else:
|
| 97 |
+
if top_features_frac < 1.0:
|
| 98 |
+
universality = np.array(np.mean(X > 0, axis=0))
|
| 99 |
+
selected_features = np.flatnonzero(universality > (1 - top_features_frac))
|
| 100 |
+
lib_sizes = np.array(np.mean(X[:, selected_features], axis=1))
|
| 101 |
+
else:
|
| 102 |
+
lib_sizes = np.array(np.mean(X, axis=1))
|
| 103 |
+
|
| 104 |
+
# Note: mean as opposed to sum
|
| 105 |
+
|
| 106 |
+
# Normalize library sizes
|
| 107 |
+
if isinstance(X, sparse.spmatrix):
|
| 108 |
+
X_scaled = X.multiply(1 / lib_sizes)
|
| 109 |
+
else:
|
| 110 |
+
try:
|
| 111 |
+
X_scaled = X / lib_sizes
|
| 112 |
+
except ValueError:
|
| 113 |
+
lib_sizes = np.reshape(lib_sizes, (-1, 1))
|
| 114 |
+
X_scaled = X / lib_sizes
|
| 115 |
+
|
| 116 |
+
# scale normalized columns
|
| 117 |
+
if scale_factor == "median":
|
| 118 |
+
kappa = np.median(np.array(np.sum(X, axis=1) / np.sum(X_scaled, axis=1)))
|
| 119 |
+
X_scaled_norm = X_scaled * kappa
|
| 120 |
+
elif isinstance(scale_factor, (int, float)):
|
| 121 |
+
X_scaled_norm = X_scaled * scale_factor
|
| 122 |
+
elif isinstance(scale_factor, np.ndarray):
|
| 123 |
+
if sparse.issparse(X_scaled):
|
| 124 |
+
X_scaled_norm = X_scaled.multiply(scale_factor)
|
| 125 |
+
else:
|
| 126 |
+
X_scaled_norm = X_scaled / scale_factor
|
| 127 |
+
|
| 128 |
+
# For compatibility with C
|
| 129 |
+
if sparse.issparse(X_scaled_norm):
|
| 130 |
+
X_scaled_norm = sparse.csc_matrix(X_scaled_norm)
|
| 131 |
+
|
| 132 |
+
# Post-transformation
|
| 133 |
+
if transformation == "log":
|
| 134 |
+
X_scaled_norm_trans = np.log1p(X_scaled_norm)
|
| 135 |
+
elif transformation == "tukey":
|
| 136 |
+
if sparse.issparse(X_scaled_norm):
|
| 137 |
+
nnz_idx = X_scaled_norm.nonzero()
|
| 138 |
+
ii = nnz_idx[0]
|
| 139 |
+
jj = nnz_idx[1]
|
| 140 |
+
vv = X_scaled_norm[ii, jj]
|
| 141 |
+
vv_transformed = np.sqrt(vv) + np.sqrt(1 + vv)
|
| 142 |
+
X_scaled_norm[ii, jj] = vv_transformed
|
| 143 |
+
else:
|
| 144 |
+
X_scaled_norm[X_scaled_norm < 0] = 0
|
| 145 |
+
vv = X_scaled_norm[X_scaled_norm != 0]
|
| 146 |
+
vv_transformed = np.sqrt(vv) + np.sqrt(1 + vv)
|
| 147 |
+
X_scaled_norm[X_scaled_norm != 0] = vv_transformed
|
| 148 |
+
|
| 149 |
+
# elif transformation == "lsi":
|
| 150 |
+
# if sparse.issparse(X_scaled_norm):
|
| 151 |
+
# X_scaled_norm_trans = _an.LSI(X_scaled_norm)
|
| 152 |
+
# else:
|
| 153 |
+
# X_scaled_norm_sp = sparse.csc_matrix(X_scaled_norm)
|
| 154 |
+
# X_scaled_norm_trans = _an.LSI(X_scaled_norm_sp).toarray()
|
| 155 |
+
else:
|
| 156 |
+
X_scaled_norm_trans = X_scaled_norm
|
| 157 |
+
|
| 158 |
+
return X_scaled_norm_trans
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def normalize_actionet(
|
| 162 |
+
adata: AnnData,
|
| 163 |
+
layer_key: Optional[str] = None,
|
| 164 |
+
layer_key_out: Optional[str] = None,
|
| 165 |
+
top_features_frac: float = 1.0,
|
| 166 |
+
scale_factor: Union[str, float, int, np.ndarray, None] = "median",
|
| 167 |
+
transformation: Union[str, None] = "log",
|
| 168 |
+
anchor_features: Union[np.ndarray, None] = None,
|
| 169 |
+
copy: Optional[bool] = False,
|
| 170 |
+
) -> Optional[AnnData]:
|
| 171 |
+
adata = adata.copy() if copy else adata
|
| 172 |
+
|
| 173 |
+
if "metadta" in adata.uns.keys():
|
| 174 |
+
if "norm_method" in adata.uns["metadata"].keys(): # Already normalized? leave it alone!
|
| 175 |
+
# return adata if copy else None
|
| 176 |
+
warnings.warn("AnnData object is prenormalized. Please make sure to use the right assay.")
|
| 177 |
+
|
| 178 |
+
if layer_key is None and "input_assay" in adata.uns["metadata"].keys():
|
| 179 |
+
layer_key = adata.uns["metadata"]["input_assay"]
|
| 180 |
+
|
| 181 |
+
if layer_key is not None:
|
| 182 |
+
if layer_key not in adata.layers.keys():
|
| 183 |
+
raise ValueError("Did not find adata.layers['" + layer_key + "']. ")
|
| 184 |
+
S = adata.layers[layer_key]
|
| 185 |
+
else:
|
| 186 |
+
S = adata.X
|
| 187 |
+
|
| 188 |
+
if sparse.issparse(S):
|
| 189 |
+
UE = set(S.data)
|
| 190 |
+
else:
|
| 191 |
+
UE = set(S.flatten())
|
| 192 |
+
|
| 193 |
+
nonint_count = len(UE.difference(set(np.arange(0, max(UE) + 1))))
|
| 194 |
+
if 0 < nonint_count:
|
| 195 |
+
warnings.warn("Input [count] assay has non-integer values, which looks like a normalized matrix. Please make sure to use the right assay.")
|
| 196 |
+
|
| 197 |
+
S = normalize_matrix(
|
| 198 |
+
S,
|
| 199 |
+
anchor_features=anchor_features,
|
| 200 |
+
top_features_frac=top_features_frac,
|
| 201 |
+
scale_factor=scale_factor,
|
| 202 |
+
transformation=transformation,
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
adata.uns["metadata"] = {}
|
| 206 |
+
adata.uns["metadata"]["norm_method"] = "default_top%.2f_%s" % (
|
| 207 |
+
top_features_frac,
|
| 208 |
+
transformation,
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
if layer_key_out is not None:
|
| 212 |
+
adata.uns["metadata"]["default_assay"] = layer_key_out
|
| 213 |
+
adata.layers[layer_key_out] = S
|
| 214 |
+
else:
|
| 215 |
+
adata.uns["metadata"]["default_assay"] = None
|
| 216 |
+
adata.X = S
|
| 217 |
+
|
| 218 |
+
return adata if copy else None
|
| 219 |
+
|
| 220 |
+
def read_pathways(filename):
|
| 221 |
+
with open(filename, 'r') as temp_f:
|
| 222 |
+
col_count = [ len(l.split("\t")) for l in temp_f.readlines() ]
|
| 223 |
+
column_names = [i for i in range(0, max(col_count))]
|
| 224 |
+
### Read csv
|
| 225 |
+
return pd.read_csv(filename, header=None, delimiter="\t", names=column_names)
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def filter_expressed_genes_by_celltype(adata: AnnData,
|
| 230 |
+
threshold: float=0.05,
|
| 231 |
+
filter_genes_from: str='singlecell',
|
| 232 |
+
subject_id: str='Subject'):
|
| 233 |
+
"""
|
| 234 |
+
|
| 235 |
+
Function to filter expressed genes by cell type based on a threshold
|
| 236 |
+
|
| 237 |
+
Parameters:
|
| 238 |
+
-----------
|
| 239 |
+
adata : AnnData object
|
| 240 |
+
Annotated Data matrix with rows representing genes and columns representing cells.
|
| 241 |
+
threshold : float, optional (default=0.05)
|
| 242 |
+
The threshold to use for filtering expressed genes based on the minimum number of cells they are detected in.
|
| 243 |
+
filter_genes_from: str, optional (default=`singlecell`)
|
| 244 |
+
Whether to filter genes that meet threshold in pseudobulk data or singlecell data.
|
| 245 |
+
subject_id (str): a string indicating the column containing individual identifiers.
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
--------
|
| 250 |
+
expressed_genes_per_celltype : pandas DataFrame
|
| 251 |
+
A dataframe where the rows are the gene names and columns are the cell types,
|
| 252 |
+
containing only the genes that are expressed in at least the specified percentage of cells for each cell type.
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
"""
|
| 256 |
+
|
| 257 |
+
# Initialize empty dictionaries to store the expressed genes and gene sets per cell type
|
| 258 |
+
expressed_genes_per_celltype = {}
|
| 259 |
+
gene_set_per_celltype = {}
|
| 260 |
+
|
| 261 |
+
if filter_genes_from=='pseudobulk':
|
| 262 |
+
# Get pseudo-bulk profile
|
| 263 |
+
adata = dc.get_pseudobulk(adata,
|
| 264 |
+
sample_col=subject_id,
|
| 265 |
+
groups_col='cell_type',
|
| 266 |
+
layer='counts',
|
| 267 |
+
mode='sum',
|
| 268 |
+
min_cells=0,
|
| 269 |
+
min_counts=0
|
| 270 |
+
)
|
| 271 |
+
# Loop through each unique cell type in the input AnnData object
|
| 272 |
+
|
| 273 |
+
for cell_type in adata.obs.cell_type.unique():
|
| 274 |
+
|
| 275 |
+
expressed_genes_per_celltype[cell_type] = dc.filter_by_prop(adata[adata.obs['cell_type']==cell_type],
|
| 276 |
+
min_prop=threshold)
|
| 277 |
+
|
| 278 |
+
elif filter_genes_from=='singlecell':
|
| 279 |
+
# Loop through each unique cell type in the input AnnData object
|
| 280 |
+
|
| 281 |
+
for cell_type in adata.obs.cell_type.unique():
|
| 282 |
+
|
| 283 |
+
# Calculate the number of cells based on the specified threshold
|
| 284 |
+
percent = threshold
|
| 285 |
+
num_cells = round(percent*len(adata[adata.obs['cell_type']==cell_type]))
|
| 286 |
+
|
| 287 |
+
# Filter genes based on minimum number of cells and store the resulting gene names
|
| 288 |
+
expressed_genes_per_celltype[cell_type], _ = sc.pp.filter_genes(adata[adata.obs.cell_type==cell_type].layers['counts'],
|
| 289 |
+
min_cells=num_cells, inplace=False)
|
| 290 |
+
expressed_genes_per_celltype[cell_type] = list(adata.var_names[expressed_genes_per_celltype[cell_type]])
|
| 291 |
+
|
| 292 |
+
# Convert the dictionary of expressed genes per cell type to a Pandas DataFrame
|
| 293 |
+
expressed_genes_per_celltype = pd.DataFrame.from_dict(expressed_genes_per_celltype, orient='index').transpose()
|
| 294 |
+
|
| 295 |
+
return expressed_genes_per_celltype
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def filter_lowly_exp_genes(expressed: pd.DataFrame,
|
| 299 |
+
all_paths: pd.DataFrame,
|
| 300 |
+
threshold: float = 0.33):
|
| 301 |
+
|
| 302 |
+
"""
|
| 303 |
+
Filters lowly expressed gene sets based on a threshold and pathway membership.
|
| 304 |
+
|
| 305 |
+
Parameters:
|
| 306 |
+
-----------
|
| 307 |
+
expressed: pandas.DataFrame
|
| 308 |
+
A DataFrame of expressed genes with cell types as columns and gene IDs as rows.
|
| 309 |
+
all_paths: pandas.DataFrame
|
| 310 |
+
A DataFrame of gene sets with pathways as columns and gene IDs as rows.
|
| 311 |
+
threshold: float, optional (default=0.33)
|
| 312 |
+
A proportion threshold used to filter gene sets based on their expression in each cell type.
|
| 313 |
+
|
| 314 |
+
Returns:
|
| 315 |
+
--------
|
| 316 |
+
gene_set_per_celltype: dict of pandas.DataFrame
|
| 317 |
+
A dictionary of gene sets per cell type, with cell type names as keys and gene set dataframes as values.
|
| 318 |
+
Each gene set dataframe has three columns: 'description', 'member', and 'name'.
|
| 319 |
+
"""
|
| 320 |
+
|
| 321 |
+
# Initialize empty dictionaries to store the gene sets and gene sets per cell type
|
| 322 |
+
gene_set = {}
|
| 323 |
+
gene_set_per_celltype = {}
|
| 324 |
+
|
| 325 |
+
# Loop through each cell type in the input Pandas DataFrame of expressed genes
|
| 326 |
+
for cell_type in expressed.columns:
|
| 327 |
+
# Determine which pathways have a proportion of genes above the specified threshold
|
| 328 |
+
index = [sum(all_paths[x].isin(expressed[cell_type]))/len(all_paths[x]) > threshold for x in all_paths.columns]
|
| 329 |
+
# Filter pathways based on threshold and store the resulting gene sets
|
| 330 |
+
p = all_paths.loc[:, index]
|
| 331 |
+
x = {y: pd.Series(list(set(expressed[cell_type]).intersection(set(p[y])))) for y in p.columns}
|
| 332 |
+
x = {k: v for k, v in x.items() if not v.empty}
|
| 333 |
+
gene_set[cell_type] = x
|
| 334 |
+
|
| 335 |
+
# Convert the gene sets to Pandas DataFrames and store them in a dictionary by cell type
|
| 336 |
+
gene_set_per_celltype[cell_type] = pd.DataFrame(columns=['description', 'member', 'name'])
|
| 337 |
+
for pathway, gene_list in gene_set[cell_type].items():
|
| 338 |
+
|
| 339 |
+
df = pd.DataFrame(columns=['description', 'member', 'name'])
|
| 340 |
+
df['member'] = gene_list
|
| 341 |
+
df['name'] = pathway
|
| 342 |
+
df['description'] = pathway.split(" ")[-1]
|
| 343 |
+
gene_set_per_celltype[cell_type] = pd.concat([gene_set_per_celltype[cell_type], df], join='outer', ignore_index=True)
|
| 344 |
+
|
| 345 |
+
# Sort the resulting gene sets by description and member
|
| 346 |
+
gene_set_per_celltype[cell_type].sort_index(axis=1, inplace=True)
|
| 347 |
+
gene_set_per_celltype[cell_type].sort_index(axis=0, inplace=True)
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
return gene_set_per_celltype
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def get_ind_level_ave(adata: AnnData, subject_id: str = 'Subject', method: str = "agg_x_num",
|
| 354 |
+
expressed_genes_per_celltype: dict = {}, filter_genes_at_threshold: bool = True):
|
| 355 |
+
"""
|
| 356 |
+
Get averaged expression data for each cell type and individual in an AnnData object.
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
Args:
|
| 360 |
+
|
| 361 |
+
adata (AnnData): An AnnData object with read counts (gene x cell).
|
| 362 |
+
subject_id (str): a string indicating the column containing individual identifiers.
|
| 363 |
+
method (str): a string indicating the method to be used. The default is "agg_x_num".
|
| 364 |
+
filter_genes_at_threshold (bool): A boolean indicating whether to filter genes based on threshold. The default is True.
|
| 365 |
+
expressed_genes_per_celltype (float): A dictionary of the genes to be filtered for each celltype.
|
| 366 |
+
|
| 367 |
+
Returns:
|
| 368 |
+
|
| 369 |
+
Dictionary: A dictionary of data frames with averaged expression data for each cell type and individual.
|
| 370 |
+
|
| 371 |
+
"""
|
| 372 |
+
|
| 373 |
+
if method == "agg_x_norm":
|
| 374 |
+
|
| 375 |
+
avs_logcounts_cellxind = {}
|
| 376 |
+
# loop over each unique cell type in the annotation metadata
|
| 377 |
+
for cell_type in adata.obs.cell_type.unique():
|
| 378 |
+
|
| 379 |
+
# filter genes based on threshold
|
| 380 |
+
if filter_genes_at_threshold:
|
| 381 |
+
adata_temp = adata[adata.obs.cell_type==cell_type].copy()
|
| 382 |
+
# sc.pp.filter_genes(adata_temp, min_cells=gene_celltype_threshold*adata_temp.n_obs)
|
| 383 |
+
adata_temp = adata_temp[:, adata_temp.var_names.isin(expressed_genes_per_celltype[cell_type].tolist())]
|
| 384 |
+
else:
|
| 385 |
+
adata_temp = adata.copy()
|
| 386 |
+
|
| 387 |
+
# Get pseudo-bulk profile
|
| 388 |
+
pdata = dc.get_pseudobulk(adata_temp, sample_col=subject_id, groups_col='cell_type', layer='counts', mode='sum',
|
| 389 |
+
min_cells=0, min_counts=0)
|
| 390 |
+
|
| 391 |
+
# genes = dc.filter_by_prop(pdata, min_prop=0.05, min_smpls=1)
|
| 392 |
+
# pdata = pdata[:, genes].copy()
|
| 393 |
+
|
| 394 |
+
# Normalize and log transform
|
| 395 |
+
|
| 396 |
+
# sc.pp.normalize_total(pdata, 1e06)
|
| 397 |
+
# sc.pp.log1p(pdata)
|
| 398 |
+
|
| 399 |
+
pdata.layers['counts'] = pdata.X
|
| 400 |
+
pdata = normalize_actionet(pdata, layer_key = 'counts', layer_key_out = None,
|
| 401 |
+
top_features_frac = 1.0, scale_factor = "median",
|
| 402 |
+
transformation = "log", anchor_features = None, copy = True)
|
| 403 |
+
|
| 404 |
+
# Store the log-normalized, averaged expression data for each individual and cell type
|
| 405 |
+
avs_logcounts_cellxind[cell_type] = pd.DataFrame(pdata.X.T, columns=pdata.obs[subject_id], index=pdata.var_names)
|
| 406 |
+
|
| 407 |
+
del adata_temp, pdata
|
| 408 |
+
|
| 409 |
+
elif method == 'norm_x_agg':
|
| 410 |
+
|
| 411 |
+
def sum_counts(counts, label, cell_labels, gene_labels):
|
| 412 |
+
|
| 413 |
+
"""
|
| 414 |
+
Sums cell-level counts by factors in label vector.
|
| 415 |
+
|
| 416 |
+
Args:
|
| 417 |
+
counts (AnnData): An AnnData object with read counts (gene x cell).
|
| 418 |
+
label (pd.DataFrame): Variable of interest by which to sum counts.
|
| 419 |
+
cell_labels (pd.Index): Vector of cell labels.
|
| 420 |
+
gene_labels (pd.Index): Vector of gene labels.
|
| 421 |
+
|
| 422 |
+
Returns:
|
| 423 |
+
Dictionary: A dictionary with the following keys:
|
| 424 |
+
- 'summed_counts': A data frame with summed counts.
|
| 425 |
+
- 'ncells': A data frame with the number of cells used per summation.
|
| 426 |
+
"""
|
| 427 |
+
# Create a data frame with the label vector and add a column of 1s for counting.
|
| 428 |
+
label_df = pd.DataFrame(label)
|
| 429 |
+
label_df.columns = ['ID']
|
| 430 |
+
label_df['index'] = 1
|
| 431 |
+
|
| 432 |
+
# Add a column for cell type and pivot the data frame to create a matrix of counts.
|
| 433 |
+
label_df['celltype'] = cell_labels
|
| 434 |
+
label_df = label_df.pivot_table(index='celltype', columns='ID', values='index', aggfunc=np.sum, fill_value=0)
|
| 435 |
+
label_df = label_df.astype(float)
|
| 436 |
+
|
| 437 |
+
# Multiply the counts matrix by the gene expression matrix to get summed counts.
|
| 438 |
+
summed_counts = pd.DataFrame(counts.X.T @ label_df.values, index = gene_labels, columns= label_df.columns)
|
| 439 |
+
|
| 440 |
+
# Sum the number of cells used for each summation.
|
| 441 |
+
ncells = label_df.sum()
|
| 442 |
+
|
| 443 |
+
# Return the summed counts and number of cells as a dictionary.
|
| 444 |
+
return {'summed_counts': summed_counts, 'ncells': ncells}
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
# Get metadata from the AnnData object.
|
| 448 |
+
meta = adata.obs # Get metadata
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
# Create a data frame of labels by combining cell type and individual metadata fields.
|
| 452 |
+
# Sum counts by individual
|
| 453 |
+
labels = pd.DataFrame(meta['cell_type'].astype(str) + '_' + meta[subject_id].astype(str), columns=['individual'])
|
| 454 |
+
|
| 455 |
+
# Sum counts by individual and store the results in a dictionary.
|
| 456 |
+
summed_logcounts_cellxind = sum_counts(adata, labels, adata.obs_names, adata.var_names)
|
| 457 |
+
|
| 458 |
+
# Calculate averages for each cell type and individual and store the results in a dictionary.
|
| 459 |
+
# Get averages corresponding to both count matrices
|
| 460 |
+
avs_logcounts = np.array(summed_logcounts_cellxind['summed_counts'].values) / np.array(summed_logcounts_cellxind['ncells'].values)
|
| 461 |
+
# avs_logcounts = np.array(summed_logcounts_cellxind['summed_counts'].values)
|
| 462 |
+
avs_logcounts = pd.DataFrame(avs_logcounts, index = summed_logcounts_cellxind['summed_counts'].index,
|
| 463 |
+
columns=summed_logcounts_cellxind['summed_counts'].columns)
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
# Split the averages by cell type and individual and store the results in a dictionary.
|
| 467 |
+
# Split column names into two parts: cell type and individual
|
| 468 |
+
x = [col.split('_') for col in avs_logcounts.columns]
|
| 469 |
+
celltype = [col[0] for col in x]
|
| 470 |
+
individual = [col[1] for col in x]
|
| 471 |
+
|
| 472 |
+
# Get unique cell types in the dataset
|
| 473 |
+
celltype_unique = np.unique(celltype)
|
| 474 |
+
|
| 475 |
+
# Create an empty dictionary to store the average counts for each cell type and individual
|
| 476 |
+
avs_by_ind_out = {}
|
| 477 |
+
|
| 478 |
+
# Loop over the unique cell types and subset the average counts for each cell type and individual
|
| 479 |
+
for i in celltype_unique:
|
| 480 |
+
index = np.array(celltype)==i
|
| 481 |
+
df = avs_logcounts.loc[:, index]
|
| 482 |
+
df.columns = np.array(individual)[index]
|
| 483 |
+
avs_by_ind_out[i] = df
|
| 484 |
+
|
| 485 |
+
if filter_genes_at_threshold:
|
| 486 |
+
# num_cells = round(gene_celltype_threshold*len(adata[adata.obs['cell_type']==cell_type]))
|
| 487 |
+
# # Filter genes based on minimum number of cells and store the resulting gene names
|
| 488 |
+
# gene_mask, _ = sc.pp.filter_genes(adata[adata.obs.cell_type==cell_type].layers['counts'],
|
| 489 |
+
# min_cells=num_cells,
|
| 490 |
+
# inplace=False)
|
| 491 |
+
# genes = list(adata.var_names[gene_mask])
|
| 492 |
+
avs_by_ind_out[i] = avs_by_ind_out[i].loc[expressed_genes_per_celltype[i], :]
|
| 493 |
+
else:
|
| 494 |
+
adata = adata.copy()
|
| 495 |
+
# Store the dictionary of average counts for each cell type and individual
|
| 496 |
+
avs_logcounts_cellxind = avs_by_ind_out
|
| 497 |
+
|
| 498 |
+
# Return the dictionary of average counts for each cell type and individual
|
| 499 |
+
|
| 500 |
+
return avs_logcounts_cellxind
|
| 501 |
+
|
| 502 |
+
|
| 503 |
+
def plot_and_select_top_deps(all_pathways: pd.DataFrame(),
|
| 504 |
+
list_of_paths_to_annotate: list = [],
|
| 505 |
+
save_name='cell_type_specific',
|
| 506 |
+
save_prefix: str = 'mathys_pfc',
|
| 507 |
+
filter: bool=False,
|
| 508 |
+
cell_type_specific: bool = True,
|
| 509 |
+
test_name: str = ''):
|
| 510 |
+
|
| 511 |
+
if cell_type_specific:
|
| 512 |
+
# Plot certain cell_type specific pathways
|
| 513 |
+
collated_df = pd.DataFrame(all_pathways.groupby(all_pathways.index).agg({'score_adj': list, 'celltype': list,
|
| 514 |
+
'logFC': list, 'P.Value': list, 'shortened': list, 'highlight': list}))
|
| 515 |
+
# filter pathways only expressed in one cell type
|
| 516 |
+
mask = collated_df["celltype"].apply(len) == 1
|
| 517 |
+
df = collated_df[mask]
|
| 518 |
+
|
| 519 |
+
# create pathway by cell type pivot table
|
| 520 |
+
scores_table = pd.pivot_table(all_pathways, values='score_adj', index='pathway', columns='celltype')
|
| 521 |
+
scores_table = scores_table.loc[df.index]
|
| 522 |
+
scores_table['shortened'] = df.shortened.apply(lambda x: x[0])
|
| 523 |
+
scores_table['highlight'] = df.highlight.apply(lambda x: x[0])
|
| 524 |
+
scores_table.sort_values(by=[cell_type for cell_type in all_pathways.celltype.unique()], inplace=True)
|
| 525 |
+
|
| 526 |
+
# drop pathways with same shortened names ??
|
| 527 |
+
scores_table = scores_table.drop_duplicates(subset='shortened', keep='first')
|
| 528 |
+
|
| 529 |
+
###### Plot Cell type specific data
|
| 530 |
+
|
| 531 |
+
if filter:
|
| 532 |
+
xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
|
| 533 |
+
|
| 534 |
+
# select only pathways that should be visualized
|
| 535 |
+
shortened_names = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]['shortened']
|
| 536 |
+
scores_table = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]
|
| 537 |
+
|
| 538 |
+
n_rows = len(scores_table)
|
| 539 |
+
|
| 540 |
+
fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
|
| 541 |
+
fig.tight_layout()
|
| 542 |
+
|
| 543 |
+
# order table by cell type name
|
| 544 |
+
# scores_table = scores_table.reindex(columns=['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte',
|
| 545 |
+
# 'OPC', 'Microglia'])
|
| 546 |
+
scores_table = scores_table[xticks]
|
| 547 |
+
|
| 548 |
+
g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
|
| 549 |
+
linewidths=0.15, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.2},
|
| 550 |
+
cbar_ax=None, square=False,ax=ax1, xticklabels=xticks, yticklabels=shortened_names, mask=None,)
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
cax = g1.figure.axes[-1]
|
| 554 |
+
|
| 555 |
+
g1.set_title(f'Select Cell-type-specific Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology',
|
| 556 |
+
fontsize=3)
|
| 557 |
+
g1.set_ylabel('')
|
| 558 |
+
g1.set_xlabel('')
|
| 559 |
+
|
| 560 |
+
ax1.tick_params(axis='both', which='major', labelsize=4, length=1.5, width=0.5)
|
| 561 |
+
cax.tick_params(labelsize=4, length=1.5, width=0.5, which="major")
|
| 562 |
+
|
| 563 |
+
plt.tight_layout()
|
| 564 |
+
plt.savefig(f'results/{test_name}/{save_prefix}_filtered_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
|
| 565 |
+
plt.show(block=False)
|
| 566 |
+
|
| 567 |
+
else:
|
| 568 |
+
xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
scores_table = scores_table[scores_table.shortened!='None']
|
| 572 |
+
yticklabels = scores_table['shortened']
|
| 573 |
+
# order table by cell type name
|
| 574 |
+
|
| 575 |
+
scores_table = scores_table[xticks]
|
| 576 |
+
|
| 577 |
+
n_rows = len(scores_table)
|
| 578 |
+
|
| 579 |
+
fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
|
| 580 |
+
fig.tight_layout()
|
| 581 |
+
|
| 582 |
+
g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
|
| 583 |
+
linewidths=0.07, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.1},
|
| 584 |
+
cbar_ax=None, square=False, ax=ax1, xticklabels=xticks, yticklabels=yticklabels, mask=None,)
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
cax = g1.figure.axes[-1]
|
| 588 |
+
|
| 589 |
+
g1.set_title(f'All Cell-type-specific Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology',
|
| 590 |
+
fontsize=3)
|
| 591 |
+
g1.set_ylabel('')
|
| 592 |
+
g1.set_xlabel('')
|
| 593 |
+
|
| 594 |
+
ax1.tick_params(axis='both', which='major', labelsize=2, length=1.5, width=0.25)
|
| 595 |
+
cax.tick_params(labelsize=4, length=1.5, width=0.25, which="major")
|
| 596 |
+
|
| 597 |
+
plt.tight_layout()
|
| 598 |
+
#plt.savefig(f'../results/{test_name}/{save_prefix}_all_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
|
| 599 |
+
plt.savefig(f'results/{test_name}/{save_prefix}_all_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
|
| 600 |
+
plt.show(block=False)
|
| 601 |
+
|
| 602 |
+
|
| 603 |
+
else:
|
| 604 |
+
# Plot certain cell_type specific pathways
|
| 605 |
+
collated_df = pd.DataFrame(all_pathways.groupby(all_pathways.index).agg({'score_adj': list, 'celltype': list,
|
| 606 |
+
'logFC': list, 'P.Value': list, 'shortened': list, 'highlight': list}))
|
| 607 |
+
# filte pathways only expressed in one cell type
|
| 608 |
+
mask = collated_df["celltype"].apply(len) > 1
|
| 609 |
+
df = collated_df[mask]
|
| 610 |
+
|
| 611 |
+
# create pathway by cell type pivot table
|
| 612 |
+
scores_table = pd.pivot_table(all_pathways, values='score_adj', index='pathway', columns='celltype')
|
| 613 |
+
scores_table = scores_table.loc[df.index]
|
| 614 |
+
scores_table['shortened'] = df.shortened.apply(lambda x: x[0])
|
| 615 |
+
scores_table['highlight'] = df.highlight.apply(lambda x: x[0])
|
| 616 |
+
scores_table.sort_values(by=[cell_type for cell_type in all_pathways.celltype.unique()], inplace=True)
|
| 617 |
+
|
| 618 |
+
# drop pathways with same shortened names ??
|
| 619 |
+
scores_table = scores_table.drop_duplicates(subset='shortened', keep='first')
|
| 620 |
+
|
| 621 |
+
###### Plot Cell type specific data
|
| 622 |
+
|
| 623 |
+
if filter:
|
| 624 |
+
xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
|
| 625 |
+
|
| 626 |
+
# select only pathways that should be visualized
|
| 627 |
+
shortened_names = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]['shortened']
|
| 628 |
+
scores_table = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]
|
| 629 |
+
|
| 630 |
+
# order table by cell type name
|
| 631 |
+
scores_table = scores_table[xticks]
|
| 632 |
+
|
| 633 |
+
n_rows = len(scores_table)
|
| 634 |
+
|
| 635 |
+
fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
|
| 636 |
+
fig.tight_layout()
|
| 637 |
+
|
| 638 |
+
g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
|
| 639 |
+
linewidths=0.15, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.2},
|
| 640 |
+
cbar_ax=None, square=False,ax=ax1, xticklabels=xticks, yticklabels=shortened_names, mask=None,)
|
| 641 |
+
|
| 642 |
+
cax = g1.figure.axes[-1]
|
| 643 |
+
|
| 644 |
+
g1.set_title(f'Select Shared Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology', fontsize=3)
|
| 645 |
+
g1.set_ylabel('')
|
| 646 |
+
g1.set_xlabel('')
|
| 647 |
+
|
| 648 |
+
ax1.tick_params(axis='both', which='major', labelsize=4, length=1.5, width=0.5)
|
| 649 |
+
cax.tick_params(labelsize=4, length=1.5, width=0.5, which="major")
|
| 650 |
+
|
| 651 |
+
plt.tight_layout()
|
| 652 |
+
plt.savefig(f'results/{test_name}/{save_prefix}_filtered_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
|
| 653 |
+
plt.show(block=False)
|
| 654 |
+
|
| 655 |
+
else:
|
| 656 |
+
xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
|
| 657 |
+
|
| 658 |
+
scores_table = scores_table[scores_table.shortened!='None']
|
| 659 |
+
yticklabels = scores_table['shortened']
|
| 660 |
+
# order table by cell type name
|
| 661 |
+
|
| 662 |
+
scores_table = scores_table[xticks]
|
| 663 |
+
|
| 664 |
+
n_rows = len(scores_table)
|
| 665 |
+
|
| 666 |
+
fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
|
| 667 |
+
fig.tight_layout()
|
| 668 |
+
|
| 669 |
+
g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
|
| 670 |
+
linewidths=0.07, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.1},
|
| 671 |
+
cbar_ax=None, square=False, ax=ax1, xticklabels=xticks, yticklabels=yticklabels, mask=None,)
|
| 672 |
+
|
| 673 |
+
cax = g1.figure.axes[-1]
|
| 674 |
+
|
| 675 |
+
g1.set_title(f'All Broad Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology', fontsize=3)
|
| 676 |
+
g1.set_ylabel('')
|
| 677 |
+
g1.set_xlabel('')
|
| 678 |
+
|
| 679 |
+
ax1.tick_params(axis='both', which='major', labelsize=2, length=1.5, width=0.25)
|
| 680 |
+
cax.tick_params(labelsize=4, length=1.5, width=0.25, which="major")
|
| 681 |
+
|
| 682 |
+
plt.tight_layout()
|
| 683 |
+
plt.savefig(f'results/{test_name}/{save_prefix}_all_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
|
| 684 |
+
plt.show(block=False)
|
| 685 |
+
|
| 686 |
+
return
|
| 687 |
+
|
| 688 |
+
|
| 689 |
+
def multi_study_pathway_overlap(pathway_scores: dict = {},
|
| 690 |
+
filtered_pathways: list = [],
|
| 691 |
+
cell_types: list = ["Excitatory", "Inhibitory", "Astrocyte",
|
| 692 |
+
"Microglia", "Oligodendrocyte", "OPC", "Endothelial"],
|
| 693 |
+
test_name: str = 'ad_vs_no',
|
| 694 |
+
top_n: int = 10,
|
| 695 |
+
pathways: list = [],
|
| 696 |
+
filter: bool = False,
|
| 697 |
+
save_suffix: str = 'ad_vs_no',
|
| 698 |
+
method: str = 'cell_type_overlap'):
|
| 699 |
+
|
| 700 |
+
"""
|
| 701 |
+
This function generates a heatmap of the overlapping pathways across multiple studies. The heatmap displays the adjusted
|
| 702 |
+
pathway scores across different cell types for each pathway in each study. The function also returns a dictionary of
|
| 703 |
+
filtered scores that contain only the overlapping pathways across the studies.
|
| 704 |
+
|
| 705 |
+
Parameters:
|
| 706 |
+
-----------
|
| 707 |
+
pathway_scores : dict
|
| 708 |
+
A dictionary of pathway scores for different studies.
|
| 709 |
+
filtered_pathways : list, optional
|
| 710 |
+
A list of pathways to be used as a filter.
|
| 711 |
+
cell_types : list, optional
|
| 712 |
+
A list of cell types to be included in the heatmap. Default is ["Excitatory", "Inhibitory", "Astrocyte",
|
| 713 |
+
"Microglia", "Oligodendrocyte", "OPC", "Endothelial"].
|
| 714 |
+
test_name : str, optional
|
| 715 |
+
The name of the test being compared. Default is 'ad_vs_no'.
|
| 716 |
+
top_n : int, optional
|
| 717 |
+
The number of top pathways to be included in the heatmap. Default is 10.
|
| 718 |
+
pathways : list, optional
|
| 719 |
+
A list of pathways to be included in the heatmap. If not empty, only these pathways will be included in the
|
| 720 |
+
heatmap. Default is [].
|
| 721 |
+
filter : bool, optional
|
| 722 |
+
If True, the function will filter out pathways that are not present in the filtered_pathways list. Default is
|
| 723 |
+
False.
|
| 724 |
+
save_suffix : str, optional
|
| 725 |
+
A suffix to be added to the output file name. Default is 'ad_vs_no'.
|
| 726 |
+
method : str, optional
|
| 727 |
+
The method used to generate the overlap. 'cell_type_overlap' will generate the overlap based on cell type.
|
| 728 |
+
'global_overlap' will generate the overlap based on all pathways in the studies. Default is 'cell_type_overlap'.
|
| 729 |
+
|
| 730 |
+
Returns:
|
| 731 |
+
--------
|
| 732 |
+
filtered_scores : dict
|
| 733 |
+
A dictionary of pathway scores for the overlapping pathways across the studies.
|
| 734 |
+
|
| 735 |
+
Examples:
|
| 736 |
+
---------
|
| 737 |
+
>>> multi_study_pathway_overlap(pathway_scores, filtered_pathways=['pathway1', 'pathway2'],
|
| 738 |
+
cell_types=['Excitatory', 'Astrocyte'], test_name='ad_vs_no', filter=True)
|
| 739 |
+
"""
|
| 740 |
+
|
| 741 |
+
|
| 742 |
+
for i, study in enumerate(pathway_scores.keys()):
|
| 743 |
+
pathway_scores[study][test_name] = pathway_scores[study][test_name][pathway_scores[study][test_name].celltype.isin(cell_types)]
|
| 744 |
+
|
| 745 |
+
if method == "cell_type_overlap":
|
| 746 |
+
overlap = []
|
| 747 |
+
for cell_type in cell_types:
|
| 748 |
+
eval_string = []
|
| 749 |
+
for i, study in enumerate(pathway_scores.keys()):
|
| 750 |
+
eval_string.append(f'set(pathway_scores["{study}"]["{test_name}"][pathway_scores["{study}"]["{test_name}"].celltype=="{cell_type}"].pathway)')
|
| 751 |
+
|
| 752 |
+
eval_string = '&'.join(eval_string)
|
| 753 |
+
overlap.extend(list(eval(eval_string)))
|
| 754 |
+
|
| 755 |
+
elif method == "global_overlap":
|
| 756 |
+
overlap = []
|
| 757 |
+
eval_string = []
|
| 758 |
+
for i, study in enumerate(pathway_scores.keys()):
|
| 759 |
+
eval_string.append(f'set(pathway_scores["{study}"]["{test_name}"].pathway)')
|
| 760 |
+
|
| 761 |
+
eval_string = '&'.join(eval_string)
|
| 762 |
+
overlap.extend(list(eval(eval_string)))
|
| 763 |
+
|
| 764 |
+
|
| 765 |
+
if filter:
|
| 766 |
+
n_rows = len(set(filtered_pathways) & set(overlap))
|
| 767 |
+
else:
|
| 768 |
+
n_rows = len(overlap)
|
| 769 |
+
|
| 770 |
+
fig, axs = plt.subplots(1, 3, figsize=(3.5, n_rows*0.095), gridspec_kw={'width_ratios':[0.85, 0.85, 1]}, sharex=False,
|
| 771 |
+
sharey=True, layout='constrained')
|
| 772 |
+
fig.tight_layout()
|
| 773 |
+
|
| 774 |
+
filtered_scores = {}
|
| 775 |
+
shortened_names = {}
|
| 776 |
+
|
| 777 |
+
for i, study in enumerate(pathway_scores.keys()):
|
| 778 |
+
filtered_scores[study] = pathway_scores[study][test_name][pathway_scores[study][test_name].pathway.isin(overlap)]
|
| 779 |
+
filtered_scores[study] = pd.pivot_table(filtered_scores[study], values='score_adj', index='pathway', columns='celltype')
|
| 780 |
+
filtered_scores[study] = filtered_scores[study][cell_types]
|
| 781 |
+
|
| 782 |
+
if filter:
|
| 783 |
+
filtered_scores[study] = filtered_scores[study].loc[filtered_scores[study].index.isin(filtered_pathways)]
|
| 784 |
+
|
| 785 |
+
shortened_names[study] = [' '.join(name.split(" ")[:-1]) for name in filtered_scores[study].index]
|
| 786 |
+
# shortened_names[study] = filtered_scores[study].index
|
| 787 |
+
|
| 788 |
+
cbar=True if study==list(pathway_scores.keys())[-1] else False
|
| 789 |
+
g1 = sb.heatmap(filtered_scores[study], cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
|
| 790 |
+
linewidths=0.015, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.2}, cbar=cbar,
|
| 791 |
+
cbar_ax=None, square=False, ax=axs[i], xticklabels=cell_types, yticklabels=shortened_names[study], mask=None,)
|
| 792 |
+
|
| 793 |
+
axs[i].tick_params(axis='both', which='major', labelsize=2.5, length=1.5, width=0.5)
|
| 794 |
+
|
| 795 |
+
g1.set_title(study.split('_')[-1].upper(), fontsize=3)
|
| 796 |
+
g1.set_ylabel('', fontsize=4)
|
| 797 |
+
g1.set_xlabel('')
|
| 798 |
+
|
| 799 |
+
cax = g1.figure.axes[-1]
|
| 800 |
+
cax.tick_params(labelsize=4, length=1.5, width=0.5, which="major")
|
| 801 |
+
|
| 802 |
+
# plt.tight_layout()
|
| 803 |
+
# if filter:
|
| 804 |
+
# plt.savefig(f'../results/pathway_meta_analysis/filtered_overlap_pathway_diff_exp_patterns_{save_suffix}.pdf', bbox_inches='tight')
|
| 805 |
+
# else:
|
| 806 |
+
|
| 807 |
+
plt.suptitle(f"{test_name.split('_')[0].capitalize()}- vs {test_name.split('_')[-1]}-pathology", fontsize=4)
|
| 808 |
+
|
| 809 |
+
if filter:
|
| 810 |
+
plt.savefig(f'results/{test_name}/multi_study_pathway_overlap_filtered.pdf', bbox_inches='tight')
|
| 811 |
+
else:
|
| 812 |
+
plt.savefig(f'results/{test_name}/multi_study_pathway_overlap_all.pdf', bbox_inches='tight')
|
| 813 |
+
plt.show(block=False)
|
| 814 |
+
|
| 815 |
+
return filtered_scores
|
| 816 |
+
|
| 817 |
+
|
| 818 |
+
def save_plot(fig, ax, save):
|
| 819 |
+
if save is not None:
|
| 820 |
+
if ax is not None:
|
| 821 |
+
if fig is not None:
|
| 822 |
+
fig.savefig(save, bbox_inches='tight')
|
| 823 |
+
else:
|
| 824 |
+
raise ValueError("fig is None, cannot save figure.")
|
| 825 |
+
else:
|
| 826 |
+
raise ValueError("ax is None, cannot save figure.")
|
| 827 |
+
|
| 828 |
+
|
| 829 |
+
def check_if_matplotlib(return_mpl=False):
|
| 830 |
+
if not return_mpl:
|
| 831 |
+
try:
|
| 832 |
+
import matplotlib.pyplot as plt
|
| 833 |
+
except Exception:
|
| 834 |
+
raise ImportError('matplotlib is not installed. Please install it with: pip install matplotlib')
|
| 835 |
+
return plt
|
| 836 |
+
else:
|
| 837 |
+
try:
|
| 838 |
+
import matplotlib as mpl
|
| 839 |
+
except Exception:
|
| 840 |
+
raise ImportError('matplotlib is not installed. Please install it with: pip install matplotlib')
|
| 841 |
+
return mpl
|
| 842 |
+
|
| 843 |
+
|
| 844 |
+
def check_if_seaborn():
|
| 845 |
+
try:
|
| 846 |
+
import seaborn as sns
|
| 847 |
+
except Exception:
|
| 848 |
+
raise ImportError('seaborn is not installed. Please install it with: pip install seaborn')
|
| 849 |
+
return sns
|
| 850 |
+
|
| 851 |
+
|
| 852 |
+
def check_if_adjustText():
|
| 853 |
+
try:
|
| 854 |
+
import adjustText as at
|
| 855 |
+
except Exception:
|
| 856 |
+
raise ImportError('adjustText is not installed. Please install it with: pip install adjustText')
|
| 857 |
+
return at
|
| 858 |
+
|
| 859 |
+
|
| 860 |
+
def filter_limits(df, sign_limit=None, lFCs_limit=None):
|
| 861 |
+
|
| 862 |
+
"""
|
| 863 |
+
Filters a DataFrame by limits of the absolute value of the columns pvals and logFCs.
|
| 864 |
+
|
| 865 |
+
Parameters
|
| 866 |
+
----------
|
| 867 |
+
df : pd.DataFrame
|
| 868 |
+
The input DataFrame to be filtered.
|
| 869 |
+
sign_limit : float, None
|
| 870 |
+
The absolute value limit for the p-values. If None, defaults to infinity.
|
| 871 |
+
lFCs_limit : float, None
|
| 872 |
+
The absolute value limit for the logFCs. If None, defaults to infinity.
|
| 873 |
+
|
| 874 |
+
Returns
|
| 875 |
+
-------
|
| 876 |
+
pd.DataFrame
|
| 877 |
+
The filtered DataFrame.
|
| 878 |
+
"""
|
| 879 |
+
|
| 880 |
+
# Define limits if not defined
|
| 881 |
+
if sign_limit is None:
|
| 882 |
+
sign_limit = np.inf
|
| 883 |
+
if lFCs_limit is None:
|
| 884 |
+
lFCs_limit = np.inf
|
| 885 |
+
|
| 886 |
+
# Filter by absolute value limits
|
| 887 |
+
msk_sign = df['pvals'] < np.abs(sign_limit)
|
| 888 |
+
msk_lFCs = np.abs(df['logFCs']) < np.abs(lFCs_limit)
|
| 889 |
+
df = df.loc[msk_sign & msk_lFCs]
|
| 890 |
+
|
| 891 |
+
return df
|
| 892 |
+
|
| 893 |
+
|
| 894 |
+
def plot_volcano(data, x, y, x_label, y_label='-log10(pvals)', annotate=True,
|
| 895 |
+
annot_by='top', names=[],
|
| 896 |
+
top=5, sign_thr=0.05, lFCs_thr=0.5, sign_limit=None, lFCs_limit=None,
|
| 897 |
+
figsize=(7, 5), dpi=100, ax=None, return_fig=False, save=None,
|
| 898 |
+
fontsizes={"on_plot": 4}):
|
| 899 |
+
"""
|
| 900 |
+
Plot logFC and p-values from a long formated data-frame.
|
| 901 |
+
|
| 902 |
+
Parameters
|
| 903 |
+
----------
|
| 904 |
+
data : pd.DataFrame
|
| 905 |
+
Results of DEA in long format.
|
| 906 |
+
x : str
|
| 907 |
+
Column name of data storing the logFCs.
|
| 908 |
+
y : str
|
| 909 |
+
Columns name of data storing the p-values.
|
| 910 |
+
x_label: str
|
| 911 |
+
Aternate name for LogFC to be included in plot. If None, defaults to x
|
| 912 |
+
y_label: str
|
| 913 |
+
Aternate name for p-values to be included in plot. If None, defaults to y
|
| 914 |
+
annotate: bool
|
| 915 |
+
Whether to annotate labels.
|
| 916 |
+
annot_by: str
|
| 917 |
+
Determines how to annotate the plot for top features. It can be either 'top' or 'name'.
|
| 918 |
+
If set to 'top', the top top differentially expressed features will be annotated. If set to 'name',
|
| 919 |
+
only the features specified in names will be annotated.
|
| 920 |
+
names: list[]:
|
| 921 |
+
A list of feature names to be annotated in the plot. Only used if annot_by is set to 'name'.
|
| 922 |
+
top : int
|
| 923 |
+
Number of top differentially expressed features to show.
|
| 924 |
+
sign_thr : float
|
| 925 |
+
Significance threshold for p-values.
|
| 926 |
+
lFCs_thr : float
|
| 927 |
+
Significance threshold for logFCs.
|
| 928 |
+
sign_limit : float
|
| 929 |
+
Limit of p-values to plot in -log10.
|
| 930 |
+
lFCs_limit : float
|
| 931 |
+
Limit of logFCs to plot in absolute value.
|
| 932 |
+
figsize : tuple
|
| 933 |
+
Figure size.
|
| 934 |
+
dpi : int
|
| 935 |
+
DPI resolution of figure.
|
| 936 |
+
ax : Axes, None
|
| 937 |
+
A matplotlib axes object. If None returns new figure.
|
| 938 |
+
return_fig : bool
|
| 939 |
+
Whether to return a Figure object or not.
|
| 940 |
+
save : str, None
|
| 941 |
+
Path to where to save the plot. Infer the filetype if ending on {`.pdf`, `.png`, `.svg`}.
|
| 942 |
+
|
| 943 |
+
Returns
|
| 944 |
+
-------
|
| 945 |
+
fig : Figure, None
|
| 946 |
+
If return_fig, returns Figure object.
|
| 947 |
+
"""
|
| 948 |
+
|
| 949 |
+
|
| 950 |
+
if x_label is None:
|
| 951 |
+
x_label = x
|
| 952 |
+
|
| 953 |
+
if y_label is None:
|
| 954 |
+
y_label = y
|
| 955 |
+
|
| 956 |
+
# Load plotting packages
|
| 957 |
+
plt = check_if_matplotlib()
|
| 958 |
+
at = check_if_adjustText()
|
| 959 |
+
|
| 960 |
+
# Transform sign_thr
|
| 961 |
+
sign_thr = -np.log10(sign_thr)
|
| 962 |
+
|
| 963 |
+
# Extract df
|
| 964 |
+
df = data.copy()
|
| 965 |
+
df['logFCs'] = df[x]
|
| 966 |
+
df['pvals'] = -np.log10(df[y])
|
| 967 |
+
|
| 968 |
+
# Filter by limits
|
| 969 |
+
df = filter_limits(df, sign_limit=sign_limit, lFCs_limit=lFCs_limit)
|
| 970 |
+
|
| 971 |
+
# Define color by up or down regulation and significance
|
| 972 |
+
df['weight'] = 'gray'
|
| 973 |
+
up_msk = (df['logFCs'] >= lFCs_thr) & (df['pvals'] >= sign_thr)
|
| 974 |
+
dw_msk = (df['logFCs'] <= -lFCs_thr) & (df['pvals'] >= sign_thr)
|
| 975 |
+
df.loc[up_msk, 'weight'] = '#D62728'
|
| 976 |
+
df.loc[dw_msk, 'weight'] = '#1F77B4'
|
| 977 |
+
|
| 978 |
+
# Plot
|
| 979 |
+
fig = None
|
| 980 |
+
if ax is None:
|
| 981 |
+
fig, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi)
|
| 982 |
+
|
| 983 |
+
n = df.shape[0]
|
| 984 |
+
size = 120000 / (100*n)
|
| 985 |
+
|
| 986 |
+
df.plot.scatter(x='logFCs', y='pvals', c='weight', sharex=False, ax=ax, s=size)
|
| 987 |
+
|
| 988 |
+
# Draw sign lines
|
| 989 |
+
ax.axhline(y=sign_thr, linestyle='--', color="black")
|
| 990 |
+
ax.axvline(x=lFCs_thr, linestyle='--', color="black")
|
| 991 |
+
ax.axvline(x=-lFCs_thr, linestyle='--', color="black")
|
| 992 |
+
|
| 993 |
+
# Plot top sign features
|
| 994 |
+
signs = df[up_msk | dw_msk].sort_values('pvals', ascending=False)
|
| 995 |
+
|
| 996 |
+
# Add labels
|
| 997 |
+
ax.set_ylabel(y_label)
|
| 998 |
+
ax.set_xlabel(x_label)
|
| 999 |
+
|
| 1000 |
+
if annotate:
|
| 1001 |
+
if annot_by == 'top':
|
| 1002 |
+
signs = signs.iloc[:top]
|
| 1003 |
+
elif annot_by == 'name':
|
| 1004 |
+
signs = signs.loc[signs.index.isin(names)]
|
| 1005 |
+
|
| 1006 |
+
texts = []
|
| 1007 |
+
for x, y, s in zip(signs['logFCs'], signs['pvals'], signs.index):
|
| 1008 |
+
texts.append(ax.text(x, y, s, fontsize=fontsizes['on_plot']))
|
| 1009 |
+
if len(texts) > 0:
|
| 1010 |
+
at.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'), ax=ax)
|
| 1011 |
+
|
| 1012 |
+
save_plot(fig, ax, save)
|
| 1013 |
+
|
| 1014 |
+
if return_fig:
|
| 1015 |
+
return fig
|
functions/requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
scanpy
|
| 5 |
+
matplotlib
|
| 6 |
+
decoupler
|
| 7 |
+
scipy
|
| 8 |
+
anndata
|
| 9 |
+
typing
|