File size: 3,958 Bytes
11767f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
"""
Method ULM.
Code to run the Univariate Linear Model (ULM) method.
"""
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.stats import t
from pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data
from tqdm.auto import tqdm
def mat_cov(A, b):
return np.dot(b.T - b.mean(), A - A.mean(axis=0)) / (b.shape[0]-1)
def mat_cor(A, b):
cov = mat_cov(A, b)
ssd = np.std(A, axis=0, ddof=1) * np.std(b, axis=0, ddof=1).reshape(-1, 1)
return cov / ssd
def t_val(r, df):
return r * np.sqrt(df / ((1.0 - r + 1.0e-16)*(1.0 + r + 1.0e-16)))
def ulm(mat, net, batch_size=10000, verbose=False):
# Get dims
n_samples = mat.shape[0]
n_features, n_fsets = net.shape
df = n_features - 2
if isinstance(mat, csr_matrix):
n_batches = int(np.ceil(n_samples / batch_size))
es = np.zeros((n_samples, n_fsets), dtype=np.float32)
for i in tqdm(range(n_batches), disable=not verbose):
# Subset batch
srt, end = i * batch_size, i * batch_size + batch_size
batch = mat[srt:end].toarray().T
# Compute R for batch
r = mat_cor(net, batch)
# Compute t-value
es[srt:end] = t_val(r, df)
else:
# Compute R value for all
r = mat_cor(net, mat.T)
# Compute t-value
es = t_val(r, df)
# Compute p-value
pv = t.sf(abs(es), df) * 2
return es, pv
def run_ulm(mat, net, source='source', target='target', weight='weight', batch_size=10000,
min_n=5, verbose=False, use_raw=True):
"""
Univariate Linear Model (ULM).
ULM fits a linear model for each sample and regulator, where the observed molecular readouts in `mat` are the response
variable and the regulator weights in `net` are the explanatory one. Target features with no associated weight are set to
zero. The obtained t-value from the fitted model is the activity (`ulm_estimate`) of a given regulator.
Parameters
----------
mat : list, DataFrame or AnnData
List of [features, matrix], dataframe (samples x features) or an AnnData instance.
net : DataFrame
Network in long format.
source : str
Column name in net with source nodes.
target : str
Column name in net with target nodes.
weight : str
Column name in net with weights.
batch_size : int
Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster.
min_n : int
Minimum of targets per source. If less, sources are removed.
verbose : bool
Whether to show progress.
use_raw : bool
Use raw attribute of mat if present.
Returns
-------
estimate : DataFrame
ULM scores. Stored in `.obsm['ulm_estimate']` if `mat` is AnnData.
pvals : DataFrame
Obtained p-values. Stored in `.obsm['ulm_pvals']` if `mat` is AnnData.
"""
# Extract sparse matrix and array of genes
m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)
# Transform net
net = rename_net(net, source=source, target=target, weight=weight)
net = filt_min_n(c, net, min_n=min_n)
sources, targets, net = get_net_mat(net)
# Match arrays
net = match(c, targets, net)
if verbose:
print('Running ulm on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))
# Run ULM
estimate, pvals = ulm(m, net, batch_size=batch_size, verbose=verbose)
# Transform to df
estimate = pd.DataFrame(estimate, index=r, columns=sources)
estimate.name = 'ulm_estimate'
pvals = pd.DataFrame(pvals, index=r, columns=sources)
pvals.name = 'ulm_pvals'
return return_data(mat=mat, results=(estimate, pvals))
|