Spaces:

oncomark
/

ai

Running

App Files Files Community

ai / ulm.py

iamspriyadarshi

Upload 13 files

11767f5 verified about 1 year ago

raw

history blame contribute delete

3.96 kB

	"""
	Method ULM.
	Code to run the Univariate Linear Model (ULM) method.
	"""

	import numpy as np
	import pandas as pd
	from scipy.sparse import csr_matrix

	from scipy.stats import t

	from pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data

	from tqdm.auto import tqdm


	def mat_cov(A, b):
	return np.dot(b.T - b.mean(), A - A.mean(axis=0)) / (b.shape[0]-1)


	def mat_cor(A, b):
	cov = mat_cov(A, b)
	ssd = np.std(A, axis=0, ddof=1) * np.std(b, axis=0, ddof=1).reshape(-1, 1)
	return cov / ssd


	def t_val(r, df):
	return r * np.sqrt(df / ((1.0 - r + 1.0e-16)*(1.0 + r + 1.0e-16)))


	def ulm(mat, net, batch_size=10000, verbose=False):

	# Get dims
	n_samples = mat.shape[0]
	n_features, n_fsets = net.shape
	df = n_features - 2

	if isinstance(mat, csr_matrix):
	n_batches = int(np.ceil(n_samples / batch_size))
	es = np.zeros((n_samples, n_fsets), dtype=np.float32)
	for i in tqdm(range(n_batches), disable=not verbose):

	# Subset batch
	srt, end = i * batch_size, i * batch_size + batch_size
	batch = mat[srt:end].toarray().T

	# Compute R for batch
	r = mat_cor(net, batch)

	# Compute t-value
	es[srt:end] = t_val(r, df)
	else:
	# Compute R value for all
	r = mat_cor(net, mat.T)

	# Compute t-value
	es = t_val(r, df)

	# Compute p-value
	pv = t.sf(abs(es), df) * 2

	return es, pv


	def run_ulm(mat, net, source='source', target='target', weight='weight', batch_size=10000,
	min_n=5, verbose=False, use_raw=True):
	"""
	Univariate Linear Model (ULM).

	ULM fits a linear model for each sample and regulator, where the observed molecular readouts in `mat` are the response
	variable and the regulator weights in `net` are the explanatory one. Target features with no associated weight are set to
	zero. The obtained t-value from the fitted model is the activity (`ulm_estimate`) of a given regulator.

	Parameters
	----------
	mat : list, DataFrame or AnnData
	List of [features, matrix], dataframe (samples x features) or an AnnData instance.
	net : DataFrame
	Network in long format.
	source : str
	Column name in net with source nodes.
	target : str
	Column name in net with target nodes.
	weight : str
	Column name in net with weights.
	batch_size : int
	Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster.
	min_n : int
	Minimum of targets per source. If less, sources are removed.
	verbose : bool
	Whether to show progress.
	use_raw : bool
	Use raw attribute of mat if present.

	Returns
	-------
	estimate : DataFrame
	ULM scores. Stored in `.obsm['ulm_estimate']` if `mat` is AnnData.
	pvals : DataFrame
	Obtained p-values. Stored in `.obsm['ulm_pvals']` if `mat` is AnnData.
	"""

	# Extract sparse matrix and array of genes
	m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)

	# Transform net
	net = rename_net(net, source=source, target=target, weight=weight)
	net = filt_min_n(c, net, min_n=min_n)
	sources, targets, net = get_net_mat(net)

	# Match arrays
	net = match(c, targets, net)

	if verbose:
	print('Running ulm on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))

	# Run ULM
	estimate, pvals = ulm(m, net, batch_size=batch_size, verbose=verbose)

	# Transform to df
	estimate = pd.DataFrame(estimate, index=r, columns=sources)
	estimate.name = 'ulm_estimate'
	pvals = pd.DataFrame(pvals, index=r, columns=sources)
	pvals.name = 'ulm_pvals'

	return return_data(mat=mat, results=(estimate, pvals))