Spaces:

oncomark
/

ai

Running

File size: 4,546 Bytes

11767f5

"""

Method MLM.

Code to run the Multivariate Linear Model (MLM) method.

"""

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data

from scipy import stats

from tqdm.auto import tqdm

import numba as nb


@nb.njit(nb.f4[:, :](nb.f4[:, :], nb.f4[:, :], nb.f4[:, :], nb.i8), parallel=True, cache=True)
def fit_mlm(X, y, inv, df):
    X = np.ascontiguousarray(X)
    n_samples = y.shape[1]
    n_fsets = X.shape[1]
    coef, sse, _, _ = np.linalg.lstsq(X, y)
    if len(sse) == 0:
        raise ValueError("""Couldn\'t fit a multivariate linear model. This can happen because there are more sources

        (covariates) than unique targets (samples), or because the network\'s matrix rank is smaller than the number of

        sources.""")
    sse = sse / df
    se = np.zeros((n_samples, n_fsets), dtype=nb.f4)
    for i in nb.prange(n_samples):
        se[i] = np.sqrt(np.diag(sse[i] * inv))
    t = coef.T/se
    return t.astype(nb.f4)


def mlm(mat, net, batch_size=10000, verbose=False):

    # Get dims
    n_samples = mat.shape[0]
    n_features, n_fsets = net.shape

    # Add intercept to network
    net = np.column_stack((np.ones((n_features, ), dtype=np.float32), net))

    # Compute inv and df for lm
    inv = np.linalg.inv(np.dot(net.T, net))
    df = n_features - n_fsets - 1

    if isinstance(mat, csr_matrix):
        # Init empty acts
        n_batches = int(np.ceil(n_samples / batch_size))
        es = np.zeros((n_samples, n_fsets), dtype=np.float32)
        for i in tqdm(range(n_batches), disable=not verbose):

            # Subset batch
            srt, end = i * batch_size, i * batch_size + batch_size
            y = mat[srt:end].toarray().T

            # Compute MLM for batch
            es[srt:end] = fit_mlm(net, y, inv, df)[:, 1:]
    else:
        # Compute MLM for all
        es = fit_mlm(net, mat.T, inv, df)[:, 1:]

    # Get p-values
    pvals = 2 * (1 - stats.t.cdf(np.abs(es), df))

    return es, pvals


def run_mlm(mat, net, source='source', target='target', weight='weight', batch_size=10000,

            min_n=5, verbose=False, use_raw=True):
    """

    Multivariate Linear Model (MLM).



    MLM fits a multivariate linear model for each sample, where the observed molecular readouts in `mat` are the response

    variable and the regulator weights in `net` are the covariates. Target features with no associated weight are set to

    zero. The obtained t-values from the fitted model are the activities (`mlm_estimate`) of the regulators in `net`.



    Parameters

    ----------

    mat : list, DataFrame or AnnData

        List of [features, matrix], dataframe (samples x features) or an AnnData instance.

    net : DataFrame

        Network in long format.

    source : str

        Column name in net with source nodes.

    target : str

        Column name in net with target nodes.

    weight : str

        Column name in net with weights.

    batch_size : int

        Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster.

    min_n : int

        Minimum of targets per source. If less, sources are removed.

    verbose : bool

        Whether to show progress.

    use_raw : bool

        Use raw attribute of mat if present.



    Returns

    -------

    estimate : DataFrame

        MLM scores. Stored in `.obsm['mlm_estimate']` if `mat` is AnnData.

    pvals : DataFrame

        Obtained p-values. Stored in `.obsm['mlm_pvals']` if `mat` is AnnData.

    """

    # Extract sparse matrix and array of genes
    m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)

    # Transform net
    net = rename_net(net, source=source, target=target, weight=weight)
    net = filt_min_n(c, net, min_n=min_n)
    sources, targets, net = get_net_mat(net)

    # Match arrays
    net = match(c, targets, net)

    if verbose:
        print('Running mlm on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))

    # Run MLM
    estimate, pvals = mlm(m, net, batch_size=batch_size, verbose=verbose)

    # Transform to df
    estimate = pd.DataFrame(estimate, index=r, columns=sources)
    estimate.name = 'mlm_estimate'
    pvals = pd.DataFrame(pvals, index=r, columns=sources)
    pvals.name = 'mlm_pvals'

    return return_data(mat=mat, results=(estimate, pvals))