File size: 4,546 Bytes
11767f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""

Method MLM.

Code to run the Multivariate Linear Model (MLM) method.

"""

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data

from scipy import stats

from tqdm.auto import tqdm

import numba as nb


@nb.njit(nb.f4[:, :](nb.f4[:, :], nb.f4[:, :], nb.f4[:, :], nb.i8), parallel=True, cache=True)
def fit_mlm(X, y, inv, df):
    X = np.ascontiguousarray(X)
    n_samples = y.shape[1]
    n_fsets = X.shape[1]
    coef, sse, _, _ = np.linalg.lstsq(X, y)
    if len(sse) == 0:
        raise ValueError("""Couldn\'t fit a multivariate linear model. This can happen because there are more sources

        (covariates) than unique targets (samples), or because the network\'s matrix rank is smaller than the number of

        sources.""")
    sse = sse / df
    se = np.zeros((n_samples, n_fsets), dtype=nb.f4)
    for i in nb.prange(n_samples):
        se[i] = np.sqrt(np.diag(sse[i] * inv))
    t = coef.T/se
    return t.astype(nb.f4)


def mlm(mat, net, batch_size=10000, verbose=False):

    # Get dims
    n_samples = mat.shape[0]
    n_features, n_fsets = net.shape

    # Add intercept to network
    net = np.column_stack((np.ones((n_features, ), dtype=np.float32), net))

    # Compute inv and df for lm
    inv = np.linalg.inv(np.dot(net.T, net))
    df = n_features - n_fsets - 1

    if isinstance(mat, csr_matrix):
        # Init empty acts
        n_batches = int(np.ceil(n_samples / batch_size))
        es = np.zeros((n_samples, n_fsets), dtype=np.float32)
        for i in tqdm(range(n_batches), disable=not verbose):

            # Subset batch
            srt, end = i * batch_size, i * batch_size + batch_size
            y = mat[srt:end].toarray().T

            # Compute MLM for batch
            es[srt:end] = fit_mlm(net, y, inv, df)[:, 1:]
    else:
        # Compute MLM for all
        es = fit_mlm(net, mat.T, inv, df)[:, 1:]

    # Get p-values
    pvals = 2 * (1 - stats.t.cdf(np.abs(es), df))

    return es, pvals


def run_mlm(mat, net, source='source', target='target', weight='weight', batch_size=10000,

            min_n=5, verbose=False, use_raw=True):
    """

    Multivariate Linear Model (MLM).



    MLM fits a multivariate linear model for each sample, where the observed molecular readouts in `mat` are the response

    variable and the regulator weights in `net` are the covariates. Target features with no associated weight are set to

    zero. The obtained t-values from the fitted model are the activities (`mlm_estimate`) of the regulators in `net`.



    Parameters

    ----------

    mat : list, DataFrame or AnnData

        List of [features, matrix], dataframe (samples x features) or an AnnData instance.

    net : DataFrame

        Network in long format.

    source : str

        Column name in net with source nodes.

    target : str

        Column name in net with target nodes.

    weight : str

        Column name in net with weights.

    batch_size : int

        Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster.

    min_n : int

        Minimum of targets per source. If less, sources are removed.

    verbose : bool

        Whether to show progress.

    use_raw : bool

        Use raw attribute of mat if present.



    Returns

    -------

    estimate : DataFrame

        MLM scores. Stored in `.obsm['mlm_estimate']` if `mat` is AnnData.

    pvals : DataFrame

        Obtained p-values. Stored in `.obsm['mlm_pvals']` if `mat` is AnnData.

    """

    # Extract sparse matrix and array of genes
    m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)

    # Transform net
    net = rename_net(net, source=source, target=target, weight=weight)
    net = filt_min_n(c, net, min_n=min_n)
    sources, targets, net = get_net_mat(net)

    # Match arrays
    net = match(c, targets, net)

    if verbose:
        print('Running mlm on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))

    # Run MLM
    estimate, pvals = mlm(m, net, batch_size=batch_size, verbose=verbose)

    # Transform to df
    estimate = pd.DataFrame(estimate, index=r, columns=sources)
    estimate.name = 'mlm_estimate'
    pvals = pd.DataFrame(pvals, index=r, columns=sources)
    pvals.name = 'mlm_pvals'

    return return_data(mat=mat, results=(estimate, pvals))