File size: 3,958 Bytes
11767f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""

Method ULM.

Code to run the Univariate Linear Model (ULM) method.

"""

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from scipy.stats import t

from pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data

from tqdm.auto import tqdm


def mat_cov(A, b):
    return np.dot(b.T - b.mean(), A - A.mean(axis=0)) / (b.shape[0]-1)


def mat_cor(A, b):
    cov = mat_cov(A, b)
    ssd = np.std(A, axis=0, ddof=1) * np.std(b, axis=0, ddof=1).reshape(-1, 1)
    return cov / ssd


def t_val(r, df):
    return r * np.sqrt(df / ((1.0 - r + 1.0e-16)*(1.0 + r + 1.0e-16)))


def ulm(mat, net, batch_size=10000, verbose=False):

    # Get dims
    n_samples = mat.shape[0]
    n_features, n_fsets = net.shape
    df = n_features - 2

    if isinstance(mat, csr_matrix):
        n_batches = int(np.ceil(n_samples / batch_size))
        es = np.zeros((n_samples, n_fsets), dtype=np.float32)
        for i in tqdm(range(n_batches), disable=not verbose):

            # Subset batch
            srt, end = i * batch_size, i * batch_size + batch_size
            batch = mat[srt:end].toarray().T

            # Compute R for batch
            r = mat_cor(net, batch)

            # Compute t-value
            es[srt:end] = t_val(r, df)
    else:
        # Compute R value for all
        r = mat_cor(net, mat.T)

        # Compute t-value
        es = t_val(r, df)

    # Compute p-value
    pv = t.sf(abs(es), df) * 2

    return es, pv


def run_ulm(mat, net, source='source', target='target', weight='weight', batch_size=10000,

            min_n=5, verbose=False, use_raw=True):
    """

    Univariate Linear Model (ULM).



    ULM fits a linear model for each sample and regulator, where the observed molecular readouts in `mat` are the response

    variable and the regulator weights in `net` are the explanatory one. Target features with no associated weight are set to

    zero. The obtained t-value from the fitted model is the activity (`ulm_estimate`) of a given regulator.



    Parameters

    ----------

    mat : list, DataFrame or AnnData

        List of [features, matrix], dataframe (samples x features) or an AnnData instance.

    net : DataFrame

        Network in long format.

    source : str

        Column name in net with source nodes.

    target : str

        Column name in net with target nodes.

    weight : str

        Column name in net with weights.

    batch_size : int

        Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster.

    min_n : int

        Minimum of targets per source. If less, sources are removed.

    verbose : bool

        Whether to show progress.

    use_raw : bool

        Use raw attribute of mat if present.



    Returns

    -------

    estimate : DataFrame

        ULM scores. Stored in `.obsm['ulm_estimate']` if `mat` is AnnData.

    pvals : DataFrame

        Obtained p-values. Stored in `.obsm['ulm_pvals']` if `mat` is AnnData.

    """

    # Extract sparse matrix and array of genes
    m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)

    # Transform net
    net = rename_net(net, source=source, target=target, weight=weight)
    net = filt_min_n(c, net, min_n=min_n)
    sources, targets, net = get_net_mat(net)

    # Match arrays
    net = match(c, targets, net)

    if verbose:
        print('Running ulm on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))

    # Run ULM
    estimate, pvals = ulm(m, net, batch_size=batch_size, verbose=verbose)

    # Transform to df
    estimate = pd.DataFrame(estimate, index=r, columns=sources)
    estimate.name = 'ulm_estimate'
    pvals = pd.DataFrame(pvals, index=r, columns=sources)
    pvals.name = 'ulm_pvals'

    return return_data(mat=mat, results=(estimate, pvals))