Spaces:

funlab
/

DeepLoop

Sleeping

File size: 10,831 Bytes

import os
import sys
import argparse
from logging import getLogger
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from tensorflow.keras.models import model_from_json
from scipy.sparse import csr_matrix, triu
import streamlit as st

logger = getLogger(__name__)


def anchor_list_to_dict(anchors):
    anchor_dict = {}
    for i, anchor in enumerate(anchors):
        anchor_dict[anchor] = i
    return anchor_dict


def anchor_to_locus(anchor_dict):
    def f(anchor):
        return anchor_dict[anchor]

    return f


def locus_to_anchor(anchor_list):
    def f(locus):
        return anchor_list[locus]

    return f


def predict_tile(args):
    model, shared_denoised, shared_overlap, matrix, window_x, window_y = args
    tile = matrix[window_x, window_y].A  # split matrix into tiles
    if tile.shape == (small_matrix_size, small_matrix_size):
        tile = np.expand_dims(tile, 0)  # add channel dimension
        tile = np.expand_dims(tile, 3)  # add batch dimension
        tmp_denoised = np.ctypeslib.as_array(shared_denoised)
        tmp_overlap = np.ctypeslib.as_array(shared_overlap)
        denoised = model.predict(tile).reshape((small_matrix_size, small_matrix_size))
        denoised[denoised < 0] = 0  # remove any negative values
        tmp_denoised[window_x, window_y] += denoised
        tmp_overlap[window_x, window_y] += 1


def sparse_prediction_from_file(
    model,
    matrix,
    anchor_list,
    small_matrix_size=128,
    step_size=64,
    max_dist=384,
    keep_zeros=True,
):
    input_matrix_size = len(anchor_list)
    denoised_matrix = np.zeros_like(matrix.A)  # matrix to store denoised values
    overlap_counts = np.zeros_like(
        matrix.A
    )  # stores number of overlaps per ratio value

    start_time = time.time()

    for i in range(0, input_matrix_size, step_size):
        for j in range(0, input_matrix_size, step_size):
            if abs(i - j) > max_dist:  # max distance from diagonal with actual values
                continue
            rows = slice(i, i + small_matrix_size)
            cols = slice(j, j + small_matrix_size)
            if i + small_matrix_size >= input_matrix_size:
                rows = slice(input_matrix_size - small_matrix_size, input_matrix_size)
            if j + small_matrix_size >= input_matrix_size:
                cols = slice(input_matrix_size - small_matrix_size, input_matrix_size)
            tile = matrix[rows, cols].A  # split matrix into tiles
            if tile.shape == (small_matrix_size, small_matrix_size):
                tile = np.expand_dims(tile, 0)  # add channel dimension
                tile = np.expand_dims(tile, 3)  # add batch dimension
                denoised = model.predict(tile).reshape(
                    (small_matrix_size, small_matrix_size)
                )
                denoised[denoised < 0] = 0  # remove any negative values
                denoised_matrix[
                    rows, cols
                ] += denoised  # add denoised ratio values to whole matrix
                overlap_counts[
                    rows, cols
                ] += 1  # add to all overlap values within tiled region

    # print('Predicted matrix in %d seconds' % (time.time() - start_time))
    # start_time = time.time()
    denoised_matrix = np.divide(
        denoised_matrix,
        overlap_counts,
        out=np.zeros_like(denoised_matrix),
        where=overlap_counts != 0,
    )  # average all overlapping areas

    denoised_matrix = (denoised_matrix + denoised_matrix.T) * 0.5  # force symmetry

    np.fill_diagonal(denoised_matrix, 0)  # set all diagonal values to 0

    sparse_denoised_matrix = triu(denoised_matrix, format="coo")

    if not keep_zeros:
        sparse_denoised_matrix.eliminate_zeros()

    # print('Averaging/symmetry, and converting to COO matrix in %d seconds' % (time.time() - start_time))

    return sparse_denoised_matrix


def predict_and_write(
    model,
    full_matrix_dir,
    input_name,
    outdir,
    anchor_dir,
    chromosome,
    small_matrix_size,
    step_size,
    dummy=5,
    max_dist=384,
    val_cols=["obs", "exp"],
    keep_zeros=True,
    matrices_per_tile=8,
):
    start_time = time.time()
    anchor_file = os.path.join(anchor_dir, chromosome + ".bed")
    anchor_list = pd.read_csv(
        anchor_file,
        sep="\t",
        usecols=[0, 1, 2, 3],
        names=["chr", "start", "end", "anchor"],
    )  # read anchor list file
    start_time = time.time()
    logger.debug("anchor file")
    logger.debug(os.path.join(full_matrix_dir, input_name))
    chr_anchor_file = pd.read_csv(
        os.path.join(full_matrix_dir, input_name),
        delimiter="\t",
        names=["anchor1", "anchor2"] + val_cols,
        usecols=["anchor1", "anchor2"] + val_cols,
    )  # read chromosome anchor to anchor file
    if "obs" in val_cols and "exp" in val_cols:
        chr_anchor_file["ratio"] = (chr_anchor_file["obs"] + dummy) / (
            chr_anchor_file["exp"] + dummy
        )  # compute matrix ratio value
    assert (
        "ratio" not in val_cols
    ), "Must provide either ratio column or obs and exp columns to compute ratio"

    denoised_anchor_to_anchor = pd.DataFrame()

    start_time = time.time()

    anchor_step = matrices_per_tile * small_matrix_size

    for i in tqdm(range(0, len(anchor_list), anchor_step)):
        anchors = anchor_list[i : i + anchor_step]
        # print(anchors)
        anchor_dict = anchor_list_to_dict(
            anchors["anchor"].values
        )  # convert to anchor --> index dictionary
        chr_tile = chr_anchor_file[
            (chr_anchor_file["anchor1"].isin(anchors["anchor"]))
            & (chr_anchor_file["anchor2"].isin(anchors["anchor"]))
        ]
        rows = np.vectorize(anchor_to_locus(anchor_dict))(
            chr_tile["anchor1"].values
        )  # convert anchor names to row indices
        cols = np.vectorize(anchor_to_locus(anchor_dict))(
            chr_tile["anchor2"].values
        )  # convert anchor names to column indices
        logger.debug(chr_tile)
        sparse_matrix = csr_matrix(
            (chr_tile["ratio"], (rows, cols)),
            shape=(anchor_step, anchor_step),
        )  # construct sparse CSR matrix

        sparse_denoised_tile = sparse_prediction_from_file(
            model,
            sparse_matrix,
            anchors,
            small_matrix_size,
            step_size,
            max_dist,
            keep_zeros=keep_zeros,
        )
        if len(sparse_denoised_tile.row) > 0:
            anchor_name_list = anchors["anchor"].values.tolist()

            anchor_1_list = np.vectorize(locus_to_anchor(anchor_name_list))(
                sparse_denoised_tile.row
            )
            anchor_2_list = np.vectorize(locus_to_anchor(anchor_name_list))(
                sparse_denoised_tile.col
            )

            anchor_to_anchor_dict = {
                "anchor1": anchor_1_list,
                "anchor2": anchor_2_list,
                "denoised": sparse_denoised_tile.data,
            }

            tile_anchor_to_anchor = pd.DataFrame.from_dict(anchor_to_anchor_dict)
            tile_anchor_to_anchor = tile_anchor_to_anchor.round({"denoised": 4})
            denoised_anchor_to_anchor = pd.concat(
                [denoised_anchor_to_anchor, tile_anchor_to_anchor]
            )

    print("Denoised matrix in %d seconds" % (time.time() - start_time))
    start_time = time.time()

    denoised_anchor_to_anchor.to_csv(
        os.path.join(outdir, chromosome + ".denoised.anchor.to.anchor"),
        sep="\t",
        index=False,
        header=False,
    )
    return denoised_anchor_to_anchor


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--full_matrix_dir",
        type=str,
        help="directory containing chromosome interaction files to be used as input",
    )
    parser.add_argument(
        "--input_name",
        type=str,
        help="name of file in full_matrix_dir that we want to feed into model",
    )
    parser.add_argument("--h5_file", type=str, help="path to model weights .h5 file")
    parser.add_argument(
        "--json_file",
        type=str,
        help="path to model architecture .json file (by default it is assumed to be the same as the weights file)",
    )
    parser.add_argument(
        "--outdir",
        type=str,
        help="directory where the output interaction file will be stored",
    )
    parser.add_argument(
        "--anchor_dir",
        type=str,
        help="directory containing anchor .bed reference files",
    )
    parser.add_argument(
        "--chromosome", type=str, help="chromosome string (e.g chr1, chr20, chrX)"
    )
    parser.add_argument(
        "--small_matrix_size",
        type=int,
        default=128,
        help="size of input tiles (symmetric)",
    )
    parser.add_argument(
        "--step_size",
        type=int,
        default=128,
        help="step size when tiling matrix (overlapping values will be averaged if different)",
    )
    parser.add_argument(
        "--max_dist",
        type=int,
        default=384,
        help="maximum distance from diagonal (in pixels) where we consider interactions (default to ~2Mb)",
    )
    parser.add_argument(
        "--dummy",
        type=int,
        default=5,
        help="dummy value to compute ratio (obs + dummy) / (exp + dummy)",
    )
    parser.add_argument(
        "--val_cols",
        "--list",
        nargs="+",
        help="names of value columns in interaction files (not including a1, a2)",
        default=["obs", "exp"],
    )
    parser.add_argument(
        "--keep_zeros",
        action="store_true",
        help="if provided, the output file will contain all pixels in every tile, even if no value is present",
    )
    args = parser.parse_args()

    full_matrix_dir = args.full_matrix_dir
    input_name = args.input_name
    h5_file = args.h5_file
    if args.json_file is not None:
        json_file = args.json_file
    else:
        json_file = args.h5_file.replace("h5", "json")
    outdir = args.outdir
    anchor_dir = args.anchor_dir
    chromosome = args.chromosome
    small_matrix_size = args.small_matrix_size
    step_size = args.step_size
    dummy = args.dummy
    max_dist = args.max_dist
    val_cols = args.val_cols
    keep_zeros = args.keep_zeros

    os.makedirs(outdir, exist_ok=True)

    with open(json_file, "r") as f:
        model = model_from_json(f.read())  # load model
    model.load_weights(h5_file)  # load model weights
    predict_and_write(
        model,
        full_matrix_dir,
        input_name,
        outdir,
        anchor_dir,
        chromosome,
        small_matrix_size,
        step_size,
        dummy,
        max_dist,
        val_cols,
        keep_zeros,
    )