splicing-predictor / data_preprocessing /compute_splicing_outcomes.py
Oded Regev
first commit
ed17227
# Take cDNA reads, and produce splicing statistics per barcode
# Read 1: NNNNN[N][N]TTTAAACGGGCCCTATNNNNNNNNNNNNNNNNNNNNTCTAGAGCGAG[CT]
# Number of Ns (UMI) is random 5-7; barcode is 20N
# Read 2:
# Diversity: [NN] (0-2Ns)
# End of exon 1: AAGTTGGTGGTGAGGCCCTGGGCAG
# Exon 2: GTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAG
# Exon 3: CTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTCTAGA
# Plasmid library:
# ES7A_Lib1 BS06911A
# %%
import os
import pandas as pd
import numpy as np
from collections import Counter
import random
from utils import *
import argparse
from tqdm.auto import tqdm
# %%
parser = argparse.ArgumentParser()
parser.add_argument("--input_folder", required=True, type=str, help="Input folder")
parser.add_argument("--output_folder", required=True, type=str, help="Output folder")
parser.add_argument(
"--plasmid_coupling_file_name",
required=True,
type=str,
help="Plasmid coupling filename. Must be a csv file.",
)
args = parser.parse_args()
# %%
NUM_LIBS = 3
ALL_LIBRARY_NAMES = {0: "ES7_HeLa_A", 1: "ES7_HeLa_B", 2: "ES7_HeLa_C"}
ALL_FILE_NAMES = {0: "BS11504A_S1", 1: "BS11505A_S2", 2: "BS11506A_S3"}
ALL_BASE_DIR_NAMES = {
0: "Sample_BS11504A/",
1: "Sample_BS11505A/",
2: "Sample_BS11506A/",
}
PLASMID_COUPLING_FILE_NAME = args.plasmid_coupling_file_name
# "data/Sample_BS07028A/coupling.csv"
INPUT_FOLDER = args.input_folder
OUTPUT_FOLDER = args.output_folder
assert os.path.isdir(INPUT_FOLDER), f"No such folder: {INPUT_FOLDER}"
if not os.path.isdir(OUTPUT_FOLDER):
print(f"{OUTPUT_FOLDER} not found. Creating...")
os.makedirs(OUTPUT_FOLDER)
# %%
bad_read_1_reads = 0
unknown_barcode_reads = 0
SUBSAMPLE_RATIO = 1 # Only analyze 1 in SUBSAMPLE_RATIO samples; used to get a quick sample for testing purposes; set to 1 in production
def identify_splicing_pattern(read_1, read_2, read_1_q, read_2_q):
global bad_read_1_reads
global unknown_barcode_reads
global barcode_statistics
if (SUBSAMPLE_RATIO > 1) and (random.randrange(SUBSAMPLE_RATIO) != 0):
return
# Check read 1 (barcode) proper format
# This should match the check done in the DNA data analysis
assert len(read_1) == 54
if "N" in read_1:
bad_read_1_reads += 1
return
# Try to identify length of UMI (5-7nt)
umi_length = -1
for i in (
5,
6,
7,
):
if (read_1[i + 14 : i + 16] != "AT") or (
hamming(read_1[i : i + 16], "TTTAAACGGGCCCTAT") >= 2
):
continue
if (read_1[i + 36 : i + 38] != "TC") or (
hamming(read_1[i + 36 :], "TCTAGAGCGAGCT."[: 4 - i]) >= 2
): # this is important to distinguish Lib1 carryover from Lib2 product; Lib1 ends with TCTAGTGAGACGT
continue
umi_length = i
break
if (
umi_length == -1
): # we were unable to identify a frame containing the desired sequences
bad_read_1_reads += 1
return
# Barcode identified!
barcode = revcomp(read_1[umi_length + 16 : umi_length + 16 + 20])
if not barcode in barcode_statistics.index: # Barcode not in the plasmid sequencing
unknown_barcode_reads += 1
return
# At this point we identified the barcode and found it in the coupling database, so the output should be recorded in the Dataframe row for that barcode
# Check read 2 (exon) proper format
assert len(read_2) == 106
if "N" in read_2:
barcode_statistics.at[barcode, "num_bad_reads"] += 1
return
EXON_1 = "AAGTTGGTGGTGAGGCCCTGGGCAG"
read2_frame = -1
for i in range(
3
): # try to identify frame of read (0-2nt of Ns in beginning of Read 2)
if (
hamming(read_2[i : i + 25], EXON_1) > 2
): # there are often read errors in the beginning of the read. Allow up to 2.
continue
read2_frame = i
break
if read2_frame == -1: # we could not identify where exon 1 is
barcode_statistics.at[barcode, "num_bad_exon1"] += 1
return
if (
read_2[read2_frame + 25 : read2_frame + 35] == "CTCCTGGGCA"
): # this is the beginning of exon 3, so we have exon skipping
barcode_statistics.at[barcode, "num_exon_skipping"] += 1
return
if (
read_2[read2_frame + 25 : read2_frame + 35] == "GTTGGTATCA"
): # this is the beginning of intron 1, so we have intron retention
barcode_statistics.at[barcode, "num_intron_retention"] += 1
return
if (
hamming(
read_2[read2_frame + 25 :],
"GTT"
+ barcode_statistics.at[barcode, "exon"]
+ "CAG"
+ "CTCCT."[: -1 - read2_frame],
)
<= 2
): # we see the full randomized exon and exon 3
barcode_statistics.at[barcode, "num_exon_inclusion"] += 1
return
if (
read_2[read2_frame + 25 : read2_frame + 25 + 6]
== "GTT" + barcode_statistics.at[barcode, "exon"][:3]
) and (
"CTCCTGGGCAA" in read_2[read2_frame + 25 + 6 :]
): # we see the beginning of exon 2, but also beginning of exon 3; probably splicing in randomized exon
barcode_statistics.at[barcode, "num_splicing_in_exon"] += 1
return
# otherwise, we were unable to identify the splicing pattern
barcode_statistics.at[barcode, "num_unknown_splicing"] += 1
# %%
all_barcode_statistics = []
for lib_num in tqdm(range(NUM_LIBS), desc="Iterating libraries"):
bad_read_1_reads = 0
unknown_barcode_reads = 0
barcode_statistics = pd.read_csv(PLASMID_COUPLING_FILE_NAME).set_index("barcode")
barcode_statistics["num_intron_retention"] = [
0 for i in range(len(barcode_statistics))
]
barcode_statistics["num_exon_inclusion"] = [
0 for i in range(len(barcode_statistics))
]
barcode_statistics["num_exon_skipping"] = [
0 for i in range(len(barcode_statistics))
]
barcode_statistics["num_bad_reads"] = [0 for i in range(len(barcode_statistics))]
barcode_statistics["num_bad_exon1"] = [0 for i in range(len(barcode_statistics))]
barcode_statistics["num_splicing_in_exon"] = [
0 for i in range(len(barcode_statistics))
]
barcode_statistics["num_unknown_splicing"] = [
0 for i in range(len(barcode_statistics))
]
BASE_DIR_NAME = ALL_BASE_DIR_NAMES[lib_num]
FILE_NAME = ALL_FILE_NAMES[lib_num]
FULL_FILE_NAME = BASE_DIR_NAME + FILE_NAME
num_reads = process_paired_fastq_file(
os.path.join(INPUT_FOLDER, FULL_FILE_NAME + "_R1_001.fastq"),
os.path.join(INPUT_FOLDER, FULL_FILE_NAME + "_R2_001.fastq"),
identify_splicing_pattern,
)
print(
"Done reading file",
FILE_NAME,
"(" + ALL_LIBRARY_NAMES[lib_num] + ")",
":",
human_format(num_reads),
"total reads;",
human_format(unknown_barcode_reads),
"reads with unknown barcode",
human_format(bad_read_1_reads),
"reads with bad Read 1",
)
barcode_statistics.to_csv(
os.path.join(OUTPUT_FOLDER, FILE_NAME + "_splicing_analysis.csv")
)
all_barcode_statistics.append(barcode_statistics)