Spaces:
Running
Running
File size: 7,331 Bytes
ed17227 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
# Take cDNA reads, and produce splicing statistics per barcode
# Read 1: NNNNN[N][N]TTTAAACGGGCCCTATNNNNNNNNNNNNNNNNNNNNTCTAGAGCGAG[CT]
# Number of Ns (UMI) is random 5-7; barcode is 20N
# Read 2:
# Diversity: [NN] (0-2Ns)
# End of exon 1: AAGTTGGTGGTGAGGCCCTGGGCAG
# Exon 2: GTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAG
# Exon 3: CTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTCTAGA
# Plasmid library:
# ES7A_Lib1 BS06911A
# %%
import os
import pandas as pd
import numpy as np
from collections import Counter
import random
from utils import *
import argparse
from tqdm.auto import tqdm
# %%
parser = argparse.ArgumentParser()
parser.add_argument("--input_folder", required=True, type=str, help="Input folder")
parser.add_argument("--output_folder", required=True, type=str, help="Output folder")
parser.add_argument(
"--plasmid_coupling_file_name",
required=True,
type=str,
help="Plasmid coupling filename. Must be a csv file.",
)
args = parser.parse_args()
# %%
NUM_LIBS = 3
ALL_LIBRARY_NAMES = {0: "ES7_HeLa_A", 1: "ES7_HeLa_B", 2: "ES7_HeLa_C"}
ALL_FILE_NAMES = {0: "BS11504A_S1", 1: "BS11505A_S2", 2: "BS11506A_S3"}
ALL_BASE_DIR_NAMES = {
0: "Sample_BS11504A/",
1: "Sample_BS11505A/",
2: "Sample_BS11506A/",
}
PLASMID_COUPLING_FILE_NAME = args.plasmid_coupling_file_name
# "data/Sample_BS07028A/coupling.csv"
INPUT_FOLDER = args.input_folder
OUTPUT_FOLDER = args.output_folder
assert os.path.isdir(INPUT_FOLDER), f"No such folder: {INPUT_FOLDER}"
if not os.path.isdir(OUTPUT_FOLDER):
print(f"{OUTPUT_FOLDER} not found. Creating...")
os.makedirs(OUTPUT_FOLDER)
# %%
bad_read_1_reads = 0
unknown_barcode_reads = 0
SUBSAMPLE_RATIO = 1 # Only analyze 1 in SUBSAMPLE_RATIO samples; used to get a quick sample for testing purposes; set to 1 in production
def identify_splicing_pattern(read_1, read_2, read_1_q, read_2_q):
global bad_read_1_reads
global unknown_barcode_reads
global barcode_statistics
if (SUBSAMPLE_RATIO > 1) and (random.randrange(SUBSAMPLE_RATIO) != 0):
return
# Check read 1 (barcode) proper format
# This should match the check done in the DNA data analysis
assert len(read_1) == 54
if "N" in read_1:
bad_read_1_reads += 1
return
# Try to identify length of UMI (5-7nt)
umi_length = -1
for i in (
5,
6,
7,
):
if (read_1[i + 14 : i + 16] != "AT") or (
hamming(read_1[i : i + 16], "TTTAAACGGGCCCTAT") >= 2
):
continue
if (read_1[i + 36 : i + 38] != "TC") or (
hamming(read_1[i + 36 :], "TCTAGAGCGAGCT."[: 4 - i]) >= 2
): # this is important to distinguish Lib1 carryover from Lib2 product; Lib1 ends with TCTAGTGAGACGT
continue
umi_length = i
break
if (
umi_length == -1
): # we were unable to identify a frame containing the desired sequences
bad_read_1_reads += 1
return
# Barcode identified!
barcode = revcomp(read_1[umi_length + 16 : umi_length + 16 + 20])
if not barcode in barcode_statistics.index: # Barcode not in the plasmid sequencing
unknown_barcode_reads += 1
return
# At this point we identified the barcode and found it in the coupling database, so the output should be recorded in the Dataframe row for that barcode
# Check read 2 (exon) proper format
assert len(read_2) == 106
if "N" in read_2:
barcode_statistics.at[barcode, "num_bad_reads"] += 1
return
EXON_1 = "AAGTTGGTGGTGAGGCCCTGGGCAG"
read2_frame = -1
for i in range(
3
): # try to identify frame of read (0-2nt of Ns in beginning of Read 2)
if (
hamming(read_2[i : i + 25], EXON_1) > 2
): # there are often read errors in the beginning of the read. Allow up to 2.
continue
read2_frame = i
break
if read2_frame == -1: # we could not identify where exon 1 is
barcode_statistics.at[barcode, "num_bad_exon1"] += 1
return
if (
read_2[read2_frame + 25 : read2_frame + 35] == "CTCCTGGGCA"
): # this is the beginning of exon 3, so we have exon skipping
barcode_statistics.at[barcode, "num_exon_skipping"] += 1
return
if (
read_2[read2_frame + 25 : read2_frame + 35] == "GTTGGTATCA"
): # this is the beginning of intron 1, so we have intron retention
barcode_statistics.at[barcode, "num_intron_retention"] += 1
return
if (
hamming(
read_2[read2_frame + 25 :],
"GTT"
+ barcode_statistics.at[barcode, "exon"]
+ "CAG"
+ "CTCCT."[: -1 - read2_frame],
)
<= 2
): # we see the full randomized exon and exon 3
barcode_statistics.at[barcode, "num_exon_inclusion"] += 1
return
if (
read_2[read2_frame + 25 : read2_frame + 25 + 6]
== "GTT" + barcode_statistics.at[barcode, "exon"][:3]
) and (
"CTCCTGGGCAA" in read_2[read2_frame + 25 + 6 :]
): # we see the beginning of exon 2, but also beginning of exon 3; probably splicing in randomized exon
barcode_statistics.at[barcode, "num_splicing_in_exon"] += 1
return
# otherwise, we were unable to identify the splicing pattern
barcode_statistics.at[barcode, "num_unknown_splicing"] += 1
# %%
all_barcode_statistics = []
for lib_num in tqdm(range(NUM_LIBS), desc="Iterating libraries"):
bad_read_1_reads = 0
unknown_barcode_reads = 0
barcode_statistics = pd.read_csv(PLASMID_COUPLING_FILE_NAME).set_index("barcode")
barcode_statistics["num_intron_retention"] = [
0 for i in range(len(barcode_statistics))
]
barcode_statistics["num_exon_inclusion"] = [
0 for i in range(len(barcode_statistics))
]
barcode_statistics["num_exon_skipping"] = [
0 for i in range(len(barcode_statistics))
]
barcode_statistics["num_bad_reads"] = [0 for i in range(len(barcode_statistics))]
barcode_statistics["num_bad_exon1"] = [0 for i in range(len(barcode_statistics))]
barcode_statistics["num_splicing_in_exon"] = [
0 for i in range(len(barcode_statistics))
]
barcode_statistics["num_unknown_splicing"] = [
0 for i in range(len(barcode_statistics))
]
BASE_DIR_NAME = ALL_BASE_DIR_NAMES[lib_num]
FILE_NAME = ALL_FILE_NAMES[lib_num]
FULL_FILE_NAME = BASE_DIR_NAME + FILE_NAME
num_reads = process_paired_fastq_file(
os.path.join(INPUT_FOLDER, FULL_FILE_NAME + "_R1_001.fastq"),
os.path.join(INPUT_FOLDER, FULL_FILE_NAME + "_R2_001.fastq"),
identify_splicing_pattern,
)
print(
"Done reading file",
FILE_NAME,
"(" + ALL_LIBRARY_NAMES[lib_num] + ")",
":",
human_format(num_reads),
"total reads;",
human_format(unknown_barcode_reads),
"reads with unknown barcode",
human_format(bad_read_1_reads),
"reads with bad Read 1",
)
barcode_statistics.to_csv(
os.path.join(OUTPUT_FOLDER, FILE_NAME + "_splicing_analysis.csv")
)
all_barcode_statistics.append(barcode_statistics)
|