File size: 7,331 Bytes
ed17227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# Take cDNA reads, and produce splicing statistics per barcode

# Read 1: NNNNN[N][N]TTTAAACGGGCCCTATNNNNNNNNNNNNNNNNNNNNTCTAGAGCGAG[CT]
#   Number of Ns (UMI) is random 5-7; barcode is 20N
# Read 2:
#   Diversity: [NN]  (0-2Ns)
#   End of exon 1: AAGTTGGTGGTGAGGCCCTGGGCAG
#   Exon 2: GTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAG
#   Exon 3: CTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTCTAGA

# Plasmid library:
#   ES7A_Lib1	BS06911A

# %%
import os
import pandas as pd
import numpy as np
from collections import Counter
import random
from utils import *
import argparse
from tqdm.auto import tqdm

# %%
parser = argparse.ArgumentParser()
parser.add_argument("--input_folder", required=True, type=str, help="Input folder")
parser.add_argument("--output_folder", required=True, type=str, help="Output folder")
parser.add_argument(
    "--plasmid_coupling_file_name",
    required=True,
    type=str,
    help="Plasmid coupling filename. Must be a csv file.",
)
args = parser.parse_args()

# %%

NUM_LIBS = 3
ALL_LIBRARY_NAMES = {0: "ES7_HeLa_A", 1: "ES7_HeLa_B", 2: "ES7_HeLa_C"}
ALL_FILE_NAMES = {0: "BS11504A_S1", 1: "BS11505A_S2", 2: "BS11506A_S3"}
ALL_BASE_DIR_NAMES = {
    0: "Sample_BS11504A/",
    1: "Sample_BS11505A/",
    2: "Sample_BS11506A/",
}


PLASMID_COUPLING_FILE_NAME = args.plasmid_coupling_file_name
# "data/Sample_BS07028A/coupling.csv"
INPUT_FOLDER = args.input_folder
OUTPUT_FOLDER = args.output_folder

assert os.path.isdir(INPUT_FOLDER), f"No such folder: {INPUT_FOLDER}"
if not os.path.isdir(OUTPUT_FOLDER):
    print(f"{OUTPUT_FOLDER} not found. Creating...")
    os.makedirs(OUTPUT_FOLDER)

# %%
bad_read_1_reads = 0
unknown_barcode_reads = 0

SUBSAMPLE_RATIO = 1  # Only analyze 1 in SUBSAMPLE_RATIO samples; used to get a quick sample for testing purposes; set to 1 in production


def identify_splicing_pattern(read_1, read_2, read_1_q, read_2_q):
    global bad_read_1_reads
    global unknown_barcode_reads
    global barcode_statistics

    if (SUBSAMPLE_RATIO > 1) and (random.randrange(SUBSAMPLE_RATIO) != 0):
        return

    # Check read 1 (barcode) proper format
    # This should match the check done in the DNA data analysis
    assert len(read_1) == 54
    if "N" in read_1:
        bad_read_1_reads += 1
        return

    # Try to identify length of UMI (5-7nt)
    umi_length = -1
    for i in (
        5,
        6,
        7,
    ):
        if (read_1[i + 14 : i + 16] != "AT") or (
            hamming(read_1[i : i + 16], "TTTAAACGGGCCCTAT") >= 2
        ):
            continue
        if (read_1[i + 36 : i + 38] != "TC") or (
            hamming(read_1[i + 36 :], "TCTAGAGCGAGCT."[: 4 - i]) >= 2
        ):  # this is important to distinguish Lib1 carryover from Lib2 product; Lib1 ends with TCTAGTGAGACGT
            continue
        umi_length = i
        break
    if (
        umi_length == -1
    ):  # we were unable to identify a frame containing the desired sequences
        bad_read_1_reads += 1
        return

    # Barcode identified!
    barcode = revcomp(read_1[umi_length + 16 : umi_length + 16 + 20])
    if not barcode in barcode_statistics.index:  # Barcode not in the plasmid sequencing
        unknown_barcode_reads += 1
        return

    # At this point we identified the barcode and found it in the coupling database, so the output should be recorded in the Dataframe row for that barcode

    # Check read 2 (exon) proper format
    assert len(read_2) == 106
    if "N" in read_2:
        barcode_statistics.at[barcode, "num_bad_reads"] += 1
        return
    EXON_1 = "AAGTTGGTGGTGAGGCCCTGGGCAG"
    read2_frame = -1
    for i in range(
        3
    ):  # try to identify frame of read (0-2nt of Ns in beginning of Read 2)
        if (
            hamming(read_2[i : i + 25], EXON_1) > 2
        ):  # there are often read errors in the beginning of the read. Allow up to 2.
            continue
        read2_frame = i
        break
    if read2_frame == -1:  # we could not identify where exon 1 is
        barcode_statistics.at[barcode, "num_bad_exon1"] += 1
        return

    if (
        read_2[read2_frame + 25 : read2_frame + 35] == "CTCCTGGGCA"
    ):  # this is the beginning of exon 3, so we have exon skipping
        barcode_statistics.at[barcode, "num_exon_skipping"] += 1
        return
    if (
        read_2[read2_frame + 25 : read2_frame + 35] == "GTTGGTATCA"
    ):  # this is the beginning of intron 1, so we have intron retention
        barcode_statistics.at[barcode, "num_intron_retention"] += 1
        return
    if (
        hamming(
            read_2[read2_frame + 25 :],
            "GTT"
            + barcode_statistics.at[barcode, "exon"]
            + "CAG"
            + "CTCCT."[: -1 - read2_frame],
        )
        <= 2
    ):  # we see the full randomized exon and exon 3
        barcode_statistics.at[barcode, "num_exon_inclusion"] += 1
        return
    if (
        read_2[read2_frame + 25 : read2_frame + 25 + 6]
        == "GTT" + barcode_statistics.at[barcode, "exon"][:3]
    ) and (
        "CTCCTGGGCAA" in read_2[read2_frame + 25 + 6 :]
    ):  # we see the beginning of exon 2, but also beginning of exon 3; probably splicing in randomized exon
        barcode_statistics.at[barcode, "num_splicing_in_exon"] += 1
        return
    # otherwise, we were unable to identify the splicing pattern
    barcode_statistics.at[barcode, "num_unknown_splicing"] += 1


# %%

all_barcode_statistics = []
for lib_num in tqdm(range(NUM_LIBS), desc="Iterating libraries"):
    bad_read_1_reads = 0
    unknown_barcode_reads = 0

    barcode_statistics = pd.read_csv(PLASMID_COUPLING_FILE_NAME).set_index("barcode")
    barcode_statistics["num_intron_retention"] = [
        0 for i in range(len(barcode_statistics))
    ]
    barcode_statistics["num_exon_inclusion"] = [
        0 for i in range(len(barcode_statistics))
    ]
    barcode_statistics["num_exon_skipping"] = [
        0 for i in range(len(barcode_statistics))
    ]
    barcode_statistics["num_bad_reads"] = [0 for i in range(len(barcode_statistics))]
    barcode_statistics["num_bad_exon1"] = [0 for i in range(len(barcode_statistics))]
    barcode_statistics["num_splicing_in_exon"] = [
        0 for i in range(len(barcode_statistics))
    ]
    barcode_statistics["num_unknown_splicing"] = [
        0 for i in range(len(barcode_statistics))
    ]

    BASE_DIR_NAME = ALL_BASE_DIR_NAMES[lib_num]
    FILE_NAME = ALL_FILE_NAMES[lib_num]
    FULL_FILE_NAME = BASE_DIR_NAME + FILE_NAME
    num_reads = process_paired_fastq_file(
        os.path.join(INPUT_FOLDER, FULL_FILE_NAME + "_R1_001.fastq"),
        os.path.join(INPUT_FOLDER, FULL_FILE_NAME + "_R2_001.fastq"),
        identify_splicing_pattern,
    )
    print(
        "Done reading file",
        FILE_NAME,
        "(" + ALL_LIBRARY_NAMES[lib_num] + ")",
        ":",
        human_format(num_reads),
        "total reads;",
        human_format(unknown_barcode_reads),
        "reads with unknown barcode",
        human_format(bad_read_1_reads),
        "reads with bad Read 1",
    )
    barcode_statistics.to_csv(
        os.path.join(OUTPUT_FOLDER, FILE_NAME + "_splicing_analysis.csv")
    )
    all_barcode_statistics.append(barcode_statistics)