Spaces:
Sleeping
Sleeping
File size: 7,151 Bytes
6766437 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
# Reads the raw DNA reads, filters out those that don't look like barcode reads, and sorts by barcode
import sys
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import io
import os
from collections import Counter
from utils import *
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--input_folder", required=True, type=str, help="Input folder")
args = parser.parse_args()
###############################################
## Read the plasmid sequencing file into memory
## Once a barcode is identified, we store the corresponding exon in a Counter for later analysis.
## If exon cannot be read, record it as error (we cannot ignore it, since it might correspond to a badly coupled barcode, like CACATACTGAGCATCTAACT in ES7A)
couplings = {}
def collect_barcodes(read_1, read_2, read_1_q, read_2_q):
global good_reads
global reads_with_N
global unidentified_reads
global couplings
global lib_num
# Check read 1 (barcode) proper format
# This should match the check done in the RNA data analysis
assert len(read_1) == 54
if "N" in read_1:
reads_with_N += 1
return
if (read_1[5 + 14 : 5 + 16] != "AT") or (
hamming(read_1[5 : 5 + 16], "TTTAAACGGGCCCTAT") >= 2
):
unidentified_reads += 1
return
if (read_1[41:43] != "TC") or (hamming(read_1[41:], "TCTAGTGAGACGT") >= 2):
unidentified_reads += 1
return
# Barcode identified!
BARCODE_POSITION = 21
barcode = revcomp(read_1[BARCODE_POSITION : BARCODE_POSITION + 20])
good_reads += 1
# add barcode to coupling dictionary if not already there
if not barcode in couplings:
couplings[barcode] = [
Counter(),
0,
] # second coordinate counts number of bad reads
# Check read 2 (exon) proper format
assert len(read_2) == 106
# if N is in read 2, it's a bad read. Exit the function
if "N" in read_2:
couplings[barcode][1] += 1
return
# This also filters out exons with deletions, which are reasonably common; consider improving:
READ_2_PREFIX = {0: "GCCATCCAGGTT"}[lib_num]
if read_2[:12] != READ_2_PREFIX:
couplings[barcode][1] += 1
return
if read_2[82:87] != "CAGGT":
couplings[barcode][1] += 1
return
# if we've passed all the bad read exit conditions,
# add the exon sequence (70 nt) to the couplings[barcode] counter
exon = read_2[12 : 12 + 70]
couplings[barcode][0][exon] += 1
ALL_LIBRARY_NAMES = {0: "ES7A"}
ALL_FILE_NAMES = {0: "BS06911A_S22"}
ALL_BASE_DIR_NAMES = {0: os.path.join(args.input_folder, "Sample_BS06911A/")}
all_couplings = []
for lib_num in ALL_LIBRARY_NAMES:
good_reads = 0
reads_with_N = 0
unidentified_reads = 0
couplings = {}
BASE_DIR_NAME = ALL_BASE_DIR_NAMES[lib_num]
FILE_NAME = ALL_FILE_NAMES[lib_num]
FULL_FILE_NAME = BASE_DIR_NAME + FILE_NAME
num_reads = process_paired_fastq_file(
FULL_FILE_NAME + "_R1_001.fastq",
FULL_FILE_NAME + "_R2_001.fastq",
collect_barcodes,
)
all_couplings.append(couplings)
print(
"Done reading file",
FILE_NAME,
":",
human_format(num_reads),
"total reads;",
human_format(unidentified_reads),
"unidentified reads; ",
human_format(reads_with_N),
"reads with N;",
human_format(good_reads),
"good reads",
)
##############################
# Check coupling between exons and barcodes, and write it to file
##############################
# below that threshold, we won't even write the barcode to the coupling file; this is meant to filter read errors in the barcode, as those are unlikely to be seen more than once.
MIN_NUMBER_OF_READS = 2
for lib_num in ALL_LIBRARY_NAMES:
print("Processing", ALL_FILE_NAMES[lib_num])
f = open(ALL_BASE_DIR_NAMES[lib_num] + "coupling.txt", "w")
num_keys_with_enough_reads = 0
num_uniquely_coupled_keys = 0
num_too_many_errors_in_exon = 0
num_with_no_clear_majority = 0
barcode_coupling = []
couplings = all_couplings[lib_num]
for barcode in tqdm(couplings.keys()):
coupling_data = couplings[barcode][0]
# total number of reads for that barcode; note that len(coupling_data) gives the number of different *exons* associated with this barcode
reads_for_barcode = sum(coupling_data.values())
if reads_for_barcode < MIN_NUMBER_OF_READS: # too few reads
continue
num_keys_with_enough_reads += 1
sequence_frequencies: Counter = Counter(coupling_data)
num_reads_most_common = sequence_frequencies.most_common(1)[0][1]
# second most common exon should not be too common (since random errors should not form clusters)
if (len(sequence_frequencies) > 1) and (
sequence_frequencies.most_common(2)[1][1]
>= max(2, num_reads_most_common / 4)
):
badly_coupled = True
num_with_no_clear_majority += 1
# number of bad reads should also not be too high
elif couplings[barcode][1] >= max(2, num_reads_most_common / 4):
badly_coupled = True
num_too_many_errors_in_exon += 1
else:
badly_coupled = False
num_uniquely_coupled_keys += 1
most_common_full_exon = sequence_frequencies.most_common(1)[0][0]
# TODO: flanking should use utils file
most_common_full_exon_with_flanking = "AGGTT" + most_common_full_exon + "CAGGT"
most_common_full_exon_contains_restriction_site = (
"CGTCTC" in most_common_full_exon_with_flanking
) or ("GAGACG" in most_common_full_exon_with_flanking)
barcode_coupling.append(
[
barcode,
most_common_full_exon,
badly_coupled,
most_common_full_exon_contains_restriction_site,
reads_for_barcode,
]
)
print(
"?"
if most_common_full_exon_contains_restriction_site
else ("*" if badly_coupled else "."),
barcode,
sequence_frequencies.most_common(2),
file=f,
)
print(
"Total number of barcodes seen:",
human_format(len(couplings)),
"Barcodes with enough reads:",
human_format(num_keys_with_enough_reads),
"Uniquely coupled barcodes:",
human_format(num_uniquely_coupled_keys),
"Barcodes with too many errors in exon reads:",
human_format(num_too_many_errors_in_exon),
"Barcodes with no clear majority exon:",
human_format(num_with_no_clear_majority),
)
f.close()
df = pd.DataFrame(
barcode_coupling,
columns=[
"barcode",
"exon",
"badly_coupled",
"contains_restriction_site",
"num_reads",
],
).set_index("barcode")
df.to_csv(ALL_BASE_DIR_NAMES[lib_num] + "coupling.csv")
print(f"Wrote coupling.csv to {ALL_BASE_DIR_NAMES[lib_num]}coupling.csv")
|