# Take cDNA reads, and produce splicing statistics per barcode # Read 1: NNNNN[N][N]TTTAAACGGGCCCTATNNNNNNNNNNNNNNNNNNNNTCTAGAGCGAG[CT] # Number of Ns (UMI) is random 5-7; barcode is 20N # Read 2: # Diversity: [NN] (0-2Ns) # End of exon 1: AAGTTGGTGGTGAGGCCCTGGGCAG # Exon 2: GTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAG # Exon 3: CTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTCTAGA # Plasmid library: # ES7A_Lib1 BS06911A # %% import os import pandas as pd import numpy as np from collections import Counter import random from utils import * import argparse from tqdm.auto import tqdm # %% parser = argparse.ArgumentParser() parser.add_argument("--input_folder", required=True, type=str, help="Input folder") parser.add_argument("--output_folder", required=True, type=str, help="Output folder") parser.add_argument( "--plasmid_coupling_file_name", required=True, type=str, help="Plasmid coupling filename. Must be a csv file.", ) args = parser.parse_args() # %% NUM_LIBS = 3 ALL_LIBRARY_NAMES = {0: "ES7_HeLa_A", 1: "ES7_HeLa_B", 2: "ES7_HeLa_C"} ALL_FILE_NAMES = {0: "BS11504A_S1", 1: "BS11505A_S2", 2: "BS11506A_S3"} ALL_BASE_DIR_NAMES = { 0: "Sample_BS11504A/", 1: "Sample_BS11505A/", 2: "Sample_BS11506A/", } PLASMID_COUPLING_FILE_NAME = args.plasmid_coupling_file_name # "data/Sample_BS07028A/coupling.csv" INPUT_FOLDER = args.input_folder OUTPUT_FOLDER = args.output_folder assert os.path.isdir(INPUT_FOLDER), f"No such folder: {INPUT_FOLDER}" if not os.path.isdir(OUTPUT_FOLDER): print(f"{OUTPUT_FOLDER} not found. Creating...") os.makedirs(OUTPUT_FOLDER) # %% bad_read_1_reads = 0 unknown_barcode_reads = 0 SUBSAMPLE_RATIO = 1 # Only analyze 1 in SUBSAMPLE_RATIO samples; used to get a quick sample for testing purposes; set to 1 in production def identify_splicing_pattern(read_1, read_2, read_1_q, read_2_q): global bad_read_1_reads global unknown_barcode_reads global barcode_statistics if (SUBSAMPLE_RATIO > 1) and (random.randrange(SUBSAMPLE_RATIO) != 0): return # Check read 1 (barcode) proper format # This should match the check done in the DNA data analysis assert len(read_1) == 54 if "N" in read_1: bad_read_1_reads += 1 return # Try to identify length of UMI (5-7nt) umi_length = -1 for i in ( 5, 6, 7, ): if (read_1[i + 14 : i + 16] != "AT") or ( hamming(read_1[i : i + 16], "TTTAAACGGGCCCTAT") >= 2 ): continue if (read_1[i + 36 : i + 38] != "TC") or ( hamming(read_1[i + 36 :], "TCTAGAGCGAGCT."[: 4 - i]) >= 2 ): # this is important to distinguish Lib1 carryover from Lib2 product; Lib1 ends with TCTAGTGAGACGT continue umi_length = i break if ( umi_length == -1 ): # we were unable to identify a frame containing the desired sequences bad_read_1_reads += 1 return # Barcode identified! barcode = revcomp(read_1[umi_length + 16 : umi_length + 16 + 20]) if not barcode in barcode_statistics.index: # Barcode not in the plasmid sequencing unknown_barcode_reads += 1 return # At this point we identified the barcode and found it in the coupling database, so the output should be recorded in the Dataframe row for that barcode # Check read 2 (exon) proper format assert len(read_2) == 106 if "N" in read_2: barcode_statistics.at[barcode, "num_bad_reads"] += 1 return EXON_1 = "AAGTTGGTGGTGAGGCCCTGGGCAG" read2_frame = -1 for i in range( 3 ): # try to identify frame of read (0-2nt of Ns in beginning of Read 2) if ( hamming(read_2[i : i + 25], EXON_1) > 2 ): # there are often read errors in the beginning of the read. Allow up to 2. continue read2_frame = i break if read2_frame == -1: # we could not identify where exon 1 is barcode_statistics.at[barcode, "num_bad_exon1"] += 1 return if ( read_2[read2_frame + 25 : read2_frame + 35] == "CTCCTGGGCA" ): # this is the beginning of exon 3, so we have exon skipping barcode_statistics.at[barcode, "num_exon_skipping"] += 1 return if ( read_2[read2_frame + 25 : read2_frame + 35] == "GTTGGTATCA" ): # this is the beginning of intron 1, so we have intron retention barcode_statistics.at[barcode, "num_intron_retention"] += 1 return if ( hamming( read_2[read2_frame + 25 :], "GTT" + barcode_statistics.at[barcode, "exon"] + "CAG" + "CTCCT."[: -1 - read2_frame], ) <= 2 ): # we see the full randomized exon and exon 3 barcode_statistics.at[barcode, "num_exon_inclusion"] += 1 return if ( read_2[read2_frame + 25 : read2_frame + 25 + 6] == "GTT" + barcode_statistics.at[barcode, "exon"][:3] ) and ( "CTCCTGGGCAA" in read_2[read2_frame + 25 + 6 :] ): # we see the beginning of exon 2, but also beginning of exon 3; probably splicing in randomized exon barcode_statistics.at[barcode, "num_splicing_in_exon"] += 1 return # otherwise, we were unable to identify the splicing pattern barcode_statistics.at[barcode, "num_unknown_splicing"] += 1 # %% all_barcode_statistics = [] for lib_num in tqdm(range(NUM_LIBS), desc="Iterating libraries"): bad_read_1_reads = 0 unknown_barcode_reads = 0 barcode_statistics = pd.read_csv(PLASMID_COUPLING_FILE_NAME).set_index("barcode") barcode_statistics["num_intron_retention"] = [ 0 for i in range(len(barcode_statistics)) ] barcode_statistics["num_exon_inclusion"] = [ 0 for i in range(len(barcode_statistics)) ] barcode_statistics["num_exon_skipping"] = [ 0 for i in range(len(barcode_statistics)) ] barcode_statistics["num_bad_reads"] = [0 for i in range(len(barcode_statistics))] barcode_statistics["num_bad_exon1"] = [0 for i in range(len(barcode_statistics))] barcode_statistics["num_splicing_in_exon"] = [ 0 for i in range(len(barcode_statistics)) ] barcode_statistics["num_unknown_splicing"] = [ 0 for i in range(len(barcode_statistics)) ] BASE_DIR_NAME = ALL_BASE_DIR_NAMES[lib_num] FILE_NAME = ALL_FILE_NAMES[lib_num] FULL_FILE_NAME = BASE_DIR_NAME + FILE_NAME num_reads = process_paired_fastq_file( os.path.join(INPUT_FOLDER, FULL_FILE_NAME + "_R1_001.fastq"), os.path.join(INPUT_FOLDER, FULL_FILE_NAME + "_R2_001.fastq"), identify_splicing_pattern, ) print( "Done reading file", FILE_NAME, "(" + ALL_LIBRARY_NAMES[lib_num] + ")", ":", human_format(num_reads), "total reads;", human_format(unknown_barcode_reads), "reads with unknown barcode", human_format(bad_read_1_reads), "reads with bad Read 1", ) barcode_statistics.to_csv( os.path.join(OUTPUT_FOLDER, FILE_NAME + "_splicing_analysis.csv") ) all_barcode_statistics.append(barcode_statistics)