IDPLab commited on
Commit
f8fe9da
·
verified ·
1 Parent(s): 9f0ecbd

Upload 9 files

Browse files
Tesei-trained_Model/filter_dataset/LL28k_w2_NC_T37_cs150_out_02.csv ADDED
The diff for this file is too large to render. See raw diff
 
Tesei-trained_Model/filter_dataset/LL28k_w2_NC_T37_cs150_out_02_composition.csv ADDED
The diff for this file is too large to render. See raw diff
 
Tesei-trained_Model/filter_dataset/README.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Sequence Processing and Filtering Pipeline
2
+ Last updated by Lilianna Houston, Sept. 4th 2024
3
+
4
+ This pipeline processes a dataset of protein sequences in preparation for Machine Learning. It identifies homologs using length, composition and the NIH Blast tool. Follow the steps below to execute the analysis.
5
+
6
+ Workflow Overview:
7
+
8
+ --> Upload Sequence Data
9
+
10
+ Begin by uploading the data file containing your sequences. This pipeline was originally used on the Tesei 2024 human IDP proteome but can be used on any dataset. The Tesei 2024 set is used an example here and the original datafile is named "LL28k_w2_NC_T37_cs150_out_02.csv".
11
+
12
+ --> Identify Length-Based Sequence Pairs
13
+
14
+ Run identify_length_pairs.py to identify pairs of sequences where the difference in length is 15 or less.
15
+
16
+ --> Calculate Composition Differences
17
+
18
+ Simultaneously, run full_comp_dif.py to compute the composition differences between every possible pair of sequences. This step will generate two outputs:
19
+ full_comp_difs.npy: A square NumPy array representing the composition differences across the dataset.
20
+ LL28k_w2_NC_T37_cs150_out_02_composition.csv: A CSV file that includes the original dataset with additional columns for the counts of each amino acid.
21
+
22
+ --> Filter Pairs Based on Length and Composition
23
+
24
+ Execute test_compo.py to filter pairs that meet both the length and composition difference criteria. This will produce indice_pairs_w_compo.csv, which contains the indices of the final pairs along with their composition differences.
25
+
26
+ --> Remove Homologous and NaN Sequences
27
+
28
+ Finally, run blast_filter.py to filter out sequences with NaN w2 values and homologous sequences from the original dataset. The output will be LL28k_filtered_nanW2s_noRepeats.csv, a refined dataset excluding the undesired sequences.
Tesei-trained_Model/filter_dataset/blast_filter.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Bio.Blast.Applications import NcbiblastpCommandline
2
+ import os
3
+ import pandas as pd
4
+ import time
5
+ import networkx as nx
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+
9
+ # Load data
10
+ dataset = pd.read_csv("LL28k_w2_NC_T37_cs150_out_02.csv")
11
+ seqs = dataset['fasta']
12
+ pairs = pd.read_csv(r"indice_pairs_w_combos.csv")
13
+
14
+ #Blast functions
15
+ def validate_protein_sequence(sequence):
16
+ # Valid amino acid residues (single-letter codes)
17
+ valid_residues = set("ACDEFGHIKLMNPQRSTVWY")
18
+ return all(residue in valid_residues for residue in sequence)
19
+
20
+ def run_blast_with_biopython(query_seq, subject_seq):
21
+ # Validate sequences
22
+ if not validate_protein_sequence(query_seq):
23
+ print("Invalid characters found in query sequence:", query_seq)
24
+ return
25
+ if not validate_protein_sequence(subject_seq):
26
+ print("Invalid characters found in subject sequence:", subject_seq)
27
+ return
28
+
29
+ # Create temporary files for query and subject sequences
30
+ with open("query.fasta", "w") as query_file:
31
+ query_file.write(">query\n" + query_seq)
32
+ with open("subject.fasta", "w") as subject_file:
33
+ subject_file.write(">subject\n" + subject_seq)
34
+
35
+ # Use the full path to the blastp.exe executable
36
+ blastp_cline = NcbiblastpCommandline(
37
+ cmd=r'C:\Users\lilia\Anaconda3\envs\ml135_env_sp21\lib\site-packages\biopython-1.84.dist-info\ncbi-blast-2.16.0+-x64-win64\ncbi-blast-2.16.0+\bin\blastp.exe',
38
+ query="query.fasta",
39
+ subject="subject.fasta",
40
+ outfmt=6, # Output format 6 is a tabular format (can be changed if needed)
41
+ evalue=10 # Set e-value threshold higher to be more inclusive
42
+ )
43
+
44
+ # Execute BLAST and capture the output
45
+ stdout, stderr = blastp_cline()
46
+
47
+ # Print the BLAST output and interpret the columns
48
+ if stdout:
49
+ # print("BLAST Output:\n")
50
+ print("BLAST found a match")
51
+
52
+ # print("Query ID, Subject ID, % Identity, Alignment Length, Mismatches, Gap Openings, Query Start, Query End, Subject Start, Subject End, E-value, Bit Score\n")
53
+ for line in stdout.strip().split("\n"):
54
+ columns = line.split("\t")
55
+ return float(columns[2]), int(columns[3])/len(query_seq)*100
56
+ else:
57
+ print("No significant matches found.")
58
+ return 0, 0
59
+
60
+ if stderr:
61
+ print("BLAST Errors:\n", stderr)
62
+
63
+ # Clean up temporary files
64
+ os.remove("query.fasta")
65
+ os.remove("subject.fasta")
66
+
67
+ # Boolean area, 1 if identified as a pair by Blast, 0 if not
68
+ blast_match = []
69
+
70
+ for i in range(len(pairs)):
71
+ iden, cov = run_blast_with_biopython(seqs[pairs["index1"][i]], seqs[pairs["index2"][i]])
72
+ if iden > 80 and cov > 80:
73
+ blast_match.append(1)
74
+ else: blast_match.append(0)
75
+ print("\n")
76
+
77
+ pairs["blast_match"] = blast_match
78
+ pairs.to_csv("blasted_pairs.csv", index=False)
79
+
80
+ # Create a graph of homolog pairs to identify single sequence to keep
81
+
82
+ # designate x and y arrays (these are the indices of all pairs)
83
+ x_list = list(pairs[pairs["blast_match"] == 1]["index1"])
84
+ y_list = list(pairs[pairs["blast_match"] == 1]["index2"])
85
+
86
+ # Initialize a graph
87
+ G = nx.Graph()
88
+
89
+ # Add nodes for each pair
90
+ for i in range(len(x_list)):
91
+ G.add_node(i, x=x_list[i], y=y_list[i])
92
+
93
+ # Add edges between nodes with the same x or y
94
+ for i in range(len(x_list)):
95
+ for j in range(i + 1, len(x_list)):
96
+ if x_list[i] == x_list[j] or y_list[i] == y_list[j]:
97
+ G.add_edge(i, j)
98
+
99
+ # Find connected components
100
+ groups = list(nx.connected_components(G))
101
+
102
+ # Extract and print the groups
103
+ result = []
104
+ for group in groups:
105
+ grouped_pairs = [(x_list[i], y_list[i]) for i in group]
106
+ result.append(grouped_pairs)
107
+
108
+ # Get number of sequences in each group
109
+ true_lengths = []
110
+ for num in range(len(result)):
111
+ group = []
112
+ for i in result[num]:
113
+ index1 = i[0]
114
+ index2 = i[1]
115
+ if index1 not in group:
116
+ group.append(index1)
117
+ if index2 not in group:
118
+ group.append(index2)
119
+ true_lengths.append(len(group))
120
+
121
+ print(np.mean(true_lengths))
122
+ plt.hist(true_lengths, bins = 20)
123
+ plt.show()
124
+
125
+ def process_group(num):
126
+ group = []
127
+ lens = []
128
+ for i in result[num]:
129
+ index1 = i[0]
130
+ index2 = i[1]
131
+ if index1 not in group:
132
+ group.append(index1)
133
+ lens.append(len(seqs[index1]))
134
+ if index2 not in group:
135
+ group.append(index2)
136
+ lens.append(len(seqs[index2]))
137
+ #print(lens)
138
+ # for i in group:
139
+ # print(seqs[i])
140
+
141
+ max_value = max(lens)
142
+
143
+ # Find the index of the first occurrence of the maximum value
144
+ index_of_max = lens.index(max_value)
145
+ # print(index_of_max)
146
+ value_to_keep = group[index_of_max]
147
+ reject_values = [value for i, value in enumerate(group) if i != index_of_max]
148
+ # print(group)
149
+ # print(value_to_keep)
150
+ return(reject_values)
151
+
152
+ all_reject_vals = []
153
+ for i in range(len(result)):
154
+ all_reject_vals.extend(process_group(i))
155
+
156
+ # Create an array to store whether a sequence is a reject due to homology
157
+ reject_column_for_dataset = []
158
+ for i in range(len(dataset)):
159
+ if i in all_reject_vals:
160
+ reject_column_for_dataset.append(1)
161
+ else: reject_column_for_dataset.append(0)
162
+
163
+ dataset["reject"] = reject_list
164
+ dataset.to_csv("LL28k_w_rejects.csv")
165
+ df_filtered = dataset.dropna(subset=['w2']) # Drop rows where 'w2' is NaN
166
+ df_filtered = dataset[dataset['reject'] != 1]
167
+ df_filtered.to_csv("LL28k_filtered_nanW2s_noRepeats.csv")
Tesei-trained_Model/filter_dataset/blasted_pairs.csv ADDED
The diff for this file is too large to render. See raw diff
 
Tesei-trained_Model/filter_dataset/full_comp_dif.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import math
4
+ import matplotlib.pyplot as plt
5
+
6
+ dataset = pd.read_csv("LL28k_w2_NC_T37_cs150_out_02.csv")
7
+ seqs = dataset["fasta"]
8
+ # constants
9
+ amino_acid_data = {
10
+ "A": 0,
11
+ "R": 1,
12
+ "N": 0,
13
+ "D": -1,
14
+ "C": 0,
15
+ "E": -1,
16
+ "Q": 0,
17
+ "G": 0,
18
+ "H": 0,
19
+ "I": 0,
20
+ "L": 0,
21
+ "K": 1,
22
+ "M": 0,
23
+ "F": 0,
24
+ "P": 0,
25
+ "S": 0,
26
+ "T": 0,
27
+ "W": 0,
28
+ "Y": 0,
29
+ "V": 0,
30
+ }
31
+
32
+ aas = amino_acid_data.keys()
33
+
34
+ column_names = [f'{i}_count' for i in aas]
35
+ for name in column_names:
36
+ dataset[name] = 0
37
+
38
+ def count_aa(seq, index):
39
+ for i in aas:
40
+ n = seq.count(i)
41
+ dataset.at[index, f'{i}_count'] = n
42
+
43
+ for i in range(len(dataset)):
44
+ count_aa(dataset['fasta'][i], i)
45
+
46
+ dataset.to_csv("LL28k_w2_NC_T37_cs150_out_02_composition.csv", index=False)
47
+
48
+ def comp_two_seqs(i, j):
49
+ test_seq = dataset['fasta'][i]
50
+ test_seq_N = len(test_seq)
51
+
52
+ comp_seq = dataset['fasta'][j]
53
+ comp_seq_N = len(comp_seq)
54
+
55
+ total = 0
56
+ for aa in aas:
57
+ dif = ( dataset[f'{aa}_count'][i] / test_seq_N - dataset[f'{aa}_count'][j] / comp_seq_N )**2
58
+ total += dif
59
+ return total
60
+
61
+ difs = np.zeros((len(dataset), len(dataset)))
62
+
63
+ for i in range(len(seqs)):
64
+ for j in range(0, i):
65
+
66
+ difs[i, j] = comp_two_seqs(i, j)
67
+
68
+ np.save("full_comp_difs", difs)
Tesei-trained_Model/filter_dataset/identify_length_pairs.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import math
4
+ import time
5
+
6
+ # load data
7
+ dataset = pd.read_csv(r"C:\Users\lilia\Documents\DU\research\Computational_IDP_Research\ML\homologs\LL28k_w2_NC_T37_cs150_out_02.csv")
8
+ seqs = dataset["fasta"]
9
+
10
+ # compare lengths, save pairs of sequences with < 15 difference in length
11
+ len_match1 = []
12
+ len_match2 = []
13
+
14
+ start_time = time.time()
15
+ for i in range(len(seqs)):
16
+ for j in range(i):
17
+ if abs(len(seqs[i]) - len(seqs[j])) < 15:
18
+ len_match1.append(i)
19
+ len_match2.append(j)
20
+ if (i % 200 == 0):
21
+ print(f"Elapsed time: {(time.time() - start_time):.2f} seconds, index", i)
22
+ end_time = time.time()
23
+ elapsed_time = end_time - start_time
24
+ print(f"Total time: {elapsed_time:.2f} seconds")
25
+
26
+ #np.save("len_match_indices1.npy", np.asarray(len_match1))
27
+ #np.save("len_match_indices2.npy", np.asarray(len_match2))
Tesei-trained_Model/filter_dataset/indice_pairs_w_combos.csv ADDED
The diff for this file is too large to render. See raw diff
 
Tesei-trained_Model/filter_dataset/test_compo.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import math
4
+ import matplotlib.pyplot as plt
5
+ import csv
6
+ import time
7
+
8
+ full_compos = np.load("full_comp_difs.npy")
9
+ len_match1 = np.load("len_match1.npy")
10
+ len_match2 = np.load("len_match2.npy")
11
+
12
+ final_pairs1 = []
13
+ final_pairs2 = []
14
+ compos = []
15
+
16
+ for i in range(len(len_match1)):
17
+ if full_compos[len_match1[i], len_match2[i]] < .005:
18
+ final_pairs1.append(len_match1[i])
19
+ final_pairs2.append(len_match2[i])
20
+ compos.append(full_compos[len_match1[i], len_match2[i]])
21
+
22
+ df = pd.DataFrame({
23
+ 'index1': final_pairs1,
24
+ 'index2': final_pairs2,
25
+ 'compo': compos
26
+ })
27
+
28
+ df.to_csv('indice_pairs_w_compo.csv', index=False)
29
+
30
+ print(len(compos))