Upload 9 files
Browse files- Tesei-trained_Model/filter_dataset/LL28k_w2_NC_T37_cs150_out_02.csv +0 -0
- Tesei-trained_Model/filter_dataset/LL28k_w2_NC_T37_cs150_out_02_composition.csv +0 -0
- Tesei-trained_Model/filter_dataset/README.txt +28 -0
- Tesei-trained_Model/filter_dataset/blast_filter.py +167 -0
- Tesei-trained_Model/filter_dataset/blasted_pairs.csv +0 -0
- Tesei-trained_Model/filter_dataset/full_comp_dif.py +68 -0
- Tesei-trained_Model/filter_dataset/identify_length_pairs.py +27 -0
- Tesei-trained_Model/filter_dataset/indice_pairs_w_combos.csv +0 -0
- Tesei-trained_Model/filter_dataset/test_compo.py +30 -0
Tesei-trained_Model/filter_dataset/LL28k_w2_NC_T37_cs150_out_02.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Tesei-trained_Model/filter_dataset/LL28k_w2_NC_T37_cs150_out_02_composition.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Tesei-trained_Model/filter_dataset/README.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Sequence Processing and Filtering Pipeline
|
| 2 |
+
Last updated by Lilianna Houston, Sept. 4th 2024
|
| 3 |
+
|
| 4 |
+
This pipeline processes a dataset of protein sequences in preparation for Machine Learning. It identifies homologs using length, composition and the NIH Blast tool. Follow the steps below to execute the analysis.
|
| 5 |
+
|
| 6 |
+
Workflow Overview:
|
| 7 |
+
|
| 8 |
+
--> Upload Sequence Data
|
| 9 |
+
|
| 10 |
+
Begin by uploading the data file containing your sequences. This pipeline was originally used on the Tesei 2024 human IDP proteome but can be used on any dataset. The Tesei 2024 set is used an example here and the original datafile is named "LL28k_w2_NC_T37_cs150_out_02.csv".
|
| 11 |
+
|
| 12 |
+
--> Identify Length-Based Sequence Pairs
|
| 13 |
+
|
| 14 |
+
Run identify_length_pairs.py to identify pairs of sequences where the difference in length is 15 or less.
|
| 15 |
+
|
| 16 |
+
--> Calculate Composition Differences
|
| 17 |
+
|
| 18 |
+
Simultaneously, run full_comp_dif.py to compute the composition differences between every possible pair of sequences. This step will generate two outputs:
|
| 19 |
+
full_comp_difs.npy: A square NumPy array representing the composition differences across the dataset.
|
| 20 |
+
LL28k_w2_NC_T37_cs150_out_02_composition.csv: A CSV file that includes the original dataset with additional columns for the counts of each amino acid.
|
| 21 |
+
|
| 22 |
+
--> Filter Pairs Based on Length and Composition
|
| 23 |
+
|
| 24 |
+
Execute test_compo.py to filter pairs that meet both the length and composition difference criteria. This will produce indice_pairs_w_compo.csv, which contains the indices of the final pairs along with their composition differences.
|
| 25 |
+
|
| 26 |
+
--> Remove Homologous and NaN Sequences
|
| 27 |
+
|
| 28 |
+
Finally, run blast_filter.py to filter out sequences with NaN w2 values and homologous sequences from the original dataset. The output will be LL28k_filtered_nanW2s_noRepeats.csv, a refined dataset excluding the undesired sequences.
|
Tesei-trained_Model/filter_dataset/blast_filter.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Bio.Blast.Applications import NcbiblastpCommandline
|
| 2 |
+
import os
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import time
|
| 5 |
+
import networkx as nx
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
# Load data
|
| 10 |
+
dataset = pd.read_csv("LL28k_w2_NC_T37_cs150_out_02.csv")
|
| 11 |
+
seqs = dataset['fasta']
|
| 12 |
+
pairs = pd.read_csv(r"indice_pairs_w_combos.csv")
|
| 13 |
+
|
| 14 |
+
#Blast functions
|
| 15 |
+
def validate_protein_sequence(sequence):
|
| 16 |
+
# Valid amino acid residues (single-letter codes)
|
| 17 |
+
valid_residues = set("ACDEFGHIKLMNPQRSTVWY")
|
| 18 |
+
return all(residue in valid_residues for residue in sequence)
|
| 19 |
+
|
| 20 |
+
def run_blast_with_biopython(query_seq, subject_seq):
|
| 21 |
+
# Validate sequences
|
| 22 |
+
if not validate_protein_sequence(query_seq):
|
| 23 |
+
print("Invalid characters found in query sequence:", query_seq)
|
| 24 |
+
return
|
| 25 |
+
if not validate_protein_sequence(subject_seq):
|
| 26 |
+
print("Invalid characters found in subject sequence:", subject_seq)
|
| 27 |
+
return
|
| 28 |
+
|
| 29 |
+
# Create temporary files for query and subject sequences
|
| 30 |
+
with open("query.fasta", "w") as query_file:
|
| 31 |
+
query_file.write(">query\n" + query_seq)
|
| 32 |
+
with open("subject.fasta", "w") as subject_file:
|
| 33 |
+
subject_file.write(">subject\n" + subject_seq)
|
| 34 |
+
|
| 35 |
+
# Use the full path to the blastp.exe executable
|
| 36 |
+
blastp_cline = NcbiblastpCommandline(
|
| 37 |
+
cmd=r'C:\Users\lilia\Anaconda3\envs\ml135_env_sp21\lib\site-packages\biopython-1.84.dist-info\ncbi-blast-2.16.0+-x64-win64\ncbi-blast-2.16.0+\bin\blastp.exe',
|
| 38 |
+
query="query.fasta",
|
| 39 |
+
subject="subject.fasta",
|
| 40 |
+
outfmt=6, # Output format 6 is a tabular format (can be changed if needed)
|
| 41 |
+
evalue=10 # Set e-value threshold higher to be more inclusive
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Execute BLAST and capture the output
|
| 45 |
+
stdout, stderr = blastp_cline()
|
| 46 |
+
|
| 47 |
+
# Print the BLAST output and interpret the columns
|
| 48 |
+
if stdout:
|
| 49 |
+
# print("BLAST Output:\n")
|
| 50 |
+
print("BLAST found a match")
|
| 51 |
+
|
| 52 |
+
# print("Query ID, Subject ID, % Identity, Alignment Length, Mismatches, Gap Openings, Query Start, Query End, Subject Start, Subject End, E-value, Bit Score\n")
|
| 53 |
+
for line in stdout.strip().split("\n"):
|
| 54 |
+
columns = line.split("\t")
|
| 55 |
+
return float(columns[2]), int(columns[3])/len(query_seq)*100
|
| 56 |
+
else:
|
| 57 |
+
print("No significant matches found.")
|
| 58 |
+
return 0, 0
|
| 59 |
+
|
| 60 |
+
if stderr:
|
| 61 |
+
print("BLAST Errors:\n", stderr)
|
| 62 |
+
|
| 63 |
+
# Clean up temporary files
|
| 64 |
+
os.remove("query.fasta")
|
| 65 |
+
os.remove("subject.fasta")
|
| 66 |
+
|
| 67 |
+
# Boolean area, 1 if identified as a pair by Blast, 0 if not
|
| 68 |
+
blast_match = []
|
| 69 |
+
|
| 70 |
+
for i in range(len(pairs)):
|
| 71 |
+
iden, cov = run_blast_with_biopython(seqs[pairs["index1"][i]], seqs[pairs["index2"][i]])
|
| 72 |
+
if iden > 80 and cov > 80:
|
| 73 |
+
blast_match.append(1)
|
| 74 |
+
else: blast_match.append(0)
|
| 75 |
+
print("\n")
|
| 76 |
+
|
| 77 |
+
pairs["blast_match"] = blast_match
|
| 78 |
+
pairs.to_csv("blasted_pairs.csv", index=False)
|
| 79 |
+
|
| 80 |
+
# Create a graph of homolog pairs to identify single sequence to keep
|
| 81 |
+
|
| 82 |
+
# designate x and y arrays (these are the indices of all pairs)
|
| 83 |
+
x_list = list(pairs[pairs["blast_match"] == 1]["index1"])
|
| 84 |
+
y_list = list(pairs[pairs["blast_match"] == 1]["index2"])
|
| 85 |
+
|
| 86 |
+
# Initialize a graph
|
| 87 |
+
G = nx.Graph()
|
| 88 |
+
|
| 89 |
+
# Add nodes for each pair
|
| 90 |
+
for i in range(len(x_list)):
|
| 91 |
+
G.add_node(i, x=x_list[i], y=y_list[i])
|
| 92 |
+
|
| 93 |
+
# Add edges between nodes with the same x or y
|
| 94 |
+
for i in range(len(x_list)):
|
| 95 |
+
for j in range(i + 1, len(x_list)):
|
| 96 |
+
if x_list[i] == x_list[j] or y_list[i] == y_list[j]:
|
| 97 |
+
G.add_edge(i, j)
|
| 98 |
+
|
| 99 |
+
# Find connected components
|
| 100 |
+
groups = list(nx.connected_components(G))
|
| 101 |
+
|
| 102 |
+
# Extract and print the groups
|
| 103 |
+
result = []
|
| 104 |
+
for group in groups:
|
| 105 |
+
grouped_pairs = [(x_list[i], y_list[i]) for i in group]
|
| 106 |
+
result.append(grouped_pairs)
|
| 107 |
+
|
| 108 |
+
# Get number of sequences in each group
|
| 109 |
+
true_lengths = []
|
| 110 |
+
for num in range(len(result)):
|
| 111 |
+
group = []
|
| 112 |
+
for i in result[num]:
|
| 113 |
+
index1 = i[0]
|
| 114 |
+
index2 = i[1]
|
| 115 |
+
if index1 not in group:
|
| 116 |
+
group.append(index1)
|
| 117 |
+
if index2 not in group:
|
| 118 |
+
group.append(index2)
|
| 119 |
+
true_lengths.append(len(group))
|
| 120 |
+
|
| 121 |
+
print(np.mean(true_lengths))
|
| 122 |
+
plt.hist(true_lengths, bins = 20)
|
| 123 |
+
plt.show()
|
| 124 |
+
|
| 125 |
+
def process_group(num):
|
| 126 |
+
group = []
|
| 127 |
+
lens = []
|
| 128 |
+
for i in result[num]:
|
| 129 |
+
index1 = i[0]
|
| 130 |
+
index2 = i[1]
|
| 131 |
+
if index1 not in group:
|
| 132 |
+
group.append(index1)
|
| 133 |
+
lens.append(len(seqs[index1]))
|
| 134 |
+
if index2 not in group:
|
| 135 |
+
group.append(index2)
|
| 136 |
+
lens.append(len(seqs[index2]))
|
| 137 |
+
#print(lens)
|
| 138 |
+
# for i in group:
|
| 139 |
+
# print(seqs[i])
|
| 140 |
+
|
| 141 |
+
max_value = max(lens)
|
| 142 |
+
|
| 143 |
+
# Find the index of the first occurrence of the maximum value
|
| 144 |
+
index_of_max = lens.index(max_value)
|
| 145 |
+
# print(index_of_max)
|
| 146 |
+
value_to_keep = group[index_of_max]
|
| 147 |
+
reject_values = [value for i, value in enumerate(group) if i != index_of_max]
|
| 148 |
+
# print(group)
|
| 149 |
+
# print(value_to_keep)
|
| 150 |
+
return(reject_values)
|
| 151 |
+
|
| 152 |
+
all_reject_vals = []
|
| 153 |
+
for i in range(len(result)):
|
| 154 |
+
all_reject_vals.extend(process_group(i))
|
| 155 |
+
|
| 156 |
+
# Create an array to store whether a sequence is a reject due to homology
|
| 157 |
+
reject_column_for_dataset = []
|
| 158 |
+
for i in range(len(dataset)):
|
| 159 |
+
if i in all_reject_vals:
|
| 160 |
+
reject_column_for_dataset.append(1)
|
| 161 |
+
else: reject_column_for_dataset.append(0)
|
| 162 |
+
|
| 163 |
+
dataset["reject"] = reject_list
|
| 164 |
+
dataset.to_csv("LL28k_w_rejects.csv")
|
| 165 |
+
df_filtered = dataset.dropna(subset=['w2']) # Drop rows where 'w2' is NaN
|
| 166 |
+
df_filtered = dataset[dataset['reject'] != 1]
|
| 167 |
+
df_filtered.to_csv("LL28k_filtered_nanW2s_noRepeats.csv")
|
Tesei-trained_Model/filter_dataset/blasted_pairs.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Tesei-trained_Model/filter_dataset/full_comp_dif.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import math
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
|
| 6 |
+
dataset = pd.read_csv("LL28k_w2_NC_T37_cs150_out_02.csv")
|
| 7 |
+
seqs = dataset["fasta"]
|
| 8 |
+
# constants
|
| 9 |
+
amino_acid_data = {
|
| 10 |
+
"A": 0,
|
| 11 |
+
"R": 1,
|
| 12 |
+
"N": 0,
|
| 13 |
+
"D": -1,
|
| 14 |
+
"C": 0,
|
| 15 |
+
"E": -1,
|
| 16 |
+
"Q": 0,
|
| 17 |
+
"G": 0,
|
| 18 |
+
"H": 0,
|
| 19 |
+
"I": 0,
|
| 20 |
+
"L": 0,
|
| 21 |
+
"K": 1,
|
| 22 |
+
"M": 0,
|
| 23 |
+
"F": 0,
|
| 24 |
+
"P": 0,
|
| 25 |
+
"S": 0,
|
| 26 |
+
"T": 0,
|
| 27 |
+
"W": 0,
|
| 28 |
+
"Y": 0,
|
| 29 |
+
"V": 0,
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
aas = amino_acid_data.keys()
|
| 33 |
+
|
| 34 |
+
column_names = [f'{i}_count' for i in aas]
|
| 35 |
+
for name in column_names:
|
| 36 |
+
dataset[name] = 0
|
| 37 |
+
|
| 38 |
+
def count_aa(seq, index):
|
| 39 |
+
for i in aas:
|
| 40 |
+
n = seq.count(i)
|
| 41 |
+
dataset.at[index, f'{i}_count'] = n
|
| 42 |
+
|
| 43 |
+
for i in range(len(dataset)):
|
| 44 |
+
count_aa(dataset['fasta'][i], i)
|
| 45 |
+
|
| 46 |
+
dataset.to_csv("LL28k_w2_NC_T37_cs150_out_02_composition.csv", index=False)
|
| 47 |
+
|
| 48 |
+
def comp_two_seqs(i, j):
|
| 49 |
+
test_seq = dataset['fasta'][i]
|
| 50 |
+
test_seq_N = len(test_seq)
|
| 51 |
+
|
| 52 |
+
comp_seq = dataset['fasta'][j]
|
| 53 |
+
comp_seq_N = len(comp_seq)
|
| 54 |
+
|
| 55 |
+
total = 0
|
| 56 |
+
for aa in aas:
|
| 57 |
+
dif = ( dataset[f'{aa}_count'][i] / test_seq_N - dataset[f'{aa}_count'][j] / comp_seq_N )**2
|
| 58 |
+
total += dif
|
| 59 |
+
return total
|
| 60 |
+
|
| 61 |
+
difs = np.zeros((len(dataset), len(dataset)))
|
| 62 |
+
|
| 63 |
+
for i in range(len(seqs)):
|
| 64 |
+
for j in range(0, i):
|
| 65 |
+
|
| 66 |
+
difs[i, j] = comp_two_seqs(i, j)
|
| 67 |
+
|
| 68 |
+
np.save("full_comp_difs", difs)
|
Tesei-trained_Model/filter_dataset/identify_length_pairs.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import math
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
# load data
|
| 7 |
+
dataset = pd.read_csv(r"C:\Users\lilia\Documents\DU\research\Computational_IDP_Research\ML\homologs\LL28k_w2_NC_T37_cs150_out_02.csv")
|
| 8 |
+
seqs = dataset["fasta"]
|
| 9 |
+
|
| 10 |
+
# compare lengths, save pairs of sequences with < 15 difference in length
|
| 11 |
+
len_match1 = []
|
| 12 |
+
len_match2 = []
|
| 13 |
+
|
| 14 |
+
start_time = time.time()
|
| 15 |
+
for i in range(len(seqs)):
|
| 16 |
+
for j in range(i):
|
| 17 |
+
if abs(len(seqs[i]) - len(seqs[j])) < 15:
|
| 18 |
+
len_match1.append(i)
|
| 19 |
+
len_match2.append(j)
|
| 20 |
+
if (i % 200 == 0):
|
| 21 |
+
print(f"Elapsed time: {(time.time() - start_time):.2f} seconds, index", i)
|
| 22 |
+
end_time = time.time()
|
| 23 |
+
elapsed_time = end_time - start_time
|
| 24 |
+
print(f"Total time: {elapsed_time:.2f} seconds")
|
| 25 |
+
|
| 26 |
+
#np.save("len_match_indices1.npy", np.asarray(len_match1))
|
| 27 |
+
#np.save("len_match_indices2.npy", np.asarray(len_match2))
|
Tesei-trained_Model/filter_dataset/indice_pairs_w_combos.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Tesei-trained_Model/filter_dataset/test_compo.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import math
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import csv
|
| 6 |
+
import time
|
| 7 |
+
|
| 8 |
+
full_compos = np.load("full_comp_difs.npy")
|
| 9 |
+
len_match1 = np.load("len_match1.npy")
|
| 10 |
+
len_match2 = np.load("len_match2.npy")
|
| 11 |
+
|
| 12 |
+
final_pairs1 = []
|
| 13 |
+
final_pairs2 = []
|
| 14 |
+
compos = []
|
| 15 |
+
|
| 16 |
+
for i in range(len(len_match1)):
|
| 17 |
+
if full_compos[len_match1[i], len_match2[i]] < .005:
|
| 18 |
+
final_pairs1.append(len_match1[i])
|
| 19 |
+
final_pairs2.append(len_match2[i])
|
| 20 |
+
compos.append(full_compos[len_match1[i], len_match2[i]])
|
| 21 |
+
|
| 22 |
+
df = pd.DataFrame({
|
| 23 |
+
'index1': final_pairs1,
|
| 24 |
+
'index2': final_pairs2,
|
| 25 |
+
'compo': compos
|
| 26 |
+
})
|
| 27 |
+
|
| 28 |
+
df.to_csv('indice_pairs_w_compo.csv', index=False)
|
| 29 |
+
|
| 30 |
+
print(len(compos))
|