Add Tkinter-based GUI for Protein Location Prediction
Browse files- Implemented a GUI in GUI.py for loading FASTA files and running protein location prediction tools.
- Added functionality to load FASTA files and display messages for user feedback.
- Integrated prediction functions for PROST and ESM models (300m and 600m).
- Created a menu with options to load files, run predictions, and exit the application.
- Included buttons for each prediction method and organized layout for user interaction.
- Data/TaxDistributionPSORT.svg +0 -0
- Data/idmapping_2025_06_24_predictions.txt +6 -6
- Data/trainingData.csv +0 -0
- cli.py → GUI.py +0 -0
- notebooks/EDA_Psort.ipynb +2 -2
- src/my_utils.py +10 -30
Data/TaxDistributionPSORT.svg
ADDED
|
|
Data/idmapping_2025_06_24_predictions.txt
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
Sequence_ID,
|
| 2 |
-
sp|P0A7V8|RS4_ECOLI,Cytoplasmic (0.
|
| 3 |
-
sp|P0A910|OMPA_ECOLI,OuterMembrane (0.
|
| 4 |
-
sp|P0A6F5|CH60_ECOLI,Cytoplasmic (0.
|
| 5 |
-
sp|P02930|TOLC_ECOLI,OuterMembrane (0.
|
| 6 |
-
tr|Q9L1T3|Q9L1T3_STRCO,CytoplasmicMembrane (0.
|
|
|
|
| 1 |
+
Sequence_ID,Prediction 1,Prediction 2,Prediction 3,Prediction 4,Prediction 5,Prediction 6
|
| 2 |
+
sp|P0A7V8|RS4_ECOLI,Cytoplasmic (0.9020),CytoplasmicMembrane (0.0480),Periplasmic (0.0247),Extracellular (0.0184),OuterMembrane (0.0061),Cellwall (0.0006)
|
| 3 |
+
sp|P0A910|OMPA_ECOLI,OuterMembrane (0.9400),Extracellular (0.0251),Periplasmic (0.0177),CytoplasmicMembrane (0.0124),Cytoplasmic (0.0028),Cellwall (0.0019)
|
| 4 |
+
sp|P0A6F5|CH60_ECOLI,Cytoplasmic (0.9935),CytoplasmicMembrane (0.0059),OuterMembrane (0.0004),Periplasmic (0.0003),Extracellular (0.0000),Cellwall (0.0000)
|
| 5 |
+
sp|P02930|TOLC_ECOLI,OuterMembrane (0.9483),CytoplasmicMembrane (0.0166),Extracellular (0.0154),Periplasmic (0.0150),Cytoplasmic (0.0031),Cellwall (0.0016)
|
| 6 |
+
tr|Q9L1T3|Q9L1T3_STRCO,CytoplasmicMembrane (0.5872),Cytoplasmic (0.1784),Extracellular (0.1599),Periplasmic (0.0445),OuterMembrane (0.0246),Cellwall (0.0053)
|
Data/trainingData.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
cli.py → GUI.py
RENAMED
|
File without changes
|
notebooks/EDA_Psort.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef532d89c742e3dad6e1aeea89215d4e1910ec825030b7c385594e59418998f1
|
| 3 |
+
size 15116012
|
src/my_utils.py
CHANGED
|
@@ -458,7 +458,7 @@ def fetch_refseq_sequence(refseq_id : str):
|
|
| 458 |
record = SeqIO.read(handle, "fasta")
|
| 459 |
handle.close()
|
| 460 |
return str(record.seq)
|
| 461 |
-
except (HTTPError, ValueError)
|
| 462 |
|
| 463 |
url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
|
| 464 |
response = requests.get(url, timeout=10)
|
|
@@ -504,17 +504,13 @@ def _fetch_sequence_for_row(idx, row):
|
|
| 504 |
|
| 505 |
return idx, sequence
|
| 506 |
|
| 507 |
-
def fetch_sequences_for_dataframe(df: pd.DataFrame
|
| 508 |
-
batch_size: Optional[int] = None,
|
| 509 |
-
max_workers: int = 5) -> pd.DataFrame:
|
| 510 |
"""
|
| 511 |
Add a 'sequence' column to the dataframe by fetching sequences from
|
| 512 |
-
SwissProt or RefSeq based on available IDs,
|
| 513 |
|
| 514 |
Args:
|
| 515 |
df: Input DataFrame with ID columns.
|
| 516 |
-
batch_size: Optional size of row-chunks to process sequentially.
|
| 517 |
-
max_workers: Number of threads for parallel fetching.
|
| 518 |
|
| 519 |
Returns:
|
| 520 |
DataFrame with added 'sequence' column.
|
|
@@ -524,35 +520,19 @@ def fetch_sequences_for_dataframe(df: pd.DataFrame,
|
|
| 524 |
result_df['sequence'] = None
|
| 525 |
|
| 526 |
total_rows = len(result_df)
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
starts = [0]
|
| 532 |
-
batch_size = total_rows
|
| 533 |
-
|
| 534 |
-
# Overall progress bar
|
| 535 |
-
with tqdm(total=total_rows, desc="Retrieving sequences", unit="row") as pbar:
|
| 536 |
-
for start in starts:
|
| 537 |
-
end = min(start + batch_size, total_rows)
|
| 538 |
-
sub_df = result_df.iloc[start:end]
|
| 539 |
-
futures = []
|
| 540 |
-
# Launch parallel tasks
|
| 541 |
-
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 542 |
-
for idx, row in sub_df.iterrows():
|
| 543 |
-
futures.append(executor.submit(_fetch_sequence_for_row, idx, row))
|
| 544 |
-
# Collect results
|
| 545 |
-
for future in as_completed(futures):
|
| 546 |
-
idx, seq = future.result()
|
| 547 |
-
result_df.at[idx, 'sequence'] = seq
|
| 548 |
-
pbar.update(1)
|
| 549 |
|
| 550 |
print("Sequence retrieval complete")
|
| 551 |
success_count = result_df['sequence'].notna().sum()
|
| 552 |
print(f"Successfully retrieved {success_count} out of {total_rows} sequences "
|
| 553 |
-
f"({round(success_count/total_rows*100, 2)}%)")
|
| 554 |
return result_df
|
| 555 |
|
|
|
|
|
|
|
| 556 |
def esm_embed(model: ESMC,
|
| 557 |
seq : str,
|
| 558 |
acc : str,
|
|
|
|
| 458 |
record = SeqIO.read(handle, "fasta")
|
| 459 |
handle.close()
|
| 460 |
return str(record.seq)
|
| 461 |
+
except (HTTPError, ValueError):
|
| 462 |
|
| 463 |
url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
|
| 464 |
response = requests.get(url, timeout=10)
|
|
|
|
| 504 |
|
| 505 |
return idx, sequence
|
| 506 |
|
| 507 |
+
def fetch_sequences_for_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
|
|
|
| 508 |
"""
|
| 509 |
Add a 'sequence' column to the dataframe by fetching sequences from
|
| 510 |
+
SwissProt or RefSeq based on available IDs, processing rows sequentially.
|
| 511 |
|
| 512 |
Args:
|
| 513 |
df: Input DataFrame with ID columns.
|
|
|
|
|
|
|
| 514 |
|
| 515 |
Returns:
|
| 516 |
DataFrame with added 'sequence' column.
|
|
|
|
| 520 |
result_df['sequence'] = None
|
| 521 |
|
| 522 |
total_rows = len(result_df)
|
| 523 |
+
|
| 524 |
+
for idx, row in tqdm(result_df.iterrows(), total=total_rows, desc="Retrieving sequences", unit="row"):
|
| 525 |
+
_, seq = _fetch_sequence_for_row(idx, row)
|
| 526 |
+
result_df.at[idx, 'sequence'] = seq
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
|
| 528 |
print("Sequence retrieval complete")
|
| 529 |
success_count = result_df['sequence'].notna().sum()
|
| 530 |
print(f"Successfully retrieved {success_count} out of {total_rows} sequences "
|
| 531 |
+
f"({round(success_count / total_rows * 100, 2)}%)")
|
| 532 |
return result_df
|
| 533 |
|
| 534 |
+
|
| 535 |
+
|
| 536 |
def esm_embed(model: ESMC,
|
| 537 |
seq : str,
|
| 538 |
acc : str,
|