jpuglia commited on
Commit
3be0ce1
·
1 Parent(s): 54135dd

Add Tkinter-based GUI for Protein Location Prediction

Browse files

- Implemented a GUI in GUI.py for loading FASTA files and running protein location prediction tools.
- Added functionality to load FASTA files and display messages for user feedback.
- Integrated prediction functions for PROST and ESM models (300m and 600m).
- Created a menu with options to load files, run predictions, and exit the application.
- Included buttons for each prediction method and organized layout for user interaction.

Data/TaxDistributionPSORT.svg ADDED
Data/idmapping_2025_06_24_predictions.txt CHANGED
@@ -1,6 +1,6 @@
1
- Sequence_ID,Predictions
2
- sp|P0A7V8|RS4_ECOLI,Cytoplasmic (0.5908), CytoplasmicMembrane (0.2121), Periplasmic (0.1080), Extracellular (0.0750), OuterMembrane (0.0140), Cellwall (0.0000)
3
- sp|P0A910|OMPA_ECOLI,OuterMembrane (0.9844), CytoplasmicMembrane (0.0069), Extracellular (0.0037), Cytoplasmic (0.0028), Periplasmic (0.0021), Cellwall (0.0000)
4
- sp|P0A6F5|CH60_ECOLI,Cytoplasmic (0.7449), CytoplasmicMembrane (0.1760), Periplasmic (0.0376), Extracellular (0.0267), OuterMembrane (0.0145), Cellwall (0.0003)
5
- sp|P02930|TOLC_ECOLI,OuterMembrane (0.9672), CytoplasmicMembrane (0.0185), Extracellular (0.0059), Periplasmic (0.0048), Cytoplasmic (0.0036), Cellwall (0.0000)
6
- tr|Q9L1T3|Q9L1T3_STRCO,CytoplasmicMembrane (0.7330), Cytoplasmic (0.0996), Periplasmic (0.0820), Extracellular (0.0585), OuterMembrane (0.0260), Cellwall (0.0009)
 
1
+ Sequence_ID,Prediction 1,Prediction 2,Prediction 3,Prediction 4,Prediction 5,Prediction 6
2
+ sp|P0A7V8|RS4_ECOLI,Cytoplasmic (0.9020),CytoplasmicMembrane (0.0480),Periplasmic (0.0247),Extracellular (0.0184),OuterMembrane (0.0061),Cellwall (0.0006)
3
+ sp|P0A910|OMPA_ECOLI,OuterMembrane (0.9400),Extracellular (0.0251),Periplasmic (0.0177),CytoplasmicMembrane (0.0124),Cytoplasmic (0.0028),Cellwall (0.0019)
4
+ sp|P0A6F5|CH60_ECOLI,Cytoplasmic (0.9935),CytoplasmicMembrane (0.0059),OuterMembrane (0.0004),Periplasmic (0.0003),Extracellular (0.0000),Cellwall (0.0000)
5
+ sp|P02930|TOLC_ECOLI,OuterMembrane (0.9483),CytoplasmicMembrane (0.0166),Extracellular (0.0154),Periplasmic (0.0150),Cytoplasmic (0.0031),Cellwall (0.0016)
6
+ tr|Q9L1T3|Q9L1T3_STRCO,CytoplasmicMembrane (0.5872),Cytoplasmic (0.1784),Extracellular (0.1599),Periplasmic (0.0445),OuterMembrane (0.0246),Cellwall (0.0053)
Data/trainingData.csv CHANGED
The diff for this file is too large to render. See raw diff
 
cli.py → GUI.py RENAMED
File without changes
notebooks/EDA_Psort.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:094324c0c863d51c84a37172e9979ef82ec7889799889808cf9c167e949310de
3
- size 21075
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef532d89c742e3dad6e1aeea89215d4e1910ec825030b7c385594e59418998f1
3
+ size 15116012
src/my_utils.py CHANGED
@@ -458,7 +458,7 @@ def fetch_refseq_sequence(refseq_id : str):
458
  record = SeqIO.read(handle, "fasta")
459
  handle.close()
460
  return str(record.seq)
461
- except (HTTPError, ValueError) as e:
462
 
463
  url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
464
  response = requests.get(url, timeout=10)
@@ -504,17 +504,13 @@ def _fetch_sequence_for_row(idx, row):
504
 
505
  return idx, sequence
506
 
507
- def fetch_sequences_for_dataframe(df: pd.DataFrame,
508
- batch_size: Optional[int] = None,
509
- max_workers: int = 5) -> pd.DataFrame:
510
  """
511
  Add a 'sequence' column to the dataframe by fetching sequences from
512
- SwissProt or RefSeq based on available IDs, with parallel execution and a progress bar.
513
 
514
  Args:
515
  df: Input DataFrame with ID columns.
516
- batch_size: Optional size of row-chunks to process sequentially.
517
- max_workers: Number of threads for parallel fetching.
518
 
519
  Returns:
520
  DataFrame with added 'sequence' column.
@@ -524,35 +520,19 @@ def fetch_sequences_for_dataframe(df: pd.DataFrame,
524
  result_df['sequence'] = None
525
 
526
  total_rows = len(result_df)
527
- # Determine batch indices
528
- if batch_size and batch_size > 0:
529
- starts = list(range(0, total_rows, batch_size))
530
- else:
531
- starts = [0]
532
- batch_size = total_rows
533
-
534
- # Overall progress bar
535
- with tqdm(total=total_rows, desc="Retrieving sequences", unit="row") as pbar:
536
- for start in starts:
537
- end = min(start + batch_size, total_rows)
538
- sub_df = result_df.iloc[start:end]
539
- futures = []
540
- # Launch parallel tasks
541
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
542
- for idx, row in sub_df.iterrows():
543
- futures.append(executor.submit(_fetch_sequence_for_row, idx, row))
544
- # Collect results
545
- for future in as_completed(futures):
546
- idx, seq = future.result()
547
- result_df.at[idx, 'sequence'] = seq
548
- pbar.update(1)
549
 
550
  print("Sequence retrieval complete")
551
  success_count = result_df['sequence'].notna().sum()
552
  print(f"Successfully retrieved {success_count} out of {total_rows} sequences "
553
- f"({round(success_count/total_rows*100, 2)}%)")
554
  return result_df
555
 
 
 
556
  def esm_embed(model: ESMC,
557
  seq : str,
558
  acc : str,
 
458
  record = SeqIO.read(handle, "fasta")
459
  handle.close()
460
  return str(record.seq)
461
+ except (HTTPError, ValueError):
462
 
463
  url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
464
  response = requests.get(url, timeout=10)
 
504
 
505
  return idx, sequence
506
 
507
+ def fetch_sequences_for_dataframe(df: pd.DataFrame) -> pd.DataFrame:
 
 
508
  """
509
  Add a 'sequence' column to the dataframe by fetching sequences from
510
+ SwissProt or RefSeq based on available IDs, processing rows sequentially.
511
 
512
  Args:
513
  df: Input DataFrame with ID columns.
 
 
514
 
515
  Returns:
516
  DataFrame with added 'sequence' column.
 
520
  result_df['sequence'] = None
521
 
522
  total_rows = len(result_df)
523
+
524
+ for idx, row in tqdm(result_df.iterrows(), total=total_rows, desc="Retrieving sequences", unit="row"):
525
+ _, seq = _fetch_sequence_for_row(idx, row)
526
+ result_df.at[idx, 'sequence'] = seq
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
  print("Sequence retrieval complete")
529
  success_count = result_df['sequence'].notna().sum()
530
  print(f"Successfully retrieved {success_count} out of {total_rows} sequences "
531
+ f"({round(success_count / total_rows * 100, 2)}%)")
532
  return result_df
533
 
534
+
535
+
536
  def esm_embed(model: ESMC,
537
  seq : str,
538
  acc : str,