Add Tkinter-based GUI for Protein Location Prediction

- Implemented a GUI in GUI.py for loading FASTA files and running protein location prediction tools.
- Added functionality to load FASTA files and display messages for user feedback.
- Integrated prediction functions for PROST and ESM models (300m and 600m).
- Created a menu with options to load files, run predictions, and exit the application.
- Included buttons for each prediction method and organized layout for user interaction.

Files changed (6) hide show

Data/TaxDistributionPSORT.svg +0 -0
Data/idmapping_2025_06_24_predictions.txt +6 -6
Data/trainingData.csv +0 -0
cli.py → GUI.py +0 -0
notebooks/EDA_Psort.ipynb +2 -2
src/my_utils.py +10 -30

Data/TaxDistributionPSORT.svg ADDED Viewed

Data/idmapping_2025_06_24_predictions.txt CHANGED Viewed

@@ -1,6 +1,6 @@
-Sequence_ID,Predictions
-sp|P0A7V8|RS4_ECOLI,Cytoplasmic (0.5908), CytoplasmicMembrane (0.2121), Periplasmic (0.1080), Extracellular (0.0750), OuterMembrane (0.0140), Cellwall (0.0000)
-sp|P0A910|OMPA_ECOLI,OuterMembrane (0.9844), CytoplasmicMembrane (0.0069), Extracellular (0.0037), Cytoplasmic (0.0028), Periplasmic (0.0021), Cellwall (0.0000)
-sp|P0A6F5|CH60_ECOLI,Cytoplasmic (0.7449), CytoplasmicMembrane (0.1760), Periplasmic (0.0376), Extracellular (0.0267), OuterMembrane (0.0145), Cellwall (0.0003)
-sp|P02930|TOLC_ECOLI,OuterMembrane (0.9672), CytoplasmicMembrane (0.0185), Extracellular (0.0059), Periplasmic (0.0048), Cytoplasmic (0.0036), Cellwall (0.0000)
-tr|Q9L1T3|Q9L1T3_STRCO,CytoplasmicMembrane (0.7330), Cytoplasmic (0.0996), Periplasmic (0.0820), Extracellular (0.0585), OuterMembrane (0.0260), Cellwall (0.0009)

+Sequence_ID,Prediction 1,Prediction 2,Prediction 3,Prediction 4,Prediction 5,Prediction 6
+sp|P0A7V8|RS4_ECOLI,Cytoplasmic (0.9020),CytoplasmicMembrane (0.0480),Periplasmic (0.0247),Extracellular (0.0184),OuterMembrane (0.0061),Cellwall (0.0006)
+sp|P0A910|OMPA_ECOLI,OuterMembrane (0.9400),Extracellular (0.0251),Periplasmic (0.0177),CytoplasmicMembrane (0.0124),Cytoplasmic (0.0028),Cellwall (0.0019)
+sp|P0A6F5|CH60_ECOLI,Cytoplasmic (0.9935),CytoplasmicMembrane (0.0059),OuterMembrane (0.0004),Periplasmic (0.0003),Extracellular (0.0000),Cellwall (0.0000)
+sp|P02930|TOLC_ECOLI,OuterMembrane (0.9483),CytoplasmicMembrane (0.0166),Extracellular (0.0154),Periplasmic (0.0150),Cytoplasmic (0.0031),Cellwall (0.0016)
+tr|Q9L1T3|Q9L1T3_STRCO,CytoplasmicMembrane (0.5872),Cytoplasmic (0.1784),Extracellular (0.1599),Periplasmic (0.0445),OuterMembrane (0.0246),Cellwall (0.0053)

Data/trainingData.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

cli.py → GUI.py RENAMED Viewed

File without changes

notebooks/EDA_Psort.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:094324c0c863d51c84a37172e9979ef82ec7889799889808cf9c167e949310de
-size 21075

 version https://git-lfs.github.com/spec/v1
+oid sha256:ef532d89c742e3dad6e1aeea89215d4e1910ec825030b7c385594e59418998f1
+size 15116012

src/my_utils.py CHANGED Viewed

@@ -458,7 +458,7 @@ def fetch_refseq_sequence(refseq_id : str):
         record = SeqIO.read(handle, "fasta")
         handle.close()
         return str(record.seq)
-    except (HTTPError, ValueError) as e:
         url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
         response = requests.get(url, timeout=10)
@@ -504,17 +504,13 @@ def _fetch_sequence_for_row(idx, row):
     return idx, sequence
-def fetch_sequences_for_dataframe(df: pd.DataFrame,
-                                  batch_size: Optional[int] = None,
-                                    max_workers: int = 5) -> pd.DataFrame:
     """
     Add a 'sequence' column to the dataframe by fetching sequences from
-    SwissProt or RefSeq based on available IDs, with parallel execution and a progress bar.
     Args:
         df: Input DataFrame with ID columns.
-        batch_size: Optional size of row-chunks to process sequentially.
-        max_workers: Number of threads for parallel fetching.
     Returns:
         DataFrame with added 'sequence' column.
@@ -524,35 +520,19 @@ def fetch_sequences_for_dataframe(df: pd.DataFrame,
         result_df['sequence'] = None
     total_rows = len(result_df)
-    # Determine batch indices
-    if batch_size and batch_size > 0:
-        starts = list(range(0, total_rows, batch_size))
-    else:
-        starts = [0]
-        batch_size = total_rows
-    # Overall progress bar
-    with tqdm(total=total_rows, desc="Retrieving sequences", unit="row") as pbar:
-        for start in starts:
-            end = min(start + batch_size, total_rows)
-            sub_df = result_df.iloc[start:end]
-            futures = []
-            # Launch parallel tasks
-            with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                for idx, row in sub_df.iterrows():
-                    futures.append(executor.submit(_fetch_sequence_for_row, idx, row))
-                # Collect results
-                for future in as_completed(futures):
-                    idx, seq = future.result()
-                    result_df.at[idx, 'sequence'] = seq
-                    pbar.update(1)
     print("Sequence retrieval complete")
     success_count = result_df['sequence'].notna().sum()
     print(f"Successfully retrieved {success_count} out of {total_rows} sequences "
-          f"({round(success_count/total_rows*100, 2)}%)")
     return result_df
 def esm_embed(model: ESMC,
               seq : str,
               acc : str,

         record = SeqIO.read(handle, "fasta")
         handle.close()
         return str(record.seq)
+    except (HTTPError, ValueError):
         url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
         response = requests.get(url, timeout=10)
     return idx, sequence
+def fetch_sequences_for_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
     Add a 'sequence' column to the dataframe by fetching sequences from
+    SwissProt or RefSeq based on available IDs, processing rows sequentially.
     Args:
         df: Input DataFrame with ID columns.
     Returns:
         DataFrame with added 'sequence' column.
         result_df['sequence'] = None
     total_rows = len(result_df)
+    for idx, row in tqdm(result_df.iterrows(), total=total_rows, desc="Retrieving sequences", unit="row"):
+        _, seq = _fetch_sequence_for_row(idx, row)
+        result_df.at[idx, 'sequence'] = seq
     print("Sequence retrieval complete")
     success_count = result_df['sequence'].notna().sum()
     print(f"Successfully retrieved {success_count} out of {total_rows} sequences "
+          f"({round(success_count / total_rows * 100, 2)}%)")
     return result_df
 def esm_embed(model: ESMC,
               seq : str,
               acc : str,