jpuglia
/

ProteinLocationPredictor

Joblib

English

Model card Files Files and versions

xet

Community

jpuglia commited on Jun 25, 2025

Commit

4b71c5b

1 Parent(s): 6587358

Enhance CLI functionality: Implement file loading and prediction features with user feedback in Tkinter GUI

Browse files

Files changed (2) hide show

cli.py +64 -25
src/my_utils.py +117 -119

cli.py CHANGED Viewed

@@ -1,19 +1,69 @@
 import tkinter as tk
-from tkinter import Menu
 from src.my_utils import predict_with_prost
 def menu():
     """
-    Creates and displays the main GUI menu for the Protein Tools application using Tkinter.
-    The menu includes:
-    - A "File" menu with options for creating a new file, opening, closing, preferences (with sub-menu for keyboard shortcuts and color themes), and exiting the application.
-    - A "Help" menu with options for welcome and about dialogs.
-    - Two buttons below the menu: one for loading a FASTA file (triggers `predict_with_prost`), and one for exiting the application.
-    Returns:
-        None
     """
     # root window
     root = tk.Tk()
@@ -26,19 +76,10 @@ def menu():
     # create the file_menu
     file_menu = Menu(menubar, tearoff=0)
-    file_menu.add_command(label='New')
-    file_menu.add_command(label='Open...')
-    file_menu.add_command(label='Close')
-    file_menu.add_separator()
-    sub_menu = Menu(file_menu, tearoff=0)
-    sub_menu.add_command(label='Keyboard Shortcuts')
-    sub_menu.add_command(label='Color Themes')
-    file_menu.add_cascade(label="Preferences", menu=sub_menu)
     file_menu.add_separator()
-    file_menu.add_command(label='Exit', command=root.destroy)
-    menubar.add_cascade(label="File", menu=file_menu, underline=0)
     # help menu
     help_menu = Menu(menubar, tearoff=0)
@@ -50,17 +91,15 @@ def menu():
     # Add Buttons Below Menu
     # =========================
-    btn_prost = tk.Button(root, text="Predict with Prost", command=predict_with_prost)
     btn_prost.pack(pady=5)
-    btn_ESM = tk.Button(root, text="Predict with ESMC", command=print(NotImplementedError("ESM functionality not implemented yet."))) #type: ignore
-    btn_ESM.pack(pady=5)
     btn_exit = tk.Button(root, text="Exit", command=root.quit)
     btn_exit.pack(pady=5)
     root.mainloop()
 menu()

+"""
+Protein Location Predictor CLI
+This module provides a Tkinter-based GUI for loading FASTA files
+     and running protein location prediction tools.
+"""
 import tkinter as tk
+from tkinter import Menu, filedialog, messagebox
 from src.my_utils import predict_with_prost
+FASTA_FILE_PATH = None  # Global or instance variable
+def load_fasta_file():
+    """
+    Opens a file dialog for the user to select a FASTA file and stores the selected file path in a global variable.
+    If a file is selected, displays an information message with the file path.
+    If no file is selected, displays a warning message.
+    Uses:
+        - filedialog.askopenfilename for file selection.
+        - messagebox.showinfo and messagebox.showwarning for user feedback.
+    Global Variables:
+        FASTA_FILE_PATH (str): Path to the selected FASTA file.
+    """
+    global FASTA_FILE_PATH # pylint: disable=global-statement
+    FASTA_FILE_PATH = filedialog.askopenfilename(
+        filetypes=[("FASTA files", "*.fasta *.fa")],
+        title="Select a FASTA file"
+    )
+    if FASTA_FILE_PATH:
+        messagebox.showinfo("File Loaded", f"Loaded file:\n{FASTA_FILE_PATH}")
+    else:
+        messagebox.showwarning("No file", "No file was selected.")
+def run_prediction():
+    """
+    Runs the protein location prediction process.
+    Checks if a FASTA file path is provided. If not, displays an error message to the user.
+    If a FASTA file is loaded, proceeds to run the prediction using the PROST model.
+    Raises:
+        Shows a message box error if no FASTA file is loaded.
+    """
+    if not FASTA_FILE_PATH:
+        messagebox.showerror("Error", "Please load a FASTA file first.")
+        return
+    predict_with_prost(FASTA_FILE_PATH)
 def menu():
     """
+    Displays the main GUI menu for the Protein Tools application.
+    This function creates a Tkinter window with a menu bar containing 'File' and 'Help' menus,
+    and buttons for running protein prediction tools and exiting the application.
+    Menus:
+        - File: Options to load a FASTA file or close the application.
+        - Help: Options for welcome information and about dialog.
+    Buttons:
+        - Predict with Prost: Runs the Prost prediction tool.
+        - Predict with ESM C: Placeholder for ESM prediction functionality (not yet implemented).
+        - Exit: Closes the application.
     """
     # root window
     root = tk.Tk()
     # create the file_menu
     file_menu = Menu(menubar, tearoff=0)
+    file_menu.add_command(label='Load FASTA', command=load_fasta_file)
+    file_menu.add_command(label='Close', command=root.quit)
     file_menu.add_separator()
+    menubar.add_cascade(label="File", menu=file_menu, underline=0)
     # help menu
     help_menu = Menu(menubar, tearoff=0)
     # Add Buttons Below Menu
     # =========================
+    btn_prost = tk.Button(root, text="Predict with Prost", command=run_prediction)
     btn_prost.pack(pady=5)
+    btn_esm = tk.Button(root, text="Predict with ESM C") #type: ignore
+    btn_esm.pack(pady=5)
     btn_exit = tk.Button(root, text="Exit", command=root.quit)
     btn_exit.pack(pady=5)
     root.mainloop()
 menu()

src/my_utils.py CHANGED Viewed

@@ -7,7 +7,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from urllib.error import HTTPError
 from typing import Literal, Optional
 import tkinter as tk
-from tkinter import filedialog
 import pandas as pd
@@ -39,6 +39,7 @@ import plotly.express as px
 from esm.models.esmc import ESMC
 from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
 from transformers import T5Tokenizer, T5EncoderModel, PreTrainedModel
 from joblib import load
@@ -503,10 +504,9 @@ def _fetch_sequence_for_row(idx, row):
     return idx, sequence
-def fetch_sequences_for_dataframe(df: pd.DataFrame, batch_size: Optional[int] = None, max_workers: int = 5) -> pd.DataFrame:
     """
     Add a 'sequence' column to the dataframe by fetching sequences from
     SwissProt or RefSeq based on available IDs, with parallel execution and a progress bar.
@@ -553,69 +553,74 @@ def fetch_sequences_for_dataframe(df: pd.DataFrame, batch_size: Optional[int] =
           f"({round(success_count/total_rows*100, 2)}%)")
     return result_df
-def esm_embed_sequence(model : Literal["esmc_300m", "esmc_600m"], sequence : str, device : str) -> LogitsOutput:
     """
-    Embed a protein sequence using the specified ESM model.
     Args:
-        model: Name of the ESM model to use.
-        sequence: Protein sequence to embed.
     Returns:
-        LogitsOutput: Contains the embeddings and logits for the sequence.
     """
-    client = ESMC.from_pretrained(model).to(device)
-    protein = ESMProtein(sequence=sequence)
-    protein_tensor = client.encode(protein)
     if isinstance(protein_tensor, ESMProteinError):
         raise protein_tensor
-    output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True))
-    return output
-def esm_save_emb(model: Literal["esmc_300m", "esmc_600m"],
-             seq_list: list[str],
-             id_list: list[str],
-             path: str,
-             device : Literal['cuda', 'cpu'] = 'cuda') -> None:
-    """
-    Save embeddings to disk.
-    Args:
-        model: ESM model name. Options are "esmc_300m" or "esmc_600m".
-        seq_list: List of protein sequences.
-        id_list: List of identifiers corresponding to the sequences.
-        path: Directory to save the embeddings.
-    """
-    assert len(seq_list) == len(id_list), "Sequence and ID lists must be the same length."
-    os.makedirs(path, exist_ok=True)
-    for i, (seq, acc) in enumerate(
-            tqdm(zip(seq_list, id_list),
-                 total=len(seq_list), desc="Saving embeddings")):
-        try:
-            output: LogitsOutput = esm_embed_sequence(model=model, sequence=seq, device = device)
-            emb_array = output.embeddings.cpu().numpy()
-            if len(emb_array.shape) == 3:
-                emb_array = emb_array.squeeze(axis=0).mean(axis=0)
-            elif len(emb_array.shape) == 2:
-                emb_array = emb_array.mean(axis=0)
-            np.save(os.path.join(path, f"{acc}.npy"), emb_array)
-        except ESMProteinError as e:
-            print(f"Error processing {acc}: {e}")
-        if i % 100 == 0:
-            gc.collect()
-            torch.cuda.empty_cache()
 def prost_embed_sequence(seq : str,
                          acc : str,
@@ -732,103 +737,96 @@ def save_predictions_to_txt(predictions_dict: dict[str, tuple[list[str], list[fl
             f.write(f"{seq_id},{pred_line}\n")
-def predict_with_prost():
     """
-    Function to select a directory containing FASTA files and embed sequences using ProstT5.
     """
-    root = tk.Tk()
-    root.withdraw()
-    fasta_path : str = filedialog.askopenfilename(
-        title="Select a FASTA file",
-        filetypes=[("FASTA files", "*.fasta *.fa")],
-        initialdir="."
-    )
-    if not fasta_path:
-        print("No file selected.")
         return
-    # Select output directory for results
-    output_dir: str = filedialog.askdirectory(
-        title="Select output directory for results",
-        initialdir="."
-    )
     if not output_dir:
-        print("No output directory selected.")
         return
     result = fasta_to_seq(fasta_path)
     if result is None:
-        print("No sequences found in the FASTA file.")
-        return {}
-    else:
-        sequences, ids = result
-        print(f"Sequences loaded from {fasta_path}: {len(sequences)} sequences found.")
-        print("Embedding sequences using ProstT5...")
-        tokenizer : T5Tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5", do_lower_case=False)
-        model : PreTrainedModel = T5EncoderModel.from_pretrained("Rostlab/ProstT5")
-        embeddings : dict[str, np.ndarray] = {}
-        for seq, acc in tqdm(zip(sequences, ids), total=len(sequences), desc="Embedding sequences"):
-            emb = prost_embed_sequence(seq, acc, tokenizer, model)
-            if emb is not None:
-                embeddings[acc] = emb
-            else:
-                print(f"Failed to embed sequence {acc}. Skipping.")
-        print(f"Embedded {len(embeddings)} sequences successfully.")
-    print("Loading pre-trained SVM model for prediction...")
     try:
-        predictor = load('/home/juan/ProteinLocationPredictor/Models/rfProst.joblib')
     except FileNotFoundError:
-        print("Error: Could not find the model file '../ProteinLocationPredictor/Models/svmProst.joblib'")
-        print("Please check the path to your trained model.")
         return
     sequence_ids = list(embeddings.keys())
-    X = np.array(list(embeddings.values())) #type: ignore
     print("Making predictions...")
     y_pred_proba = predictor.predict_proba(X)
-    # Get class names (you may need to adjust this based on your model)
     if hasattr(predictor, 'classes_'):
-        class_names = predictor.classes_.tolist()
     else:
-        # If class names are not available, use generic names
-        n_classes = y_pred_proba.shape[1]
-        class_names = [f"Class_{i}" for i in range(n_classes)]
-    # Convert class names to strings if they aren't already
-    class_names = [str(cls) for cls in class_names]
-    # Create predictions dictionary
     predictions_dict = {}
     for i, seq_id in enumerate(sequence_ids):
-        probabilities = y_pred_proba[i].tolist()
-        class_prob_pairs = sorted(zip(class_names, probabilities), key=lambda x: x[1], reverse=True)
         sorted_classes, sorted_probs = zip(*class_prob_pairs)
         predictions_dict[seq_id] = (list(sorted_classes), list(sorted_probs))
-    # Generate output filename
     input_filename = os.path.splitext(os.path.basename(fasta_path))[0]
     output_file = os.path.join(output_dir, f"{input_filename}_predictions.txt")
-    # Save predictions to file
     print(f"Saving predictions to {output_file}...")
     save_predictions_to_txt(predictions_dict, output_file)
-    print(f"Predictions saved successfully!")
     print(f"Total sequences processed: {len(embeddings)}")
-    print(f"Output file: {output_file}")
-    # Print a few sample predictions
     print("\nSample predictions:")
     for i, (seq_id, (classes, probs)) in enumerate(list(predictions_dict.items())[:3]):
         pred_str = ", ".join([f"{cls} ({prob:.4f})" for cls, prob in zip(classes, probs)])
-        print(f"{seq_id}: {pred_str}")

 from urllib.error import HTTPError
 from typing import Literal, Optional
 import tkinter as tk
+from tkinter import filedialog, messagebox, ttk
 import pandas as pd
 from esm.models.esmc import ESMC
 from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
 from transformers import T5Tokenizer, T5EncoderModel, PreTrainedModel
+from esm.sdk.forge import ESM3ForgeInferenceClient
 from joblib import load
     return idx, sequence
+def fetch_sequences_for_dataframe(df: pd.DataFrame,
+                                  batch_size: Optional[int] = None,
+                                    max_workers: int = 5) -> pd.DataFrame:
     """
     Add a 'sequence' column to the dataframe by fetching sequences from
     SwissProt or RefSeq based on available IDs, with parallel execution and a progress bar.
           f"({round(success_count/total_rows*100, 2)}%)")
     return result_df
+def esm_embed(model: ESMC,
+              seq : str,
+              acc : str,
+              device : torch.device = torch.device(
+                  'cuda' if torch.cuda.is_available()
+                  else 'cpu'
+              )) -> Optional[np.ndarray]:
     """
+    Generates an embedding for a given protein sequence using an ESM model.
     Args:
+        model (ESMC): The ESM model used for encoding and generating embeddings.
+        seq (str): The amino acid sequence of the protein.
+        acc (str): The accession identifier for the protein (used for error reporting).
+        device (torch.device, optional): The device to run the computation on. Defaults to CUDA if available, otherwise CPU.
     Returns:
+        Optional[np.ndarray]: The embedding vector for the protein sequence, or None if embedding could not be generated.
+    Raises:
+        ESMProteinError: If there is an error during protein encoding or embedding generation.
+    Side Effects:
+        Displays an error message using `messagebox.showerror` if an error occurs during processing.
     """
+    protein : ESMProtein = ESMProtein(sequence = seq)
+    protein_tensor = model.encode(protein).to(device)
     if isinstance(protein_tensor, ESMProteinError):
+        messagebox.showerror("Error", f"Error processing {acc}: {protein_tensor}")
         raise protein_tensor
+    try:
+        output : LogitsOutput = model.logits(protein_tensor,
+                                            LogitsConfig(sequence=True,
+                                                        return_embeddings=True))
+        if output is not None and output.embeddings is not None:
+            arr_output : np.ndarray = output.embeddings.cpu().numpy()
+            if len(arr_output.shape) == 3:
+                arr_output = arr_output.squeeze(axis=0).mean(axis=0)
+            elif len(arr_output.shape) == 2:
+                arr_output = arr_output.mean(axis=0)
+            return arr_output
+    except ESMProteinError as e:
+        messagebox.showerror("Error", f"Error processing {acc}: {e}")
+        return
+def predict_with_esm(fasta_path : str,
+                     model : Literal['esmc_600m', 'esmc_300m'],
+                     device : torch.device = torch.device('cuda' if torch.cuda.is_available()
+                                                          else 'cpu'),
+                     ) -> None:
+    if fasta_path is None or not os.path.exists(fasta_path):
+        messagebox.showerror("Error", "Invalid FASTA file path.")
+        return
+    result = fasta_to_seq(fasta_path)
+    if result is None:
+        messagebox.showerror("Error", "No sequences found in FASTA file.")
+        return
+    seq, ids = result
 def prost_embed_sequence(seq : str,
                          acc : str,
             f.write(f"{seq_id},{pred_line}\n")
+def predict_with_prost(fasta_path: str):
     """
+    Function to embed sequences from a provided FASTA file using ProstT5 and predict locations.
     """
+    if not fasta_path or not os.path.exists(fasta_path):
+        print("Invalid FASTA file path.")
         return
+    # Ask user for output directory
+    root = tk.Tk()
+    root.withdraw()  # Hide root window
+    output_dir = filedialog.askdirectory(title="Select output directory")
     if not output_dir:
         return
     result = fasta_to_seq(fasta_path)
     if result is None:
+        messagebox.showerror("Error", "No sequences found in FASTA file.")
+        return
+    sequences, ids = result
+    total = len(sequences)
+    # Create progress bar window
+    progress_win = tk.Toplevel(root)
+    progress_win.title("Embedding Progress")
+    progress_label = tk.Label(progress_win, text="Embedding sequences...")
+    progress_label.pack(padx=10, pady=5)
+    progress = ttk.Progressbar(progress_win, length=300, mode='determinate', maximum=total)
+    progress.pack(padx=10, pady=10)
+    # Load model/tokenizer once
+    tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5", do_lower_case=False, legacy=True)
+    model = T5EncoderModel.from_pretrained("Rostlab/ProstT5")
+    embeddings = {}
+    for i, (seq, acc) in enumerate(zip(sequences, ids)):
+        emb = prost_embed_sequence(seq, acc, tokenizer, model)
+        if emb is not None:
+            embeddings[acc] = emb
+        # Update progress
+        progress['value'] = i + 1
+        progress_win.update_idletasks()  # Keeps the window responsive
+    progress_label.config(text="Embedding complete!")
+    tk.Button(progress_win, text="Close", command=progress_win.destroy).pack(pady=5)
+    # Load model
+    messagebox.showinfo("Info", "Loading random forest model for predictions...")
+    project_root: str = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+    model_path = os.path.join(project_root, 'Models', 'rfProst.joblib')
     try:
+        predictor = load(model_path)
     except FileNotFoundError:
+        print(f"Error: Could not find the model file '{model_path}'")
         return
     sequence_ids = list(embeddings.keys())
+    X = np.array(list(embeddings.values()))  # type: ignore
     print("Making predictions...")
     y_pred_proba = predictor.predict_proba(X)
+    # Get class names
     if hasattr(predictor, 'classes_'):
+        class_names = [str(cls) for cls in predictor.classes_]
     else:
+        class_names = [f"Class_{i}" for i in range(y_pred_proba.shape[1])]
     predictions_dict = {}
     for i, seq_id in enumerate(sequence_ids):
+        class_prob_pairs = sorted(zip(class_names, y_pred_proba[i]), key=lambda x: x[1], reverse=True)
         sorted_classes, sorted_probs = zip(*class_prob_pairs)
         predictions_dict[seq_id] = (list(sorted_classes), list(sorted_probs))
+    # Save results
     input_filename = os.path.splitext(os.path.basename(fasta_path))[0]
     output_file = os.path.join(output_dir, f"{input_filename}_predictions.txt")
     print(f"Saving predictions to {output_file}...")
     save_predictions_to_txt(predictions_dict, output_file)
+    print("Predictions saved successfully!")
     print(f"Total sequences processed: {len(embeddings)}")
     print("\nSample predictions:")
     for i, (seq_id, (classes, probs)) in enumerate(list(predictions_dict.items())[:3]):
         pred_str = ", ".join([f"{cls} ({prob:.4f})" for cls, prob in zip(classes, probs)])
+        print(f"{seq_id}: {pred_str}")