Refactor my_utils.py: Simplify type hints, enhance evaluate and training functions, and improve error handling in sequence fetching

Browse files

Files changed (2) hide show

notebooks/hyperparamsRF.ipynb +2 -2
src/my_utils.py +91 -49

notebooks/hyperparamsRF.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed5fec8d6f5354ecaef873661bc650c07f91e4e425b3d20c5f221ab8d1d21b11
-size 707241

 version https://git-lfs.github.com/spec/v1
+oid sha256:be08020829c6e68c1b659bca93f71ede388f4c5d6fba3b7bd4aa85b363806f28
+size 101568

src/my_utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import re
 from pprint import pprint
 from io import StringIO
-from typing import Literal, Optional, Union
 import tkinter as tk
 from tkinter import filedialog, messagebox, ttk
@@ -14,17 +14,26 @@ import numpy as np
 from sklearn.ensemble import RandomForestClassifier
 from sklearn import svm
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
-from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.pipeline import Pipeline
 from sklearn.manifold import TSNE
 from sklearn.model_selection import train_test_split
 from sklearn.utils import resample
 import umap
 import requests
 from Bio import Entrez
 from Bio import SeqIO
 from tqdm import tqdm
@@ -234,7 +243,9 @@ def plot_emb(x: np.ndarray, labels : np.ndarray, model_name: str):
     tsne_plot(x, labels, title=f't-SNE - {model_name}')
     plot_umap(x, labels, title=f'UMAP - {model_name}')
-def evaluate(model: Union[RandomForestClassifier, svm.SVC], X_test : np.ndarray, y_test : np.ndarray) -> dict:
     """
     Evaluates a classification model on test data and computes performance metrics.
@@ -253,7 +264,7 @@ def evaluate(model: Union[RandomForestClassifier, svm.SVC], X_test : np.ndarray,
     """
     result = {}
-    y_pred = model.predict(X_test)
     result['Accuracy'] = accuracy_score(y_test, y_pred)
     result['Recall'] = recall_score(y_test, y_pred, average = 'weighted')
@@ -261,8 +272,6 @@ def evaluate(model: Union[RandomForestClassifier, svm.SVC], X_test : np.ndarray,
     result['F1'] = f1_score(y_test, y_pred, average='weighted')
     pprint(result)
     return result
@@ -270,9 +279,27 @@ def evaluate(model: Union[RandomForestClassifier, svm.SVC], X_test : np.ndarray,
 def train_rf(title: str,
              x: np.ndarray,
              y : np.ndarray,
-             params: dict) -> tuple[RandomForestClassifier, dict]:
-    y_encoded = LabelEncoder().fit_transform(y)
     x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.33, stratify=y_encoded, random_state=42)
@@ -287,61 +314,78 @@ def train_rf(title: str,
     evaluation = evaluate(classifier, x_test, y_test)
-    print(classification_report(y_test, y_pred, zero_division=0))
-    confusion(title = title,
-              y_true = y_test,
-              y_pred = y_pred)
-    del x_train, x_test, y_train, y_test
-    return classifier, evaluation
-def train_svm(title: str, x: np.ndarray, y: list[str], params: dict) -> tuple[Pipeline, dict]:
     """
-    Train a Support Vector Machine (SVM) classifier with the provided data and parameters, evaluate its performance, and return the trained pipeline and evaluation metrics.
     Args:
         title (str): Title for the confusion matrix plot.
-        x (np.ndarray): Feature matrix.
-        y (list[str]): List of labels.
-        params (dict): Dictionary of parameters for the SVM.
     Returns:
-        tuple[Pipeline, dict]: The trained pipeline and a dictionary of evaluation metrics.
     """
     x_train, x_test, y_train, y_test = train_test_split(
-        x, y, test_size=0.33, stratify=y, random_state=42
     )
     svc_params = {k.replace('svm__', ''): v for k, v in params.items() if k.startswith('svm__')}
     pipeline = Pipeline([
         ('scaler', StandardScaler()),
-        ('svm', svm.SVC(**svc_params))
     ])
     pipeline.fit(x_train, y_train)
     y_pred = pipeline.predict(x_test)
-    evaluation = evaluate(model=pipeline, X_test=x_test, y_test=y_test)
-    confusion(title=title,
-              y_true=y_test,
-              y_pred=y_pred)
-    print(classification_report(y_test, y_pred, zero_division=0))
-    return pipeline, evaluation
-def randomSVM(X: list[np.ndarray], y = list[str]) -> dict:
-    X_train, _, y_train, _ = train_test_split(X,
-                                              y,
                                               test_size=0.33,
-                                              stratify=y,
-                                              random_state=42)
     pipeline = Pipeline([('scaler', StandardScaler()),
                          ('svm', svm.SVC())])
@@ -365,28 +409,26 @@ def randomSVM(X: list[np.ndarray], y = list[str]) -> dict:
         n_iter=50,
         scoring='f1_weighted',
         cv=3,
-        verbose=2,
         random_state=42,
         n_jobs=-1
     )
-    random_search.fit(X_sample, y_sample)
     pprint(random_search.best_params_)
     return random_search.best_params_
-def randomSearch(X: np.ndarray, y: np.ndarray) -> dict:
-    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
     classifier : RandomForestClassifier = RandomForestClassifier(random_state=42)
-    X_sample, y_sample = resample(X_train,
-                                  y_train,
-                                  n_samples = 3500,
-                                  stratify = y_train,
-                                  random_state = 42) #type: ignore
     param_grid = {
         'n_estimators': list(np.arange(500,4000, 400)),
         'max_depth': [None, 10, 20, 30, 40, 50],
@@ -404,10 +446,10 @@ def randomSearch(X: np.ndarray, y: np.ndarray) -> dict:
                                    n_iter= 50,
                                    scoring = 'f1_weighted',
                                    cv = 3,
-                                   verbose = 2,
                                    n_jobs = -1)
-    rf_random.fit(X = X_sample, y = y_sample)
     print('Best Params')
     pprint(rf_random.best_params_)
@@ -534,7 +576,7 @@ def _fetch_sequence_for_row(idx, row):
         try:
             sequence = fetch_uniprot_sequence(swiss_id)
         except HTTPError as e:
-            print(f"Warning: SwissProt fetch failed for {swiss_id} with HTTP {e.code}")
             sequence = None
     # Try RefSeq if no SwissProt
@@ -542,7 +584,7 @@ def _fetch_sequence_for_row(idx, row):
         try:
             sequence = fetch_refseq_sequence(row['Refseq_Accession'])
         except HTTPError as e:
-            print(f"Warning: RefSeq fetch failed for {row['Refseq_Accession']} with HTTP {e.code}")
             sequence = None
     # Try Other_Accession if still no sequence
@@ -550,7 +592,7 @@ def _fetch_sequence_for_row(idx, row):
         try:
             sequence = fetch_refseq_sequence(row['Other_Accession'])
         except HTTPError as e:
-            print(f"Warning: RefSeq fetch failed for {row['Other_Accession']} with HTTP {e.code}")
             sequence = None
     return idx, sequence

 import re
 from pprint import pprint
 from io import StringIO
+from typing import Literal, Optional
 import tkinter as tk
 from tkinter import filedialog, messagebox, ttk
 from sklearn.ensemble import RandomForestClassifier
 from sklearn import svm
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+from sklearn.metrics import (
+    classification_report,
+    accuracy_score,
+    f1_score,
+    recall_score,
+    precision_score,
+    confusion_matrix,
+)
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.pipeline import Pipeline
 from sklearn.manifold import TSNE
 from sklearn.model_selection import train_test_split
 from sklearn.utils import resample
+from sklearn.base import BaseEstimator
 import umap
 import requests
+from requests.exceptions import HTTPError
 from Bio import Entrez
 from Bio import SeqIO
 from tqdm import tqdm
     tsne_plot(x, labels, title=f't-SNE - {model_name}')
     plot_umap(x, labels, title=f'UMAP - {model_name}')
+def evaluate(model: BaseEstimator,
+             x_test: np.ndarray,
+             y_test: np.ndarray) -> dict:
     """
     Evaluates a classification model on test data and computes performance metrics.
     """
     result = {}
+    y_pred = model.predict(x_test)  # type: ignore
     result['Accuracy'] = accuracy_score(y_test, y_pred)
     result['Recall'] = recall_score(y_test, y_pred, average = 'weighted')
     result['F1'] = f1_score(y_test, y_pred, average='weighted')
     pprint(result)
     return result
 def train_rf(title: str,
              x: np.ndarray,
              y : np.ndarray,
+             params: dict) -> tuple[RandomForestClassifier, dict, LabelEncoder]:
+    """
+    Trains a RandomForestClassifier on the provided data, evaluates its performance, and displays results.
+    Args:
+        title (str): Title for the confusion matrix plot.
+        x (np.ndarray): Feature matrix for training and testing.
+        y (np.ndarray): Target labels corresponding to the feature matrix.
+        params (dict): Parameters to initialize the RandomForestClassifier.
+    Returns:
+        tuple[RandomForestClassifier, dict, LabelEncoder]:
+            - Trained RandomForestClassifier instance,
+            - Evaluation metrics as a dictionary,
+            - Fitted LabelEncoder for label transformations.
+    Side Effects:
+        - Prints a classification report to stdout.
+        - Displays a confusion matrix plot.
+    """
+    le = LabelEncoder()
+    y_encoded = le.fit_transform(y)
     x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.33, stratify=y_encoded, random_state=42)
     evaluation = evaluate(classifier, x_test, y_test)
+    print(classification_report(y_test,
+                                y_pred,
+                                zero_division=0,
+                                target_names = le.classes_))
+    y_pred_str = le.inverse_transform(y_pred)
+    y_test_str = le.inverse_transform(y_test)
+    confusion(title=title, y_true=y_test_str, y_pred=y_pred_str)
+    return classifier, evaluation, le
+def train_svm(title: str, x: np.ndarray, y: np.ndarray, params: dict) -> tuple[Pipeline, dict, LabelEncoder]:
     """
+    Trains an SVM classifier using the provided data and parameters, evaluates its performance, and returns the trained pipeline, evaluation metrics, and label encoder.
     Args:
         title (str): Title for the confusion matrix plot.
+        x (np.ndarray): Feature matrix for training and testing.
+        y (np.ndarray): Target labels corresponding to the feature matrix.
+        params (dict): Dictionary of parameters for the SVM classifier. SVM-specific parameters should be prefixed with 'svm__'.
     Returns:
+        tuple[Pipeline, dict, LabelEncoder]:
+            - Trained scikit-learn Pipeline object containing the scaler and SVM.
+            - Dictionary with evaluation metrics from the `evaluate` function.
+            - Fitted LabelEncoder instance for encoding and decoding labels.
+    Side Effects:
+        - Displays a confusion matrix plot using the provided title.
+        - Prints a classification report to the standard output.
     """
+    le = LabelEncoder()
+    y_encoded = le.fit_transform(y)
     x_train, x_test, y_train, y_test = train_test_split(
+        x, y_encoded, test_size=0.33, stratify=y_encoded, random_state=42
     )
     svc_params = {k.replace('svm__', ''): v for k, v in params.items() if k.startswith('svm__')}
     pipeline = Pipeline([
         ('scaler', StandardScaler()),
+        ('svm', svm.SVC(**svc_params, probability = True))
     ])
     pipeline.fit(x_train, y_train)
     y_pred = pipeline.predict(x_test)
+    evaluation = evaluate(model=pipeline, x_test=x_test, y_test=y_test)
+    y_pred_str = le.inverse_transform(y_pred)
+    y_test_str = le.inverse_transform(y_test)
+    confusion(title=title, y_true=y_test_str, y_pred=y_pred_str)
+    print(classification_report(y_test, y_pred, zero_division=0, target_names = le.classes_))
+    return pipeline, evaluation, le
+def randomSVM(x: np.ndarray, y: np.ndarray) -> dict:
+    le = LabelEncoder()
+    y_encoded = le.fit_transform(y)
+    x_train, _, y_train, _ = train_test_split(x,
+                                              y_encoded,
                                               test_size=0.33,
+                                              stratify=y_encoded,
+                                              random_state=42)
     pipeline = Pipeline([('scaler', StandardScaler()),
                          ('svm', svm.SVC())])
         n_iter=50,
         scoring='f1_weighted',
         cv=3,
+        verbose=1,
         random_state=42,
         n_jobs=-1
     )
+    random_search.fit(x_train, y_train)
+    random_search.best_params_['svm__probability'] = True
     pprint(random_search.best_params_)
     return random_search.best_params_
+def randomSearch(x: np.ndarray, y: np.ndarray) -> dict:
+    le = LabelEncoder()
+    y_encoded = le.fit_transform(y)
+    x_train, _, y_train, _ = train_test_split(x, y_encoded, test_size=0.33, stratify=y_encoded, random_state=42)
     classifier : RandomForestClassifier = RandomForestClassifier(random_state=42)
     param_grid = {
         'n_estimators': list(np.arange(500,4000, 400)),
         'max_depth': [None, 10, 20, 30, 40, 50],
                                    n_iter= 50,
                                    scoring = 'f1_weighted',
                                    cv = 3,
+                                   verbose = 1,
                                    n_jobs = -1)
+    rf_random.fit(X = x_train, y = y_train)
     print('Best Params')
     pprint(rf_random.best_params_)
         try:
             sequence = fetch_uniprot_sequence(swiss_id)
         except HTTPError as e:
+            print(f"Warning: SwissProt fetch failed for {swiss_id} with HTTP {e}")
             sequence = None
     # Try RefSeq if no SwissProt
         try:
             sequence = fetch_refseq_sequence(row['Refseq_Accession'])
         except HTTPError as e:
+            print(f"Warning: RefSeq fetch failed for {row['Refseq_Accession']} with HTTP {e}")
             sequence = None
     # Try Other_Accession if still no sequence
         try:
             sequence = fetch_refseq_sequence(row['Other_Accession'])
         except HTTPError as e:
+            print(f"Warning: RefSeq fetch failed for {row['Other_Accession']} with HTTP {e}")
             sequence = None
     return idx, sequence