Refactor and optimize notebook utilities and model training functions

- Updated embedding loading function to return processed embeddings and accession identifiers.
- Enhanced confusion matrix visualization with improved aesthetics.
- Consolidated PCA, t-SNE, and UMAP plotting functions for better clarity and consistency.
- Added LabelEncoder for encoding target labels in Random Forest and SVM training functions.
- Increased timeout for sequence fetching functions to improve reliability.
- Removed unused imports and cleaned up code for better readability.
- Updated documentation for functions to clarify parameters and return types.

Files changed (7) hide show

Data/TaxDistributionPSORT.svg +0 -0
Data/trainingData.csv +0 -0
notebooks/EDA_Psort.ipynb +2 -2
notebooks/EmbAnalisis.ipynb +2 -2
notebooks/Get_embeddings.ipynb +2 -2
notebooks/hyperparamsRF.ipynb +2 -2
src/my_utils.py +172 -120

Data/TaxDistributionPSORT.svg CHANGED Viewed

Data/trainingData.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/EDA_Psort.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f3676927b7ed8eeef1a881f840243f931c72e429e8c5af70db1cbc4b8d82e900
-size 15130990

 version https://git-lfs.github.com/spec/v1
+oid sha256:888f3665e5b2bf5e597acbe20bb839018b5ece80c55c3bf0bfd911904399031e
+size 10331239

notebooks/EmbAnalisis.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f56a416d1a8fb454ba368583013118d8fc490964dd036d3b3ce8c5879a4393b3
-size 10635423

 version https://git-lfs.github.com/spec/v1
+oid sha256:4f58224a5c99d9990d0a3091944f16ed8d985de10690c166f59fc1739c9aabf9
+size 4648240

notebooks/Get_embeddings.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0e07c094294b597ad35d3ab0bd89cf3c5708a68cc09ef7ad66f9ca77490e9461
-size 15520

 version https://git-lfs.github.com/spec/v1
+oid sha256:f4d087d9e61aa44b98adedab8e1a483a1d981137f826da03f14a897617f8ef53
+size 10847

notebooks/hyperparamsRF.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3c4cbddf3a3a71e3c39bb9def4922e9c1f2fbb4fbbe241cd0e019af820cca6a4
-size 702978

 version https://git-lfs.github.com/spec/v1
+oid sha256:ed5fec8d6f5354ecaef873661bc650c07f91e4e425b3d20c5f221ab8d1d21b11
+size 707241

src/my_utils.py CHANGED Viewed

@@ -3,9 +3,7 @@ import os
 import re
 from pprint import pprint
 from io import StringIO
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from urllib.error import HTTPError
-from typing import Literal, Optional
 import tkinter as tk
 from tkinter import filedialog, messagebox, ttk
@@ -18,7 +16,7 @@ from sklearn import svm
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
 from sklearn.decomposition import PCA
-from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline
 from sklearn.manifold import TSNE
 from sklearn.model_selection import train_test_split
@@ -34,46 +32,66 @@ from tqdm import tqdm
 # Visualization libraries
 import seaborn as sns
 import matplotlib.pyplot as plt
-import plotly.express as px
 from esm.models.esmc import ESMC
 from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
 from transformers import T5Tokenizer, T5EncoderModel, PreTrainedModel
-from esm.sdk.forge import ESM3ForgeInferenceClient
 from joblib import load
 import torch
-import gc
 # Load one chunk of embeddings
-def load_emb(path: str, acc: list[str])->list[np.ndarray]:
-    """    Load embeddings from a specified path.
     Args:
-        path (str): Directory where embeddings are stored.
-        acc (list[str]): List of accession IDs corresponding to the embeddings.
     Returns:
-        list[np.ndarray]: List of loaded embeddings as numpy arrays.
     """
     if not os.path.exists(path):
         raise FileNotFoundError(f"The specified path does not exist: {path}")
-    X = []
-    for a in tqdm(acc, desc = 'Cargando embeddings'):
         emb : np.ndarray = np.load(os.path.join(path, f"{a}.npy"))
         if len(emb.shape) == 3:
             emb = emb.squeeze(axis = 0)
             emb = emb.mean(axis = 0)
-            X.append(emb)
         elif len(emb.shape) == 2:
             emb = emb.mean(axis = 0)
-            X.append(emb)
         else:
-            X.append(emb)
-    return X
 def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
@@ -88,10 +106,10 @@ def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
                           y_pred = y_pred,
                           normalize = 'pred')
-    class_names = np.unique(y_true)
-    plt.figure(figsize=(6, 4))
-    sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues',
-                xticklabels=class_names, yticklabels=class_names) #type: ignore
     plt.xlabel('Predicted Label')
     plt.ylabel('True Label')
@@ -99,27 +117,7 @@ def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
     plt.tight_layout()
     plt.show()
-def perplexity(X):
-    """
-    Plot the KL divergence for different perplexity values in t-SNE.
-    Args:
-        X (list[np.ndarray]): List of feature arrays to be reduced.
-    """
-    X_array = np.vstack(X)
-    perp= np.arange(5, 55, 5)
-    divergence = []
-    for i in perp:
-        model = TSNE(n_components=2, init="pca", perplexity=i)
-        divergence.append(model.kl_divergence_)
-    fig = px.line(x=perp, y=divergence, markers=True)
-    fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
-    fig.update_traces(line_color="red", line_width=1)
-    fig.show()
-def plot_umap(x: list[np.ndarray], y: list[str], title: str, org: list[str]) -> None:
     """
     Plot a 2D UMAP projection of high-dimensional data with color-coded labels and hover information.
@@ -133,66 +131,95 @@ def plot_umap(x: list[np.ndarray], y: list[str], title: str, org: list[str]) ->
         None: Displays an interactive UMAP scatter plot using Plotly.
     """
     reducer = umap.UMAP(n_neighbors=30, random_state=42)
-    x_array = np.vstack(x)
-    scaled_x = StandardScaler().fit_transform(x_array)
     embedding = reducer.fit_transform(scaled_x)
     embedding = np.array(embedding)  # Ensure it's a NumPy array for slicing
-    fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=y, hover_data=[org, y])
-    fig.update_layout(
-        title=title,
-        xaxis_title="First UMAP",
-        yaxis_title="Second UMAP",
-    )
-    fig.show()
-def plot_PCA(X: np.ndarray, labels: list[str], title: str, org : list[str], scale: bool) -> None:
-    X_array = np.vstack(X) #type: ignore
     pca = PCA(n_components=2, random_state=42)
-    if scale:
-        pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
-        Xt = pipe.fit_transform(X_array)
-        explained = pipe.named_steps['pca'].explained_variance_ratio_
-    else:
-        Xt = pca.fit_transform(X_array)
-        explained = pca.explained_variance_ratio_
     df_plot = pd.DataFrame({
-        'PC1': Xt[:, 0],
-        'PC2': Xt[:, 1],
         'Label': labels
     })
-    fig = px.scatter(df_plot, x='PC1', y='PC2', color='Label', hover_data= [org, labels])
-    fig.update_layout(
-        title=title,
-        xaxis_title=f'PC1 ({explained[0]*100:.1f}%)',
-        yaxis_title=f'PC2 ({explained[1]*100:.1f}%)'
-    )
-    fig.show()
-def tsne_plot(X, y, org: list[str]) -> None:
-    # If X is a list of arrays, stack them; if already ndarray, use as is
-    if isinstance(X, list):
-        X_array = np.vstack(X)
-    else:
-        X_array = X
-    X_array = StandardScaler().fit_transform(X_array)
     tsne = TSNE(n_components=2, perplexity=60, random_state=42)
-    tsne_fit = tsne.fit_transform(X_array)
-    fig = px.scatter(x=tsne_fit[:, 0], y=tsne_fit[:, 1], color=y, hover_data=[org, y])
-    fig.update_layout(
-        title="t-SNE",
-        xaxis_title="First t-SNE",
-        yaxis_title="Second t-SNE"
-    )
-    fig.show()
-def plot_emb(X, y, model_name, org : list[str]):
     """    Plot embeddings using PCA, t-SNE, and UMAP.
     Args:
@@ -203,12 +230,27 @@ def plot_emb(X, y, model_name, org : list[str]):
     """
     print(f"Plotting embeddings for: {model_name}")
-    plot_PCA(X, y, title="PCA", scale=True, org = org)
-    tsne_plot(X, y,org = org)
-    plot_umap(X, y, title="UMAP",org = org)
-def evaluate(model, X_test, y_test):
     result = {}
     y_pred = model.predict(X_test)
@@ -225,20 +267,25 @@ def evaluate(model, X_test, y_test):
-def train_rf(title : str, X : np.ndarray, y : np.ndarray, params: dict) -> tuple[RandomForestClassifier, dict]:
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
     # Initialize the RandomForestClassifier with specified parameters
     classifier: RandomForestClassifier = RandomForestClassifier(**params)
     # Fit the model on training data
-    classifier.fit(X_train, y_train)
     # Make predictions on the test data
-    y_pred = classifier.predict(X_test)
-    evaluation = evaluate(classifier, X_test, y_test)
     print(classification_report(y_test, y_pred, zero_division=0))
@@ -246,51 +293,55 @@ def train_rf(title : str, X : np.ndarray, y : np.ndarray, params: dict) -> tuple
               y_true = y_test,
               y_pred = y_pred)
-    del X_train, X_test, y_train, y_test
     return classifier, evaluation
-def train_svm(title : str, X: np.ndarray, y: np.ndarray, params:dict) -> tuple[Pipeline, dict]:
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.33, stratify=y, random_state=42
     )
     svc_params = {k.replace('svm__', ''): v for k, v in params.items() if k.startswith('svm__')}
     pipeline = Pipeline([
         ('scaler', StandardScaler()),
         ('svm', svm.SVC(**svc_params))
-        ])
-    pipeline.fit(X_train, y_train)
-    y_pred = pipeline.predict(X_test)
-    evaluation = evaluate(model=pipeline, X_test=X_test, y_test=y_test)
-    confusion(title = title,
-            y_true = y_test,
-            y_pred = y_pred)#type: ignore
     print(classification_report(y_test, y_pred, zero_division=0))
     return pipeline, evaluation
-def randomSVM(X: np.ndarray, y = np.ndarray) -> dict:
     X_train, _, y_train, _ = train_test_split(X,
                                               y,
                                               test_size=0.33,
-                                              stratify=y,#type: ignore
                                               random_state=42)
-    X_sample, y_sample = resample(X_train,
-                                   y_train,
-                                     n_samples = 3500,
-                                       stratify = y_train,
-                                         random_state = 42) #type: ignore
     pipeline = Pipeline([('scaler', StandardScaler()),
                          ('svm', svm.SVC())])
@@ -406,7 +457,7 @@ def fetch_uniprot_sequence(uniprot_id: str):
     """
     url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
-    response = requests.get(url, timeout=10)
     if response.status_code == 200:
         try:
@@ -418,7 +469,7 @@ def fetch_uniprot_sequence(uniprot_id: str):
         except ValueError:
             # fallback to UniSave if the standard endpoint is not available
             url = f"https://rest.uniprot.org/unisave/{uniprot_id}.fasta"
-            response = requests.get(url, timeout=10)
             if response.status_code == 200:
                 try:
@@ -461,7 +512,7 @@ def fetch_refseq_sequence(refseq_id : str):
     except (HTTPError, ValueError):
         url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
-        response = requests.get(url, timeout=10)
         if response.status_code == 200:
             try:
                 fasta_data = response.text
@@ -471,7 +522,7 @@ def fetch_refseq_sequence(refseq_id : str):
             except ValueError:
                 print(f"No se pudo convertir {fasta_data}, id: {refseq_id}")
-# Main function to fetch sequences for a DataFrame
 def _fetch_sequence_for_row(idx, row):
     """
     Helper to fetch sequence for a single row. Returns (idx, sequence).
@@ -504,6 +555,7 @@ def _fetch_sequence_for_row(idx, row):
     return idx, sequence
 def fetch_sequences_for_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
     Add a 'sequence' column to the dataframe by fetching sequences from

 import re
 from pprint import pprint
 from io import StringIO
+from typing import Literal, Optional, Union
 import tkinter as tk
 from tkinter import filedialog, messagebox, ttk
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
 from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.pipeline import Pipeline
 from sklearn.manifold import TSNE
 from sklearn.model_selection import train_test_split
 # Visualization libraries
 import seaborn as sns
 import matplotlib.pyplot as plt
 from esm.models.esmc import ESMC
 from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
 from transformers import T5Tokenizer, T5EncoderModel, PreTrainedModel
 from joblib import load
 import torch
 # Load one chunk of embeddings
+def load_emb(path: str, acc: list[str]) -> np.ndarray:
+    """
+    Loads and processes embedding files from a specified directory.
+    For each accession in the provided list, this function loads the corresponding
+    NumPy `.npy` file from the given path, processes the embedding by averaging
+    over axes if necessary, and collects the results.
     Args:
+        path (str): Directory path containing the embedding `.npy` files.
+        acc (list[str]): List of accession identifiers corresponding to the embedding files.
     Returns:
+        tuple[np.ndarray, np.ndarray]:
+            - A 2D NumPy array where each row is a processed embedding.
+            - A 1D NumPy array of accession identifiers corresponding to the embeddings.
+    Raises:
+        FileNotFoundError: If the specified path does not exist.
+    Notes:
+        - If an embedding has 3 dimensions, it is squeezed along axis 0 and then averaged over axis 0.
+        - If an embedding has 2 dimensions, it is averaged over axis 0.
+        - Otherwise, the embedding is used as is.
     """
     if not os.path.exists(path):
         raise FileNotFoundError(f"The specified path does not exist: {path}")
+    total_files = len([f for f in os.listdir(path) if f.endswith('.npy')])
+    x = []
+    y = []
+    for a in tqdm(acc, desc = 'Cargando embeddings', total=total_files):
         emb : np.ndarray = np.load(os.path.join(path, f"{a}.npy"))
         if len(emb.shape) == 3:
             emb = emb.squeeze(axis = 0)
             emb = emb.mean(axis = 0)
+            x.append(emb)
+            y.append(a)
         elif len(emb.shape) == 2:
             emb = emb.mean(axis = 0)
+            x.append(emb)
+            y.append(a)
         else:
+            x.append(emb)
+            y.append(a)
+    return np.vstack(x)
 def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
                           y_pred = y_pred,
                           normalize = 'pred')
+    class_names = list(np.unique(y_true))
+    plt.figure(figsize=(10, 10))
+    sns.heatmap(cm, annot=True, fmt='.2f', cmap='Greys',
+                xticklabels=class_names, yticklabels=class_names)
     plt.xlabel('Predicted Label')
     plt.ylabel('True Label')
     plt.tight_layout()
     plt.show()
+def plot_umap(x: np.ndarray, y: np.ndarray, title: str) -> None:
     """
     Plot a 2D UMAP projection of high-dimensional data with color-coded labels and hover information.
         None: Displays an interactive UMAP scatter plot using Plotly.
     """
     reducer = umap.UMAP(n_neighbors=30, random_state=42)
+    scaled_x = StandardScaler().fit_transform(x)
     embedding = reducer.fit_transform(scaled_x)
     embedding = np.array(embedding)  # Ensure it's a NumPy array for slicing
+    df_plot = pd.DataFrame({
+        'UMAP1': embedding[:, 0],
+        'UMAP2': embedding[:, 1],
+        'Label' : y
+        })
+    plt.figure(figsize=(14, 6))
+    fig = sns.scatterplot(data=df_plot, x='UMAP1', y='UMAP2', hue='Label', alpha=0.7)
+    fig.set_title(title)
+    fig.set_xlabel('UMAP Component 1')
+    fig.set_ylabel('UMAP Component 2')
+    plt.legend(title='Labels', bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.tight_layout()
+    plt.show()
+def plot_pca(x : np.ndarray, labels: np.ndarray, title: str) -> None:
+    """
+    Plots the first two principal components of the given data using PCA.
+    Parameters:
+        X (np.ndarray): Input data array of shape (n_samples, n_features).
+        labels (list[str]): List of class or group labels for each sample.
+        title (str): Title for the plot.
+        org (list[str]): List of organism or sample identifiers for hover information.
+    Returns:
+        None: Displays an interactive scatter plot of the first two principal components.
+    """
     pca = PCA(n_components=2, random_state=42)
+    pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
+    scaled_x = pipe.fit_transform(x)
+    explained = pipe.named_steps['pca'].explained_variance_ratio_
     df_plot = pd.DataFrame({
+        'PC1': scaled_x[:, 0],
+        'PC2': scaled_x[:, 1],
         'Label': labels
     })
+    plt.figure(figsize=(14, 6))
+    fig = sns.scatterplot(data=df_plot, x='PC1', y='PC2', hue='Label', alpha=0.7)
+    fig.set_title(f'{title} - Explained Variance: {explained[0]:.2f}, {explained[1]:.2f}')
+    fig.set_xlabel('First Principal Component')
+    fig.set_ylabel('Second Principal Component')
+    plt.legend(title='Labels', bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.tight_layout()
+    plt.show()
+def tsne_plot(x: np.ndarray, labels: np.ndarray, title: str) -> None:
+    """
+    Plots a 2D t-SNE projection of high-dimensional data with color-coded labels.
+    Args:
+        x (list[np.ndarray]): List of feature arrays to be concatenated and visualized.
+        labels (list[str]): List of labels corresponding to each sample in x.
+        title (str): Title for the plot.
+    """
+    x_scaled = StandardScaler().fit_transform(x)
     tsne = TSNE(n_components=2, perplexity=60, random_state=42)
+    tsne_fit = tsne.fit_transform(x_scaled)
+    df_plot = pd.DataFrame({
+        't-SNE1': tsne_fit[:, 0],
+        't-SNE2': tsne_fit[:, 1],
+        'Label': labels
+    })
+    plt.figure(figsize=(14, 6))
+    fig = sns.scatterplot(data=df_plot, x='t-SNE1', y='t-SNE2', hue='Label', alpha=0.7)
+    fig.set_title(title)
+    fig.set_xlabel('First t-SNE Component')
+    fig.set_ylabel('Second t-SNE Component')
+    plt.legend(title='Labels', bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.tight_layout()
+    plt.show()
+def plot_emb(x: np.ndarray, labels : np.ndarray, model_name: str):
     """    Plot embeddings using PCA, t-SNE, and UMAP.
     Args:
     """
     print(f"Plotting embeddings for: {model_name}")
+    plot_pca(x, labels, title=f'PCA - {model_name}')
+    tsne_plot(x, labels, title=f't-SNE - {model_name}')
+    plot_umap(x, labels, title=f'UMAP - {model_name}')
+def evaluate(model: Union[RandomForestClassifier, svm.SVC], X_test : np.ndarray, y_test : np.ndarray) -> dict:
+    """
+    Evaluates a classification model on test data and computes performance metrics.
+    Parameters:
+        model: A trained classification model with a `predict` method.
+        X_test: Features of the test dataset.
+        y_test: True labels for the test dataset.
+    Returns:
+        dict: A dictionary containing the following evaluation metrics:
+            - 'Accuracy': Overall accuracy of the model.
+            - 'Recall': Weighted recall score.
+            - 'Precision': Weighted precision score.
+            - 'F1': Weighted F1 score.
+    Side Effects:
+        Prints the evaluation metrics using pprint.
+    """
     result = {}
     y_pred = model.predict(X_test)
+def train_rf(title: str,
+             x: np.ndarray,
+             y : np.ndarray,
+             params: dict) -> tuple[RandomForestClassifier, dict]:
+    y_encoded = LabelEncoder().fit_transform(y)
+    x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.33, stratify=y_encoded, random_state=42)
     # Initialize the RandomForestClassifier with specified parameters
     classifier: RandomForestClassifier = RandomForestClassifier(**params)
     # Fit the model on training data
+    classifier.fit(x_train, y_train)
     # Make predictions on the test data
+    y_pred = classifier.predict(x_test)
+    evaluation = evaluate(classifier, x_test, y_test)
     print(classification_report(y_test, y_pred, zero_division=0))
               y_true = y_test,
               y_pred = y_pred)
+    del x_train, x_test, y_train, y_test
     return classifier, evaluation
+def train_svm(title: str, x: np.ndarray, y: list[str], params: dict) -> tuple[Pipeline, dict]:
+    """
+    Train a Support Vector Machine (SVM) classifier with the provided data and parameters, evaluate its performance, and return the trained pipeline and evaluation metrics.
+    Args:
+        title (str): Title for the confusion matrix plot.
+        x (np.ndarray): Feature matrix.
+        y (list[str]): List of labels.
+        params (dict): Dictionary of parameters for the SVM.
+    Returns:
+        tuple[Pipeline, dict]: The trained pipeline and a dictionary of evaluation metrics.
+    """
+    x_train, x_test, y_train, y_test = train_test_split(
+        x, y, test_size=0.33, stratify=y, random_state=42
     )
     svc_params = {k.replace('svm__', ''): v for k, v in params.items() if k.startswith('svm__')}
     pipeline = Pipeline([
         ('scaler', StandardScaler()),
         ('svm', svm.SVC(**svc_params))
+    ])
+    pipeline.fit(x_train, y_train)
+    y_pred = pipeline.predict(x_test)
+    evaluation = evaluate(model=pipeline, X_test=x_test, y_test=y_test)
+    confusion(title=title,
+              y_true=y_test,
+              y_pred=y_pred)
     print(classification_report(y_test, y_pred, zero_division=0))
     return pipeline, evaluation
+def randomSVM(X: list[np.ndarray], y = list[str]) -> dict:
     X_train, _, y_train, _ = train_test_split(X,
                                               y,
                                               test_size=0.33,
+                                              stratify=y,
                                               random_state=42)
     pipeline = Pipeline([('scaler', StandardScaler()),
                          ('svm', svm.SVC())])
     """
     url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
+    response = requests.get(url, timeout=60)
     if response.status_code == 200:
         try:
         except ValueError:
             # fallback to UniSave if the standard endpoint is not available
             url = f"https://rest.uniprot.org/unisave/{uniprot_id}.fasta"
+            response = requests.get(url, timeout=60)
             if response.status_code == 200:
                 try:
     except (HTTPError, ValueError):
         url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
+        response = requests.get(url, timeout=60)
         if response.status_code == 200:
             try:
                 fasta_data = response.text
             except ValueError:
                 print(f"No se pudo convertir {fasta_data}, id: {refseq_id}")
 def _fetch_sequence_for_row(idx, row):
     """
     Helper to fetch sequence for a single row. Returns (idx, sequence).
     return idx, sequence
+# Main function to fetch sequences for a DataFrame
 def fetch_sequences_for_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
     Add a 'sequence' column to the dataframe by fetching sequences from