jpuglia commited on
Commit
816b1ef
·
1 Parent(s): e10ca7d

Refactor and optimize notebook utilities and model training functions

Browse files

- Updated embedding loading function to return processed embeddings and accession identifiers.
- Enhanced confusion matrix visualization with improved aesthetics.
- Consolidated PCA, t-SNE, and UMAP plotting functions for better clarity and consistency.
- Added LabelEncoder for encoding target labels in Random Forest and SVM training functions.
- Increased timeout for sequence fetching functions to improve reliability.
- Removed unused imports and cleaned up code for better readability.
- Updated documentation for functions to clarify parameters and return types.

Data/TaxDistributionPSORT.svg CHANGED
Data/trainingData.csv CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/EDA_Psort.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3676927b7ed8eeef1a881f840243f931c72e429e8c5af70db1cbc4b8d82e900
3
- size 15130990
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:888f3665e5b2bf5e597acbe20bb839018b5ece80c55c3bf0bfd911904399031e
3
+ size 10331239
notebooks/EmbAnalisis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f56a416d1a8fb454ba368583013118d8fc490964dd036d3b3ce8c5879a4393b3
3
- size 10635423
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f58224a5c99d9990d0a3091944f16ed8d985de10690c166f59fc1739c9aabf9
3
+ size 4648240
notebooks/Get_embeddings.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e07c094294b597ad35d3ab0bd89cf3c5708a68cc09ef7ad66f9ca77490e9461
3
- size 15520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4d087d9e61aa44b98adedab8e1a483a1d981137f826da03f14a897617f8ef53
3
+ size 10847
notebooks/hyperparamsRF.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c4cbddf3a3a71e3c39bb9def4922e9c1f2fbb4fbbe241cd0e019af820cca6a4
3
- size 702978
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed5fec8d6f5354ecaef873661bc650c07f91e4e425b3d20c5f221ab8d1d21b11
3
+ size 707241
src/my_utils.py CHANGED
@@ -3,9 +3,7 @@ import os
3
  import re
4
  from pprint import pprint
5
  from io import StringIO
6
- from concurrent.futures import ThreadPoolExecutor, as_completed
7
- from urllib.error import HTTPError
8
- from typing import Literal, Optional
9
  import tkinter as tk
10
  from tkinter import filedialog, messagebox, ttk
11
 
@@ -18,7 +16,7 @@ from sklearn import svm
18
  from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
19
  from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
20
  from sklearn.decomposition import PCA
21
- from sklearn.preprocessing import StandardScaler
22
  from sklearn.pipeline import Pipeline
23
  from sklearn.manifold import TSNE
24
  from sklearn.model_selection import train_test_split
@@ -34,46 +32,66 @@ from tqdm import tqdm
34
  # Visualization libraries
35
  import seaborn as sns
36
  import matplotlib.pyplot as plt
37
- import plotly.express as px
38
 
39
  from esm.models.esmc import ESMC
40
  from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
41
  from transformers import T5Tokenizer, T5EncoderModel, PreTrainedModel
42
- from esm.sdk.forge import ESM3ForgeInferenceClient
43
 
44
  from joblib import load
45
 
46
  import torch
47
- import gc
48
 
49
 
50
 
51
  # Load one chunk of embeddings
52
- def load_emb(path: str, acc: list[str])->list[np.ndarray]:
53
-
54
- """ Load embeddings from a specified path.
 
 
 
 
55
  Args:
56
- path (str): Directory where embeddings are stored.
57
- acc (list[str]): List of accession IDs corresponding to the embeddings.
58
  Returns:
59
- list[np.ndarray]: List of loaded embeddings as numpy arrays.
 
 
 
 
 
 
 
 
60
  """
 
61
  if not os.path.exists(path):
62
  raise FileNotFoundError(f"The specified path does not exist: {path}")
63
 
64
- X = []
65
- for a in tqdm(acc, desc = 'Cargando embeddings'):
 
 
 
 
 
66
  emb : np.ndarray = np.load(os.path.join(path, f"{a}.npy"))
 
67
  if len(emb.shape) == 3:
68
  emb = emb.squeeze(axis = 0)
69
  emb = emb.mean(axis = 0)
70
- X.append(emb)
 
71
  elif len(emb.shape) == 2:
72
  emb = emb.mean(axis = 0)
73
- X.append(emb)
 
74
  else:
75
- X.append(emb)
76
- return X
 
 
77
 
78
  def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
79
 
@@ -88,10 +106,10 @@ def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
88
  y_pred = y_pred,
89
  normalize = 'pred')
90
 
91
- class_names = np.unique(y_true)
92
- plt.figure(figsize=(6, 4))
93
- sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues',
94
- xticklabels=class_names, yticklabels=class_names) #type: ignore
95
 
96
  plt.xlabel('Predicted Label')
97
  plt.ylabel('True Label')
@@ -99,27 +117,7 @@ def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
99
  plt.tight_layout()
100
  plt.show()
101
 
102
- def perplexity(X):
103
-
104
- """
105
- Plot the KL divergence for different perplexity values in t-SNE.
106
- Args:
107
- X (list[np.ndarray]): List of feature arrays to be reduced.
108
- """
109
-
110
- X_array = np.vstack(X)
111
- perp= np.arange(5, 55, 5)
112
- divergence = []
113
-
114
- for i in perp:
115
- model = TSNE(n_components=2, init="pca", perplexity=i)
116
- divergence.append(model.kl_divergence_)
117
- fig = px.line(x=perp, y=divergence, markers=True)
118
- fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
119
- fig.update_traces(line_color="red", line_width=1)
120
- fig.show()
121
-
122
- def plot_umap(x: list[np.ndarray], y: list[str], title: str, org: list[str]) -> None:
123
  """
124
  Plot a 2D UMAP projection of high-dimensional data with color-coded labels and hover information.
125
 
@@ -133,66 +131,95 @@ def plot_umap(x: list[np.ndarray], y: list[str], title: str, org: list[str]) ->
133
  None: Displays an interactive UMAP scatter plot using Plotly.
134
  """
135
  reducer = umap.UMAP(n_neighbors=30, random_state=42)
136
- x_array = np.vstack(x)
137
 
138
- scaled_x = StandardScaler().fit_transform(x_array)
139
  embedding = reducer.fit_transform(scaled_x)
140
  embedding = np.array(embedding) # Ensure it's a NumPy array for slicing
141
 
142
- fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=y, hover_data=[org, y])
143
- fig.update_layout(
144
- title=title,
145
- xaxis_title="First UMAP",
146
- yaxis_title="Second UMAP",
147
- )
148
- fig.show()
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
 
 
 
 
 
150
 
151
- def plot_PCA(X: np.ndarray, labels: list[str], title: str, org : list[str], scale: bool) -> None:
152
- X_array = np.vstack(X) #type: ignore
 
153
  pca = PCA(n_components=2, random_state=42)
154
 
155
- if scale:
156
- pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
157
- Xt = pipe.fit_transform(X_array)
158
- explained = pipe.named_steps['pca'].explained_variance_ratio_
159
- else:
160
- Xt = pca.fit_transform(X_array)
161
- explained = pca.explained_variance_ratio_
162
 
163
  df_plot = pd.DataFrame({
164
- 'PC1': Xt[:, 0],
165
- 'PC2': Xt[:, 1],
166
  'Label': labels
167
  })
 
 
 
 
 
 
 
 
 
168
 
169
- fig = px.scatter(df_plot, x='PC1', y='PC2', color='Label', hover_data= [org, labels])
170
- fig.update_layout(
171
- title=title,
172
- xaxis_title=f'PC1 ({explained[0]*100:.1f}%)',
173
- yaxis_title=f'PC2 ({explained[1]*100:.1f}%)'
174
- )
175
- fig.show()
176
 
 
 
 
177
 
178
- def tsne_plot(X, y, org: list[str]) -> None:
179
- # If X is a list of arrays, stack them; if already ndarray, use as is
180
- if isinstance(X, list):
181
- X_array = np.vstack(X)
182
- else:
183
- X_array = X
184
- X_array = StandardScaler().fit_transform(X_array)
185
  tsne = TSNE(n_components=2, perplexity=60, random_state=42)
186
- tsne_fit = tsne.fit_transform(X_array)
187
 
188
- fig = px.scatter(x=tsne_fit[:, 0], y=tsne_fit[:, 1], color=y, hover_data=[org, y])
189
- fig.update_layout(
190
- title="t-SNE",
191
- xaxis_title="First t-SNE",
192
- yaxis_title="Second t-SNE"
193
- )
194
- fig.show()
195
- def plot_emb(X, y, model_name, org : list[str]):
 
 
 
 
 
 
 
 
 
196
 
197
  """ Plot embeddings using PCA, t-SNE, and UMAP.
198
  Args:
@@ -203,12 +230,27 @@ def plot_emb(X, y, model_name, org : list[str]):
203
  """
204
 
205
  print(f"Plotting embeddings for: {model_name}")
206
- plot_PCA(X, y, title="PCA", scale=True, org = org)
207
- tsne_plot(X, y,org = org)
208
- plot_umap(X, y, title="UMAP",org = org)
209
 
 
210
 
211
- def evaluate(model, X_test, y_test):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
  result = {}
214
  y_pred = model.predict(X_test)
@@ -225,20 +267,25 @@ def evaluate(model, X_test, y_test):
225
 
226
 
227
 
228
- def train_rf(title : str, X : np.ndarray, y : np.ndarray, params: dict) -> tuple[RandomForestClassifier, dict]:
 
 
 
229
 
230
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
 
 
231
 
232
  # Initialize the RandomForestClassifier with specified parameters
233
  classifier: RandomForestClassifier = RandomForestClassifier(**params)
234
 
235
  # Fit the model on training data
236
- classifier.fit(X_train, y_train)
237
 
238
  # Make predictions on the test data
239
- y_pred = classifier.predict(X_test)
240
 
241
- evaluation = evaluate(classifier, X_test, y_test)
242
 
243
  print(classification_report(y_test, y_pred, zero_division=0))
244
 
@@ -246,51 +293,55 @@ def train_rf(title : str, X : np.ndarray, y : np.ndarray, params: dict) -> tuple
246
  y_true = y_test,
247
  y_pred = y_pred)
248
 
249
- del X_train, X_test, y_train, y_test
250
 
251
  return classifier, evaluation
252
 
253
- def train_svm(title : str, X: np.ndarray, y: np.ndarray, params:dict) -> tuple[Pipeline, dict]:
 
 
254
 
255
- X_train, X_test, y_train, y_test = train_test_split(
256
- X, y, test_size=0.33, stratify=y, random_state=42
 
 
 
 
 
 
 
 
 
257
  )
258
 
259
  svc_params = {k.replace('svm__', ''): v for k, v in params.items() if k.startswith('svm__')}
260
  pipeline = Pipeline([
261
  ('scaler', StandardScaler()),
262
  ('svm', svm.SVC(**svc_params))
263
- ])
264
-
265
 
266
- pipeline.fit(X_train, y_train)
267
 
268
- y_pred = pipeline.predict(X_test)
269
 
270
- evaluation = evaluate(model=pipeline, X_test=X_test, y_test=y_test)
271
 
272
- confusion(title = title,
273
- y_true = y_test,
274
- y_pred = y_pred)#type: ignore
275
 
276
  print(classification_report(y_test, y_pred, zero_division=0))
277
 
278
  return pipeline, evaluation
279
 
280
 
281
- def randomSVM(X: np.ndarray, y = np.ndarray) -> dict:
282
 
283
  X_train, _, y_train, _ = train_test_split(X,
284
  y,
285
  test_size=0.33,
286
- stratify=y,#type: ignore
287
  random_state=42)
288
-
289
- X_sample, y_sample = resample(X_train,
290
- y_train,
291
- n_samples = 3500,
292
- stratify = y_train,
293
- random_state = 42) #type: ignore
294
 
295
  pipeline = Pipeline([('scaler', StandardScaler()),
296
  ('svm', svm.SVC())])
@@ -406,7 +457,7 @@ def fetch_uniprot_sequence(uniprot_id: str):
406
  """
407
 
408
  url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
409
- response = requests.get(url, timeout=10)
410
 
411
  if response.status_code == 200:
412
  try:
@@ -418,7 +469,7 @@ def fetch_uniprot_sequence(uniprot_id: str):
418
  except ValueError:
419
  # fallback to UniSave if the standard endpoint is not available
420
  url = f"https://rest.uniprot.org/unisave/{uniprot_id}.fasta"
421
- response = requests.get(url, timeout=10)
422
 
423
  if response.status_code == 200:
424
  try:
@@ -461,7 +512,7 @@ def fetch_refseq_sequence(refseq_id : str):
461
  except (HTTPError, ValueError):
462
 
463
  url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
464
- response = requests.get(url, timeout=10)
465
  if response.status_code == 200:
466
  try:
467
  fasta_data = response.text
@@ -471,7 +522,7 @@ def fetch_refseq_sequence(refseq_id : str):
471
  except ValueError:
472
  print(f"No se pudo convertir {fasta_data}, id: {refseq_id}")
473
 
474
- # Main function to fetch sequences for a DataFrame
475
  def _fetch_sequence_for_row(idx, row):
476
  """
477
  Helper to fetch sequence for a single row. Returns (idx, sequence).
@@ -504,6 +555,7 @@ def _fetch_sequence_for_row(idx, row):
504
 
505
  return idx, sequence
506
 
 
507
  def fetch_sequences_for_dataframe(df: pd.DataFrame) -> pd.DataFrame:
508
  """
509
  Add a 'sequence' column to the dataframe by fetching sequences from
 
3
  import re
4
  from pprint import pprint
5
  from io import StringIO
6
+ from typing import Literal, Optional, Union
 
 
7
  import tkinter as tk
8
  from tkinter import filedialog, messagebox, ttk
9
 
 
16
  from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
17
  from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
18
  from sklearn.decomposition import PCA
19
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
20
  from sklearn.pipeline import Pipeline
21
  from sklearn.manifold import TSNE
22
  from sklearn.model_selection import train_test_split
 
32
  # Visualization libraries
33
  import seaborn as sns
34
  import matplotlib.pyplot as plt
 
35
 
36
  from esm.models.esmc import ESMC
37
  from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
38
  from transformers import T5Tokenizer, T5EncoderModel, PreTrainedModel
 
39
 
40
  from joblib import load
41
 
42
  import torch
 
43
 
44
 
45
 
46
  # Load one chunk of embeddings
47
+ def load_emb(path: str, acc: list[str]) -> np.ndarray:
48
+
49
+ """
50
+ Loads and processes embedding files from a specified directory.
51
+ For each accession in the provided list, this function loads the corresponding
52
+ NumPy `.npy` file from the given path, processes the embedding by averaging
53
+ over axes if necessary, and collects the results.
54
  Args:
55
+ path (str): Directory path containing the embedding `.npy` files.
56
+ acc (list[str]): List of accession identifiers corresponding to the embedding files.
57
  Returns:
58
+ tuple[np.ndarray, np.ndarray]:
59
+ - A 2D NumPy array where each row is a processed embedding.
60
+ - A 1D NumPy array of accession identifiers corresponding to the embeddings.
61
+ Raises:
62
+ FileNotFoundError: If the specified path does not exist.
63
+ Notes:
64
+ - If an embedding has 3 dimensions, it is squeezed along axis 0 and then averaged over axis 0.
65
+ - If an embedding has 2 dimensions, it is averaged over axis 0.
66
+ - Otherwise, the embedding is used as is.
67
  """
68
+
69
  if not os.path.exists(path):
70
  raise FileNotFoundError(f"The specified path does not exist: {path}")
71
 
72
+ total_files = len([f for f in os.listdir(path) if f.endswith('.npy')])
73
+
74
+ x = []
75
+ y = []
76
+
77
+ for a in tqdm(acc, desc = 'Cargando embeddings', total=total_files):
78
+
79
  emb : np.ndarray = np.load(os.path.join(path, f"{a}.npy"))
80
+
81
  if len(emb.shape) == 3:
82
  emb = emb.squeeze(axis = 0)
83
  emb = emb.mean(axis = 0)
84
+ x.append(emb)
85
+ y.append(a)
86
  elif len(emb.shape) == 2:
87
  emb = emb.mean(axis = 0)
88
+ x.append(emb)
89
+ y.append(a)
90
  else:
91
+ x.append(emb)
92
+ y.append(a)
93
+
94
+ return np.vstack(x)
95
 
96
  def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
97
 
 
106
  y_pred = y_pred,
107
  normalize = 'pred')
108
 
109
+ class_names = list(np.unique(y_true))
110
+ plt.figure(figsize=(10, 10))
111
+ sns.heatmap(cm, annot=True, fmt='.2f', cmap='Greys',
112
+ xticklabels=class_names, yticklabels=class_names)
113
 
114
  plt.xlabel('Predicted Label')
115
  plt.ylabel('True Label')
 
117
  plt.tight_layout()
118
  plt.show()
119
 
120
+ def plot_umap(x: np.ndarray, y: np.ndarray, title: str) -> None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  """
122
  Plot a 2D UMAP projection of high-dimensional data with color-coded labels and hover information.
123
 
 
131
  None: Displays an interactive UMAP scatter plot using Plotly.
132
  """
133
  reducer = umap.UMAP(n_neighbors=30, random_state=42)
 
134
 
135
+ scaled_x = StandardScaler().fit_transform(x)
136
  embedding = reducer.fit_transform(scaled_x)
137
  embedding = np.array(embedding) # Ensure it's a NumPy array for slicing
138
 
139
+ df_plot = pd.DataFrame({
140
+ 'UMAP1': embedding[:, 0],
141
+ 'UMAP2': embedding[:, 1],
142
+ 'Label' : y
143
+ })
144
+
145
+ plt.figure(figsize=(14, 6))
146
+ fig = sns.scatterplot(data=df_plot, x='UMAP1', y='UMAP2', hue='Label', alpha=0.7)
147
+ fig.set_title(title)
148
+ fig.set_xlabel('UMAP Component 1')
149
+ fig.set_ylabel('UMAP Component 2')
150
+ plt.legend(title='Labels', bbox_to_anchor=(1.05, 1), loc='upper left')
151
+ plt.tight_layout()
152
+ plt.show()
153
+
154
+
155
+
156
+ def plot_pca(x : np.ndarray, labels: np.ndarray, title: str) -> None:
157
+ """
158
+ Plots the first two principal components of the given data using PCA.
159
 
160
+ Parameters:
161
+ X (np.ndarray): Input data array of shape (n_samples, n_features).
162
+ labels (list[str]): List of class or group labels for each sample.
163
+ title (str): Title for the plot.
164
+ org (list[str]): List of organism or sample identifiers for hover information.
165
 
166
+ Returns:
167
+ None: Displays an interactive scatter plot of the first two principal components.
168
+ """
169
  pca = PCA(n_components=2, random_state=42)
170
 
171
+
172
+ pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
173
+ scaled_x = pipe.fit_transform(x)
174
+ explained = pipe.named_steps['pca'].explained_variance_ratio_
175
+
 
 
176
 
177
  df_plot = pd.DataFrame({
178
+ 'PC1': scaled_x[:, 0],
179
+ 'PC2': scaled_x[:, 1],
180
  'Label': labels
181
  })
182
+
183
+ plt.figure(figsize=(14, 6))
184
+ fig = sns.scatterplot(data=df_plot, x='PC1', y='PC2', hue='Label', alpha=0.7)
185
+ fig.set_title(f'{title} - Explained Variance: {explained[0]:.2f}, {explained[1]:.2f}')
186
+ fig.set_xlabel('First Principal Component')
187
+ fig.set_ylabel('Second Principal Component')
188
+ plt.legend(title='Labels', bbox_to_anchor=(1.05, 1), loc='upper left')
189
+ plt.tight_layout()
190
+ plt.show()
191
 
 
 
 
 
 
 
 
192
 
193
+ def tsne_plot(x: np.ndarray, labels: np.ndarray, title: str) -> None:
194
+ """
195
+ Plots a 2D t-SNE projection of high-dimensional data with color-coded labels.
196
 
197
+ Args:
198
+ x (list[np.ndarray]): List of feature arrays to be concatenated and visualized.
199
+ labels (list[str]): List of labels corresponding to each sample in x.
200
+ title (str): Title for the plot.
201
+ """
202
+ x_scaled = StandardScaler().fit_transform(x)
 
203
  tsne = TSNE(n_components=2, perplexity=60, random_state=42)
204
+ tsne_fit = tsne.fit_transform(x_scaled)
205
 
206
+ df_plot = pd.DataFrame({
207
+ 't-SNE1': tsne_fit[:, 0],
208
+ 't-SNE2': tsne_fit[:, 1],
209
+ 'Label': labels
210
+ })
211
+
212
+
213
+ plt.figure(figsize=(14, 6))
214
+ fig = sns.scatterplot(data=df_plot, x='t-SNE1', y='t-SNE2', hue='Label', alpha=0.7)
215
+ fig.set_title(title)
216
+ fig.set_xlabel('First t-SNE Component')
217
+ fig.set_ylabel('Second t-SNE Component')
218
+ plt.legend(title='Labels', bbox_to_anchor=(1.05, 1), loc='upper left')
219
+ plt.tight_layout()
220
+ plt.show()
221
+
222
+ def plot_emb(x: np.ndarray, labels : np.ndarray, model_name: str):
223
 
224
  """ Plot embeddings using PCA, t-SNE, and UMAP.
225
  Args:
 
230
  """
231
 
232
  print(f"Plotting embeddings for: {model_name}")
233
+ plot_pca(x, labels, title=f'PCA - {model_name}')
234
+ tsne_plot(x, labels, title=f't-SNE - {model_name}')
235
+ plot_umap(x, labels, title=f'UMAP - {model_name}')
236
 
237
+ def evaluate(model: Union[RandomForestClassifier, svm.SVC], X_test : np.ndarray, y_test : np.ndarray) -> dict:
238
 
239
+ """
240
+ Evaluates a classification model on test data and computes performance metrics.
241
+ Parameters:
242
+ model: A trained classification model with a `predict` method.
243
+ X_test: Features of the test dataset.
244
+ y_test: True labels for the test dataset.
245
+ Returns:
246
+ dict: A dictionary containing the following evaluation metrics:
247
+ - 'Accuracy': Overall accuracy of the model.
248
+ - 'Recall': Weighted recall score.
249
+ - 'Precision': Weighted precision score.
250
+ - 'F1': Weighted F1 score.
251
+ Side Effects:
252
+ Prints the evaluation metrics using pprint.
253
+ """
254
 
255
  result = {}
256
  y_pred = model.predict(X_test)
 
267
 
268
 
269
 
270
+ def train_rf(title: str,
271
+ x: np.ndarray,
272
+ y : np.ndarray,
273
+ params: dict) -> tuple[RandomForestClassifier, dict]:
274
 
275
+ y_encoded = LabelEncoder().fit_transform(y)
276
+
277
+ x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.33, stratify=y_encoded, random_state=42)
278
 
279
  # Initialize the RandomForestClassifier with specified parameters
280
  classifier: RandomForestClassifier = RandomForestClassifier(**params)
281
 
282
  # Fit the model on training data
283
+ classifier.fit(x_train, y_train)
284
 
285
  # Make predictions on the test data
286
+ y_pred = classifier.predict(x_test)
287
 
288
+ evaluation = evaluate(classifier, x_test, y_test)
289
 
290
  print(classification_report(y_test, y_pred, zero_division=0))
291
 
 
293
  y_true = y_test,
294
  y_pred = y_pred)
295
 
296
+ del x_train, x_test, y_train, y_test
297
 
298
  return classifier, evaluation
299
 
300
+ def train_svm(title: str, x: np.ndarray, y: list[str], params: dict) -> tuple[Pipeline, dict]:
301
+ """
302
+ Train a Support Vector Machine (SVM) classifier with the provided data and parameters, evaluate its performance, and return the trained pipeline and evaluation metrics.
303
 
304
+ Args:
305
+ title (str): Title for the confusion matrix plot.
306
+ x (np.ndarray): Feature matrix.
307
+ y (list[str]): List of labels.
308
+ params (dict): Dictionary of parameters for the SVM.
309
+
310
+ Returns:
311
+ tuple[Pipeline, dict]: The trained pipeline and a dictionary of evaluation metrics.
312
+ """
313
+ x_train, x_test, y_train, y_test = train_test_split(
314
+ x, y, test_size=0.33, stratify=y, random_state=42
315
  )
316
 
317
  svc_params = {k.replace('svm__', ''): v for k, v in params.items() if k.startswith('svm__')}
318
  pipeline = Pipeline([
319
  ('scaler', StandardScaler()),
320
  ('svm', svm.SVC(**svc_params))
321
+ ])
 
322
 
323
+ pipeline.fit(x_train, y_train)
324
 
325
+ y_pred = pipeline.predict(x_test)
326
 
327
+ evaluation = evaluate(model=pipeline, X_test=x_test, y_test=y_test)
328
 
329
+ confusion(title=title,
330
+ y_true=y_test,
331
+ y_pred=y_pred)
332
 
333
  print(classification_report(y_test, y_pred, zero_division=0))
334
 
335
  return pipeline, evaluation
336
 
337
 
338
+ def randomSVM(X: list[np.ndarray], y = list[str]) -> dict:
339
 
340
  X_train, _, y_train, _ = train_test_split(X,
341
  y,
342
  test_size=0.33,
343
+ stratify=y,
344
  random_state=42)
 
 
 
 
 
 
345
 
346
  pipeline = Pipeline([('scaler', StandardScaler()),
347
  ('svm', svm.SVC())])
 
457
  """
458
 
459
  url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
460
+ response = requests.get(url, timeout=60)
461
 
462
  if response.status_code == 200:
463
  try:
 
469
  except ValueError:
470
  # fallback to UniSave if the standard endpoint is not available
471
  url = f"https://rest.uniprot.org/unisave/{uniprot_id}.fasta"
472
+ response = requests.get(url, timeout=60)
473
 
474
  if response.status_code == 200:
475
  try:
 
512
  except (HTTPError, ValueError):
513
 
514
  url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
515
+ response = requests.get(url, timeout=60)
516
  if response.status_code == 200:
517
  try:
518
  fasta_data = response.text
 
522
  except ValueError:
523
  print(f"No se pudo convertir {fasta_data}, id: {refseq_id}")
524
 
525
+
526
  def _fetch_sequence_for_row(idx, row):
527
  """
528
  Helper to fetch sequence for a single row. Returns (idx, sequence).
 
555
 
556
  return idx, sequence
557
 
558
+ # Main function to fetch sequences for a DataFrame
559
  def fetch_sequences_for_dataframe(df: pd.DataFrame) -> pd.DataFrame:
560
  """
561
  Add a 'sequence' column to the dataframe by fetching sequences from