jpuglia commited on
Commit
5f9e451
·
1 Parent(s): c39213e

Update dependencies in my_utils.py and clean up imports

Browse files
notebooks/02_Get_embeddings.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f052aee04179938913521a555b8224712135ec3671362d864b48721d005cc94
3
  size 10859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5636426c5de705c27b2ac837b352c878ac47dac9b1a86e8eb7d28146fe719084
3
  size 10859
src/my_utils.py CHANGED
@@ -6,6 +6,7 @@ from io import StringIO
6
  from typing import Literal, Optional
7
  import tkinter as tk
8
  from tkinter import filedialog, messagebox, ttk
 
9
 
10
 
11
  import pandas as pd
@@ -26,13 +27,11 @@ from sklearn.decomposition import PCA
26
  from sklearn.preprocessing import StandardScaler, LabelEncoder
27
  from sklearn.pipeline import Pipeline
28
  from sklearn.manifold import TSNE
 
29
  from sklearn.model_selection import train_test_split
30
  from sklearn.base import BaseEstimator
31
 
32
- import umap
33
-
34
  import requests
35
- from urllib.error import HTTPError as URLLibHTTPError
36
  from requests.exceptions import HTTPError as RequestsHTTPError
37
  from Bio import Entrez
38
  from Bio import SeqIO
@@ -376,6 +375,7 @@ def train_svm(title: str, x: np.ndarray, y: np.ndarray, params: dict) -> tuple[P
376
 
377
 
378
  def randomSVM(x: np.ndarray, y: np.ndarray) -> dict:
 
379
  """
380
  Performs randomized hyperparameter search for an SVM classifier using a pipeline with feature scaling.
381
 
@@ -525,12 +525,6 @@ def fetch_uniprot_sequence(uniprot_id: str):
525
  else:
526
  print(f'URL inválido o no accesible: {url}')
527
 
528
- from Bio import Entrez, SeqIO
529
-
530
- from io import StringIO
531
- import pandas as pd
532
- import requests
533
-
534
  def fetch_refseq_sequence(refseq_id: str) -> str | None:
535
  """
536
  Fetch the protein sequence for the given RefSeq ID using NCBI Entrez.
@@ -540,14 +534,6 @@ def fetch_refseq_sequence(refseq_id: str) -> str | None:
540
  Entrez.email = "puglia.jd@gmail.com"
541
  Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
542
 
543
- # ——— Validate input ———
544
- if not refseq_id or pd.isna(refseq_id):
545
- print(f"[SKIP] Empty or NaN RefSeq ID: `{refseq_id}`")
546
- return None
547
-
548
- # Clean up possible whitespace or comma‐separated values
549
- refseq_id = str(refseq_id).strip().split(",")[0]
550
-
551
  # ——— 1) Try NCBI Entrez ———
552
  try:
553
  handle = Entrez.efetch(
@@ -579,8 +565,6 @@ def fetch_refseq_sequence(refseq_id: str) -> str | None:
579
  # ——— All methods failed ———
580
  return None
581
 
582
-
583
-
584
  def _fetch_sequence_for_row(idx, row):
585
  """
586
  Helper to fetch sequence for a single row. Returns (idx, sequence).
@@ -874,8 +858,7 @@ def prost_embed_sequence(seq : str,
874
  if real_len <= 0:
875
  print(f"Sequence too short after tokenization for {acc}")
876
 
877
- # Extract and average embeddings
878
-
879
  emb = embedding_repr.last_hidden_state[0, 1:real_len]
880
  emb_avg = emb.mean(dim=0).cpu().numpy()
881
 
 
6
  from typing import Literal, Optional
7
  import tkinter as tk
8
  from tkinter import filedialog, messagebox, ttk
9
+ from urllib.error import HTTPError as URLLibHTTPError
10
 
11
 
12
  import pandas as pd
 
27
  from sklearn.preprocessing import StandardScaler, LabelEncoder
28
  from sklearn.pipeline import Pipeline
29
  from sklearn.manifold import TSNE
30
+ import umap
31
  from sklearn.model_selection import train_test_split
32
  from sklearn.base import BaseEstimator
33
 
 
 
34
  import requests
 
35
  from requests.exceptions import HTTPError as RequestsHTTPError
36
  from Bio import Entrez
37
  from Bio import SeqIO
 
375
 
376
 
377
  def randomSVM(x: np.ndarray, y: np.ndarray) -> dict:
378
+
379
  """
380
  Performs randomized hyperparameter search for an SVM classifier using a pipeline with feature scaling.
381
 
 
525
  else:
526
  print(f'URL inválido o no accesible: {url}')
527
 
 
 
 
 
 
 
528
  def fetch_refseq_sequence(refseq_id: str) -> str | None:
529
  """
530
  Fetch the protein sequence for the given RefSeq ID using NCBI Entrez.
 
534
  Entrez.email = "puglia.jd@gmail.com"
535
  Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
536
 
 
 
 
 
 
 
 
 
537
  # ——— 1) Try NCBI Entrez ———
538
  try:
539
  handle = Entrez.efetch(
 
565
  # ——— All methods failed ———
566
  return None
567
 
 
 
568
  def _fetch_sequence_for_row(idx, row):
569
  """
570
  Helper to fetch sequence for a single row. Returns (idx, sequence).
 
858
  if real_len <= 0:
859
  print(f"Sequence too short after tokenization for {acc}")
860
 
861
+ # Extract and average embeddings
 
862
  emb = embedding_repr.last_hidden_state[0, 1:real_len]
863
  emb_avg = emb.mean(dim=0).cpu().numpy()
864