Update dependencies in my_utils.py and clean up imports
Browse files- notebooks/02_Get_embeddings.ipynb +1 -1
- src/my_utils.py +4 -21
notebooks/02_Get_embeddings.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 10859
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5636426c5de705c27b2ac837b352c878ac47dac9b1a86e8eb7d28146fe719084
|
| 3 |
size 10859
|
src/my_utils.py
CHANGED
|
@@ -6,6 +6,7 @@ from io import StringIO
|
|
| 6 |
from typing import Literal, Optional
|
| 7 |
import tkinter as tk
|
| 8 |
from tkinter import filedialog, messagebox, ttk
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
import pandas as pd
|
|
@@ -26,13 +27,11 @@ from sklearn.decomposition import PCA
|
|
| 26 |
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 27 |
from sklearn.pipeline import Pipeline
|
| 28 |
from sklearn.manifold import TSNE
|
|
|
|
| 29 |
from sklearn.model_selection import train_test_split
|
| 30 |
from sklearn.base import BaseEstimator
|
| 31 |
|
| 32 |
-
import umap
|
| 33 |
-
|
| 34 |
import requests
|
| 35 |
-
from urllib.error import HTTPError as URLLibHTTPError
|
| 36 |
from requests.exceptions import HTTPError as RequestsHTTPError
|
| 37 |
from Bio import Entrez
|
| 38 |
from Bio import SeqIO
|
|
@@ -376,6 +375,7 @@ def train_svm(title: str, x: np.ndarray, y: np.ndarray, params: dict) -> tuple[P
|
|
| 376 |
|
| 377 |
|
| 378 |
def randomSVM(x: np.ndarray, y: np.ndarray) -> dict:
|
|
|
|
| 379 |
"""
|
| 380 |
Performs randomized hyperparameter search for an SVM classifier using a pipeline with feature scaling.
|
| 381 |
|
|
@@ -525,12 +525,6 @@ def fetch_uniprot_sequence(uniprot_id: str):
|
|
| 525 |
else:
|
| 526 |
print(f'URL inválido o no accesible: {url}')
|
| 527 |
|
| 528 |
-
from Bio import Entrez, SeqIO
|
| 529 |
-
|
| 530 |
-
from io import StringIO
|
| 531 |
-
import pandas as pd
|
| 532 |
-
import requests
|
| 533 |
-
|
| 534 |
def fetch_refseq_sequence(refseq_id: str) -> str | None:
|
| 535 |
"""
|
| 536 |
Fetch the protein sequence for the given RefSeq ID using NCBI Entrez.
|
|
@@ -540,14 +534,6 @@ def fetch_refseq_sequence(refseq_id: str) -> str | None:
|
|
| 540 |
Entrez.email = "puglia.jd@gmail.com"
|
| 541 |
Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
|
| 542 |
|
| 543 |
-
# ——— Validate input ———
|
| 544 |
-
if not refseq_id or pd.isna(refseq_id):
|
| 545 |
-
print(f"[SKIP] Empty or NaN RefSeq ID: `{refseq_id}`")
|
| 546 |
-
return None
|
| 547 |
-
|
| 548 |
-
# Clean up possible whitespace or comma‐separated values
|
| 549 |
-
refseq_id = str(refseq_id).strip().split(",")[0]
|
| 550 |
-
|
| 551 |
# ——— 1) Try NCBI Entrez ———
|
| 552 |
try:
|
| 553 |
handle = Entrez.efetch(
|
|
@@ -579,8 +565,6 @@ def fetch_refseq_sequence(refseq_id: str) -> str | None:
|
|
| 579 |
# ——— All methods failed ———
|
| 580 |
return None
|
| 581 |
|
| 582 |
-
|
| 583 |
-
|
| 584 |
def _fetch_sequence_for_row(idx, row):
|
| 585 |
"""
|
| 586 |
Helper to fetch sequence for a single row. Returns (idx, sequence).
|
|
@@ -874,8 +858,7 @@ def prost_embed_sequence(seq : str,
|
|
| 874 |
if real_len <= 0:
|
| 875 |
print(f"Sequence too short after tokenization for {acc}")
|
| 876 |
|
| 877 |
-
|
| 878 |
-
|
| 879 |
emb = embedding_repr.last_hidden_state[0, 1:real_len]
|
| 880 |
emb_avg = emb.mean(dim=0).cpu().numpy()
|
| 881 |
|
|
|
|
| 6 |
from typing import Literal, Optional
|
| 7 |
import tkinter as tk
|
| 8 |
from tkinter import filedialog, messagebox, ttk
|
| 9 |
+
from urllib.error import HTTPError as URLLibHTTPError
|
| 10 |
|
| 11 |
|
| 12 |
import pandas as pd
|
|
|
|
| 27 |
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 28 |
from sklearn.pipeline import Pipeline
|
| 29 |
from sklearn.manifold import TSNE
|
| 30 |
+
import umap
|
| 31 |
from sklearn.model_selection import train_test_split
|
| 32 |
from sklearn.base import BaseEstimator
|
| 33 |
|
|
|
|
|
|
|
| 34 |
import requests
|
|
|
|
| 35 |
from requests.exceptions import HTTPError as RequestsHTTPError
|
| 36 |
from Bio import Entrez
|
| 37 |
from Bio import SeqIO
|
|
|
|
| 375 |
|
| 376 |
|
| 377 |
def randomSVM(x: np.ndarray, y: np.ndarray) -> dict:
|
| 378 |
+
|
| 379 |
"""
|
| 380 |
Performs randomized hyperparameter search for an SVM classifier using a pipeline with feature scaling.
|
| 381 |
|
|
|
|
| 525 |
else:
|
| 526 |
print(f'URL inválido o no accesible: {url}')
|
| 527 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
def fetch_refseq_sequence(refseq_id: str) -> str | None:
|
| 529 |
"""
|
| 530 |
Fetch the protein sequence for the given RefSeq ID using NCBI Entrez.
|
|
|
|
| 534 |
Entrez.email = "puglia.jd@gmail.com"
|
| 535 |
Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
|
| 536 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
# ——— 1) Try NCBI Entrez ———
|
| 538 |
try:
|
| 539 |
handle = Entrez.efetch(
|
|
|
|
| 565 |
# ——— All methods failed ———
|
| 566 |
return None
|
| 567 |
|
|
|
|
|
|
|
| 568 |
def _fetch_sequence_for_row(idx, row):
|
| 569 |
"""
|
| 570 |
Helper to fetch sequence for a single row. Returns (idx, sequence).
|
|
|
|
| 858 |
if real_len <= 0:
|
| 859 |
print(f"Sequence too short after tokenization for {acc}")
|
| 860 |
|
| 861 |
+
# Extract and average embeddings
|
|
|
|
| 862 |
emb = embedding_repr.last_hidden_state[0, 1:real_len]
|
| 863 |
emb_avg = emb.mean(dim=0).cpu().numpy()
|
| 864 |
|