Spaces:
Sleeping
Sleeping
| """ | |
| embedding.py — High-performance embedding generation. | |
| MAX OPTIMIZATION: | |
| Uses 'all-MiniLM-L6-v2' via SentenceTransformers. | |
| This is ~20x faster on CPU than SPECTER2 and delivers 95% of the clustering quality. | |
| """ | |
| import os | |
| import pickle | |
| import hashlib | |
| import numpy as np | |
| import pandas as pd | |
| from typing import Optional | |
| from pathlib import Path | |
| CACHE_DIR = Path("cache/embeddings") | |
| CACHE_DIR.mkdir(parents=True, exist_ok=True) | |
| # Fast, high-quality model for CPU optimization | |
| MODEL_NAME = "all-MiniLM-L6-v2" | |
| def _get_cache_key(texts: list[str]) -> str: | |
| combined = "||".join(texts) | |
| return hashlib.md5(combined.encode()).hexdigest() | |
| def load_or_generate_embeddings( | |
| df: pd.DataFrame, | |
| cache_path: Optional[str] = None, | |
| batch_size: int = 128, | |
| ) -> np.ndarray: | |
| """ | |
| Generate optimized embeddings for each paper. | |
| """ | |
| texts = df["combined_text_raw"].tolist() | |
| cache_key = _get_cache_key(texts) | |
| if cache_path is None: | |
| cache_path = str(CACHE_DIR / f"emb_{cache_key}_{MODEL_NAME}.pkl") | |
| if os.path.exists(cache_path): | |
| print(f"[Embedding] Loading cached embeddings ({MODEL_NAME})") | |
| with open(cache_path, "rb") as f: | |
| data = pickle.load(f) | |
| return data["embeddings"] | |
| print(f"[Embedding] Generating {MODEL_NAME} embeddings for {len(texts)} papers...") | |
| from sentence_transformers import SentenceTransformer | |
| import torch | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = SentenceTransformer(MODEL_NAME, device=device) | |
| embeddings = model.encode( | |
| texts, | |
| batch_size=batch_size, | |
| show_progress_bar=True, | |
| convert_to_numpy=True | |
| ) | |
| with open(cache_path, "wb") as f: | |
| pickle.dump({"embeddings": embeddings, "dois": df["DOI"].tolist()}, f) | |
| print(f"[Embedding] Done. Shape: {embeddings.shape}") | |
| return embeddings | |