Spaces:
Sleeping
Sleeping
| """ | |
| embedding.py — SPECTER2 embedding generation with caching. | |
| Uses AutoAdapterModel (from the `adapters` library) with the allenai/specter2 | |
| proximity adapter, which is the correct way to load SPECTER2 for document | |
| similarity and clustering tasks. | |
| Reference: https://huggingface.co/allenai/specter2 | |
| """ | |
| import os | |
| import pickle | |
| import hashlib | |
| import numpy as np | |
| import pandas as pd | |
| from typing import Optional | |
| from pathlib import Path | |
| CACHE_DIR = Path("cache") | |
| CACHE_DIR.mkdir(exist_ok=True) | |
| MODEL_NAME = "allenai/specter2_base" | |
| ADAPTER_NAME = "allenai/specter2" # proximity adapter (for similarity / clustering) | |
| def _get_cache_key(texts: list[str]) -> str: | |
| """Generate a deterministic cache key from input texts.""" | |
| combined = "||".join(texts) | |
| return hashlib.md5(combined.encode()).hexdigest() | |
| def load_or_generate_embeddings( | |
| df: pd.DataFrame, | |
| cache_path: Optional[str] = None, | |
| batch_size: int = 16, | |
| ) -> np.ndarray: | |
| """ | |
| Generate SPECTER2 embeddings for each paper's combined_text_raw. | |
| Caches result to disk (pickle). Uses DOI as identity for mapping. | |
| Returns: | |
| np.ndarray of shape (n_papers, embedding_dim) | |
| """ | |
| # Use combined_text_raw (original casing) for embeddings | |
| texts = df["combined_text_raw"].tolist() | |
| cache_key = _get_cache_key(texts) | |
| if cache_path is None: | |
| cache_path = str(CACHE_DIR / f"embeddings_{cache_key}.pkl") | |
| if os.path.exists(cache_path): | |
| print(f"[Embedding] Loading cached embeddings from {cache_path}") | |
| with open(cache_path, "rb") as f: | |
| data = pickle.load(f) | |
| return data["embeddings"] | |
| print(f"[Embedding] Generating SPECTER2 embeddings for {len(texts)} papers...") | |
| embeddings = _generate_specter2_embeddings(texts, batch_size=batch_size) | |
| # Cache with DOI mapping | |
| with open(cache_path, "wb") as f: | |
| pickle.dump({"embeddings": embeddings, "dois": df["DOI"].tolist()}, f) | |
| print(f"[Embedding] Saved embeddings to {cache_path}") | |
| return embeddings | |
| def _generate_specter2_embeddings(texts: list[str], batch_size: int = 16) -> np.ndarray: | |
| """ | |
| Generate SPECTER2 embeddings using AutoAdapterModel with the proximity adapter. | |
| The adapters library allows loading task-specific adapter weights on top of | |
| the base SPECTER2 model. The 'proximity' adapter is appropriate for | |
| document similarity and clustering tasks. | |
| Runs on CPU; GPU is used automatically if available. | |
| """ | |
| from adapters import AutoAdapterModel | |
| from transformers import AutoTokenizer | |
| import torch | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"[Embedding] Using device: {device}") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # Load base model using adapters' AutoAdapterModel (not transformers AutoModel) | |
| model = AutoAdapterModel.from_pretrained(MODEL_NAME) | |
| # Load and activate the proximity adapter from the Hub | |
| model.load_adapter(ADAPTER_NAME, source="hf", load_as="proximity", set_active=True) | |
| model.to(device) | |
| model.eval() | |
| all_embeddings = [] | |
| with torch.no_grad(): | |
| for i in range(0, len(texts), batch_size): | |
| batch = texts[i : i + batch_size] | |
| inputs = tokenizer( | |
| batch, | |
| padding=True, | |
| truncation=True, | |
| max_length=512, | |
| return_tensors="pt", | |
| ).to(device) | |
| outputs = model(**inputs) | |
| # Use CLS token embedding (first token of last hidden state) | |
| batch_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy() | |
| all_embeddings.append(batch_emb) | |
| if (i // batch_size) % 5 == 0: | |
| print( | |
| f"[Embedding] Processed " | |
| f"{min(i + batch_size, len(texts))}/{len(texts)} papers" | |
| ) | |
| embeddings = np.vstack(all_embeddings) | |
| print(f"[Embedding] Done. Embedding shape: {embeddings.shape}") | |
| return embeddings | |