""" embedding.py — SPECTER2 embedding generation with caching. Uses AutoAdapterModel (from the `adapters` library) with the allenai/specter2 proximity adapter, which is the correct way to load SPECTER2 for document similarity and clustering tasks. Reference: https://huggingface.co/allenai/specter2 """ import os import pickle import hashlib import numpy as np import pandas as pd from typing import Optional from pathlib import Path CACHE_DIR = Path("cache") CACHE_DIR.mkdir(exist_ok=True) MODEL_NAME = "allenai/specter2_base" ADAPTER_NAME = "allenai/specter2" # proximity adapter (for similarity / clustering) def _get_cache_key(texts: list[str]) -> str: """Generate a deterministic cache key from input texts.""" combined = "||".join(texts) return hashlib.md5(combined.encode()).hexdigest() def load_or_generate_embeddings( df: pd.DataFrame, cache_path: Optional[str] = None, batch_size: int = 16, ) -> np.ndarray: """ Generate SPECTER2 embeddings for each paper's combined_text_raw. Caches result to disk (pickle). Uses DOI as identity for mapping. Returns: np.ndarray of shape (n_papers, embedding_dim) """ # Use combined_text_raw (original casing) for embeddings texts = df["combined_text_raw"].tolist() cache_key = _get_cache_key(texts) if cache_path is None: cache_path = str(CACHE_DIR / f"embeddings_{cache_key}.pkl") if os.path.exists(cache_path): print(f"[Embedding] Loading cached embeddings from {cache_path}") with open(cache_path, "rb") as f: data = pickle.load(f) return data["embeddings"] print(f"[Embedding] Generating SPECTER2 embeddings for {len(texts)} papers...") embeddings = _generate_specter2_embeddings(texts, batch_size=batch_size) # Cache with DOI mapping with open(cache_path, "wb") as f: pickle.dump({"embeddings": embeddings, "dois": df["DOI"].tolist()}, f) print(f"[Embedding] Saved embeddings to {cache_path}") return embeddings def _generate_specter2_embeddings(texts: list[str], batch_size: int = 16) -> np.ndarray: """ Generate SPECTER2 embeddings using AutoAdapterModel with the proximity adapter. The adapters library allows loading task-specific adapter weights on top of the base SPECTER2 model. The 'proximity' adapter is appropriate for document similarity and clustering tasks. Runs on CPU; GPU is used automatically if available. """ from adapters import AutoAdapterModel from transformers import AutoTokenizer import torch device = "cuda" if torch.cuda.is_available() else "cpu" print(f"[Embedding] Using device: {device}") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Load base model using adapters' AutoAdapterModel (not transformers AutoModel) model = AutoAdapterModel.from_pretrained(MODEL_NAME) # Load and activate the proximity adapter from the Hub model.load_adapter(ADAPTER_NAME, source="hf", load_as="proximity", set_active=True) model.to(device) model.eval() all_embeddings = [] with torch.no_grad(): for i in range(0, len(texts), batch_size): batch = texts[i : i + batch_size] inputs = tokenizer( batch, padding=True, truncation=True, max_length=512, return_tensors="pt", ).to(device) outputs = model(**inputs) # Use CLS token embedding (first token of last hidden state) batch_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy() all_embeddings.append(batch_emb) if (i // batch_size) % 5 == 0: print( f"[Embedding] Processed " f"{min(i + batch_size, len(texts))}/{len(texts)} papers" ) embeddings = np.vstack(all_embeddings) print(f"[Embedding] Done. Embedding shape: {embeddings.shape}") return embeddings