agent-specter / embedding.py
reyansh2005's picture
first
3eefe42
"""
embedding.py — SPECTER2 embedding generation with caching.
Uses AutoAdapterModel (from the `adapters` library) with the allenai/specter2
proximity adapter, which is the correct way to load SPECTER2 for document
similarity and clustering tasks.
Reference: https://huggingface.co/allenai/specter2
"""
import os
import pickle
import hashlib
import numpy as np
import pandas as pd
from typing import Optional
from pathlib import Path
CACHE_DIR = Path("cache")
CACHE_DIR.mkdir(exist_ok=True)
MODEL_NAME = "allenai/specter2_base"
ADAPTER_NAME = "allenai/specter2" # proximity adapter (for similarity / clustering)
def _get_cache_key(texts: list[str]) -> str:
"""Generate a deterministic cache key from input texts."""
combined = "||".join(texts)
return hashlib.md5(combined.encode()).hexdigest()
def load_or_generate_embeddings(
df: pd.DataFrame,
cache_path: Optional[str] = None,
batch_size: int = 16,
) -> np.ndarray:
"""
Generate SPECTER2 embeddings for each paper's combined_text_raw.
Caches result to disk (pickle). Uses DOI as identity for mapping.
Returns:
np.ndarray of shape (n_papers, embedding_dim)
"""
# Use combined_text_raw (original casing) for embeddings
texts = df["combined_text_raw"].tolist()
cache_key = _get_cache_key(texts)
if cache_path is None:
cache_path = str(CACHE_DIR / f"embeddings_{cache_key}.pkl")
if os.path.exists(cache_path):
print(f"[Embedding] Loading cached embeddings from {cache_path}")
with open(cache_path, "rb") as f:
data = pickle.load(f)
return data["embeddings"]
print(f"[Embedding] Generating SPECTER2 embeddings for {len(texts)} papers...")
embeddings = _generate_specter2_embeddings(texts, batch_size=batch_size)
# Cache with DOI mapping
with open(cache_path, "wb") as f:
pickle.dump({"embeddings": embeddings, "dois": df["DOI"].tolist()}, f)
print(f"[Embedding] Saved embeddings to {cache_path}")
return embeddings
def _generate_specter2_embeddings(texts: list[str], batch_size: int = 16) -> np.ndarray:
"""
Generate SPECTER2 embeddings using AutoAdapterModel with the proximity adapter.
The adapters library allows loading task-specific adapter weights on top of
the base SPECTER2 model. The 'proximity' adapter is appropriate for
document similarity and clustering tasks.
Runs on CPU; GPU is used automatically if available.
"""
from adapters import AutoAdapterModel
from transformers import AutoTokenizer
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Embedding] Using device: {device}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Load base model using adapters' AutoAdapterModel (not transformers AutoModel)
model = AutoAdapterModel.from_pretrained(MODEL_NAME)
# Load and activate the proximity adapter from the Hub
model.load_adapter(ADAPTER_NAME, source="hf", load_as="proximity", set_active=True)
model.to(device)
model.eval()
all_embeddings = []
with torch.no_grad():
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
inputs = tokenizer(
batch,
padding=True,
truncation=True,
max_length=512,
return_tensors="pt",
).to(device)
outputs = model(**inputs)
# Use CLS token embedding (first token of last hidden state)
batch_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
all_embeddings.append(batch_emb)
if (i // batch_size) % 5 == 0:
print(
f"[Embedding] Processed "
f"{min(i + batch_size, len(texts))}/{len(texts)} papers"
)
embeddings = np.vstack(all_embeddings)
print(f"[Embedding] Done. Embedding shape: {embeddings.shape}")
return embeddings