Spaces:

reyansh2005
/

agent-specter

Sleeping

App Files Files Community

agent-specter / embedding.py

reyansh2005

first

3eefe42 28 days ago

raw

history blame contribute delete

4.01 kB

	"""
	embedding.py — SPECTER2 embedding generation with caching.

	Uses AutoAdapterModel (from the `adapters` library) with the allenai/specter2
	proximity adapter, which is the correct way to load SPECTER2 for document
	similarity and clustering tasks.

	Reference: https://huggingface.co/allenai/specter2
	"""

	import os
	import pickle
	import hashlib
	import numpy as np
	import pandas as pd
	from typing import Optional
	from pathlib import Path


	CACHE_DIR = Path("cache")
	CACHE_DIR.mkdir(exist_ok=True)

	MODEL_NAME = "allenai/specter2_base"
	ADAPTER_NAME = "allenai/specter2" # proximity adapter (for similarity / clustering)


	def _get_cache_key(texts: list[str]) -> str:
	"""Generate a deterministic cache key from input texts."""
	combined = "\|\|".join(texts)
	return hashlib.md5(combined.encode()).hexdigest()


	def load_or_generate_embeddings(
	df: pd.DataFrame,
	cache_path: Optional[str] = None,
	batch_size: int = 16,
	) -> np.ndarray:
	"""
	Generate SPECTER2 embeddings for each paper's combined_text_raw.
	Caches result to disk (pickle). Uses DOI as identity for mapping.

	Returns:
	np.ndarray of shape (n_papers, embedding_dim)
	"""
	# Use combined_text_raw (original casing) for embeddings
	texts = df["combined_text_raw"].tolist()
	cache_key = _get_cache_key(texts)

	if cache_path is None:
	cache_path = str(CACHE_DIR / f"embeddings_{cache_key}.pkl")

	if os.path.exists(cache_path):
	print(f"[Embedding] Loading cached embeddings from {cache_path}")
	with open(cache_path, "rb") as f:
	data = pickle.load(f)
	return data["embeddings"]

	print(f"[Embedding] Generating SPECTER2 embeddings for {len(texts)} papers...")
	embeddings = _generate_specter2_embeddings(texts, batch_size=batch_size)

	# Cache with DOI mapping
	with open(cache_path, "wb") as f:
	pickle.dump({"embeddings": embeddings, "dois": df["DOI"].tolist()}, f)
	print(f"[Embedding] Saved embeddings to {cache_path}")

	return embeddings


	def _generate_specter2_embeddings(texts: list[str], batch_size: int = 16) -> np.ndarray:
	"""
	Generate SPECTER2 embeddings using AutoAdapterModel with the proximity adapter.

	The adapters library allows loading task-specific adapter weights on top of
	the base SPECTER2 model. The 'proximity' adapter is appropriate for
	document similarity and clustering tasks.

	Runs on CPU; GPU is used automatically if available.
	"""
	from adapters import AutoAdapterModel
	from transformers import AutoTokenizer
	import torch

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"[Embedding] Using device: {device}")

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	# Load base model using adapters' AutoAdapterModel (not transformers AutoModel)
	model = AutoAdapterModel.from_pretrained(MODEL_NAME)

	# Load and activate the proximity adapter from the Hub
	model.load_adapter(ADAPTER_NAME, source="hf", load_as="proximity", set_active=True)

	model.to(device)
	model.eval()

	all_embeddings = []

	with torch.no_grad():
	for i in range(0, len(texts), batch_size):
	batch = texts[i : i + batch_size]
	inputs = tokenizer(
	batch,
	padding=True,
	truncation=True,
	max_length=512,
	return_tensors="pt",
	).to(device)

	outputs = model(**inputs)
	# Use CLS token embedding (first token of last hidden state)
	batch_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
	all_embeddings.append(batch_emb)

	if (i // batch_size) % 5 == 0:
	print(
	f"[Embedding] Processed "
	f"{min(i + batch_size, len(texts))}/{len(texts)} papers"
	)

	embeddings = np.vstack(all_embeddings)
	print(f"[Embedding] Done. Embedding shape: {embeddings.shape}")
	return embeddings