from typing import Dict, List from transformers import AutoTokenizer from adapters import AutoAdapterModel class AdapterHandler: def __init__(self): self.tokenizer = None self.model = None def initialize(self, model_dir: str): self.tokenizer = AutoTokenizer.from_pretrained(model_dir) self.model = AutoAdapterModel.from_pretrained(model_dir) # Load adapter - adjust path as needed self.model.load_adapter("specter2_proximity", source="local") self.model.set_active_adapters("specter2_proximity") return self def __call__(self, data: Dict[str, List[str]]) -> Dict[str, List[float]]: titles = data.get("title", [""]) abstracts = data.get("abstract", [""] * len(titles)) # Combine inputs combined = [ title + self.tokenizer.sep_token + (abstract or "") for title, abstract in zip(titles, abstracts) ] # Tokenize inputs = self.tokenizer( combined, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False, max_length=512 ) # Get embeddings with torch.no_grad(): outputs = self.model(**inputs) embeddings = outputs.last_hidden_state[:, 0, :].numpy() return {"embeddings": embeddings.tolist()}