| from transformers import AutoTokenizer, AutoModel |
| import torch |
| import numpy as np |
| from tqdm import tqdm |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base") |
| model = AutoModel.from_pretrained("allenai/specter2_base") |
| model.eval() |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| model.to(device) |
|
|
| def embed_texts_specter2(texts: list[str], batch_size=16) -> np.ndarray: |
| embeddings = [] |
|
|
| for i in tqdm(range(0, len(texts), batch_size), desc="Embedding with SPECTER2"): |
| batch_texts = texts[i:i+batch_size] |
| inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device) |
| with torch.no_grad(): |
| outputs = model(**inputs) |
| cls_embeddings = outputs.last_hidden_state[:, 0, :] |
| cls_embeddings = torch.nn.functional.normalize(cls_embeddings, p=2, dim=1) |
| embeddings.append(cls_embeddings.cpu().numpy()) |
|
|
| return np.vstack(embeddings) |