Spaces:
Sleeping
Sleeping
| import os | |
| import numpy as np | |
| import torch | |
| import torch.nn.functional as F | |
| from transformers import AutoTokenizer, AutoModel | |
| from sentence_transformers import SentenceTransformer | |
| from tqdm import tqdm | |
| class QwenEmbeddings: | |
| def __init__(self, model_name="Qwen/Qwen3-Embedding-8B", max_length=512, batch_size=8): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") | |
| self.model = AutoModel.from_pretrained(model_name).to(self.device) | |
| self.max_length = max_length | |
| self.batch_size = batch_size | |
| def last_token_pool(self, last_hidden_states, attention_mask): | |
| left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) | |
| if left_padding: | |
| return last_hidden_states[:, -1] | |
| else: | |
| sequence_lengths = attention_mask.sum(dim=1) - 1 | |
| batch_size = last_hidden_states.shape[0] | |
| return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] | |
| def encode(self, texts): | |
| all_embeddings = [] | |
| for i in tqdm(range(0, len(texts), self.batch_size)): | |
| batch = texts[i:i + self.batch_size] | |
| enc = self.tokenizer( | |
| batch, | |
| padding=True, | |
| truncation=True, | |
| max_length=self.max_length, | |
| return_tensors="pt" | |
| ).to(self.device) | |
| with torch.no_grad(): | |
| out = self.model(**enc) | |
| pooled = self.last_token_pool(out.last_hidden_state, enc["attention_mask"]) | |
| pooled = F.normalize(pooled, p=2, dim=1) | |
| all_embeddings.append(pooled.cpu()) | |
| torch.cuda.empty_cache() | |
| return torch.cat(all_embeddings).numpy() | |
| class Embedder: | |
| def __init__(self, backend="qwen"): | |
| """ | |
| backend = "mini_lm" or "qwen" | |
| """ | |
| self.backend = backend | |
| self.model = None | |
| def encode_books(self, df, text_column="combined_text", batch_size=32): | |
| if self.backend == "mini_lm": | |
| self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| elif backend == "qwen": | |
| self.model = QwenEmbeddings() | |
| else: | |
| raise ValueError("backend must be 'mini_lm' or 'qwen'") | |
| texts = df[text_column].tolist() | |
| if self.backend == "mini_lm": | |
| embeddings = self.model.encode(texts, batch_size=batch_size, show_progress_bar=True) | |
| return np.array(embeddings) | |
| elif self.backend == "qwen": | |
| return self.model.encode(texts) | |
| def encode_query(self, text): | |
| if self.backend == "mini_lm": | |
| return self.model.encode([text])[0] | |
| elif self.backend == "qwen": | |
| emb = self.model.encode([text]) | |
| return emb[0] | |
| def save_embeddings(self, embeddings, path="models/embeddings.npy"): | |
| os.makedirs(os.path.dirname(path), exist_ok=True) | |
| np.save(path, embeddings) | |
| def load_embeddings(self, path="models/embeddings.npy"): | |
| return np.load(path) | |