Spaces:
Sleeping
Sleeping
| import torch | |
| import faiss | |
| import numpy as np | |
| from transformers import AutoTokenizer, AutoModel | |
| MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = AutoModel.from_pretrained(MODEL_NAME) | |
| class ColBERTRetriever: | |
| def __init__(self): | |
| self.chunks = [] | |
| self.doc_embeddings = [] | |
| self.index = None | |
| # ----------------------------- | |
| # EMBED TEXT TOKENS | |
| # ----------------------------- | |
| def embed(self, text): | |
| inputs = tokenizer( | |
| text, | |
| return_tensors="pt", | |
| truncation=True, | |
| padding=True, | |
| max_length=256 | |
| ) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| embeddings = outputs.last_hidden_state.squeeze(0) | |
| return embeddings.numpy() | |
| # ----------------------------- | |
| # BUILD INDEX | |
| # ----------------------------- | |
| def build_index(self, chunks): | |
| self.chunks = chunks | |
| vectors = [] | |
| for c in chunks: | |
| emb = self.embed(c["text"]) | |
| vectors.append(emb.mean(axis=0)) | |
| vectors = np.array(vectors).astype("float32") | |
| dim = vectors.shape[1] | |
| self.index = faiss.IndexFlatIP(dim) | |
| self.index.add(vectors) | |
| # ----------------------------- | |
| # QUERY | |
| # ----------------------------- | |
| def query(self, question, k=20): | |
| q_emb = self.embed(question) # token embeddings | |
| scores = [] | |
| for chunk in self.chunks: | |
| d_emb = self.embed(chunk["text"]) | |
| sim = np.matmul(q_emb, d_emb.T) # token similarity | |
| score = sim.max(axis=1).sum() # MaxSim | |
| scores.append(score) | |
| idx = np.argsort(scores)[::-1][:k] | |
| return [self.chunks[i] for i in idx] |