Spaces:
Sleeping
Sleeping
| import os | |
| from typing import List, Dict, Any | |
| from pymongo import MongoClient | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| class MongoKB: | |
| def __init__(self, uri: str, db: str, kb_collection: str, embed_model_name: str): | |
| self.client = MongoClient(uri, tls=True, tlsAllowInvalidCertificates=True) | |
| self.db = self.client[db] | |
| self.col = self.db[kb_collection] | |
| self.embedder = SentenceTransformer(embed_model_name) | |
| self.dim = self.embedder.get_sentence_embedding_dimension() | |
| def embed(self, texts: List[str]) -> np.ndarray: | |
| return self.embedder.encode(texts, normalize_embeddings=True) | |
| def search(self, query: str, k: int = 3) -> List[Dict[str, Any]]: | |
| q_vec = self.embed([query])[0].tolist() | |
| # MongoDB Atlas Vector Search $vectorSearch (MongoDB 7.0+ / Atlas) | |
| results = self.db.command({ | |
| "aggregate": self.col.name, | |
| "pipeline": [{ | |
| "$vectorSearch": { | |
| "index": "kb_vector_index", # name you create in Atlas | |
| "path": "embedding", | |
| "queryVector": q_vec, | |
| "numCandidates": 100, | |
| "limit": k | |
| } | |
| }, {"$project": {"_id": 0, "question": 1, "answer": 1, "intent": 1, "score": {"$meta": "vectorSearchScore"}}}], | |
| "cursor": {} | |
| }) | |
| return list(results["cursor"]["firstBatch"]) | |
| def insert_many(self, docs: List[Dict[str, Any]]): | |
| self.col.insert_many(docs) | |
| def ensure_indexes(self): | |
| # normal fallback indexes | |
| self.col.create_index("intent") | |
| self.col.create_index([("question", "text"), ("answer", "text")]) | |