Spaces:
Sleeping
Sleeping
File size: 1,734 Bytes
17205ab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | import os
from typing import List, Dict, Any
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer
import numpy as np
class MongoKB:
def __init__(self, uri: str, db: str, kb_collection: str, embed_model_name: str):
self.client = MongoClient(uri, tls=True, tlsAllowInvalidCertificates=True)
self.db = self.client[db]
self.col = self.db[kb_collection]
self.embedder = SentenceTransformer(embed_model_name)
self.dim = self.embedder.get_sentence_embedding_dimension()
def embed(self, texts: List[str]) -> np.ndarray:
return self.embedder.encode(texts, normalize_embeddings=True)
def search(self, query: str, k: int = 3) -> List[Dict[str, Any]]:
q_vec = self.embed([query])[0].tolist()
# MongoDB Atlas Vector Search $vectorSearch (MongoDB 7.0+ / Atlas)
results = self.db.command({
"aggregate": self.col.name,
"pipeline": [{
"$vectorSearch": {
"index": "kb_vector_index", # name you create in Atlas
"path": "embedding",
"queryVector": q_vec,
"numCandidates": 100,
"limit": k
}
}, {"$project": {"_id": 0, "question": 1, "answer": 1, "intent": 1, "score": {"$meta": "vectorSearchScore"}}}],
"cursor": {}
})
return list(results["cursor"]["firstBatch"])
def insert_many(self, docs: List[Dict[str, Any]]):
self.col.insert_many(docs)
def ensure_indexes(self):
# normal fallback indexes
self.col.create_index("intent")
self.col.create_index([("question", "text"), ("answer", "text")])
|