import os from dotenv import load_dotenv from pymongo import MongoClient from sentence_transformers import SentenceTransformer load_dotenv() MONGO_URI = os.getenv("MONGO_URI") DB_NAME = os.getenv("MONGO_DB", "legal_chatbot_db") COLLECTION_NAME = os.getenv("MONGO_COLLECTION", "datasets") EMBED_MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2") client = MongoClient(MONGO_URI, tls=True, tlsAllowInvalidCertificates=True) col = client[DB_NAME][COLLECTION_NAME] embedder = SentenceTransformer(EMBED_MODEL_NAME) print("Docs count:", col.count_documents({})) print("One doc keys:", list(col.find_one({}, {"_id": 0}).keys())) query = "What are my rights in case of workplace harassment?" q_vec = embedder.encode([query], normalize_embeddings=True)[0].tolist() pipe = [ { "$vectorSearch": { "index": "kb_vector_index", "path": "embedding", "queryVector": q_vec, "numCandidates": 100, "limit": 3 } }, {"$project": {"_id": 0, "intent": 1, "question": 1, "answer": 1, "score": {"$meta": "vectorSearchScore"}}} ] print("Query:", query) for h in col.aggregate(pipe): print(h)