""" Index knowledge corpus into Qdrant vector store. Uses BGE-M3 for embeddings (same as before). """ import sys, json sys.path.insert(0, '.') from backend.database.vector.client import vector_store from pathlib import Path def index_corpus(corpus_path: str = "knowledge/training/kenyan_finance_corpus.jsonl"): print(f"Loading corpus from {corpus_path}...") pairs = [] if not Path(corpus_path).exists(): print("Run build_corpus.py first") return with open(corpus_path) as f: for line in f: pairs.append(json.loads(line.strip())) print(f"Loaded {len(pairs)} pairs") print("Generating embeddings (this takes a few minutes)...") try: from FlagEmbedding import BGEM3FlagModel model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) # Process in batches batch_size = 32 total_indexed = 0 for i in range(0, len(pairs), batch_size): batch = pairs[i:i+batch_size] texts = [p.get("answer", p.get("question", "")) for p in batch] metadatas = [ { "source": p.get("source", "senti"), "category": p.get("category", "general"), "jurisdiction": p.get("jurisdiction", "KE"), "language": p.get("language", "en"), "question": p.get("question", "")[:200] } for p in batch ] # Generate embeddings output = model.encode(texts, batch_size=12, max_length=512) embeddings = output['dense_vecs'].tolist() vector_store.create_collections() # Ensure they exist added = vector_store.add_documents( collection="knowledge", texts=texts, embeddings=embeddings, metadata=metadatas ) total_indexed += added print(f" Indexed {total_indexed}/{len(pairs)}...") print(f"\nDone. Total indexed: {total_indexed}") final = vector_store.get_count("knowledge") print(f"Qdrant knowledge collection: {final} documents") except ImportError: print("FlagEmbedding not available. Install: pip install FlagEmbedding") print("For now: corpus saved, will index when model available") if __name__ == "__main__": index_corpus()