Spaces:
Running
Running
| """ | |
| Index knowledge corpus into Qdrant vector store. | |
| Uses BGE-M3 for embeddings (same as before). | |
| """ | |
| import sys, json | |
| sys.path.insert(0, '.') | |
| from backend.database.vector.client import vector_store | |
| from pathlib import Path | |
| def index_corpus(corpus_path: str = "knowledge/training/kenyan_finance_corpus.jsonl"): | |
| print(f"Loading corpus from {corpus_path}...") | |
| pairs = [] | |
| if not Path(corpus_path).exists(): | |
| print("Run build_corpus.py first") | |
| return | |
| with open(corpus_path) as f: | |
| for line in f: | |
| pairs.append(json.loads(line.strip())) | |
| print(f"Loaded {len(pairs)} pairs") | |
| print("Generating embeddings (this takes a few minutes)...") | |
| try: | |
| from FlagEmbedding import BGEM3FlagModel | |
| model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) | |
| # Process in batches | |
| batch_size = 32 | |
| total_indexed = 0 | |
| for i in range(0, len(pairs), batch_size): | |
| batch = pairs[i:i+batch_size] | |
| texts = [p.get("answer", p.get("question", "")) for p in batch] | |
| metadatas = [ | |
| { | |
| "source": p.get("source", "senti"), | |
| "category": p.get("category", "general"), | |
| "jurisdiction": p.get("jurisdiction", "KE"), | |
| "language": p.get("language", "en"), | |
| "question": p.get("question", "")[:200] | |
| } | |
| for p in batch | |
| ] | |
| # Generate embeddings | |
| output = model.encode(texts, batch_size=12, max_length=512) | |
| embeddings = output['dense_vecs'].tolist() | |
| vector_store.create_collections() # Ensure they exist | |
| added = vector_store.add_documents( | |
| collection="knowledge", | |
| texts=texts, | |
| embeddings=embeddings, | |
| metadata=metadatas | |
| ) | |
| total_indexed += added | |
| print(f" Indexed {total_indexed}/{len(pairs)}...") | |
| print(f"\nDone. Total indexed: {total_indexed}") | |
| final = vector_store.get_count("knowledge") | |
| print(f"Qdrant knowledge collection: {final} documents") | |
| except ImportError: | |
| print("FlagEmbedding not available. Install: pip install FlagEmbedding") | |
| print("For now: corpus saved, will index when model available") | |
| if __name__ == "__main__": | |
| index_corpus() | |