senti-beta / senti /scripts /index_knowledge.py
joseph njoroge kariuki
Deploy Senti AI to Hugging Face Spaces
021e065
"""
Index knowledge corpus into Qdrant vector store.
Uses BGE-M3 for embeddings (same as before).
"""
import sys, json
sys.path.insert(0, '.')
from backend.database.vector.client import vector_store
from pathlib import Path
def index_corpus(corpus_path: str = "knowledge/training/kenyan_finance_corpus.jsonl"):
print(f"Loading corpus from {corpus_path}...")
pairs = []
if not Path(corpus_path).exists():
print("Run build_corpus.py first")
return
with open(corpus_path) as f:
for line in f:
pairs.append(json.loads(line.strip()))
print(f"Loaded {len(pairs)} pairs")
print("Generating embeddings (this takes a few minutes)...")
try:
from FlagEmbedding import BGEM3FlagModel
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
# Process in batches
batch_size = 32
total_indexed = 0
for i in range(0, len(pairs), batch_size):
batch = pairs[i:i+batch_size]
texts = [p.get("answer", p.get("question", "")) for p in batch]
metadatas = [
{
"source": p.get("source", "senti"),
"category": p.get("category", "general"),
"jurisdiction": p.get("jurisdiction", "KE"),
"language": p.get("language", "en"),
"question": p.get("question", "")[:200]
}
for p in batch
]
# Generate embeddings
output = model.encode(texts, batch_size=12, max_length=512)
embeddings = output['dense_vecs'].tolist()
vector_store.create_collections() # Ensure they exist
added = vector_store.add_documents(
collection="knowledge",
texts=texts,
embeddings=embeddings,
metadata=metadatas
)
total_indexed += added
print(f" Indexed {total_indexed}/{len(pairs)}...")
print(f"\nDone. Total indexed: {total_indexed}")
final = vector_store.get_count("knowledge")
print(f"Qdrant knowledge collection: {final} documents")
except ImportError:
print("FlagEmbedding not available. Install: pip install FlagEmbedding")
print("For now: corpus saved, will index when model available")
if __name__ == "__main__":
index_corpus()