""" ingest.py — Load FAQ JSON, create LangChain Documents, and store embeddings in a local ChromaDB collection. Run directly to (re)build the vector store: python -m src.ingest """ from __future__ import annotations import json import os from pathlib import Path from langchain_core.documents import Document from langchain_chroma import Chroma from langchain_ollama import OllamaEmbeddings # Paths: keep knowledge data and vector store under rag/ _HERE = Path(__file__).parent _RAG_DIR = _HERE.parent / "data" FAQ_PATH = _RAG_DIR / "faq.json" CHROMA_DIR = _RAG_DIR / "chroma_db" EMBED_MODEL = os.getenv("EMBED_MODEL", "embeddinggemma:latest") OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434") OLLAMA_AUTH_TOKEN = os.getenv("OLLAMA_AUTH_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN") COLLECTION_NAME = "naijalingo_faq" def _ollama_client_kwargs() -> dict: if not OLLAMA_AUTH_TOKEN: return {} return {"headers": {"Authorization": f"Bearer {OLLAMA_AUTH_TOKEN}"}} def load_faq_documents(faq_path: Path = FAQ_PATH) -> list[Document]: with open(faq_path, encoding="utf-8") as f: items = json.load(f) docs: list[Document] = [] for i, item in enumerate(items): question = item.get("question", "").strip() answer = item.get("answer", "").strip() content = f"Question: {question}\nAnswer: {answer}" docs.append( Document( page_content=content, metadata={"source": "faq.json", "index": i, "question": question}, ) ) return docs def build_vectorstore( faq_path: Path = FAQ_PATH, chroma_dir: Path = CHROMA_DIR, embed_model: str = EMBED_MODEL, ) -> Chroma: docs = load_faq_documents(faq_path) embeddings = OllamaEmbeddings( model=embed_model, base_url=OLLAMA_BASE_URL, client_kwargs=_ollama_client_kwargs(), ) chroma_dir.mkdir(parents=True, exist_ok=True) vectorstore = Chroma.from_documents( documents=docs, embedding=embeddings, collection_name=COLLECTION_NAME, persist_directory=str(chroma_dir), ) print(f"[ingest] Indexed {len(docs)} FAQ entries -> {chroma_dir}") return vectorstore def load_vectorstore( chroma_dir: Path = CHROMA_DIR, embed_model: str = EMBED_MODEL, ) -> Chroma: embeddings = OllamaEmbeddings( model=embed_model, base_url=OLLAMA_BASE_URL, client_kwargs=_ollama_client_kwargs(), ) return Chroma( collection_name=COLLECTION_NAME, embedding_function=embeddings, persist_directory=str(chroma_dir), ) def get_or_build_vectorstore() -> Chroma: if CHROMA_DIR.exists() and any(CHROMA_DIR.iterdir()): print("[ingest] Loading existing vector store from disk...") return load_vectorstore() print("[ingest] Building vector store for the first time...") return build_vectorstore() if __name__ == "__main__": build_vectorstore()