| """
|
| ingest.py — Load FAQ JSON, create LangChain Documents, and store
|
| embeddings in a local ChromaDB collection.
|
|
|
| Run directly to (re)build the vector store:
|
| python -m src.ingest
|
| """
|
|
|
| from __future__ import annotations
|
|
|
| import json
|
| import os
|
| from pathlib import Path
|
|
|
| from langchain_core.documents import Document
|
| from langchain_chroma import Chroma
|
| from langchain_ollama import OllamaEmbeddings
|
|
|
|
|
| _HERE = Path(__file__).parent
|
| _RAG_DIR = _HERE.parent / "data"
|
| FAQ_PATH = _RAG_DIR / "faq.json"
|
| CHROMA_DIR = _RAG_DIR / "chroma_db"
|
|
|
| EMBED_MODEL = os.getenv("EMBED_MODEL", "embeddinggemma:latest")
|
| OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
| OLLAMA_AUTH_TOKEN = os.getenv("OLLAMA_AUTH_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN")
|
| COLLECTION_NAME = "naijalingo_faq"
|
|
|
|
|
| def _ollama_client_kwargs() -> dict:
|
| if not OLLAMA_AUTH_TOKEN:
|
| return {}
|
| return {"headers": {"Authorization": f"Bearer {OLLAMA_AUTH_TOKEN}"}}
|
|
|
|
|
| def load_faq_documents(faq_path: Path = FAQ_PATH) -> list[Document]:
|
| with open(faq_path, encoding="utf-8") as f:
|
| items = json.load(f)
|
|
|
| docs: list[Document] = []
|
| for i, item in enumerate(items):
|
| question = item.get("question", "").strip()
|
| answer = item.get("answer", "").strip()
|
| content = f"Question: {question}\nAnswer: {answer}"
|
| docs.append(
|
| Document(
|
| page_content=content,
|
| metadata={"source": "faq.json", "index": i, "question": question},
|
| )
|
| )
|
| return docs
|
|
|
|
|
| def build_vectorstore(
|
| faq_path: Path = FAQ_PATH,
|
| chroma_dir: Path = CHROMA_DIR,
|
| embed_model: str = EMBED_MODEL,
|
| ) -> Chroma:
|
| docs = load_faq_documents(faq_path)
|
| embeddings = OllamaEmbeddings(
|
| model=embed_model,
|
| base_url=OLLAMA_BASE_URL,
|
| client_kwargs=_ollama_client_kwargs(),
|
| )
|
|
|
| chroma_dir.mkdir(parents=True, exist_ok=True)
|
| vectorstore = Chroma.from_documents(
|
| documents=docs,
|
| embedding=embeddings,
|
| collection_name=COLLECTION_NAME,
|
| persist_directory=str(chroma_dir),
|
| )
|
| print(f"[ingest] Indexed {len(docs)} FAQ entries -> {chroma_dir}")
|
| return vectorstore
|
|
|
|
|
| def load_vectorstore(
|
| chroma_dir: Path = CHROMA_DIR,
|
| embed_model: str = EMBED_MODEL,
|
| ) -> Chroma:
|
| embeddings = OllamaEmbeddings(
|
| model=embed_model,
|
| base_url=OLLAMA_BASE_URL,
|
| client_kwargs=_ollama_client_kwargs(),
|
| )
|
| return Chroma(
|
| collection_name=COLLECTION_NAME,
|
| embedding_function=embeddings,
|
| persist_directory=str(chroma_dir),
|
| )
|
|
|
|
|
| def get_or_build_vectorstore() -> Chroma:
|
| if CHROMA_DIR.exists() and any(CHROMA_DIR.iterdir()):
|
| print("[ingest] Loading existing vector store from disk...")
|
| return load_vectorstore()
|
| print("[ingest] Building vector store for the first time...")
|
| return build_vectorstore()
|
|
|
|
|
| if __name__ == "__main__":
|
| build_vectorstore() |