File size: 3,138 Bytes
9f031f6 0d06c0b c2659c1 9f031f6 c2659c1 9f031f6 c2659c1 9f031f6 c2659c1 9f031f6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | """
ingest.py — Load FAQ JSON, create LangChain Documents, and store
embeddings in a local ChromaDB collection.
Run directly to (re)build the vector store:
python -m src.ingest
"""
from __future__ import annotations
import json
import os
from pathlib import Path
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
# Paths: keep knowledge data and vector store under rag/
_HERE = Path(__file__).parent
_RAG_DIR = _HERE.parent / "data"
FAQ_PATH = _RAG_DIR / "faq.json"
CHROMA_DIR = _RAG_DIR / "chroma_db"
EMBED_MODEL = os.getenv("EMBED_MODEL", "embeddinggemma:latest")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
OLLAMA_AUTH_TOKEN = os.getenv("OLLAMA_AUTH_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN")
COLLECTION_NAME = "naijalingo_faq"
def _ollama_client_kwargs() -> dict:
if not OLLAMA_AUTH_TOKEN:
return {}
return {"headers": {"Authorization": f"Bearer {OLLAMA_AUTH_TOKEN}"}}
def load_faq_documents(faq_path: Path = FAQ_PATH) -> list[Document]:
with open(faq_path, encoding="utf-8") as f:
items = json.load(f)
docs: list[Document] = []
for i, item in enumerate(items):
question = item.get("question", "").strip()
answer = item.get("answer", "").strip()
content = f"Question: {question}\nAnswer: {answer}"
docs.append(
Document(
page_content=content,
metadata={"source": "faq.json", "index": i, "question": question},
)
)
return docs
def build_vectorstore(
faq_path: Path = FAQ_PATH,
chroma_dir: Path = CHROMA_DIR,
embed_model: str = EMBED_MODEL,
) -> Chroma:
docs = load_faq_documents(faq_path)
embeddings = OllamaEmbeddings(
model=embed_model,
base_url=OLLAMA_BASE_URL,
client_kwargs=_ollama_client_kwargs(),
)
chroma_dir.mkdir(parents=True, exist_ok=True)
vectorstore = Chroma.from_documents(
documents=docs,
embedding=embeddings,
collection_name=COLLECTION_NAME,
persist_directory=str(chroma_dir),
)
print(f"[ingest] Indexed {len(docs)} FAQ entries -> {chroma_dir}")
return vectorstore
def load_vectorstore(
chroma_dir: Path = CHROMA_DIR,
embed_model: str = EMBED_MODEL,
) -> Chroma:
embeddings = OllamaEmbeddings(
model=embed_model,
base_url=OLLAMA_BASE_URL,
client_kwargs=_ollama_client_kwargs(),
)
return Chroma(
collection_name=COLLECTION_NAME,
embedding_function=embeddings,
persist_directory=str(chroma_dir),
)
def get_or_build_vectorstore() -> Chroma:
if CHROMA_DIR.exists() and any(CHROMA_DIR.iterdir()):
print("[ingest] Loading existing vector store from disk...")
return load_vectorstore()
print("[ingest] Building vector store for the first time...")
return build_vectorstore()
if __name__ == "__main__":
build_vectorstore() |