chatbot / src /ingest.py
okoliechykwuka
Add Bearer auth for private Ollama Space
c2659c1
"""
ingest.py — Load FAQ JSON, create LangChain Documents, and store
embeddings in a local ChromaDB collection.
Run directly to (re)build the vector store:
python -m src.ingest
"""
from __future__ import annotations
import json
import os
from pathlib import Path
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
# Paths: keep knowledge data and vector store under rag/
_HERE = Path(__file__).parent
_RAG_DIR = _HERE.parent / "data"
FAQ_PATH = _RAG_DIR / "faq.json"
CHROMA_DIR = _RAG_DIR / "chroma_db"
EMBED_MODEL = os.getenv("EMBED_MODEL", "embeddinggemma:latest")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
OLLAMA_AUTH_TOKEN = os.getenv("OLLAMA_AUTH_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN")
COLLECTION_NAME = "naijalingo_faq"
def _ollama_client_kwargs() -> dict:
if not OLLAMA_AUTH_TOKEN:
return {}
return {"headers": {"Authorization": f"Bearer {OLLAMA_AUTH_TOKEN}"}}
def load_faq_documents(faq_path: Path = FAQ_PATH) -> list[Document]:
with open(faq_path, encoding="utf-8") as f:
items = json.load(f)
docs: list[Document] = []
for i, item in enumerate(items):
question = item.get("question", "").strip()
answer = item.get("answer", "").strip()
content = f"Question: {question}\nAnswer: {answer}"
docs.append(
Document(
page_content=content,
metadata={"source": "faq.json", "index": i, "question": question},
)
)
return docs
def build_vectorstore(
faq_path: Path = FAQ_PATH,
chroma_dir: Path = CHROMA_DIR,
embed_model: str = EMBED_MODEL,
) -> Chroma:
docs = load_faq_documents(faq_path)
embeddings = OllamaEmbeddings(
model=embed_model,
base_url=OLLAMA_BASE_URL,
client_kwargs=_ollama_client_kwargs(),
)
chroma_dir.mkdir(parents=True, exist_ok=True)
vectorstore = Chroma.from_documents(
documents=docs,
embedding=embeddings,
collection_name=COLLECTION_NAME,
persist_directory=str(chroma_dir),
)
print(f"[ingest] Indexed {len(docs)} FAQ entries -> {chroma_dir}")
return vectorstore
def load_vectorstore(
chroma_dir: Path = CHROMA_DIR,
embed_model: str = EMBED_MODEL,
) -> Chroma:
embeddings = OllamaEmbeddings(
model=embed_model,
base_url=OLLAMA_BASE_URL,
client_kwargs=_ollama_client_kwargs(),
)
return Chroma(
collection_name=COLLECTION_NAME,
embedding_function=embeddings,
persist_directory=str(chroma_dir),
)
def get_or_build_vectorstore() -> Chroma:
if CHROMA_DIR.exists() and any(CHROMA_DIR.iterdir()):
print("[ingest] Loading existing vector store from disk...")
return load_vectorstore()
print("[ingest] Building vector store for the first time...")
return build_vectorstore()
if __name__ == "__main__":
build_vectorstore()