kgbchatbot / src /utils.py
thomascerniglia's picture
Add more demo documents for initial index
1b80ae8
import os
import pickle
import json
from src.config import Settings
from src.retrieval.vectorstore import VectorStore
from src.retrieval.embedder import get_embedder
def ensure_dirs():
for p in ["data/raw", "storage"]:
os.makedirs(p, exist_ok=True)
def save_pickle(obj, path: str):
with open(path, "wb") as f:
pickle.dump(obj, f)
def load_pickle(path: str):
with open(path, "rb") as f:
return pickle.load(f)
def write_json(obj, path: str):
with open(path, "w", encoding="utf-8") as f:
json.dump(obj, f, ensure_ascii=False, indent=2)
def bootstrap_demo_index():
"""Create a minimal index so the app works before ingestion."""
settings = Settings()
demo_docs = [
"Directive: Reinforce border surveillance along this area and also that one too. [Source: KGB/1963/Example 1]",
"Report: Intercepted correspondence near Moscow. Oh no! [Source: KGB/1972/Example 2]",
"Memo: Field notes suggest supply shortages in the winter. Hopefully it wont be cold. [Source: KGB/1979/Example 3]",
"Alert: Increased activity detected in the northern territories. Deploy additional units immediately. [Source: KGB/1965/Example 4]",
"Communication: Agent reports suspicious behavior at the embassy. Further investigation required. [Source: KGB/1968/Example 5]",
"Analysis: Economic data indicates potential instability in the region. Monitor closely. [Source: KGB/1974/Example 6]",
"Order: All operatives must report to headquarters by end of month for briefing. No exceptions. [Source: KGB/1977/Example 7]",
"Intelligence: Foreign delegation arriving next week. Ensure proper surveillance measures are in place. [Source: KGB/1980/Example 8]",
"Warning: Communication channels may be compromised. Switch to backup protocols effective immediately. [Source: KGB/1982/Example 9]",
"Summary: Operation in Leningrad completed successfully. All targets accounted for. [Source: KGB/1985/Example 10]"
]
save_pickle(demo_docs, settings.docs_path)
embedder = get_embedder(settings)
vs = VectorStore(settings).build(demo_docs, embedder)
vs.save()
write_json({"demo": True, "count": len(demo_docs)}, settings.meta_path)