""" web_ingest.py ------------- Read persistent/web_cache.json -> chunk into smaller pieces -> attempt to ingest into core.vector_store. Produces: /home/user/app/persistent/web_chunks.jsonl (backup) """ import os import json import hashlib from tqdm import tqdm PERSISTENT_DIR = "/home/user/app/persistent" WEB_CACHE = os.path.join(PERSISTENT_DIR, "web_cache.json") OUT_JSONL = os.path.join(PERSISTENT_DIR, "web_chunks.jsonl") CHUNK_SIZE = 420 OVERLAP = 80 def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP): words = text.split() if len(words) <= size: return [text] chunks = [] i = 0 n = len(words) while i < n: end = min(i + size, n) chunks.append(" ".join(words[i:end])) if end == n: break i = end - overlap return chunks def make_id(url, idx): h = hashlib.sha1(url.encode("utf-8")).hexdigest()[:12] return f"{h}::{idx}" def ingest(auto_ingest=True): if not os.path.exists(WEB_CACHE): print("No web_cache.json found; nothing to ingest.") return [] with open(WEB_CACHE, "r", encoding="utf-8") as fh: pages = json.load(fh) docs = [] for url, meta in tqdm(pages.items(), desc="Chunking web pages"): text = meta.get("text", "") title = meta.get("title", "") or "" chunks = chunk_text(text) for i, chunk in enumerate(chunks): doc = { "id": make_id(url, i), "file": url, "source": url, "type": "web", "title": title, "text": chunk, "term": "", "sources": [url] } docs.append(doc) # Write JSONL for downstream ingestion with open(OUT_JSONL, "w", encoding="utf-8") as fh: for d in docs: fh.write(json.dumps(d, ensure_ascii=False) + "\n") print(f"Wrote {len(docs)} web chunks to {OUT_JSONL}") if auto_ingest: try: import core.vector_store as vs if hasattr(vs, "add_documents"): print("Auto-ingest: calling core.vector_store.add_documents()") vs.add_documents(docs) print("Auto-ingest succeeded.") elif hasattr(vs, "ingest_documents_from_jsonl"): print("Auto-ingest: calling core.vector_store.ingest_documents_from_jsonl()") vs.ingest_documents_from_jsonl(OUT_JSONL) print("Auto-ingest succeeded.") else: print("No ingestion API found on core.vector_store; leaving jsonl for manual ingestion.") except Exception as e: print(f"Auto-ingest failed: {e}. JSONL remains for manual ingestion.") return docs if __name__ == "__main__": ingest(auto_ingest=True)