CT-Chat-V2 / web_ingest.py
essprasad's picture
Upload 13 files
b4d63a8 verified
"""
web_ingest.py
-------------
Read persistent/web_cache.json -> chunk into smaller pieces -> attempt to ingest into core.vector_store.
Produces: /home/user/app/persistent/web_chunks.jsonl (backup)
"""
import os
import json
import hashlib
from tqdm import tqdm
PERSISTENT_DIR = "/home/user/app/persistent"
WEB_CACHE = os.path.join(PERSISTENT_DIR, "web_cache.json")
OUT_JSONL = os.path.join(PERSISTENT_DIR, "web_chunks.jsonl")
CHUNK_SIZE = 420
OVERLAP = 80
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
words = text.split()
if len(words) <= size:
return [text]
chunks = []
i = 0
n = len(words)
while i < n:
end = min(i + size, n)
chunks.append(" ".join(words[i:end]))
if end == n:
break
i = end - overlap
return chunks
def make_id(url, idx):
h = hashlib.sha1(url.encode("utf-8")).hexdigest()[:12]
return f"{h}::{idx}"
def ingest(auto_ingest=True):
if not os.path.exists(WEB_CACHE):
print("No web_cache.json found; nothing to ingest.")
return []
with open(WEB_CACHE, "r", encoding="utf-8") as fh:
pages = json.load(fh)
docs = []
for url, meta in tqdm(pages.items(), desc="Chunking web pages"):
text = meta.get("text", "")
title = meta.get("title", "") or ""
chunks = chunk_text(text)
for i, chunk in enumerate(chunks):
doc = {
"id": make_id(url, i),
"file": url,
"source": url,
"type": "web",
"title": title,
"text": chunk,
"term": "",
"sources": [url]
}
docs.append(doc)
# Write JSONL for downstream ingestion
with open(OUT_JSONL, "w", encoding="utf-8") as fh:
for d in docs:
fh.write(json.dumps(d, ensure_ascii=False) + "\n")
print(f"Wrote {len(docs)} web chunks to {OUT_JSONL}")
if auto_ingest:
try:
import core.vector_store as vs
if hasattr(vs, "add_documents"):
print("Auto-ingest: calling core.vector_store.add_documents()")
vs.add_documents(docs)
print("Auto-ingest succeeded.")
elif hasattr(vs, "ingest_documents_from_jsonl"):
print("Auto-ingest: calling core.vector_store.ingest_documents_from_jsonl()")
vs.ingest_documents_from_jsonl(OUT_JSONL)
print("Auto-ingest succeeded.")
else:
print("No ingestion API found on core.vector_store; leaving jsonl for manual ingestion.")
except Exception as e:
print(f"Auto-ingest failed: {e}. JSONL remains for manual ingestion.")
return docs
if __name__ == "__main__":
ingest(auto_ingest=True)