Spaces:

essprasad
/

CT-Chat-V2

Sleeping

File size: 2,823 Bytes

b4d63a8

"""
web_ingest.py
-------------
Read persistent/web_cache.json -> chunk into smaller pieces -> attempt to ingest into core.vector_store.
Produces: /home/user/app/persistent/web_chunks.jsonl (backup)
"""

import os
import json
import hashlib
from tqdm import tqdm

PERSISTENT_DIR = "/home/user/app/persistent"
WEB_CACHE = os.path.join(PERSISTENT_DIR, "web_cache.json")
OUT_JSONL = os.path.join(PERSISTENT_DIR, "web_chunks.jsonl")

CHUNK_SIZE = 420
OVERLAP = 80


def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
    words = text.split()
    if len(words) <= size:
        return [text]
    chunks = []
    i = 0
    n = len(words)
    while i < n:
        end = min(i + size, n)
        chunks.append(" ".join(words[i:end]))
        if end == n:
            break
        i = end - overlap
    return chunks


def make_id(url, idx):
    h = hashlib.sha1(url.encode("utf-8")).hexdigest()[:12]
    return f"{h}::{idx}"


def ingest(auto_ingest=True):
    if not os.path.exists(WEB_CACHE):
        print("No web_cache.json found; nothing to ingest.")
        return []

    with open(WEB_CACHE, "r", encoding="utf-8") as fh:
        pages = json.load(fh)

    docs = []
    for url, meta in tqdm(pages.items(), desc="Chunking web pages"):
        text = meta.get("text", "")
        title = meta.get("title", "") or ""
        chunks = chunk_text(text)
        for i, chunk in enumerate(chunks):
            doc = {
                "id": make_id(url, i),
                "file": url,
                "source": url,
                "type": "web",
                "title": title,
                "text": chunk,
                "term": "",
                "sources": [url]
            }
            docs.append(doc)

    # Write JSONL for downstream ingestion
    with open(OUT_JSONL, "w", encoding="utf-8") as fh:
        for d in docs:
            fh.write(json.dumps(d, ensure_ascii=False) + "\n")

    print(f"Wrote {len(docs)} web chunks to {OUT_JSONL}")

    if auto_ingest:
        try:
            import core.vector_store as vs
            if hasattr(vs, "add_documents"):
                print("Auto-ingest: calling core.vector_store.add_documents()")
                vs.add_documents(docs)
                print("Auto-ingest succeeded.")
            elif hasattr(vs, "ingest_documents_from_jsonl"):
                print("Auto-ingest: calling core.vector_store.ingest_documents_from_jsonl()")
                vs.ingest_documents_from_jsonl(OUT_JSONL)
                print("Auto-ingest succeeded.")
            else:
                print("No ingestion API found on core.vector_store; leaving jsonl for manual ingestion.")
        except Exception as e:
            print(f"Auto-ingest failed: {e}. JSONL remains for manual ingestion.")

    return docs


if __name__ == "__main__":
    ingest(auto_ingest=True)