File size: 2,823 Bytes
b4d63a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
web_ingest.py
-------------
Read persistent/web_cache.json -> chunk into smaller pieces -> attempt to ingest into core.vector_store.
Produces: /home/user/app/persistent/web_chunks.jsonl (backup)
"""

import os
import json
import hashlib
from tqdm import tqdm

PERSISTENT_DIR = "/home/user/app/persistent"
WEB_CACHE = os.path.join(PERSISTENT_DIR, "web_cache.json")
OUT_JSONL = os.path.join(PERSISTENT_DIR, "web_chunks.jsonl")

CHUNK_SIZE = 420
OVERLAP = 80


def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
    words = text.split()
    if len(words) <= size:
        return [text]
    chunks = []
    i = 0
    n = len(words)
    while i < n:
        end = min(i + size, n)
        chunks.append(" ".join(words[i:end]))
        if end == n:
            break
        i = end - overlap
    return chunks


def make_id(url, idx):
    h = hashlib.sha1(url.encode("utf-8")).hexdigest()[:12]
    return f"{h}::{idx}"


def ingest(auto_ingest=True):
    if not os.path.exists(WEB_CACHE):
        print("No web_cache.json found; nothing to ingest.")
        return []

    with open(WEB_CACHE, "r", encoding="utf-8") as fh:
        pages = json.load(fh)

    docs = []
    for url, meta in tqdm(pages.items(), desc="Chunking web pages"):
        text = meta.get("text", "")
        title = meta.get("title", "") or ""
        chunks = chunk_text(text)
        for i, chunk in enumerate(chunks):
            doc = {
                "id": make_id(url, i),
                "file": url,
                "source": url,
                "type": "web",
                "title": title,
                "text": chunk,
                "term": "",
                "sources": [url]
            }
            docs.append(doc)

    # Write JSONL for downstream ingestion
    with open(OUT_JSONL, "w", encoding="utf-8") as fh:
        for d in docs:
            fh.write(json.dumps(d, ensure_ascii=False) + "\n")

    print(f"Wrote {len(docs)} web chunks to {OUT_JSONL}")

    if auto_ingest:
        try:
            import core.vector_store as vs
            if hasattr(vs, "add_documents"):
                print("Auto-ingest: calling core.vector_store.add_documents()")
                vs.add_documents(docs)
                print("Auto-ingest succeeded.")
            elif hasattr(vs, "ingest_documents_from_jsonl"):
                print("Auto-ingest: calling core.vector_store.ingest_documents_from_jsonl()")
                vs.ingest_documents_from_jsonl(OUT_JSONL)
                print("Auto-ingest succeeded.")
            else:
                print("No ingestion API found on core.vector_store; leaving jsonl for manual ingestion.")
        except Exception as e:
            print(f"Auto-ingest failed: {e}. JSONL remains for manual ingestion.")

    return docs


if __name__ == "__main__":
    ingest(auto_ingest=True)