Spaces:
Sleeping
Sleeping
| """ | |
| web_ingest.py | |
| ------------- | |
| Read persistent/web_cache.json -> chunk into smaller pieces -> attempt to ingest into core.vector_store. | |
| Produces: /home/user/app/persistent/web_chunks.jsonl (backup) | |
| """ | |
| import os | |
| import json | |
| import hashlib | |
| from tqdm import tqdm | |
| PERSISTENT_DIR = "/home/user/app/persistent" | |
| WEB_CACHE = os.path.join(PERSISTENT_DIR, "web_cache.json") | |
| OUT_JSONL = os.path.join(PERSISTENT_DIR, "web_chunks.jsonl") | |
| CHUNK_SIZE = 420 | |
| OVERLAP = 80 | |
| def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP): | |
| words = text.split() | |
| if len(words) <= size: | |
| return [text] | |
| chunks = [] | |
| i = 0 | |
| n = len(words) | |
| while i < n: | |
| end = min(i + size, n) | |
| chunks.append(" ".join(words[i:end])) | |
| if end == n: | |
| break | |
| i = end - overlap | |
| return chunks | |
| def make_id(url, idx): | |
| h = hashlib.sha1(url.encode("utf-8")).hexdigest()[:12] | |
| return f"{h}::{idx}" | |
| def ingest(auto_ingest=True): | |
| if not os.path.exists(WEB_CACHE): | |
| print("No web_cache.json found; nothing to ingest.") | |
| return [] | |
| with open(WEB_CACHE, "r", encoding="utf-8") as fh: | |
| pages = json.load(fh) | |
| docs = [] | |
| for url, meta in tqdm(pages.items(), desc="Chunking web pages"): | |
| text = meta.get("text", "") | |
| title = meta.get("title", "") or "" | |
| chunks = chunk_text(text) | |
| for i, chunk in enumerate(chunks): | |
| doc = { | |
| "id": make_id(url, i), | |
| "file": url, | |
| "source": url, | |
| "type": "web", | |
| "title": title, | |
| "text": chunk, | |
| "term": "", | |
| "sources": [url] | |
| } | |
| docs.append(doc) | |
| # Write JSONL for downstream ingestion | |
| with open(OUT_JSONL, "w", encoding="utf-8") as fh: | |
| for d in docs: | |
| fh.write(json.dumps(d, ensure_ascii=False) + "\n") | |
| print(f"Wrote {len(docs)} web chunks to {OUT_JSONL}") | |
| if auto_ingest: | |
| try: | |
| import core.vector_store as vs | |
| if hasattr(vs, "add_documents"): | |
| print("Auto-ingest: calling core.vector_store.add_documents()") | |
| vs.add_documents(docs) | |
| print("Auto-ingest succeeded.") | |
| elif hasattr(vs, "ingest_documents_from_jsonl"): | |
| print("Auto-ingest: calling core.vector_store.ingest_documents_from_jsonl()") | |
| vs.ingest_documents_from_jsonl(OUT_JSONL) | |
| print("Auto-ingest succeeded.") | |
| else: | |
| print("No ingestion API found on core.vector_store; leaving jsonl for manual ingestion.") | |
| except Exception as e: | |
| print(f"Auto-ingest failed: {e}. JSONL remains for manual ingestion.") | |
| return docs | |
| if __name__ == "__main__": | |
| ingest(auto_ingest=True) | |