Spaces:

essprasad
/

CT-Chat-V2

Sleeping

App Files Files Community

CT-Chat-V2 / web_ingest.py

essprasad

Upload 13 files

b4d63a8 verified 3 months ago

raw

history blame contribute delete

2.82 kB

	"""
	web_ingest.py
	-------------
	Read persistent/web_cache.json -> chunk into smaller pieces -> attempt to ingest into core.vector_store.
	Produces: /home/user/app/persistent/web_chunks.jsonl (backup)
	"""

	import os
	import json
	import hashlib
	from tqdm import tqdm

	PERSISTENT_DIR = "/home/user/app/persistent"
	WEB_CACHE = os.path.join(PERSISTENT_DIR, "web_cache.json")
	OUT_JSONL = os.path.join(PERSISTENT_DIR, "web_chunks.jsonl")

	CHUNK_SIZE = 420
	OVERLAP = 80


	def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
	words = text.split()
	if len(words) <= size:
	return [text]
	chunks = []
	i = 0
	n = len(words)
	while i < n:
	end = min(i + size, n)
	chunks.append(" ".join(words[i:end]))
	if end == n:
	break
	i = end - overlap
	return chunks


	def make_id(url, idx):
	h = hashlib.sha1(url.encode("utf-8")).hexdigest()[:12]
	return f"{h}::{idx}"


	def ingest(auto_ingest=True):
	if not os.path.exists(WEB_CACHE):
	print("No web_cache.json found; nothing to ingest.")
	return []

	with open(WEB_CACHE, "r", encoding="utf-8") as fh:
	pages = json.load(fh)

	docs = []
	for url, meta in tqdm(pages.items(), desc="Chunking web pages"):
	text = meta.get("text", "")
	title = meta.get("title", "") or ""
	chunks = chunk_text(text)
	for i, chunk in enumerate(chunks):
	doc = {
	"id": make_id(url, i),
	"file": url,
	"source": url,
	"type": "web",
	"title": title,
	"text": chunk,
	"term": "",
	"sources": [url]
	}
	docs.append(doc)

	# Write JSONL for downstream ingestion
	with open(OUT_JSONL, "w", encoding="utf-8") as fh:
	for d in docs:
	fh.write(json.dumps(d, ensure_ascii=False) + "\n")

	print(f"Wrote {len(docs)} web chunks to {OUT_JSONL}")

	if auto_ingest:
	try:
	import core.vector_store as vs
	if hasattr(vs, "add_documents"):
	print("Auto-ingest: calling core.vector_store.add_documents()")
	vs.add_documents(docs)
	print("Auto-ingest succeeded.")
	elif hasattr(vs, "ingest_documents_from_jsonl"):
	print("Auto-ingest: calling core.vector_store.ingest_documents_from_jsonl()")
	vs.ingest_documents_from_jsonl(OUT_JSONL)
	print("Auto-ingest succeeded.")
	else:
	print("No ingestion API found on core.vector_store; leaving jsonl for manual ingestion.")
	except Exception as e:
	print(f"Auto-ingest failed: {e}. JSONL remains for manual ingestion.")

	return docs


	if __name__ == "__main__":
	ingest(auto_ingest=True)