Spaces:

IPTS-PRODDEV
/

ProBas_RAG_Assistant

Running

App Files Files Community

ProBas_RAG_Assistant / enrich_bundle.py

Mohamed284

Deploy ProBas RAG Assistant with enriched prebuilt index

0ca97fd 21 days ago

Raw

History Blame Contribute Delete

2.86 kB

	"""One-off migration: add a `key_impacts` field to every record in the existing
	prebuilt bundle, extracted from the raw dataset's exchanges / LCIA results.

	The rag_text stored in the bundle only previews the first few exchanges, which
	miss the actual emission outputs (CO2, SO2, NOx) and key impact indicators
	(GWP / Treibhauseffekt, cumulative energy demand). This script reads the raw
	dataset, computes the compact key-impacts block per record, and writes it back
	into the bundle JSON. The embeddings (.npy) are left untouched, so this does NOT
	re-embed anything — retrieval already finds the right records; this only enriches
	the context the model sees.

	Run once:

	python enrich_bundle.py
	"""
	from __future__ import annotations

	import glob
	import json
	from pathlib import Path

	import app


	def build_uuid_to_impacts() -> dict[str, str]:
	mapping: dict[str, str] = {}
	files = sorted(glob.glob(str(app.DATA_DIR / "*.json")))
	for fi, path in enumerate(files, start=1):
	data = json.loads(Path(path).read_text(encoding="utf-8"))
	if isinstance(data, dict):
	data = [data]
	stem = Path(path).stem
	for index, item in enumerate(data):
	uuid = app.normalize_text(item.get("uuid")) or f"{stem}-{index}"
	impacts = app.compose_key_impacts(item.get("exchanges") or [], item.get("lcia_results") or [])
	if not impacts:
	continue
	# Keep the richest variant if a uuid appears more than once.
	if len(impacts) > len(mapping.get(uuid, "")):
	mapping[uuid] = impacts
	if fi % 25 == 0:
	print(f" scanned {fi}/{len(files)} files, {len(mapping)} records with impacts")
	return mapping


	def main() -> None:
	print("Extracting key impacts from the raw dataset...")
	mapping = build_uuid_to_impacts()
	print(f"Extracted impacts for {len(mapping)} records.")

	bundle_paths = sorted(app.CACHE_DIR.glob(f"bundle_{app.CACHE_VERSION}_*.json"))
	if not bundle_paths:
	raise SystemExit(f"No bundle found under {app.CACHE_DIR}")

	for bundle_path in bundle_paths:
	print(f"Enriching {bundle_path.name} ...")
	meta = json.loads(bundle_path.read_text(encoding="utf-8"))
	records = meta.get("records", [])
	enriched = 0
	for record in records:
	impacts = mapping.get(app.normalize_text(record.get("uuid")), "")
	record["key_impacts"] = impacts
	if impacts:
	enriched += 1
	app.atomic_write_text(
	bundle_path,
	json.dumps(meta, ensure_ascii=False, sort_keys=True),
	)
	print(f" wrote {enriched}/{len(records)} records with key impacts.")
	print("Done. The embeddings (.npy) were not touched — no re-embedding needed.")


	if __name__ == "__main__":
	main()