"""One-off migration: add a `key_impacts` field to every record in the existing prebuilt bundle, extracted from the raw dataset's exchanges / LCIA results. The rag_text stored in the bundle only previews the first few exchanges, which miss the actual emission outputs (CO2, SO2, NOx) and key impact indicators (GWP / Treibhauseffekt, cumulative energy demand). This script reads the raw dataset, computes the compact key-impacts block per record, and writes it back into the bundle JSON. The embeddings (.npy) are left untouched, so this does NOT re-embed anything — retrieval already finds the right records; this only enriches the context the model sees. Run once: python enrich_bundle.py """ from __future__ import annotations import glob import json from pathlib import Path import app def build_uuid_to_impacts() -> dict[str, str]: mapping: dict[str, str] = {} files = sorted(glob.glob(str(app.DATA_DIR / "*.json"))) for fi, path in enumerate(files, start=1): data = json.loads(Path(path).read_text(encoding="utf-8")) if isinstance(data, dict): data = [data] stem = Path(path).stem for index, item in enumerate(data): uuid = app.normalize_text(item.get("uuid")) or f"{stem}-{index}" impacts = app.compose_key_impacts(item.get("exchanges") or [], item.get("lcia_results") or []) if not impacts: continue # Keep the richest variant if a uuid appears more than once. if len(impacts) > len(mapping.get(uuid, "")): mapping[uuid] = impacts if fi % 25 == 0: print(f" scanned {fi}/{len(files)} files, {len(mapping)} records with impacts") return mapping def main() -> None: print("Extracting key impacts from the raw dataset...") mapping = build_uuid_to_impacts() print(f"Extracted impacts for {len(mapping)} records.") bundle_paths = sorted(app.CACHE_DIR.glob(f"bundle_{app.CACHE_VERSION}_*.json")) if not bundle_paths: raise SystemExit(f"No bundle found under {app.CACHE_DIR}") for bundle_path in bundle_paths: print(f"Enriching {bundle_path.name} ...") meta = json.loads(bundle_path.read_text(encoding="utf-8")) records = meta.get("records", []) enriched = 0 for record in records: impacts = mapping.get(app.normalize_text(record.get("uuid")), "") record["key_impacts"] = impacts if impacts: enriched += 1 app.atomic_write_text( bundle_path, json.dumps(meta, ensure_ascii=False, sort_keys=True), ) print(f" wrote {enriched}/{len(records)} records with key impacts.") print("Done. The embeddings (.npy) were not touched — no re-embedding needed.") if __name__ == "__main__": main()