| """One-off migration: add a `key_impacts` field to every record in the existing |
| prebuilt bundle, extracted from the raw dataset's exchanges / LCIA results. |
| |
| The rag_text stored in the bundle only previews the first few exchanges, which |
| miss the actual emission outputs (CO2, SO2, NOx) and key impact indicators |
| (GWP / Treibhauseffekt, cumulative energy demand). This script reads the raw |
| dataset, computes the compact key-impacts block per record, and writes it back |
| into the bundle JSON. The embeddings (.npy) are left untouched, so this does NOT |
| re-embed anything — retrieval already finds the right records; this only enriches |
| the context the model sees. |
| |
| Run once: |
| |
| python enrich_bundle.py |
| """ |
| from __future__ import annotations |
|
|
| import glob |
| import json |
| from pathlib import Path |
|
|
| import app |
|
|
|
|
| def build_uuid_to_impacts() -> dict[str, str]: |
| mapping: dict[str, str] = {} |
| files = sorted(glob.glob(str(app.DATA_DIR / "*.json"))) |
| for fi, path in enumerate(files, start=1): |
| data = json.loads(Path(path).read_text(encoding="utf-8")) |
| if isinstance(data, dict): |
| data = [data] |
| stem = Path(path).stem |
| for index, item in enumerate(data): |
| uuid = app.normalize_text(item.get("uuid")) or f"{stem}-{index}" |
| impacts = app.compose_key_impacts(item.get("exchanges") or [], item.get("lcia_results") or []) |
| if not impacts: |
| continue |
| |
| if len(impacts) > len(mapping.get(uuid, "")): |
| mapping[uuid] = impacts |
| if fi % 25 == 0: |
| print(f" scanned {fi}/{len(files)} files, {len(mapping)} records with impacts") |
| return mapping |
|
|
|
|
| def main() -> None: |
| print("Extracting key impacts from the raw dataset...") |
| mapping = build_uuid_to_impacts() |
| print(f"Extracted impacts for {len(mapping)} records.") |
|
|
| bundle_paths = sorted(app.CACHE_DIR.glob(f"bundle_{app.CACHE_VERSION}_*.json")) |
| if not bundle_paths: |
| raise SystemExit(f"No bundle found under {app.CACHE_DIR}") |
|
|
| for bundle_path in bundle_paths: |
| print(f"Enriching {bundle_path.name} ...") |
| meta = json.loads(bundle_path.read_text(encoding="utf-8")) |
| records = meta.get("records", []) |
| enriched = 0 |
| for record in records: |
| impacts = mapping.get(app.normalize_text(record.get("uuid")), "") |
| record["key_impacts"] = impacts |
| if impacts: |
| enriched += 1 |
| app.atomic_write_text( |
| bundle_path, |
| json.dumps(meta, ensure_ascii=False, sort_keys=True), |
| ) |
| print(f" wrote {enriched}/{len(records)} records with key impacts.") |
| print("Done. The embeddings (.npy) were not touched — no re-embedding needed.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|