Spaces:

IPTS-PRODDEV
/

ProBas_RAG_Assistant

Running

File size: 2,856 Bytes

0ca97fd

"""One-off migration: add a `key_impacts` field to every record in the existing
prebuilt bundle, extracted from the raw dataset's exchanges / LCIA results.

The rag_text stored in the bundle only previews the first few exchanges, which
miss the actual emission outputs (CO2, SO2, NOx) and key impact indicators
(GWP / Treibhauseffekt, cumulative energy demand). This script reads the raw
dataset, computes the compact key-impacts block per record, and writes it back
into the bundle JSON. The embeddings (.npy) are left untouched, so this does NOT
re-embed anything — retrieval already finds the right records; this only enriches
the context the model sees.

Run once:

    python enrich_bundle.py
"""
from __future__ import annotations

import glob
import json
from pathlib import Path

import app


def build_uuid_to_impacts() -> dict[str, str]:
    mapping: dict[str, str] = {}
    files = sorted(glob.glob(str(app.DATA_DIR / "*.json")))
    for fi, path in enumerate(files, start=1):
        data = json.loads(Path(path).read_text(encoding="utf-8"))
        if isinstance(data, dict):
            data = [data]
        stem = Path(path).stem
        for index, item in enumerate(data):
            uuid = app.normalize_text(item.get("uuid")) or f"{stem}-{index}"
            impacts = app.compose_key_impacts(item.get("exchanges") or [], item.get("lcia_results") or [])
            if not impacts:
                continue
            # Keep the richest variant if a uuid appears more than once.
            if len(impacts) > len(mapping.get(uuid, "")):
                mapping[uuid] = impacts
        if fi % 25 == 0:
            print(f"  scanned {fi}/{len(files)} files, {len(mapping)} records with impacts")
    return mapping


def main() -> None:
    print("Extracting key impacts from the raw dataset...")
    mapping = build_uuid_to_impacts()
    print(f"Extracted impacts for {len(mapping)} records.")

    bundle_paths = sorted(app.CACHE_DIR.glob(f"bundle_{app.CACHE_VERSION}_*.json"))
    if not bundle_paths:
        raise SystemExit(f"No bundle found under {app.CACHE_DIR}")

    for bundle_path in bundle_paths:
        print(f"Enriching {bundle_path.name} ...")
        meta = json.loads(bundle_path.read_text(encoding="utf-8"))
        records = meta.get("records", [])
        enriched = 0
        for record in records:
            impacts = mapping.get(app.normalize_text(record.get("uuid")), "")
            record["key_impacts"] = impacts
            if impacts:
                enriched += 1
        app.atomic_write_text(
            bundle_path,
            json.dumps(meta, ensure_ascii=False, sort_keys=True),
        )
        print(f"  wrote {enriched}/{len(records)} records with key impacts.")
    print("Done. The embeddings (.npy) were not touched — no re-embedding needed.")


if __name__ == "__main__":
    main()