File size: 2,856 Bytes
0ca97fd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | """One-off migration: add a `key_impacts` field to every record in the existing
prebuilt bundle, extracted from the raw dataset's exchanges / LCIA results.
The rag_text stored in the bundle only previews the first few exchanges, which
miss the actual emission outputs (CO2, SO2, NOx) and key impact indicators
(GWP / Treibhauseffekt, cumulative energy demand). This script reads the raw
dataset, computes the compact key-impacts block per record, and writes it back
into the bundle JSON. The embeddings (.npy) are left untouched, so this does NOT
re-embed anything — retrieval already finds the right records; this only enriches
the context the model sees.
Run once:
python enrich_bundle.py
"""
from __future__ import annotations
import glob
import json
from pathlib import Path
import app
def build_uuid_to_impacts() -> dict[str, str]:
mapping: dict[str, str] = {}
files = sorted(glob.glob(str(app.DATA_DIR / "*.json")))
for fi, path in enumerate(files, start=1):
data = json.loads(Path(path).read_text(encoding="utf-8"))
if isinstance(data, dict):
data = [data]
stem = Path(path).stem
for index, item in enumerate(data):
uuid = app.normalize_text(item.get("uuid")) or f"{stem}-{index}"
impacts = app.compose_key_impacts(item.get("exchanges") or [], item.get("lcia_results") or [])
if not impacts:
continue
# Keep the richest variant if a uuid appears more than once.
if len(impacts) > len(mapping.get(uuid, "")):
mapping[uuid] = impacts
if fi % 25 == 0:
print(f" scanned {fi}/{len(files)} files, {len(mapping)} records with impacts")
return mapping
def main() -> None:
print("Extracting key impacts from the raw dataset...")
mapping = build_uuid_to_impacts()
print(f"Extracted impacts for {len(mapping)} records.")
bundle_paths = sorted(app.CACHE_DIR.glob(f"bundle_{app.CACHE_VERSION}_*.json"))
if not bundle_paths:
raise SystemExit(f"No bundle found under {app.CACHE_DIR}")
for bundle_path in bundle_paths:
print(f"Enriching {bundle_path.name} ...")
meta = json.loads(bundle_path.read_text(encoding="utf-8"))
records = meta.get("records", [])
enriched = 0
for record in records:
impacts = mapping.get(app.normalize_text(record.get("uuid")), "")
record["key_impacts"] = impacts
if impacts:
enriched += 1
app.atomic_write_text(
bundle_path,
json.dumps(meta, ensure_ascii=False, sort_keys=True),
)
print(f" wrote {enriched}/{len(records)} records with key impacts.")
print("Done. The embeddings (.npy) were not touched — no re-embedding needed.")
if __name__ == "__main__":
main()
|