Spaces:
Sleeping
Sleeping
| """ | |
| Enrich section chunks with report-level metadata from md_recursive chunks. | |
| Reads : data/processed/chunks_md_section.json | |
| data/processed/chunks_md_md_recursive.json | |
| Writes : data/processed/chunks_md_section_enriched.json | |
| Only report-level fields are joined (ntsb_no, event_date, make, model, state, | |
| phase_of_flight, weather, source_filename, entity_id, context_summary). | |
| Chunk-level fields (entities, aircraft_components, numerics) are intentionally | |
| excluded — they are specific to each md_recursive chunk's text and would be | |
| wrong if copied to a different chunk boundary. | |
| """ | |
| import json | |
| import sys | |
| from pathlib import Path | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| SECTION_FILE = BASE_DIR / "data" / "processed" / "chunks_md_section.json" | |
| MDREC_FILE = BASE_DIR / "data" / "processed" / "chunks_md_md_recursive.json" | |
| OUT_FILE = BASE_DIR / "data" / "processed" / "chunks_md_section_enriched.json" | |
| REPORT_LEVEL_FIELDS = [ | |
| "ntsb_no", | |
| "event_date", | |
| "make", | |
| "model", | |
| "state", | |
| "phase_of_flight", | |
| "weather", | |
| "source_filename", | |
| "entity_id", | |
| "context_summary", | |
| ] | |
| def build_report_meta(mdrec_chunks: list[dict]) -> dict[str, dict]: | |
| """Build report_id -> metadata dict using the first chunk seen per report.""" | |
| meta = {} | |
| for chunk in mdrec_chunks: | |
| rid = chunk.get("report_id") | |
| if rid and rid not in meta: | |
| meta[rid] = {f: chunk.get(f, "") for f in REPORT_LEVEL_FIELDS} | |
| return meta | |
| def enrich(section_chunks: list[dict], report_meta: dict[str, dict]) -> list[dict]: | |
| enriched = [] | |
| missing = set() | |
| for chunk in section_chunks: | |
| rid = chunk.get("report_id", "") | |
| meta = report_meta.get(rid) | |
| if meta is None: | |
| missing.add(rid) | |
| enriched.append(chunk) | |
| continue | |
| enriched_chunk = dict(chunk) | |
| enriched_chunk.update(meta) | |
| enriched.append(enriched_chunk) | |
| if missing: | |
| print(f" WARNING: {len(missing)} report(s) had no metadata match: {sorted(missing)}") | |
| return enriched | |
| def main(): | |
| print("Loading section chunks...") | |
| with open(SECTION_FILE, "r", encoding="utf-8") as f: | |
| section_chunks = json.load(f) | |
| print(f" {len(section_chunks)} section chunks loaded") | |
| print("Loading md_recursive chunks for metadata...") | |
| with open(MDREC_FILE, "r", encoding="utf-8") as f: | |
| mdrec_chunks = json.load(f) | |
| print(f" {len(mdrec_chunks)} md_recursive chunks loaded") | |
| report_meta = build_report_meta(mdrec_chunks) | |
| print(f" Built metadata lookup for {len(report_meta)} reports") | |
| print("Enriching section chunks...") | |
| enriched = enrich(section_chunks, report_meta) | |
| # Verify enrichment | |
| has_ntsb = sum(1 for c in enriched if c.get("ntsb_no")) | |
| print(f" {has_ntsb}/{len(enriched)} chunks now have ntsb_no") | |
| print(f"Writing enriched chunks to {OUT_FILE.name}...") | |
| with open(OUT_FILE, "w", encoding="utf-8") as f: | |
| json.dump(enriched, f, indent=2, ensure_ascii=False) | |
| print(f" Done — {OUT_FILE}") | |
| if __name__ == "__main__": | |
| main() | |