Records: {context.get('total_records', 0)}
" f"Generated: {context.get('generated_at', '')}
" f"Install jinja2 for the full HTML report.
" f"" ) # --------------------------------------------------------------------------- # Export helpers # --------------------------------------------------------------------------- def export_jsonl(records: List[NormalizedRecord], out_path: Path) -> Path: """Write records as newline-delimited JSON.""" out_path.parent.mkdir(parents=True, exist_ok=True) with open(out_path, "w", encoding="utf-8") as f: for r in records: f.write(r.model_dump_json() + "\n") logger.info("Exported %d records to %s", len(records), out_path) return out_path def export_csv(records: List[NormalizedRecord], out_path: Path) -> Path: """Write records as CSV.""" out_path.parent.mkdir(parents=True, exist_ok=True) if not records: out_path.write_text("") return out_path fieldnames = list(records[0].model_dump().keys()) with open(out_path, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for r in records: row = r.model_dump() # Serialize complex fields for k, v in row.items(): if isinstance(v, (dict, list)): row[k] = json.dumps(v, ensure_ascii=False, default=str) elif isinstance(v, Path): row[k] = str(v) elif v is None: row[k] = "" writer.writerow(row) logger.info("Exported %d records to %s", len(records), out_path) return out_path # --------------------------------------------------------------------------- # Main report generation # --------------------------------------------------------------------------- def generate_report( run_id: str, run_dir: Path, input_spec: InputSpec, records: List[NormalizedRecord], stage_outputs: Dict[str, EngineOutput], ) -> List[Artifact]: """ Generate the full report suite: - HTML report at ``run_dir/report/index.html`` - JSONL export at ``run_dir/exports/normalized.jsonl`` - CSV export at ``run_dir/exports/records.csv`` Returns a list of ``Artifact`` objects. """ report_dir = run_dir / "report" exports_dir = run_dir / "exports" report_dir.mkdir(parents=True, exist_ok=True) exports_dir.mkdir(parents=True, exist_ok=True) artifacts: List[Artifact] = [] # -- JSONL export -------------------------------------------------------- jsonl_path = export_jsonl(records, exports_dir / "normalized.jsonl") artifacts.append(Artifact( name="normalized.jsonl", path=jsonl_path, mime_type="application/jsonl", description="All normalized records in JSONL format", )) # -- CSV export ---------------------------------------------------------- csv_path = export_csv(records, exports_dir / "records.csv") artifacts.append(Artifact( name="records.csv", path=csv_path, mime_type="text/csv", description="All normalized records in CSV format", )) # -- HTML report --------------------------------------------------------- preview_limit = 50 stages_data = [] for name, out in stage_outputs.items(): stages_data.append({ "stage": name, "status": out.status.value, "summary": out.summary, "error": out.error, }) input_files_data = [] for f in input_spec.files: input_files_data.append({ "name": f.path.name, "file_type": f.file_type.value, "size_bytes": f.size_bytes, "sha256": f.sha256, }) records_preview = [] for r in records[:preview_limit]: d = r.model_dump() # Convert Path/enum to string for template d["source_type"] = d.get("source_type", "") if hasattr(d["source_type"], "value"): d["source_type"] = d["source_type"].value records_preview.append(d) # Build relative paths for download links artifacts_data = [] for a in artifacts: try: rel = a.path.relative_to(report_dir) except ValueError: rel = Path("..") / "exports" / a.path.name artifacts_data.append({"name": a.name, "rel_path": str(rel)}) context = { "run_id": run_id, "generated_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"), "total_records": len(records), "input_file_count": len(input_spec.files), "stages": stages_data, "input_files": input_files_data, "records_preview": records_preview, "preview_limit": preview_limit, "artifacts": artifacts_data, } html_content = _render_html("report.html", context) html_path = report_dir / "index.html" html_path.write_text(html_content, encoding="utf-8") logger.info("HTML report written to %s", html_path) artifacts.append(Artifact( name="index.html", path=html_path, mime_type="text/html", description="Human-browsable pipeline report", )) return artifacts