""" Reporting — HTML report generation + JSONL/CSV exports. Produces: runs//report/index.html – human-browsable report runs//exports/normalized.jsonl runs//exports/records.csv """ from __future__ import annotations import csv import json import logging from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List from engine.io_contract import ( Artifact, EngineOutput, InputSpec, NormalizedRecord, ) logger = logging.getLogger("engine.reporting") # --------------------------------------------------------------------------- # Jinja2 setup (lazy import so the module can be imported without jinja2) # --------------------------------------------------------------------------- _TEMPLATE_DIR = Path(__file__).parent / "templates" def _render_html(template_name: str, context: Dict[str, Any]) -> str: """Render a Jinja2 template from the engine/templates/ directory.""" try: from jinja2 import Environment, FileSystemLoader except ImportError: logger.warning("jinja2 not installed — HTML report will be a plain summary") return _fallback_html(context) env = Environment( loader=FileSystemLoader(str(_TEMPLATE_DIR)), autoescape=True, ) template = env.get_template(template_name) return template.render(**context) def _fallback_html(context: Dict[str, Any]) -> str: """Minimal HTML when Jinja2 is unavailable.""" return ( f"

MOD-OSINT Report — {context.get('run_id', '?')}

" f"

Records: {context.get('total_records', 0)}

" f"

Generated: {context.get('generated_at', '')}

" f"

Install jinja2 for the full HTML report.

" f"" ) # --------------------------------------------------------------------------- # Export helpers # --------------------------------------------------------------------------- def export_jsonl(records: List[NormalizedRecord], out_path: Path) -> Path: """Write records as newline-delimited JSON.""" out_path.parent.mkdir(parents=True, exist_ok=True) with open(out_path, "w", encoding="utf-8") as f: for r in records: f.write(r.model_dump_json() + "\n") logger.info("Exported %d records to %s", len(records), out_path) return out_path def export_csv(records: List[NormalizedRecord], out_path: Path) -> Path: """Write records as CSV.""" out_path.parent.mkdir(parents=True, exist_ok=True) if not records: out_path.write_text("") return out_path fieldnames = list(records[0].model_dump().keys()) with open(out_path, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for r in records: row = r.model_dump() # Serialize complex fields for k, v in row.items(): if isinstance(v, (dict, list)): row[k] = json.dumps(v, ensure_ascii=False, default=str) elif isinstance(v, Path): row[k] = str(v) elif v is None: row[k] = "" writer.writerow(row) logger.info("Exported %d records to %s", len(records), out_path) return out_path # --------------------------------------------------------------------------- # Main report generation # --------------------------------------------------------------------------- def generate_report( run_id: str, run_dir: Path, input_spec: InputSpec, records: List[NormalizedRecord], stage_outputs: Dict[str, EngineOutput], ) -> List[Artifact]: """ Generate the full report suite: - HTML report at ``run_dir/report/index.html`` - JSONL export at ``run_dir/exports/normalized.jsonl`` - CSV export at ``run_dir/exports/records.csv`` Returns a list of ``Artifact`` objects. """ report_dir = run_dir / "report" exports_dir = run_dir / "exports" report_dir.mkdir(parents=True, exist_ok=True) exports_dir.mkdir(parents=True, exist_ok=True) artifacts: List[Artifact] = [] # -- JSONL export -------------------------------------------------------- jsonl_path = export_jsonl(records, exports_dir / "normalized.jsonl") artifacts.append(Artifact( name="normalized.jsonl", path=jsonl_path, mime_type="application/jsonl", description="All normalized records in JSONL format", )) # -- CSV export ---------------------------------------------------------- csv_path = export_csv(records, exports_dir / "records.csv") artifacts.append(Artifact( name="records.csv", path=csv_path, mime_type="text/csv", description="All normalized records in CSV format", )) # -- HTML report --------------------------------------------------------- preview_limit = 50 stages_data = [] for name, out in stage_outputs.items(): stages_data.append({ "stage": name, "status": out.status.value, "summary": out.summary, "error": out.error, }) input_files_data = [] for f in input_spec.files: input_files_data.append({ "name": f.path.name, "file_type": f.file_type.value, "size_bytes": f.size_bytes, "sha256": f.sha256, }) records_preview = [] for r in records[:preview_limit]: d = r.model_dump() # Convert Path/enum to string for template d["source_type"] = d.get("source_type", "") if hasattr(d["source_type"], "value"): d["source_type"] = d["source_type"].value records_preview.append(d) # Build relative paths for download links artifacts_data = [] for a in artifacts: try: rel = a.path.relative_to(report_dir) except ValueError: rel = Path("..") / "exports" / a.path.name artifacts_data.append({"name": a.name, "rel_path": str(rel)}) context = { "run_id": run_id, "generated_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"), "total_records": len(records), "input_file_count": len(input_spec.files), "stages": stages_data, "input_files": input_files_data, "records_preview": records_preview, "preview_limit": preview_limit, "artifacts": artifacts_data, } html_content = _render_html("report.html", context) html_path = report_dir / "index.html" html_path.write_text(html_content, encoding="utf-8") logger.info("HTML report written to %s", html_path) artifacts.append(Artifact( name="index.html", path=html_path, mime_type="text/html", description="Human-browsable pipeline report", )) return artifacts