File size: 6,940 Bytes
b75c637 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | """
Reporting — HTML report generation + JSONL/CSV exports.
Produces:
runs/<run_id>/report/index.html – human-browsable report
runs/<run_id>/exports/normalized.jsonl
runs/<run_id>/exports/records.csv
"""
from __future__ import annotations
import csv
import json
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List
from engine.io_contract import (
Artifact,
EngineOutput,
InputSpec,
NormalizedRecord,
)
logger = logging.getLogger("engine.reporting")
# ---------------------------------------------------------------------------
# Jinja2 setup (lazy import so the module can be imported without jinja2)
# ---------------------------------------------------------------------------
_TEMPLATE_DIR = Path(__file__).parent / "templates"
def _render_html(template_name: str, context: Dict[str, Any]) -> str:
"""Render a Jinja2 template from the engine/templates/ directory."""
try:
from jinja2 import Environment, FileSystemLoader
except ImportError:
logger.warning("jinja2 not installed — HTML report will be a plain summary")
return _fallback_html(context)
env = Environment(
loader=FileSystemLoader(str(_TEMPLATE_DIR)),
autoescape=True,
)
template = env.get_template(template_name)
return template.render(**context)
def _fallback_html(context: Dict[str, Any]) -> str:
"""Minimal HTML when Jinja2 is unavailable."""
return (
f"<html><body><h1>MOD-OSINT Report — {context.get('run_id', '?')}</h1>"
f"<p>Records: {context.get('total_records', 0)}</p>"
f"<p>Generated: {context.get('generated_at', '')}</p>"
f"<p><em>Install jinja2 for the full HTML report.</em></p>"
f"</body></html>"
)
# ---------------------------------------------------------------------------
# Export helpers
# ---------------------------------------------------------------------------
def export_jsonl(records: List[NormalizedRecord], out_path: Path) -> Path:
"""Write records as newline-delimited JSON."""
out_path.parent.mkdir(parents=True, exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
for r in records:
f.write(r.model_dump_json() + "\n")
logger.info("Exported %d records to %s", len(records), out_path)
return out_path
def export_csv(records: List[NormalizedRecord], out_path: Path) -> Path:
"""Write records as CSV."""
out_path.parent.mkdir(parents=True, exist_ok=True)
if not records:
out_path.write_text("")
return out_path
fieldnames = list(records[0].model_dump().keys())
with open(out_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for r in records:
row = r.model_dump()
# Serialize complex fields
for k, v in row.items():
if isinstance(v, (dict, list)):
row[k] = json.dumps(v, ensure_ascii=False, default=str)
elif isinstance(v, Path):
row[k] = str(v)
elif v is None:
row[k] = ""
writer.writerow(row)
logger.info("Exported %d records to %s", len(records), out_path)
return out_path
# ---------------------------------------------------------------------------
# Main report generation
# ---------------------------------------------------------------------------
def generate_report(
run_id: str,
run_dir: Path,
input_spec: InputSpec,
records: List[NormalizedRecord],
stage_outputs: Dict[str, EngineOutput],
) -> List[Artifact]:
"""
Generate the full report suite:
- HTML report at ``run_dir/report/index.html``
- JSONL export at ``run_dir/exports/normalized.jsonl``
- CSV export at ``run_dir/exports/records.csv``
Returns a list of ``Artifact`` objects.
"""
report_dir = run_dir / "report"
exports_dir = run_dir / "exports"
report_dir.mkdir(parents=True, exist_ok=True)
exports_dir.mkdir(parents=True, exist_ok=True)
artifacts: List[Artifact] = []
# -- JSONL export --------------------------------------------------------
jsonl_path = export_jsonl(records, exports_dir / "normalized.jsonl")
artifacts.append(Artifact(
name="normalized.jsonl",
path=jsonl_path,
mime_type="application/jsonl",
description="All normalized records in JSONL format",
))
# -- CSV export ----------------------------------------------------------
csv_path = export_csv(records, exports_dir / "records.csv")
artifacts.append(Artifact(
name="records.csv",
path=csv_path,
mime_type="text/csv",
description="All normalized records in CSV format",
))
# -- HTML report ---------------------------------------------------------
preview_limit = 50
stages_data = []
for name, out in stage_outputs.items():
stages_data.append({
"stage": name,
"status": out.status.value,
"summary": out.summary,
"error": out.error,
})
input_files_data = []
for f in input_spec.files:
input_files_data.append({
"name": f.path.name,
"file_type": f.file_type.value,
"size_bytes": f.size_bytes,
"sha256": f.sha256,
})
records_preview = []
for r in records[:preview_limit]:
d = r.model_dump()
# Convert Path/enum to string for template
d["source_type"] = d.get("source_type", "")
if hasattr(d["source_type"], "value"):
d["source_type"] = d["source_type"].value
records_preview.append(d)
# Build relative paths for download links
artifacts_data = []
for a in artifacts:
try:
rel = a.path.relative_to(report_dir)
except ValueError:
rel = Path("..") / "exports" / a.path.name
artifacts_data.append({"name": a.name, "rel_path": str(rel)})
context = {
"run_id": run_id,
"generated_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"),
"total_records": len(records),
"input_file_count": len(input_spec.files),
"stages": stages_data,
"input_files": input_files_data,
"records_preview": records_preview,
"preview_limit": preview_limit,
"artifacts": artifacts_data,
}
html_content = _render_html("report.html", context)
html_path = report_dir / "index.html"
html_path.write_text(html_content, encoding="utf-8")
logger.info("HTML report written to %s", html_path)
artifacts.append(Artifact(
name="index.html",
path=html_path,
mime_type="text/html",
description="Human-browsable pipeline report",
))
return artifacts
|