Abdourakib's picture
Initial commit: AutoML Engineer Agent with README, LICENSE, and project files
5908c8b
"""
Report generator: rich markdown/HTML from pipeline results.
"""
from __future__ import annotations
import base64
import html
from datetime import datetime
from pathlib import Path
from typing import Any
# Brand palette (match evaluate.py / UI)
PALETTE = {
"primary": "#534AB7",
"secondary": "#1D9E75",
"accent": "#D85A30",
}
def generate_report(
objective: str,
dataset_name: str,
metrics: dict,
best_model: str,
output_dir: str | Path = "outputs",
) -> Path:
"""
Generate a markdown report with objective, dataset, metrics, and best model.
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_path = output_dir / f"report_{timestamp}.md"
content = f"""# AutoML Report
**Generated:** {datetime.now().isoformat()}
## Objective
{objective}
## Dataset
{dataset_name}
## Best Model
{best_model}
## Metrics
```json
{metrics}
```
---
*Report generated by automl-engineer-agent*
"""
report_path.write_text(content, encoding="utf-8")
return report_path
def _dataset_size_label(dp: dict[str, Any]) -> str:
n = int(dp.get("n_rows", 0) or 0)
if n < 1000:
return "Small (< 1000 rows)"
if n <= 10000:
return "Medium"
return "Large (> 10000 rows)"
def _embed_png(path: str) -> str:
try:
p = Path(path)
if not p.exists():
return ""
raw = p.read_bytes()
return base64.b64encode(raw).decode("ascii")
except Exception:
return ""
def _shap_status(result: dict[str, Any]) -> str:
pp = result.get("plot_paths") or {}
ev = result.get("eval") or {}
if ev.get("has_shap") or any(
k in pp for k in ("shap_bar", "shap_summary")
) or any(str(k).startswith("shap_dependence_") for k in pp):
return "Complete"
return "Not available"
def _best_model_tuned_line(result: dict[str, Any]) -> str:
name = result.get("best_model_name") or "—"
tune = result.get("tune") or {}
if tune.get("success"):
return f"{name} (tuned)"
return str(name)
def _tuning_improvement_line(result: dict[str, Any]) -> str:
tune = result.get("tune") or {}
if not tune.get("success"):
return "Not run"
imp = tune.get("improvement")
if imp is None:
return "—"
return f"{float(imp):+.4f}"
def _count_pipeline_steps(result: dict[str, Any]) -> int:
n = 0
if result.get("eda"):
n += 1
if result.get("task"):
n += 1
if result.get("prep"):
n += 1
if result.get("plan"):
n += 1
if result.get("train"):
n += 1
if result.get("tune"):
n += 1
if result.get("eval") or result.get("plot_paths"):
n += 1
return n
def _generate_next_steps(result: dict[str, Any]) -> list[str]:
steps: list[str] = []
eda = result.get("eda") or {}
prep = result.get("prep") or {}
ti = eda.get("target_info") or {}
miss = eda.get("missing", {}).get("by_column", {}) or {}
train_data = result.get("train", {}) or {}
overfit_warnings = list(train_data.get("overfitting_warnings", []))
if eda.get("overview", {}).get("rows", 0) < 500:
steps.append("Collect more rows or use simpler models to reduce variance.")
if overfit_warnings:
steps.append("Reduce model complexity, add regularization, or gather more diverse data.")
if any(info.get("pct", 0) > 30 for info in miss.values()):
steps.append("Impute or drop high-missing columns before retraining.")
if ti.get("imbalance_ratio", 0) and ti.get("imbalance_ratio", 0) > 5:
steps.append("Try class_weight='balanced', SMOTE, or stratified sampling.")
if prep.get("target_leakage_suspicion"):
steps.append("Review columns flagged for possible target leakage.")
tune = result.get("tune") or {}
if tune.get("success"):
imp = float(tune.get("improvement") or 0.0)
if imp < 0.01:
steps.append(
"Hyperparameter tuning showed minimal improvement — consider feature engineering instead."
)
if imp > 0.05:
steps.append(
"Significant improvement from tuning — consider expanding the search space with more trials."
)
if tune.get("overfit"):
steps.append(
"Even after tuning the model overfit — try adding more training data or reducing model complexity."
)
steps.append("Run k-fold cross-validation on the training split to validate stability.")
return steps[:12]
def _build_plan_section_md(plan: dict[str, Any]) -> list[str]:
dp = plan.get("dataset_profile") or {}
lines: list[str] = [
"## 3. Training Plan",
"",
"### Dataset profile",
"",
"| Field | Value |",
"|-------|-------|",
f"| Rows | {dp.get('n_rows', '—')} |",
f"| Features | {dp.get('n_features', '—')} |",
f"| Size category | {_dataset_size_label(dp)} |",
f"| Imbalance ratio | {dp.get('imbalance_ratio', '—')} |",
"",
"### Models selected",
"",
"| Model | Selected | Reason |",
"|-------|----------|--------|",
]
rec = plan.get("recommended_models") or []
skip = plan.get("skip_models") or []
reasons = plan.get("skip_reasons") or {}
for m in rec:
lines.append(f"| {m} | Yes | Included in the training plan for this dataset. |")
for m in skip:
r = reasons.get(m, "Excluded by training plan rules.")
lines.append(f"| {m} | No | {r} |")
lines += [
"",
"### Primary metric",
"",
f"- **Metric:** `{plan.get('primary_metric', '—')}`",
f"- **Reasoning:** {plan.get('metric_reasoning', '—')}",
"",
"### Tuning budget",
"",
f"- **Optuna trials:** {plan.get('n_trials', '—')}",
f"- **Timeout (s):** {plan.get('timeout', '—')}",
"",
]
warns = plan.get("warnings") or []
if warns:
lines.append("### Warnings")
lines.append("")
for w in warns:
lines.append(f"- ⚠️ {w}")
lines.append("")
return lines
def _build_tune_section_md(tune: dict[str, Any]) -> list[str]:
if not tune.get("success"):
return [
"## 5. Hyperparameter Tuning",
"",
f"Tuning did not complete successfully: {tune.get('error', 'unknown')}",
"",
]
bp = tune.get("best_params") or {}
lines = [
"## 5. Hyperparameter Tuning",
"",
"| Metric | Value |",
"|--------|-------|",
f"| Baseline score (test) | {tune.get('baseline_score', '—')} |",
f"| Best score after tuning | {tune.get('best_score', '—')} |",
f"| Improvement | {tune.get('improvement', '—')} |",
f"| Trials run | {tune.get('n_trials_run', '—')} |",
f"| Tuning time (s) | {tune.get('tuning_time_s', '—')} |",
f"| Overfit (train–test gap heuristic) | {tune.get('overfit', '—')} |",
"",
"### Best hyperparameters",
"",
"| Parameter | Value |",
"|-----------|-------|",
]
for k, v in sorted(bp.items())[:48]:
lines.append(f"| `{k}` | `{v!r}` |")
lines.append("")
return lines
def _build_markdown(result: dict[str, Any]) -> str:
"""Full markdown report from a pipeline result dict."""
lines: list[str] = [
"# AutoML pipeline report",
"",
"| Field | Value |",
"|-------|-------|",
f"| **Best model (tuned)** | {_best_model_tuned_line(result)} |",
f"| **Tuning improvement** | {_tuning_improvement_line(result)} |",
f"| **SHAP analysis** | {_shap_status(result)} |",
f"| **Total pipeline steps** | {_count_pipeline_steps(result)} |",
f"| **Target** | {result.get('target_col', '—')} |",
f"| **Task** | {result.get('task_type', '—')} |",
"",
"## 1. Overview",
"",
f"**Best model:** {result.get('best_model_name', '—')}",
]
tr_ov = result.get("train") or {}
_bn = str(result.get("best_model_name", "—")).replace(" (tuned)", "").strip()
_pm = tr_ov.get("metric_name") or (
"roc_auc" if result.get("task_type") == "classification" else "r2"
)
for _r in tr_ov.get("results") or []:
if _r.get("name") == _bn:
_cvm = _r.get("cv_mean")
_cvs = _r.get("cv_std")
_nf = tr_ov.get("cv_folds_used") or 5
if _cvm is not None and _cvs is not None:
lines.append(
f"\n**{_bn}** was selected using cross-validated performance: "
f"CV mean {float(_cvm):.4f} ± {float(_cvs):.4f} across {_nf} folds. "
f"That average reflects how the model scores when validated on different held-out "
f"subsets of the training data (primary metric: `{_pm}`)."
)
break
lines.append("")
prep = result.get("prep") or {}
if prep:
lines += [
"## 2. Preprocessing",
"",
f"- **Final feature count:** {prep.get('final_feature_count', '—')}",
f"- **Train / test size:** {prep.get('train_size', '—')} / {prep.get('test_size', '—')}",
"",
]
if result.get("plan"):
lines += _build_plan_section_md(result["plan"])
else:
lines += ["## 3. Training Plan", "", "*Not available for this run.*", ""]
cdf = result.get("comparison_df")
lines += [
"## 4. Model comparison",
"",
"The table includes **CV Mean**, **CV Std**, **CV Train Mean**, and **CV Overfit** when "
"cross-validation ran; otherwise those cells are empty.",
"",
]
if cdf is not None:
try:
lines.append(cdf.to_markdown(index=False))
except Exception:
lines.append(str(cdf))
else:
lines.append("*No comparison table.*")
lines.append("")
if result.get("tune") is not None:
lines += _build_tune_section_md(result["tune"])
else:
lines += ["## 5. Hyperparameter Tuning", "", "*Not run.*", ""]
lines += [
"## 6. Evaluation metrics",
"",
"```",
str(result.get("metrics", {})),
"```",
"",
]
pp = result.get("plot_paths") or {}
if pp:
lines += ["### Plots generated", ""]
for name, path in sorted(pp.items()):
lines.append(f"- `{name}` → `{path}`")
lines.append("")
expl = (result.get("eval") or {}).get("shap_explanation_text") or ""
if expl.strip():
lines += ["### SHAP (example row)", "", expl, ""]
lines += ["## 7. Recommended next steps", ""]
for s in _generate_next_steps(result):
lines.append(f"- {s}")
lines += ["", "*Generated by automl-engineer*", ""]
return "\n".join(lines).strip() + "\n"
def _build_html(result: dict[str, Any]) -> str:
"""Self-contained HTML report with embedded PNG plots."""
title = html.escape("AutoML pipeline report")
bg = "#ffffff"
fg = "#111111"
border = "#d4d4d4"
parts: list[str] = [
f"""<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"><title>{title}</title>
<style>
body {{ font-family: Segoe UI, system-ui, sans-serif; background:{bg}; color:{fg}; padding: 28px; max-width: 960px; margin: 0 auto; }}
h1 {{ color: {PALETTE["primary"]}; }}
h2 {{ border-bottom: 2px solid {PALETTE["secondary"]}; padding-bottom: 6px; }}
table.meta {{ border-collapse: collapse; width: 100%; margin: 16px 0; }}
table.meta td, table.meta th {{ border: 1px solid {border}; padding: 8px 12px; text-align: left; }}
table.data {{ border-collapse: collapse; width: 100%; margin: 12px 0; font-size: 14px; }}
table.data th, table.data td {{ border: 1px solid {border}; padding: 6px 10px; }}
img.plot {{ max-width: 100%; height: auto; display: block; margin: 16px 0; border: 1px solid {border}; border-radius: 6px; }}
.caption {{ font-size: 13px; color: #444; margin-bottom: 24px; }}
.note {{ background: #f5f5f5; border-left: 4px solid {PALETTE["primary"]}; padding: 12px 16px; margin: 20px 0; }}
</style></head><body>
<h1>{title}</h1>
<table class="meta">
<tr><th>Best model (tuned)</th><td>{html.escape(_best_model_tuned_line(result))}</td></tr>
<tr><th>Tuning improvement</th><td>{html.escape(_tuning_improvement_line(result))}</td></tr>
<tr><th>SHAP analysis</th><td>{html.escape(_shap_status(result))}</td></tr>
<tr><th>Total pipeline steps</th><td>{_count_pipeline_steps(result)}</td></tr>
<tr><th>Target</th><td>{html.escape(str(result.get("target_col", "—")))}</td></tr>
<tr><th>Task</th><td>{html.escape(str(result.get("task_type", "—")))}</td></tr>
</table>
""",
]
if result.get("plan"):
plan = result["plan"]
dp = plan.get("dataset_profile") or {}
parts.append("<h2>3. Training Plan</h2>")
parts.append("<h3>Dataset profile</h3><table class='data'>")
parts.append(
f"<tr><th>Rows</th><td>{html.escape(str(dp.get('n_rows', '—')))}</td></tr>"
f"<tr><th>Features</th><td>{html.escape(str(dp.get('n_features', '—')))}</td></tr>"
f"<tr><th>Size category</th><td>{html.escape(_dataset_size_label(dp))}</td></tr>"
f"<tr><th>Imbalance ratio</th><td>{html.escape(str(dp.get('imbalance_ratio', '—')))}</td></tr>"
"</table>"
)
parts.append("<h3>Models</h3><table class='data'><tr><th>Model</th><th>Selected</th><th>Reason</th></tr>")
reasons = plan.get("skip_reasons") or {}
for m in plan.get("recommended_models") or []:
parts.append(
"<tr><td>"
+ html.escape(str(m))
+ "</td><td>Yes</td><td>Included in the training plan.</td></tr>"
)
for m in plan.get("skip_models") or []:
r = reasons.get(m, "Excluded by training plan rules.")
parts.append(
f"<tr><td>{html.escape(str(m))}</td><td>No</td><td>{html.escape(str(r))}</td></tr>"
)
parts.append("</table>")
parts.append(
f"<h3>Primary metric</h3><p><strong>{html.escape(str(plan.get('primary_metric', '—')))}</strong> — "
f"{html.escape(str(plan.get('metric_reasoning', '')))}</p>"
)
parts.append(
f"<h3>Tuning budget</h3><p>Trials: {html.escape(str(plan.get('n_trials', '—')))}, "
f"timeout (s): {html.escape(str(plan.get('timeout', '—')))}</p>"
)
for w in plan.get("warnings") or []:
parts.append(f"<p class='caption'>⚠️ {html.escape(str(w))}</p>")
cdf = result.get("comparison_df")
if cdf is not None:
parts.append("<h2>4. Model comparison</h2>")
parts.append(
"<p class='caption'>CV Mean, CV Std, CV Train Mean, and CV Overfit are from k-fold "
"cross-validation on the training data when sample size allows.</p>"
)
try:
parts.append(cdf.to_html(index=False, classes="data", border=0, escape=True))
except Exception:
parts.append(f"<pre>{html.escape(str(cdf))}</pre>")
trh = result.get("train") or {}
bnh = str(result.get("best_model_name", "—")).replace(" (tuned)", "").strip()
pmh = trh.get("metric_name") or (
"roc_auc" if result.get("task_type") == "classification" else "r2"
)
for rh in trh.get("results") or []:
if rh.get("name") == bnh:
cvm = rh.get("cv_mean")
cvs = rh.get("cv_std")
nf = trh.get("cv_folds_used") or 5
if cvm is not None and cvs is not None:
parts.append(
"<p><strong>"
+ html.escape(bnh)
+ "</strong> was selected using cross-validated performance: CV mean "
f"{float(cvm):.4f} ± {float(cvs):.4f} across {nf} folds "
f"(primary metric: <code>{html.escape(str(pmh))}</code>).</p>"
)
break
tune = result.get("tune")
if tune is not None:
parts.append("<h2>5. Hyperparameter Tuning</h2>")
if tune.get("success"):
parts.append(
"<table class='data'><tr><th>Baseline</th><td>"
f"{html.escape(str(tune.get('baseline_score')))}</td></tr>"
f"<tr><th>After tuning</th><td>{html.escape(str(tune.get('best_score')))}</td></tr>"
f"<tr><th>Improvement</th><td>{html.escape(str(tune.get('improvement')))}</td></tr>"
f"<tr><th>Trials</th><td>{html.escape(str(tune.get('n_trials_run')))}</td></tr>"
f"<tr><th>Time (s)</th><td>{html.escape(str(tune.get('tuning_time_s')))}</td></tr>"
f"<tr><th>Overfit flag</th><td>{html.escape(str(tune.get('overfit')))}</td></tr></table>"
)
bp = tune.get("best_params") or {}
if bp:
parts.append("<h3>Best hyperparameters</h3><table class='data'><tr><th>Parameter</th><th>Value</th></tr>")
for k, v in sorted(bp.items())[:48]:
parts.append(
f"<tr><td>{html.escape(str(k))}</td><td>{html.escape(repr(v))}</td></tr>"
)
parts.append("</table>")
else:
parts.append(f"<p>{html.escape(str(tune.get('error', 'Unknown error')))}</p>")
parts.append("<h2>6. Evaluation metrics</h2>")
parts.append(f"<pre>{html.escape(str(result.get('metrics', {})))}</pre>")
pp = result.get("plot_paths") or {}
base_keys = [
"confusion_matrix",
"roc_curve",
"actual_vs_predicted",
"residuals",
"feature_importance",
"shap_bar",
"shap_summary",
"shap_waterfall",
]
parts.append("<h2>Figures</h2>")
embedded = 0
for key in base_keys:
path = pp.get(key)
if not path:
continue
b64 = _embed_png(path)
if not b64:
continue
parts.append(
f"<h3>{html.escape(key.replace('_', ' ').title())}</h3>"
f"<img class='plot' alt=\"{html.escape(key)}\" "
f'src="data:image/png;base64,{b64}" />'
)
embedded += 1
dep_keys = sorted(k for k in pp if str(k).startswith("shap_dependence_"))
if dep_keys:
parts.append("<h2>SHAP Feature Deep Dive</h2>")
parts.append(
"<p>Dependence plots show how each top feature individually affects the model output on test samples. "
"Color encodes the most interacting feature (when available).</p>"
)
for key in dep_keys:
path = pp.get(key)
if not path:
continue
b64 = _embed_png(path)
if not b64:
continue
feat_part = str(key).replace("shap_dependence_", "", 1).replace("_", " ")
parts.append(
f"<h3>{html.escape(feat_part)}</h3>"
f"<img class='plot' alt=\"{html.escape(key)}\" src=\"data:image/png;base64,{b64}\" />"
"<p class='caption'>This plot shows how <strong>"
f"{html.escape(feat_part)}</strong> affects the model prediction. "
"Each dot is one test sample. The color shows the value of the most interacting feature.</p>"
)
embedded += 1
parts.append(
f"<p class='note'><strong>Plots embedded:</strong> {embedded}. "
"The HTML report is fully self-contained — all plots are embedded. Safe to email or share.</p>"
)
parts.append("<h2>Recommended next steps</h2><ul>")
for s in _generate_next_steps(result):
parts.append(f"<li>{html.escape(s)}</li>")
parts.append("</ul></body></html>")
return "".join(parts)
def count_embedded_plots_html(html_str: str) -> int:
return html_str.count("data:image/png;base64,")