File size: 12,521 Bytes

6f90f5c

"""Feedback loop for eval agent — computes effectiveness of auxiliary metrics."""

from __future__ import annotations

import json
import logging
import math
from pathlib import Path
from typing import Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Statistics helpers (no scipy dependency)
# ---------------------------------------------------------------------------

def _rank(values: List[float]) -> List[float]:
    """Assign ranks to values (average rank for ties)."""
    n = len(values)
    indexed = sorted(range(n), key=lambda i: values[i])
    ranks = [0.0] * n
    i = 0
    while i < n:
        j = i
        while j < n - 1 and values[indexed[j + 1]] == values[indexed[j]]:
            j += 1
        avg_rank = (i + j) / 2.0 + 1  # 1-based
        for k in range(i, j + 1):
            ranks[indexed[k]] = avg_rank
        i = j + 1
    return ranks


def _spearman_correlation(x: List[float], y: List[float]) -> float:
    """Spearman rank correlation without scipy."""
    n = len(x)
    if n < 3:
        return 0.0
    rx = _rank(x)
    ry = _rank(y)
    d_sq = sum((a - b) ** 2 for a, b in zip(rx, ry))
    denom = n * (n * n - 1)
    if denom == 0:
        return 0.0
    return 1.0 - (6.0 * d_sq) / denom


def _variance(values: List[float]) -> float:
    """Population variance."""
    n = len(values)
    if n < 2:
        return 0.0
    mean = sum(values) / n
    return sum((v - mean) ** 2 for v in values) / n


def _linear_trend(values: List[float]) -> float:
    """Slope of linear regression (values indexed 0..n-1)."""
    n = len(values)
    if n < 2:
        return 0.0
    x_mean = (n - 1) / 2.0
    y_mean = sum(values) / n
    num = sum((i - x_mean) * (v - y_mean) for i, v in enumerate(values))
    den = sum((i - x_mean) ** 2 for i in range(n))
    if den == 0:
        return 0.0
    return num / den


# ---------------------------------------------------------------------------
# Data loading
# ---------------------------------------------------------------------------

def _load_generation_metrics(
    results_dir: Path, gen_start: int, gen_end: int
) -> List[Tuple[int, Dict]]:
    """Load metrics.json for a range of generations. Returns [(gen, data), ...]."""
    records = []
    for gen in range(gen_start, gen_end + 1):
        metrics_path = results_dir / f"gen_{gen}" / "results" / "metrics.json"
        if not metrics_path.exists():
            continue
        try:
            with open(metrics_path) as f:
                data = json.load(f)
            records.append((gen, data))
        except Exception:
            continue
    return records


def _extract_aux_metric_names(records: List[Tuple[int, Dict]]) -> List[str]:
    """Collect all aux_ metric names from public metrics across records.
    Filters out framework metrics (aux_aux_metric_*) that are always present."""
    FRAMEWORK_PREFIXES = ("aux_aux_metric_eval_success", "aux_aux_metric_error_code",
                          "aux_aux_metric_error_message_length", "aux_aux_metric_error_detail_length",
                          "aux_aux_metric_non_numeric_dropped_count")
    names: set[str] = set()
    for _, data in records:
        public = data.get("public", {})
        for key in public:
            if key.startswith("aux_") and key not in FRAMEWORK_PREFIXES:
                names.add(key)
    return sorted(names)


# ---------------------------------------------------------------------------
# Feedback computation
# ---------------------------------------------------------------------------

def _classify_correlation(rho: float) -> Tuple[str, str]:
    """Classify correlation strength. Returns (label, emoji)."""
    abs_rho = abs(rho)
    if abs_rho >= 0.6:
        if rho > 0:
            return "strong positive", "USEFUL"
        return "strong negative", "MISLEADING"
    if abs_rho >= 0.3:
        if rho > 0:
            return "moderate positive", "USEFUL"
        return "moderate negative", "REVIEW"
    return "weak/none", "LOW SIGNAL"


def compute_metric_feedback(
    results_dir: Path,
    current_gen: int,
    last_agent_gen: int,
    pre_window: int = 10,
) -> str:
    """
    Compute feedback on auxiliary metrics effectiveness since last agent intervention.

    Args:
        results_dir: Experiment root directory.
        current_gen: Current generation number.
        last_agent_gen: Generation when agent last ran.
        pre_window: Number of generations before agent intervention to compare.

    Returns:
        Formatted markdown string for inclusion in agent task message.
        Empty string if insufficient data.
    """
    if last_agent_gen < 0 or current_gen <= last_agent_gen:
        return ""

    results_dir = Path(results_dir)

    # Load post-intervention data (last_agent_gen+1 .. current_gen)
    post_records = _load_generation_metrics(results_dir, last_agent_gen + 1, current_gen)
    if len(post_records) < 2:
        return ""  # Not enough data for meaningful feedback

    # Load pre-intervention data for trajectory comparison
    pre_start = max(0, last_agent_gen - pre_window)
    pre_records = _load_generation_metrics(results_dir, pre_start, last_agent_gen)

    # --- Section 1: Score trajectory comparison ---
    trajectory_lines = _build_trajectory_section(pre_records, post_records, last_agent_gen)

    # --- Section 2: Per-metric effectiveness ---
    aux_names = _extract_aux_metric_names(post_records)
    if not aux_names:
        # No auxiliary metrics to evaluate
        if trajectory_lines:
            return (
                f"\n\n📋 Feedback on Previous Actions (gen {last_agent_gen} → gen {current_gen}):\n\n"
                + "\n".join(trajectory_lines)
                + "\n\nNo auxiliary metrics found in recent generations.\n"
            )
        return ""

    metric_lines = _build_metric_effectiveness_section(post_records, aux_names)

    # --- Section 3: Actionable recommendation ---
    recommendation = _build_recommendation(post_records, aux_names)

    # Assemble
    parts = [
        f"\n\n📋 Feedback on Your Previous Actions (gen {last_agent_gen} → gen {current_gen}):",
        "",
    ]
    if trajectory_lines:
        parts.extend(trajectory_lines)
        parts.append("")
    parts.extend(metric_lines)
    if recommendation:
        parts.append("")
        parts.append(recommendation)

    return "\n".join(parts)


def _build_trajectory_section(
    pre_records: List[Tuple[int, Dict]],
    post_records: List[Tuple[int, Dict]],
    last_agent_gen: int,
) -> List[str]:
    """Build score trajectory comparison lines."""
    lines: List[str] = ["Score Trajectory:"]

    pre_scores = [d.get("combined_score", 0.0) for _, d in pre_records if isinstance(d.get("combined_score"), (int, float))]
    post_scores = [d.get("combined_score", 0.0) for _, d in post_records if isinstance(d.get("combined_score"), (int, float))]

    if pre_scores:
        pre_avg = sum(pre_scores) / len(pre_scores)
        pre_trend = _linear_trend(pre_scores)
        lines.append(f"- Before intervention ({len(pre_scores)} gens): avg {pre_avg:.4f}, trend {pre_trend:+.4f}/gen")

    if post_scores:
        post_avg = sum(post_scores) / len(post_scores)
        post_trend = _linear_trend(post_scores)
        lines.append(f"- After intervention ({len(post_scores)} gens): avg {post_avg:.4f}, trend {post_trend:+.4f}/gen")

    if pre_scores and post_scores:
        delta = post_avg - pre_avg
        lines.append(f"- Net impact: avg score {'improved' if delta > 0 else 'declined'} by {delta:+.4f}")
        # Score decline warning
        if pre_avg > 0 and delta < 0 and abs(delta) / pre_avg > 0.05:
            lines.append("")
            lines.append("⚠️ WARNING: Score has DECLINED significantly since agent intervention.")
            lines.append("Consider simplifying or clearing auxiliary metrics to eliminate potential interference.")
            lines.append("Focus diagnostic_report.md on algorithmic advice rather than adding new metrics.")

    return lines if len(lines) > 1 else []


def _build_metric_effectiveness_section(
    post_records: List[Tuple[int, Dict]],
    aux_names: List[str],
) -> List[str]:
    """Build per-metric effectiveness table."""
    scores = []
    for _, data in post_records:
        cs = data.get("combined_score")
        if isinstance(cs, (int, float)) and math.isfinite(cs):
            scores.append(cs)
        else:
            scores.append(None)

    lines = [
        "Metric Effectiveness Report:",
        "| Metric | Corr w/ Score | Variance | Coverage | Signal |",
        "|--------|--------------|----------|----------|--------|",
    ]

    for name in aux_names:
        metric_vals: List[Optional[float]] = []
        for _, data in post_records:
            v = data.get("public", {}).get(name)
            if isinstance(v, (int, float)) and math.isfinite(v):
                metric_vals.append(float(v))
            else:
                metric_vals.append(None)

        # Paired non-None values for correlation
        paired_m = []
        paired_s = []
        for mv, sv in zip(metric_vals, scores):
            if mv is not None and sv is not None:
                paired_m.append(mv)
                paired_s.append(sv)

        n_total = len(metric_vals)
        n_present = sum(1 for v in metric_vals if v is not None)
        coverage = n_present / n_total if n_total > 0 else 0.0

        if len(paired_m) >= 3:
            rho = _spearman_correlation(paired_m, paired_s)
            var = _variance(paired_m)
            label, signal = _classify_correlation(rho)
            # Override signal if variance is near zero
            if var < 1e-9:
                signal = "LOW SIGNAL"
            lines.append(
                f"| {name} | {rho:+.2f} ({label}) | {var:.4f} | {coverage:.0%} | {signal} |"
            )
        else:
            lines.append(
                f"| {name} | N/A (< 3 pts) | N/A | {coverage:.0%} | INSUFFICIENT |"
            )

    return lines


def _build_recommendation(
    post_records: List[Tuple[int, Dict]],
    aux_names: List[str],
) -> str:
    """Build actionable recommendation based on metric analysis."""
    scores = [
        data.get("combined_score", 0.0)
        for _, data in post_records
        if isinstance(data.get("combined_score"), (int, float))
    ]

    to_remove: List[str] = []
    to_review: List[str] = []

    for name in aux_names:
        paired_m = []
        paired_s = []
        for _, data in post_records:
            v = data.get("public", {}).get(name)
            cs = data.get("combined_score")
            if (
                isinstance(v, (int, float)) and math.isfinite(v)
                and isinstance(cs, (int, float)) and math.isfinite(cs)
            ):
                paired_m.append(float(v))
                paired_s.append(float(cs))

        if len(paired_m) < 3:
            continue

        rho = _spearman_correlation(paired_m, paired_s)
        var = _variance(paired_m)

        if var < 1e-9:
            # Distinguish: constant because code broken vs constant because metric is bad
            all_scores_zero = all(s <= 0.01 for s in paired_s)
            if all_scores_zero:
                # Don't recommend removal — metric is constant because code never worked
                pass  # skip this metric, it may become useful when code starts working
            else:
                to_remove.append(f"{name} (constant value, no signal)")
        elif rho < -0.3:
            to_review.append(f"{name} (negative correlation {rho:+.2f}, may mislead optimizer)")
        elif abs(rho) < 0.1:
            to_remove.append(f"{name} (near-zero correlation {rho:+.2f})")

    parts: List[str] = []
    if to_remove:
        parts.append("Consider REMOVING: " + "; ".join(to_remove) + ".")
    if to_review:
        parts.append("Consider REVIEWING: " + "; ".join(to_review) + ".")
    if not parts:
        parts.append("All current metrics show reasonable signal. Focus on diagnostic_report.md rather than adding new metrics.")

    # Churn warning: if most metrics are flagged for removal, the agent is likely churning
    if len(to_remove) >= 3:
        parts.append(
            "⚠️ CHURN WARNING: Most metrics have low signal. "
            "Stop modifying metrics and focus on writing a useful diagnostic_report.md instead."
        )

    return "Recommendation: " + " ".join(parts)