Spaces:

vxa8502
/

Sage

Running

File size: 17,553 Bytes

"""
Human evaluation of recommendation explanations.

Generates 50 samples from the recommendation pipeline, presents them
interactively for Likert-scale rating, and computes aggregate metrics.

Dimensions (1-5 Likert scale):
    Comprehension: "I understood why this item was recommended"
    Trust:         "I trust this explanation is accurate"
    Usefulness:    "This explanation helped me make a decision"
    Satisfaction:  "I am satisfied with this explanation"

Usage:
    python scripts/human_eval.py --generate   # Generate 50 samples
    python scripts/human_eval.py --annotate   # Rate samples (resumable)
    python scripts/human_eval.py --analyze    # Compute results
    python scripts/human_eval.py --status     # Show progress

Run from project root.
"""

import argparse
import json
import math
import sys
from datetime import datetime

from sage.core import AggregationMethod
from sage.config import (
    DATA_DIR,
    EVAL_DIMENSIONS,
    EVALUATION_QUERIES,
    HELPFULNESS_TARGET,
    MAX_EVIDENCE,
    RESULTS_DIR,
    get_logger,
    log_banner,
    save_results,
)

logger = get_logger(__name__)

SAMPLES_DIR = DATA_DIR / "human_eval"
SAMPLES_FILE = SAMPLES_DIR / "samples.json"

TARGET_SAMPLES = 50
NATURAL_QUERIES_FILE = DATA_DIR / "eval" / "eval_natural_queries.json"


# ============================================================================
# Sample Generation
# ============================================================================


def _select_diverse_natural_queries(target: int = 35) -> list[str]:
    """Select diverse queries from natural eval dataset, balanced by category."""
    if not NATURAL_QUERIES_FILE.exists():
        logger.error(
            "Natural queries file not found: %s  "
            "Run 'make eval' first to build eval datasets.",
            NATURAL_QUERIES_FILE,
        )
        return []

    with open(NATURAL_QUERIES_FILE, encoding="utf-8") as f:
        data = json.load(f)

    # Group by category
    by_category: dict[str, list[str]] = {}
    for item in data:
        cat = item["category"]
        by_category.setdefault(cat, []).append(item["query"])

    if not by_category:
        return []

    # Round-robin across categories
    selected = []
    categories = sorted(by_category.keys())
    max_cat_len = max(len(v) for v in by_category.values())
    idx = 0
    while len(selected) < target and idx < max_cat_len:
        for cat in categories:
            queries = by_category[cat]
            if idx < len(queries) and len(selected) < target:
                q = queries[idx]
                if q not in selected:
                    selected.append(q)
        idx += 1

    return selected


def _select_config_queries(exclude: set[str], target: int = 15) -> list[str]:
    """Select queries from EVALUATION_QUERIES config, excluding duplicates."""
    selected = []
    for q in EVALUATION_QUERIES:
        if q not in exclude and len(selected) < target:
            selected.append(q)
    return selected


def generate_samples(force: bool = False, seed: int = 42):
    """Generate recommendation+explanation samples for human evaluation."""
    import random

    from sage.services.retrieval import get_candidates
    from sage.services import get_explanation_services

    # Protect existing rated samples from accidental overwrite
    if SAMPLES_FILE.exists() and not force:
        with open(SAMPLES_FILE, encoding="utf-8") as f:
            existing = json.load(f)
        rated = sum(1 for s in existing if s.get("rating") is not None)
        if rated > 0:
            logger.error(
                "%s contains %d rated samples. "
                "Use --force to overwrite, or run --annotate to continue.",
                SAMPLES_FILE,
                rated,
            )
            sys.exit(1)

    SAMPLES_DIR.mkdir(parents=True, exist_ok=True)
    RESULTS_DIR.mkdir(parents=True, exist_ok=True)

    log_banner(logger, "GENERATING HUMAN EVAL SAMPLES")
    logger.info("Random seed: %d", seed)

    # Set seed for reproducibility
    random.seed(seed)

    # Select diverse query set
    natural = _select_diverse_natural_queries(35)
    config = _select_config_queries(set(natural), 15)
    all_queries = natural + config

    # Shuffle with seeded random for reproducibility
    random.shuffle(all_queries)
    logger.info(
        "Queries: %d natural + %d config = %d total",
        len(natural),
        len(config),
        len(all_queries),
    )

    if len(all_queries) < TARGET_SAMPLES:
        logger.error(
            "Only %d unique queries available (target: %d). "
            "Results will lack statistical power. "
            "Run 'make eval' to build natural query dataset.",
            len(all_queries),
            TARGET_SAMPLES,
        )

    # Initialize services
    explainer, detector = get_explanation_services()

    samples = []
    for i, query in enumerate(all_queries, 1):
        logger.info('[%d/%d] "%s"', i, len(all_queries), query)

        products = get_candidates(
            query=query,
            k=1,
            min_rating=4.0,
            aggregation=AggregationMethod.MAX,
        )
        if not products:
            logger.info("  No products found, skipping")
            continue

        product = products[0]
        try:
            expl = explainer.generate_explanation(
                query,
                product,
                max_evidence=MAX_EVIDENCE,
            )
            hhem = detector.check_explanation(
                expl.evidence_texts,
                expl.explanation,
            )

            sample = {
                "id": len(samples) + 1,
                "query": query,
                "product_id": product.product_id,
                "avg_rating": round(product.avg_rating, 1),
                "explanation": expl.explanation,
                "evidence": expl.to_evidence_dicts(),
                "hhem_score": round(hhem.score, 4),
                "rating": None,
            }
            samples.append(sample)
            logger.info(
                "  %s (%.1f stars) HHEM=%.3f",
                product.product_id,
                product.avg_rating,
                hhem.score,
            )
        except ValueError as exc:
            logger.info("  Quality gate refusal: %s", exc)
        except Exception:
            logger.exception("  Error generating sample")

    # Save
    with open(SAMPLES_FILE, "w", encoding="utf-8") as f:
        json.dump(samples, f, indent=2)

    logger.info("Generated %d samples -> %s", len(samples), SAMPLES_FILE)
    return samples


# ============================================================================
# Interactive Annotation
# ============================================================================


def _load_samples() -> list[dict]:
    """Load samples from disk."""
    if not SAMPLES_FILE.exists():
        logger.error("No samples file. Run --generate first.")
        sys.exit(1)

    with open(SAMPLES_FILE, encoding="utf-8") as f:
        return json.load(f)


def _save_samples(samples: list[dict]):
    """Save samples back to disk."""
    with open(SAMPLES_FILE, "w", encoding="utf-8") as f:
        json.dump(samples, f, indent=2)


def _get_likert_input(prompt: str) -> int:
    """Prompt user for a 1-5 Likert rating. Returns rating or raises KeyboardInterrupt."""
    while True:
        try:
            raw = input(f"  {prompt} [1-5]: ").strip()
        except EOFError:
            raise KeyboardInterrupt
        if raw in ("1", "2", "3", "4", "5"):
            return int(raw)
        print("    Enter a number from 1 to 5.")


def annotate_samples():
    """Interactive CLI loop for rating samples."""
    samples = _load_samples()
    total = len(samples)
    rated = sum(1 for s in samples if s["rating"] is not None)
    unrated = [s for s in samples if s["rating"] is None]

    log_banner(logger, "HUMAN EVALUATION ANNOTATION")
    print(f"\nProgress: {rated}/{total} rated, {len(unrated)} remaining\n")

    if not unrated:
        print("All samples have been rated. Run --analyze to compute results.")
        return

    print("Rate each dimension from 1 (strongly disagree) to 5 (strongly agree).")
    print("Press Ctrl+C to save progress and quit.\n")
    print("-" * 60)

    try:
        for sample in unrated:
            rated = sum(1 for s in samples if s["rating"] is not None)
            print(f"\n--- Sample {sample['id']} ({rated + 1}/{total}) ---\n")

            # Display product and query
            print(f"PRODUCT: {sample['product_id']}  ({sample['avg_rating']} stars)")
            print(f"QUERY:   {sample['query']}\n")

            # Display explanation
            print(f"EXPLANATION:\n{sample['explanation']}\n")

            # Display evidence (truncated)
            print("EVIDENCE:")
            for ev in sample["evidence"]:
                text = ev["text"]
                if len(text) > 200:
                    text = text[:200] + "..."
                print(f'  [{ev["id"]}]: "{text}"')
            print()

            # Collect ratings
            rating = {}
            for dim_key, dim_prompt in EVAL_DIMENSIONS.items():
                rating[dim_key] = _get_likert_input(dim_prompt)

            sample["rating"] = rating
            _save_samples(samples)
            scores_str = ", ".join(f"{k}={v}" for k, v in rating.items())
            print(f"  -> Saved ({scores_str})")
            print("-" * 60)

    except KeyboardInterrupt:
        _save_samples(samples)
        rated_now = sum(1 for s in samples if s["rating"] is not None)
        print(f"\n\nProgress saved: {rated_now}/{total} rated.")
        print("Run --annotate again to continue.")


# ============================================================================
# Analysis
# ============================================================================


def analyze_results():
    """Compute aggregate metrics from rated samples."""
    samples = _load_samples()
    rated = [s for s in samples if s["rating"] is not None]

    log_banner(logger, "HUMAN EVALUATION ANALYSIS")

    if not rated:
        logger.error("No rated samples. Run --annotate first.")
        return None

    logger.info("Rated samples: %d/%d", len(rated), len(samples))

    # Per-dimension statistics
    dimensions_results = {}
    for dim_key in EVAL_DIMENSIONS:
        scores = [s["rating"][dim_key] for s in rated]
        n = len(scores)
        mean = sum(scores) / n
        variance = sum((x - mean) ** 2 for x in scores) / (n - 1) if n > 1 else 0.0
        std = variance**0.5
        dimensions_results[dim_key] = {
            "mean": round(mean, 2),
            "std": round(std, 2),
            "min": min(scores),
            "max": max(scores),
        }
        logger.info(
            "  %-15s mean=%.2f  std=%.2f  range=[%d, %d]",
            dim_key + ":",
            mean,
            std,
            min(scores),
            max(scores),
        )

    # Overall helpfulness: mean of per-sample averages
    per_sample_means = []
    for s in rated:
        r = s["rating"]
        sample_mean = sum(r[k] for k in EVAL_DIMENSIONS) / len(EVAL_DIMENSIONS)
        per_sample_means.append(sample_mean)
    overall = sum(per_sample_means) / len(per_sample_means)
    passed = overall >= HELPFULNESS_TARGET

    logger.info("")
    logger.info(
        "Overall helpfulness: %.2f (target: %.1f) [%s]",
        overall,
        HELPFULNESS_TARGET,
        "PASS" if passed else "FAIL",
    )

    # HHEM vs Trust correlation (Spearman)
    correlation = _compute_hhem_trust_correlation(rated)
    if correlation:
        logger.info(
            "HHEM-Trust correlation: r=%.3f, p=%.4f",
            correlation["spearman_r"],
            correlation["p_value"],
        )

    # Save results
    results = {
        "timestamp": datetime.now().isoformat(),
        "n_samples": len(rated),
        "n_total": len(samples),
        "methodology": {
            "evaluator": "Single rater (developer/researcher)",
            "instructions": "Rate each dimension 1-5 Likert: 1=strongly disagree, 5=strongly agree",
            "dimensions": {
                "comprehension": "I understood why this item was recommended",
                "trust": "I trust this explanation is accurate",
                "usefulness": "This explanation helped me make a decision",
                "satisfaction": "I am satisfied with this explanation",
            },
            "sample_selection": "35 natural queries (balanced by category) + 15 config queries",
            "inter_annotator_agreement": "N/A (single rater)",
        },
        "dimensions": dimensions_results,
        "overall_helpfulness": round(overall, 2),
        "target": HELPFULNESS_TARGET,
        "pass": passed,
    }
    if correlation:
        results["hhem_trust_correlation"] = correlation

    ts_file = save_results(results, "human_eval")
    logger.info("Saved: %s", ts_file)

    return results


def _compute_hhem_trust_correlation(rated: list[dict]) -> dict | None:
    """Compute Spearman correlation between HHEM score and trust rating."""
    hhem_scores = [s["hhem_score"] for s in rated]
    trust_scores = [s["rating"]["trust"] for s in rated]

    if len(set(hhem_scores)) < 2 or len(set(trust_scores)) < 2:
        return None

    try:
        from scipy.stats import spearmanr

        r, p = spearmanr(hhem_scores, trust_scores)
        return {"spearman_r": round(float(r), 4), "p_value": round(float(p), 4)}
    except ImportError:
        # Fall back: compute rank correlation manually
        return _manual_spearman(hhem_scores, trust_scores)


def _manual_spearman(x: list[float], y: list[float]) -> dict | None:
    """Rank-based Spearman without scipy."""
    n = len(x)
    if n < 3:
        return None

    def _rank(vals):
        order = sorted(range(n), key=lambda i: vals[i])
        ranks = [0.0] * n
        i = 0
        while i < n:
            j = i
            while j < n - 1 and vals[order[j + 1]] == vals[order[j]]:
                j += 1
            avg_rank = (i + j) / 2 + 1
            for k in range(i, j + 1):
                ranks[order[k]] = avg_rank
            i = j + 1
        return ranks

    rx = _rank(x)
    ry = _rank(y)

    d_sq = sum((rx[i] - ry[i]) ** 2 for i in range(n))
    rho = 1 - (6 * d_sq) / (n * (n**2 - 1))

    # Approximate p-value via t-distribution (large sample)
    if abs(rho) >= 1.0:
        p = 0.0
    else:
        t = rho * math.sqrt((n - 2) / (1 - rho**2))
        # Two-tailed p-value approximation
        p = 2 * (1 - _t_cdf_approx(abs(t), n - 2))

    return {"spearman_r": round(rho, 4), "p_value": round(max(p, 0.0), 4)}


def _t_cdf_approx(t: float, df: int) -> float:
    """Rough t-distribution CDF approximation (good enough for p < 0.05 checks)."""
    # Regularized incomplete beta function approximation
    # For df > 30, normal approximation is fine
    if df > 30:
        z = t * (1 - 1 / (4 * df))
        return 0.5 * (1 + math.erf(z / math.sqrt(2)))
    # For smaller df, use a rougher bound
    return 0.5 * (1 + math.erf(t / math.sqrt(2 + t * t / df)))


# ============================================================================
# Status
# ============================================================================


def show_status():
    """Show annotation progress."""
    if not SAMPLES_FILE.exists():
        print("No samples generated yet. Run --generate first.")
        return

    samples = _load_samples()
    total = len(samples)
    rated = sum(1 for s in samples if s["rating"] is not None)
    print(f"Human Evaluation Status: {rated}/{total} samples rated")

    if rated == total:
        print("All samples rated. Run --analyze to compute results.")
    elif rated > 0:
        print(f"  {total - rated} remaining. Run --annotate to continue.")
    else:
        print("  No ratings yet. Run --annotate to start.")


# ============================================================================
# Main
# ============================================================================


def main():
    parser = argparse.ArgumentParser(
        description="Human evaluation of recommendation explanations",
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        "--generate", action="store_true", help="Generate recommendation samples"
    )
    group.add_argument(
        "--annotate", action="store_true", help="Rate samples interactively (resumable)"
    )
    group.add_argument(
        "--analyze", action="store_true", help="Compute aggregate results from ratings"
    )
    group.add_argument("--status", action="store_true", help="Show annotation progress")
    parser.add_argument(
        "--force",
        action="store_true",
        help="Overwrite existing rated samples (with --generate)",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="Random seed for query selection (with --generate)",
    )
    args = parser.parse_args()

    if args.force and not args.generate:
        parser.error("--force can only be used with --generate")

    if args.seed != 42 and not args.generate:
        parser.error("--seed can only be used with --generate")

    if args.generate:
        generate_samples(force=args.force, seed=args.seed)
    elif args.annotate:
        annotate_samples()
    elif args.analyze:
        analyze_results()
    elif args.status:
        show_status()


if __name__ == "__main__":
    main()