| | |
| | from __future__ import annotations |
| |
|
| | import argparse |
| | import csv |
| | import json |
| | import subprocess |
| | from pathlib import Path |
| | from typing import Any |
| |
|
| | ROOT = Path(__file__).resolve().parents[1] |
| | CHALLENGE_SCRIPT = ROOT / 'scripts' / 'score_hf_hub_community_challenges.py' |
| | COVERAGE_SCRIPT = ROOT / 'scripts' / 'score_hf_hub_community_coverage.py' |
| | OUT_DIR = ROOT / 'docs' / 'hf_hub_prompt_ab' |
| |
|
| |
|
| | def parse_variants(raw: str) -> list[tuple[str, Path]]: |
| | out: list[tuple[str, Path]] = [] |
| | for chunk in raw.split(','): |
| | chunk = chunk.strip() |
| | if not chunk: |
| | continue |
| | if '=' not in chunk: |
| | raise ValueError(f"Invalid variant '{chunk}'. Use id=/path/to/cards") |
| | vid, path = chunk.split('=', 1) |
| | out.append((vid.strip(), Path(path.strip()))) |
| | if len(out) < 2: |
| | raise ValueError('Provide at least two variants.') |
| | return out |
| |
|
| |
|
| | def run_cmd(cmd: list[str]) -> None: |
| | print('\n[run]', ' '.join(cmd)) |
| | subprocess.run(cmd, check=True) |
| |
|
| |
|
| | def run_variant_model( |
| | *, |
| | variant_id: str, |
| | cards_dir: Path, |
| | model: str, |
| | agent: str, |
| | timeout: int, |
| | out_dir: Path, |
| | challenge_start: int | None, |
| | challenge_end: int | None, |
| | coverage_cases: Path | None, |
| | ) -> dict[str, Any]: |
| | vm_dir = out_dir / variant_id / model.replace('/', '_') |
| | vm_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | ch_json = vm_dir / 'challenges.json' |
| | ch_md = vm_dir / 'challenges.md' |
| | ch_raw = vm_dir / 'raw' / 'challenges' |
| |
|
| | cov_json = vm_dir / 'coverage.json' |
| | cov_md = vm_dir / 'coverage.md' |
| | cov_raw = vm_dir / 'raw' / 'coverage' |
| |
|
| | ch_cmd = [ |
| | 'python', str(CHALLENGE_SCRIPT), |
| | '--model', model, |
| | '--agent', agent, |
| | '--agent-cards', str(cards_dir), |
| | '--timeout', str(timeout), |
| | '--raw-results-dir', str(ch_raw), |
| | '--json-out', str(ch_json), |
| | '--md-out', str(ch_md), |
| | ] |
| | if challenge_start is not None: |
| | ch_cmd.extend(['--start', str(challenge_start)]) |
| | if challenge_end is not None: |
| | ch_cmd.extend(['--end', str(challenge_end)]) |
| | run_cmd(ch_cmd) |
| |
|
| | cov_cmd = [ |
| | 'python', str(COVERAGE_SCRIPT), |
| | '--model', model, |
| | '--agent', agent, |
| | '--agent-cards', str(cards_dir), |
| | '--timeout', str(timeout), |
| | '--raw-results-dir', str(cov_raw), |
| | '--json-out', str(cov_json), |
| | '--md-out', str(cov_md), |
| | ] |
| | if coverage_cases is not None: |
| | cov_cmd.extend(['--cases', str(coverage_cases)]) |
| | run_cmd(cov_cmd) |
| |
|
| | ch_rows = json.loads(ch_json.read_text(encoding='utf-8')) |
| | cov_payload = json.loads(cov_json.read_text(encoding='utf-8')) |
| | cov_summary = cov_payload.get('summary', {}) |
| |
|
| | n_ch = len(ch_rows) |
| | ch_total = sum(r['score']['total'] for r in ch_rows) if n_ch else 0 |
| | ch_avg = (ch_total / n_ch) if n_ch else 0.0 |
| | ch_tool_calls = sum(int(r.get('tool_calls_count') or 0) for r in ch_rows) |
| | ch_tokens = sum(int((r.get('usage') or {}).get('total_tokens') or 0) for r in ch_rows) |
| |
|
| | cov_n = int(cov_summary.get('n_cases') or 0) |
| | cov_avg_score = float(cov_summary.get('avg_score_total') or 0.0) |
| | cov_endpoint_rate = float(cov_summary.get('endpoint_match_rate') or 0.0) |
| | cov_method_rate = float(cov_summary.get('method_match_rate') or 0.0) |
| | cov_tool_calls = sum(int(r['observed']['tool_calls_count']) for r in cov_payload.get('rows', [])) |
| | cov_tokens = sum(int((r['observed']['usage'].get('total_tokens') or 0)) for r in cov_payload.get('rows', [])) |
| |
|
| | combined_calls = ch_tool_calls + cov_tool_calls |
| | combined_tokens = ch_tokens + cov_tokens |
| |
|
| | |
| | |
| | composite = (0.6 * (ch_avg / 10.0)) + (0.3 * cov_endpoint_rate) + (0.1 * cov_method_rate) |
| |
|
| | return { |
| | 'variant': variant_id, |
| | 'model': model, |
| | 'challenge_cases': n_ch, |
| | 'challenge_avg_score': round(ch_avg, 3), |
| | 'coverage_cases': cov_n, |
| | 'coverage_avg_score': round(cov_avg_score, 3), |
| | 'coverage_endpoint_rate': round(cov_endpoint_rate, 4), |
| | 'coverage_method_rate': round(cov_method_rate, 4), |
| | 'total_tool_calls': combined_calls, |
| | 'total_tokens': combined_tokens, |
| | 'composite': round(composite, 4), |
| | 'paths': { |
| | 'challenges_json': str(ch_json), |
| | 'coverage_json': str(cov_json), |
| | }, |
| | } |
| |
|
| |
|
| | def write_outputs(rows: list[dict[str, Any]], out_dir: Path) -> None: |
| | out_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | json_path = out_dir / 'prompt_ab_summary.json' |
| | csv_path = out_dir / 'prompt_ab_summary.csv' |
| | md_path = out_dir / 'prompt_ab_summary.md' |
| |
|
| | json_path.write_text(json.dumps(rows, indent=2), encoding='utf-8') |
| |
|
| | fields = [ |
| | 'variant', 'model', |
| | 'challenge_cases', 'challenge_avg_score', |
| | 'coverage_cases', 'coverage_avg_score', |
| | 'coverage_endpoint_rate', 'coverage_method_rate', |
| | 'total_tool_calls', 'total_tokens', 'composite', |
| | ] |
| | with csv_path.open('w', newline='', encoding='utf-8') as f: |
| | w = csv.DictWriter(f, fieldnames=fields) |
| | w.writeheader() |
| | for r in rows: |
| | w.writerow({k: r.get(k) for k in fields}) |
| |
|
| | lines = [ |
| | '# HF Hub Prompt A/B Summary', |
| | '', |
| | '| Variant | Model | Ch avg (/10) | Cov endpoint | Cov method | Composite | Calls | Tokens |', |
| | '|---|---|---:|---:|---:|---:|---:|---:|', |
| | ] |
| | for r in sorted(rows, key=lambda x: (-x['composite'], x['total_tokens'], x['variant'], x['model'])): |
| | lines.append( |
| | f"| {r['variant']} | {r['model']} | {r['challenge_avg_score']} | {r['coverage_endpoint_rate']} | {r['coverage_method_rate']} | {r['composite']} | {r['total_tool_calls']} | {r['total_tokens']} |" |
| | ) |
| |
|
| | |
| | variants = sorted({r['variant'] for r in rows}) |
| | if len(variants) >= 2: |
| | a, b = variants[0], variants[1] |
| | lines += [ |
| | '', |
| | f'## Pairwise delta ({b} - {a})', |
| | '', |
| | '| Model | Δ Ch avg | Δ Cov endpoint | Δ Cov method | Δ Composite | Δ Calls | Δ Tokens |', |
| | '|---|---:|---:|---:|---:|---:|---:|', |
| | ] |
| | by = {(r['variant'], r['model']): r for r in rows} |
| | for model in sorted({r['model'] for r in rows}): |
| | ra = by.get((a, model)) |
| | rb = by.get((b, model)) |
| | if not ra or not rb: |
| | continue |
| | lines.append( |
| | f"| {model} | {rb['challenge_avg_score'] - ra['challenge_avg_score']:+.3f} | " |
| | f"{rb['coverage_endpoint_rate'] - ra['coverage_endpoint_rate']:+.4f} | " |
| | f"{rb['coverage_method_rate'] - ra['coverage_method_rate']:+.4f} | " |
| | f"{rb['composite'] - ra['composite']:+.4f} | " |
| | f"{rb['total_tool_calls'] - ra['total_tool_calls']:+d} | " |
| | f"{rb['total_tokens'] - ra['total_tokens']:+d} |" |
| | ) |
| |
|
| | md_path.write_text('\n'.join(lines) + '\n', encoding='utf-8') |
| |
|
| | print('\nWrote:') |
| | print(f'- {json_path}') |
| | print(f'- {csv_path}') |
| | print(f'- {md_path}') |
| |
|
| |
|
| | def try_plot(rows: list[dict[str, Any]], out_dir: Path) -> None: |
| | try: |
| | import matplotlib.pyplot as plt |
| | except Exception as e: |
| | print(f'Plotting skipped (matplotlib missing): {e}') |
| | return |
| |
|
| | out_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | variants = sorted({r['variant'] for r in rows}) |
| | models = sorted({r['model'] for r in rows}) |
| |
|
| | for model in models: |
| | vals = [next((r['composite'] for r in rows if r['model'] == model and r['variant'] == v), None) for v in variants] |
| | x = list(range(len(variants))) |
| | plt.figure(figsize=(6, 4)) |
| | plt.bar(x, [v if v is not None else 0 for v in vals]) |
| | plt.xticks(x, variants, rotation=20, ha='right') |
| | plt.ylim(0, 1.0) |
| | plt.title(f'Composite score by variant ({model})') |
| | plt.ylabel('Composite') |
| | for i, v in enumerate(vals): |
| | if v is not None: |
| | plt.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=8) |
| | plt.tight_layout() |
| | plt.savefig(out_dir / f'prompt_ab_composite_{model.replace("/", "_")}.png', dpi=170) |
| | plt.close() |
| |
|
| | |
| | plt.figure(figsize=(6.8, 4.6)) |
| | for r in rows: |
| | plt.scatter(r['total_tokens'], r['challenge_avg_score'], s=55) |
| | plt.text(r['total_tokens'], r['challenge_avg_score'], f" {r['variant']}:{r['model']}", fontsize=7) |
| | plt.xlabel('Total tokens (challenge + coverage)') |
| | plt.ylabel('Challenge avg score (/10)') |
| | plt.title('Prompt A/B efficiency vs quality') |
| | plt.grid(alpha=0.25) |
| | plt.tight_layout() |
| | plt.savefig(out_dir / 'prompt_ab_scatter_tokens_vs_challenge.png', dpi=170) |
| | plt.close() |
| |
|
| | print(f'Plots written to: {out_dir}') |
| |
|
| |
|
| | def main() -> None: |
| | ap = argparse.ArgumentParser(description='A/B compare hf_hub_community prompt variants over challenge + coverage packs') |
| | ap.add_argument('--variants', required=True, help='Comma-separated id=cards_dir entries. Example: baseline=.fast-agent/tool-cards,compact=.fast-agent/evals/compact/cards') |
| | ap.add_argument('--models', default='gpt-oss', help='Comma-separated model IDs') |
| | ap.add_argument('--agent', default='hf_hub_community') |
| | ap.add_argument('--timeout', type=int, default=240) |
| | ap.add_argument('--out-dir', type=Path, default=OUT_DIR) |
| | ap.add_argument('--challenge-start', type=int, default=None) |
| | ap.add_argument('--challenge-end', type=int, default=None) |
| | ap.add_argument('--coverage-cases', type=Path, default=None) |
| | args = ap.parse_args() |
| |
|
| | variants = parse_variants(args.variants) |
| | models = [m.strip() for m in args.models.split(',') if m.strip()] |
| |
|
| | rows: list[dict[str, Any]] = [] |
| | for vid, cards in variants: |
| | for model in models: |
| | row = run_variant_model( |
| | variant_id=vid, |
| | cards_dir=cards, |
| | model=model, |
| | agent=args.agent, |
| | timeout=args.timeout, |
| | out_dir=args.out_dir, |
| | challenge_start=args.challenge_start, |
| | challenge_end=args.challenge_end, |
| | coverage_cases=args.coverage_cases, |
| | ) |
| | rows.append(row) |
| | print(f"[done] {vid}/{model}: composite={row['composite']} tokens={row['total_tokens']}") |
| |
|
| | write_outputs(rows, args.out_dir) |
| | try_plot(rows, args.out_dir) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|