#!/usr/bin/env python3 from __future__ import annotations import argparse import json import re import subprocess from pathlib import Path from typing import Any ROOT = Path(__file__).resolve().parents[1] DEFAULT_CARDS_DIR = ROOT / '.fast-agent' / 'tool-cards' DEFAULT_AGENT = 'hf_hub_community' PROMPTS_FILE = ROOT / 'scripts' / 'hf_hub_community_coverage_prompts.json' REPORT_MD = ROOT / 'docs' / 'hf_hub_community_coverage_report.md' REPORT_JSON = ROOT / 'docs' / 'hf_hub_community_coverage_report.json' ANSI_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") def strip_ansi(text: str) -> str: return ANSI_RE.sub('', text) def load_cases(path: Path) -> list[dict[str, Any]]: rows = json.loads(path.read_text(encoding='utf-8')) if not isinstance(rows, list): raise ValueError('coverage prompts file must be a JSON list') out: list[dict[str, Any]] = [] for row in rows: if not isinstance(row, dict): continue out.append({ 'id': int(row['id']), 'prompt': str(row['prompt']), 'expected_endpoint_any': list(row.get('expected_endpoint_any', [])), 'expected_method_any': [str(x).upper() for x in row.get('expected_method_any', ['GET'])], }) return out def _session_extract(result_path: Path) -> dict[str, Any]: data = json.loads(result_path.read_text(encoding='utf-8')) messages = data.get('messages', []) if isinstance(data, dict) else [] endpoints: list[str] = [] methods: list[str] = [] tool_names: list[str] = [] tool_calls_count = 0 merged_parts: list[str] = [] tool_error = False usage_input_tokens = 0 usage_output_tokens = 0 usage_total_tokens = 0 for msg in messages: if not isinstance(msg, dict): continue if msg.get('role') == 'assistant': for item in msg.get('content', []) or []: if isinstance(item, dict) and item.get('type') == 'text' and item.get('text'): merged_parts.append(str(item['text'])) channels = msg.get('channels') or {} for item in channels.get('reasoning', []) or []: if isinstance(item, dict) and item.get('text'): merged_parts.append(str(item['text'])) for item in channels.get('fast-agent-usage', []) or []: if not isinstance(item, dict): continue txt = item.get('text') if not isinstance(txt, str): continue try: payload = json.loads(txt) except Exception: continue turn = payload.get('turn', {}) if isinstance(payload, dict) else {} if isinstance(turn, dict): usage_input_tokens += int(turn.get('input_tokens') or 0) usage_output_tokens += int(turn.get('output_tokens') or 0) usage_total_tokens += int(turn.get('total_tokens') or 0) tool_calls = msg.get('tool_calls') or {} if isinstance(tool_calls, dict): tool_calls_count += len(tool_calls) for tc in tool_calls.values(): params = (tc or {}).get('params', {}) if isinstance(tc, dict) else {} name = params.get('name') if isinstance(params, dict) else None args = params.get('arguments', {}) if isinstance(params, dict) else {} if isinstance(name, str): tool_names.append(name) merged_parts.append(f'tool call - {name}') if isinstance(args, dict): ep = args.get('endpoint') if isinstance(ep, str): endpoints.append(ep) method = args.get('method') methods.append(str(method).upper() if method else 'GET') merged_parts.append(json.dumps(args, ensure_ascii=False)) if msg.get('role') == 'user': tool_results = msg.get('tool_results') or {} if isinstance(tool_results, dict): for tr in tool_results.values(): if bool((tr or {}).get('isError')): tool_error = True for item in (tr or {}).get('content', []) or []: if isinstance(item, dict) and item.get('type') == 'text' and item.get('text'): text = str(item['text']) merged_parts.append(text) if 'Error executing tool' in text: tool_error = True return { 'endpoints': endpoints, 'methods': methods, 'tool_names': tool_names, 'tool_calls_count': tool_calls_count, 'tool_error': tool_error, 'merged': '\n'.join(merged_parts).strip(), 'usage': { 'input_tokens': usage_input_tokens, 'output_tokens': usage_output_tokens, 'total_tokens': usage_total_tokens, }, } def run_case( case: dict[str, Any], timeout_sec: int, model: str, agent_cards: Path, agent: str, result_path: Path, ) -> dict[str, Any]: prompt = case['prompt'] result_path.parent.mkdir(parents=True, exist_ok=True) cmd = [ 'fast-agent', 'go', '--no-env', '--model', model, '--agent-cards', str(agent_cards), '--agent', agent, '--results', str(result_path), '-m', prompt, ] proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout_sec) out = strip_ansi(proc.stdout or '') err = strip_ansi(proc.stderr or '') if not result_path.exists(): raise RuntimeError(f'Expected --results file not written: {result_path}') parsed = _session_extract(result_path) expected_ep = [re.compile(p) for p in case.get('expected_endpoint_any', [])] expected_methods = [m.upper() for m in case.get('expected_method_any', ['GET'])] endpoint_ok = any(p.search(ep) for p in expected_ep for ep in parsed['endpoints']) if expected_ep else False method_ok = any(m in expected_methods for m in parsed['methods']) if parsed['methods'] else ('GET' in expected_methods) tool_ok = any(t == 'hf_api_request' for t in parsed['tool_names']) success = proc.returncode == 0 and 'Traceback' not in (out + '\n' + err) clarity = len(parsed['merged']) > 20 score = int(endpoint_ok) * 4 + int(method_ok) * 2 + int(tool_ok) * 2 + int(success) + int(clarity) return { 'id': case['id'], 'prompt': prompt, 'returncode': proc.returncode, 'result_file': str(result_path), 'observed': { 'endpoints': parsed['endpoints'], 'methods': parsed['methods'], 'tool_names': parsed['tool_names'], 'tool_calls_count': parsed['tool_calls_count'], 'tool_error': parsed['tool_error'], 'usage': parsed['usage'], }, 'expected': { 'endpoint_any': case.get('expected_endpoint_any', []), 'method_any': expected_methods, }, 'eval': { 'endpoint_ok': endpoint_ok, 'method_ok': method_ok, 'tool_ok': tool_ok, 'success': success, 'clarity': clarity, 'score_total': score, }, 'merged': parsed['merged'], } def summarize(rows: list[dict[str, Any]]) -> dict[str, Any]: n = len(rows) if n == 0: return {'n_cases': 0} endpoint_rate = sum(1 for r in rows if r['eval']['endpoint_ok']) / n method_rate = sum(1 for r in rows if r['eval']['method_ok']) / n tool_rate = sum(1 for r in rows if r['eval']['tool_ok']) / n success_rate = sum(1 for r in rows if r['eval']['success']) / n avg_score = sum(r['eval']['score_total'] for r in rows) / n avg_calls = sum(r['observed']['tool_calls_count'] for r in rows) / n avg_tokens = sum(int(r['observed']['usage'].get('total_tokens') or 0) for r in rows) / n tool_error_rate = sum(1 for r in rows if r['observed']['tool_error']) / n return { 'n_cases': n, 'endpoint_match_rate': round(endpoint_rate, 4), 'method_match_rate': round(method_rate, 4), 'tool_use_rate': round(tool_rate, 4), 'success_rate': round(success_rate, 4), 'tool_error_rate': round(tool_error_rate, 4), 'avg_score_total': round(avg_score, 3), 'avg_tool_calls': round(avg_calls, 3), 'avg_total_tokens': round(avg_tokens, 1), } def render_markdown(rows: list[dict[str, Any]], summary: dict[str, Any], model: str, agent: str) -> str: out = [ '# HF Hub Community Coverage Report', '', f'- Model: `{model}`', f'- Agent: `{agent}`', '', '## Summary', '', f"- Cases: **{summary.get('n_cases', 0)}**", f"- Endpoint match rate: **{summary.get('endpoint_match_rate')}**", f"- Method match rate: **{summary.get('method_match_rate')}**", f"- Tool use rate: **{summary.get('tool_use_rate')}**", f"- Success rate: **{summary.get('success_rate')}**", f"- Tool error rate: **{summary.get('tool_error_rate')}**", f"- Avg score (/10): **{summary.get('avg_score_total')}**", f"- Avg tool calls: **{summary.get('avg_tool_calls')}**", f"- Avg total tokens: **{summary.get('avg_total_tokens')}**", '', '| # | Score | Endpoint OK | Method OK | Calls | Tokens | Prompt |', '|---|------:|------------:|----------:|------:|-------:|--------|', ] for r in rows: score = r['eval']['score_total'] calls = r['observed']['tool_calls_count'] tokens = int(r['observed']['usage'].get('total_tokens') or 0) out.append( f"| {r['id']} | {score} | {int(r['eval']['endpoint_ok'])} | {int(r['eval']['method_ok'])} | {calls} | {tokens} | {r['prompt'][:72].replace('|','/')} |" ) return '\n'.join(out) + '\n' def main() -> None: ap = argparse.ArgumentParser(description='Run endpoint-coverage pack for hf_hub_community') ap.add_argument('--model', default='gpt-oss') ap.add_argument('--agent', default=DEFAULT_AGENT) ap.add_argument('--agent-cards', type=Path, default=DEFAULT_CARDS_DIR) ap.add_argument('--cases', type=Path, default=PROMPTS_FILE) ap.add_argument('--timeout', type=int, default=240) ap.add_argument('--raw-results-dir', type=Path, default=ROOT / 'docs' / 'hf_hub_community_coverage_results') ap.add_argument('--json-out', type=Path, default=REPORT_JSON) ap.add_argument('--md-out', type=Path, default=REPORT_MD) args = ap.parse_args() cases = load_cases(args.cases) rows: list[dict[str, Any]] = [] for case in cases: result_file = args.raw_results_dir / f"coverage_{args.agent}_{args.model.replace('/', '_')}_case_{case['id']:02d}.json" row = run_case( case=case, timeout_sec=args.timeout, model=args.model, agent_cards=args.agent_cards, agent=args.agent, result_path=result_file, ) rows.append(row) print(f"[{case['id']}] score={row['eval']['score_total']}/10 endpoint_ok={row['eval']['endpoint_ok']} method_ok={row['eval']['method_ok']}") summary = summarize(rows) payload = { 'summary': summary, 'rows': rows, } args.json_out.parent.mkdir(parents=True, exist_ok=True) args.md_out.parent.mkdir(parents=True, exist_ok=True) args.json_out.write_text(json.dumps(payload, indent=2), encoding='utf-8') args.md_out.write_text(render_markdown(rows, summary, model=args.model, agent=args.agent), encoding='utf-8') print(f"\nWrote:\n- {args.json_out}\n- {args.md_out}") if __name__ == '__main__': main()