Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Extract inference request and token usage data for all 'react with code' agents. | |
| This script reads session.jsonl files to count inference requests and estimate token usage. | |
| """ | |
| import json | |
| import sys | |
| from pathlib import Path | |
| from dataclasses import dataclass | |
| import ast | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from tqdm import tqdm | |
| PROJECT_ROOT = Path(__file__).parent.parent | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from analysis_src.utils import ( | |
| get_model_name, | |
| find_react_with_code_dirs, | |
| get_runs_stats, | |
| filter_scenarios_with_min_runs, | |
| find_latest_rollout_file | |
| ) | |
| from analysis_src.model_styles import ( | |
| get_model_style, MIN_FONT_SIZE, SINGLE_COLUMN_WIDTH, DOUBLE_COLUMN_WIDTH, get_color_palette, PLOT_PARAMETERS | |
| ) | |
| # Paths | |
| LEADERBOARD_DIR = PROJECT_ROOT / "ITBench-SRE-Agent" / "ITBench-Trajectories" / "ReAct-Agent-Trajectories" | |
| RESULTS_JSON_DIR = LEADERBOARD_DIR / "results" | |
| OUTPUT_DIR = PROJECT_ROOT / "ITBench-SRE-Agent" / "ITBench-Trajectories" / "output" / "inferences" | |
| # Minimum runs per scenario required | |
| MIN_RUNS_PER_SCENARIO = 3 | |
| MIN_QUALIFYING_SCENARIOS = 20 | |
| # Token estimation factor (chars per token) | |
| CHARS_PER_TOKEN = 4 | |
| def extract_tokens_from_rollout(rollout_file: Path) -> dict: | |
| """ | |
| Extract token counts and tool usage from a rollout file. | |
| Counts: | |
| - INPUT: system prompt + user messages + tool outputs | |
| - OUTPUT: assistant messages + tool call arguments | |
| - TOOLS: counts by tool name, including code execution | |
| """ | |
| system_prompt_chars = 0 | |
| user_input_chars = 0 | |
| assistant_output_chars = 0 | |
| tool_call_chars = 0 | |
| tool_output_chars = 0 | |
| assistant_msg_count = 0 | |
| tool_call_count = 0 | |
| tool_counts = {} # tool_name -> count | |
| code_execution_count = 0 # Specifically track code/python execution | |
| # Tool names that indicate code execution | |
| CODE_TOOLS = ['execute_python', 'run_python', 'python', 'execute_code', | |
| 'run_code', 'shell', 'bash', 'terminal', 'exec'] | |
| try: | |
| with open(rollout_file) as f: | |
| for line in f: | |
| try: | |
| d = json.loads(line) | |
| msg_type = d.get('type', '') | |
| payload = d.get('payload', {}) | |
| if msg_type == 'session_meta': | |
| # System prompt | |
| instructions = payload.get('instructions', '') | |
| system_prompt_chars += len(str(instructions)) | |
| elif msg_type == 'response_item': | |
| item_type = payload.get('type', '') | |
| role = payload.get('role', '') | |
| if item_type == 'message': | |
| content = payload.get('content', []) | |
| if isinstance(content, list): | |
| text = ' '.join([ | |
| c.get('text', '') if isinstance(c, dict) else str(c) | |
| for c in content | |
| ]) | |
| else: | |
| text = str(content) | |
| if role == 'user': | |
| user_input_chars += len(text) | |
| elif role == 'assistant': | |
| assistant_output_chars += len(text) | |
| assistant_msg_count += 1 | |
| elif item_type == 'function_call': | |
| # Tool call (output) | |
| name = payload.get('name', '') | |
| arguments = payload.get('arguments', '') | |
| tool_call_chars += len(str(name)) + len(str(arguments)) | |
| tool_call_count += 1 | |
| # Track tool usage | |
| tool_counts[name] = tool_counts.get(name, 0) + 1 | |
| # Check if it's code execution | |
| name_lower = name.lower() | |
| if any(code_tool in name_lower for code_tool in CODE_TOOLS): | |
| code_execution_count += 1 | |
| # Also check if arguments contain python code patterns | |
| args_str = str(arguments).lower() | |
| if 'python' in name_lower or ('def ' in args_str or 'import ' in args_str): | |
| code_execution_count += 1 | |
| elif item_type == 'function_call_output': | |
| # Tool output (input to model) | |
| output = payload.get('output', '') | |
| tool_output_chars += len(str(output)) | |
| except json.JSONDecodeError: | |
| continue | |
| except Exception as e: | |
| return None | |
| # INPUT = system + user + tool outputs (fed back to model) | |
| input_chars = system_prompt_chars + user_input_chars + tool_output_chars | |
| # OUTPUT = assistant responses + tool call arguments | |
| output_chars = assistant_output_chars + tool_call_chars | |
| return { | |
| 'system_prompt_chars': system_prompt_chars, | |
| 'user_input_chars': user_input_chars, | |
| 'assistant_output_chars': assistant_output_chars, | |
| 'tool_call_chars': tool_call_chars, | |
| 'tool_output_chars': tool_output_chars, | |
| 'input_chars': input_chars, | |
| 'output_chars': output_chars, | |
| 'input_tokens': input_chars // CHARS_PER_TOKEN, | |
| 'output_tokens': output_chars // CHARS_PER_TOKEN, | |
| 'assistant_msg_count': assistant_msg_count, | |
| 'tool_call_count': tool_call_count, | |
| 'tool_counts': tool_counts, | |
| 'code_execution_count': code_execution_count, | |
| } | |
| def extract_session_stats(session_file: Path) -> dict: | |
| """ | |
| Extract inference stats from session.jsonl and rollout files. | |
| Uses the latest rollout file for accurate token counting. | |
| """ | |
| if not session_file.exists(): | |
| return None | |
| trial_dir = session_file.parent | |
| # Count inference requests from session.jsonl | |
| inference_count = 0 | |
| try: | |
| with open(session_file) as f: | |
| for line in f: | |
| try: | |
| d = json.loads(line) | |
| if d.get('type') == 'response_item': | |
| inference_count += 1 | |
| except json.JSONDecodeError: | |
| continue | |
| except Exception as e: | |
| print(f" Warning: Error reading {session_file}: {e}") | |
| return None | |
| # First check stdout.log for real token counts (OpenAI models) | |
| stdout_log = trial_dir / "traces" / "stdout.log" | |
| has_real_tokens = False | |
| input_tokens = 0 | |
| output_tokens = 0 | |
| cached_input_tokens = 0 | |
| if stdout_log.exists(): | |
| try: | |
| with open(stdout_log) as f: | |
| for line in f: | |
| try: | |
| d = json.loads(line) | |
| if d.get('type') == 'turn.completed': | |
| usage = d.get('usage', {}) | |
| input_tokens = usage.get('input_tokens', 0) | |
| output_tokens = usage.get('output_tokens', 0) | |
| cached_input_tokens = usage.get('cached_input_tokens', 0) | |
| if input_tokens > 0 or output_tokens > 0: | |
| has_real_tokens = True | |
| break | |
| except json.JSONDecodeError: | |
| continue | |
| except Exception: | |
| pass | |
| # Extract from latest rollout file for tokens (if needed) and tool counts | |
| tool_call_count = 0 | |
| tool_counts = {} | |
| code_execution_count = 0 | |
| latest_rollout = find_latest_rollout_file(trial_dir) | |
| if latest_rollout: | |
| rollout_stats = extract_tokens_from_rollout(latest_rollout) | |
| if rollout_stats: | |
| # Use rollout tokens if no real API token data | |
| if not has_real_tokens: | |
| input_tokens = rollout_stats['input_tokens'] | |
| output_tokens = rollout_stats['output_tokens'] | |
| # Always use rollout for tool counts | |
| tool_call_count = rollout_stats['tool_call_count'] | |
| tool_counts = rollout_stats['tool_counts'] | |
| code_execution_count = rollout_stats['code_execution_count'] | |
| return { | |
| 'inference_count': inference_count, | |
| 'input_tokens': input_tokens, | |
| 'cached_input_tokens': cached_input_tokens, | |
| 'output_tokens': output_tokens, | |
| 'total_tokens': input_tokens + output_tokens, | |
| 'has_real_tokens': has_real_tokens, | |
| 'tool_call_count': tool_call_count, | |
| 'tool_counts': tool_counts, | |
| 'code_execution_count': code_execution_count, | |
| } | |
| def read_agent_stats(agent_dir: Path) -> dict[str, list[dict]]: | |
| """ | |
| Read session stats from all scenarios/trials for an agent. | |
| Returns: | |
| Dict mapping scenario_id -> list of stats (one per trial) | |
| """ | |
| scenario_data = {} | |
| # Check if directory contains Scenario folders directly, or if we need to go one level deeper | |
| # (e.g., agent_dir/sre/Scenario-1, agent_dir/finops/Scenario-1, etc.) | |
| has_scenarios = any(d.name.startswith("Scenario") for d in agent_dir.iterdir() if d.is_dir()) | |
| if not has_scenarios: | |
| # Look for subdirectories that might contain scenarios (sre, finops, etc.) | |
| subdirs = [d for d in agent_dir.iterdir() if d.is_dir() and not d.name.startswith(".")] | |
| if len(subdirs) == 1: | |
| # If there's exactly one subdirectory, use it | |
| agent_dir = subdirs[0] | |
| elif len(subdirs) > 1: | |
| # If there are multiple, try to find one with Scenario folders | |
| for subdir in subdirs: | |
| if any(d.name.startswith("Scenario") for d in subdir.iterdir() if d.is_dir()): | |
| agent_dir = subdir | |
| break | |
| for scenario_dir in agent_dir.iterdir(): | |
| if not scenario_dir.is_dir() or not scenario_dir.name.startswith("Scenario"): | |
| continue | |
| scenario_id = scenario_dir.name | |
| trials = [] | |
| for trial_dir in sorted(scenario_dir.iterdir()): | |
| if not trial_dir.is_dir(): | |
| continue | |
| session_file = trial_dir / "session.jsonl" | |
| stats = extract_session_stats(session_file) | |
| if stats: | |
| trials.append(stats) | |
| if trials: | |
| scenario_data[scenario_id] = trials | |
| return scenario_data | |
| def load_performance_data() -> pd.DataFrame: | |
| """Load performance data from the consistency analysis.""" | |
| perf_file = PROJECT_ROOT / "ITBench-SRE-Agent" / "ITBench-Trajectories" / "output" / "consistency" / "performance_data.csv" | |
| if perf_file.exists(): | |
| df = pd.read_csv(perf_file) | |
| return df[df["metric_raw"] == "root_cause_entity_f1"][["model", "performance"]] | |
| return pd.DataFrame() | |
| def extract_all_data() -> tuple[pd.DataFrame, pd.DataFrame]: | |
| """ | |
| Extract inference data for all agents. | |
| Returns: | |
| - summary_df: Aggregated stats per model | |
| - detail_df: Per-scenario stats | |
| """ | |
| agent_dirs = find_react_with_code_dirs(LEADERBOARD_DIR) | |
| print(f"Found {len(agent_dirs)} 'react with code' agent directories") | |
| summary_records = [] | |
| detail_records = [] | |
| for agent_dir in tqdm(agent_dirs, desc="Processing agents"): | |
| model_name = get_model_name(agent_dir.name) | |
| print(f"\nReading: {agent_dir.name}") | |
| scenario_data = read_agent_stats(agent_dir) | |
| n_scenarios, min_runs, max_runs, n_qualifying = get_runs_stats(scenario_data, MIN_RUNS_PER_SCENARIO) | |
| if n_scenarios == 0: | |
| print(f" SKIPPING {model_name}: No session data found") | |
| continue | |
| if n_qualifying < MIN_QUALIFYING_SCENARIOS: | |
| print(f" SKIPPING {model_name}: Only {n_qualifying}/{n_scenarios} scenarios have {MIN_RUNS_PER_SCENARIO}+ runs") | |
| continue | |
| # Filter scenarios | |
| scenario_data = filter_scenarios_with_min_runs(scenario_data, MIN_RUNS_PER_SCENARIO) | |
| n_scenarios_filtered = len(scenario_data) | |
| print(f" Processing: {model_name} ({n_scenarios_filtered} scenarios)") | |
| # Aggregate across all scenarios and trials | |
| all_inference_counts = [] | |
| all_input_tokens = [] | |
| all_output_tokens = [] | |
| all_total_tokens = [] | |
| all_cached_tokens = [] | |
| all_tool_call_counts = [] | |
| all_code_execution_counts = [] | |
| aggregated_tool_counts = {} | |
| for scenario_id, trials in tqdm(scenario_data.items(), desc=f" {model_name} scenarios", leave=False): | |
| for trial in trials: | |
| all_inference_counts.append(trial['inference_count']) | |
| all_input_tokens.append(trial['input_tokens']) | |
| all_output_tokens.append(trial['output_tokens']) | |
| all_total_tokens.append(trial['total_tokens']) | |
| all_cached_tokens.append(trial.get('cached_input_tokens', 0)) | |
| all_tool_call_counts.append(trial.get('tool_call_count', 0)) | |
| all_code_execution_counts.append(trial.get('code_execution_count', 0)) | |
| # Aggregate tool counts | |
| for tool_name, count in trial.get('tool_counts', {}).items(): | |
| aggregated_tool_counts[tool_name] = aggregated_tool_counts.get(tool_name, 0) + count | |
| detail_records.append({ | |
| 'model': model_name, | |
| 'scenario': scenario_id, | |
| 'inference_count': trial['inference_count'], | |
| 'input_tokens': trial['input_tokens'], | |
| 'cached_input_tokens': trial.get('cached_input_tokens', 0), | |
| 'output_tokens': trial['output_tokens'], | |
| 'total_tokens': trial['total_tokens'], | |
| 'tool_call_count': trial.get('tool_call_count', 0), | |
| 'code_execution_count': trial.get('code_execution_count', 0), | |
| }) | |
| # Summary stats | |
| summary_records.append({ | |
| 'model': model_name, | |
| 'n_scenarios': n_scenarios_filtered, | |
| 'n_trials': len(all_inference_counts), | |
| 'avg_inference_count': np.mean(all_inference_counts), | |
| 'std_inference_count': np.std(all_inference_counts), | |
| 'avg_input_tokens': np.mean(all_input_tokens), | |
| 'avg_cached_tokens': np.mean(all_cached_tokens), | |
| 'avg_output_tokens': np.mean(all_output_tokens), | |
| 'avg_total_tokens': np.mean(all_total_tokens), | |
| 'total_inference_count': sum(all_inference_counts), | |
| 'total_tokens': sum(all_total_tokens), | |
| 'avg_tool_call_count': np.mean(all_tool_call_counts) if all_tool_call_counts else 0, | |
| 'total_tool_calls': sum(all_tool_call_counts), | |
| 'avg_code_execution_count': np.mean(all_code_execution_counts) if all_code_execution_counts else 0, | |
| 'total_code_executions': sum(all_code_execution_counts), | |
| 'top_tools': dict(sorted(aggregated_tool_counts.items(), key=lambda x: -x[1])[:10]), | |
| }) | |
| summary_df = pd.DataFrame(summary_records) | |
| detail_df = pd.DataFrame(detail_records) | |
| # Merge with performance data | |
| perf_df = load_performance_data() | |
| if len(perf_df) > 0: | |
| summary_df = pd.merge(summary_df, perf_df, on='model', how='left') | |
| return summary_df, detail_df | |
| def save_data(summary_df: pd.DataFrame, detail_df: pd.DataFrame): | |
| """Save extracted data to CSV files.""" | |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| summary_path = OUTPUT_DIR / "inference_summary.csv" | |
| detail_path = OUTPUT_DIR / "inference_detail.csv" | |
| summary_df.to_csv(summary_path, index=False) | |
| detail_df.to_csv(detail_path, index=False) | |
| print(f"\nData saved to:") | |
| print(f" - {summary_path}") | |
| print(f" - {detail_path}") | |
| def print_summary(summary_df: pd.DataFrame): | |
| """Print summary table.""" | |
| print("\n" + "="*80) | |
| print("Inference Summary") | |
| print("="*80) | |
| summary_df = summary_df.sort_values("avg_inference_count", ascending=False) | |
| print(f"\n{'Model':<25} {'Avg Infer':>10} {'Avg Tokens':>12} {'Avg In':>10} {'Avg Out':>10}") | |
| print("-" * 70) | |
| for _, row in summary_df.iterrows(): | |
| print(f"{row['model']:<25} {row['avg_inference_count']:>10.1f} {row['avg_total_tokens']:>12.0f} {row['avg_input_tokens']:>10.0f} {row['avg_output_tokens']:>10.0f}") | |
| def plot_tool_usage(summary_df: pd.DataFrame): | |
| """ | |
| Figure: Tool usage per model - total tool calls and code execution. | |
| """ | |
| plt.rcParams.update(PLOT_PARAMETERS) | |
| if 'avg_tool_call_count' not in summary_df.columns: | |
| print("Skipping tool usage: no tool data") | |
| return | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(SINGLE_COLUMN_WIDTH * 2, 2.5)) | |
| data = summary_df.sort_values("avg_tool_call_count", ascending=True) | |
| color_palette = get_color_palette(len(data)) | |
| colors = [color_palette[i % len(color_palette)] for i in range(len(data))] | |
| # Left: Total tool calls | |
| bars1 = ax1.barh(data["model"], data["avg_tool_call_count"], color=colors, | |
| edgecolor='black', linewidth=0.5) | |
| ax1.set_xlabel("Avg. Tool Calls per Scenario") | |
| for bar, val in zip(bars1, data["avg_tool_call_count"]): | |
| ax1.text(val + 1, bar.get_y() + bar.get_height()/2, | |
| f'{val:.0f}', va='center', ha='left', fontsize=MIN_FONT_SIZE - 1) | |
| ax1.set_xlim(0, data["avg_tool_call_count"].max() * 1.15) | |
| # Right: Code executions | |
| bars2 = ax2.barh(data["model"], data["avg_code_execution_count"], color=colors, | |
| edgecolor='black', linewidth=0.5) | |
| ax2.set_xlabel("Avg. Code Executions per Scenario") | |
| for bar, val in zip(bars2, data["avg_code_execution_count"]): | |
| if val > 0: | |
| ax2.text(val + 0.5, bar.get_y() + bar.get_height()/2, | |
| f'{val:.0f}', va='center', ha='left', fontsize=MIN_FONT_SIZE - 1) | |
| ax2.set_xlim(0, max(data["avg_code_execution_count"].max() * 1.3, 1)) | |
| ax2.set_yticklabels([]) | |
| plt.title("Tool Call Distribution") | |
| plt.tight_layout() | |
| plt.show() | |
| fig.savefig(OUTPUT_DIR / "fig_tool_usage.png") | |
| plt.close(fig) | |
| print("Saved: fig_tool_usage.png") | |
| def plot_inference_vs_performance(summary_df: pd.DataFrame): | |
| """ | |
| Figure 3: Inference count vs Performance scatter. | |
| """ | |
| if 'performance' not in summary_df.columns: | |
| print("Skipping inference vs performance: no performance data") | |
| return | |
| plt.rcParams.update(PLOT_PARAMETERS) | |
| fig, ax = plt.subplots(figsize=(SINGLE_COLUMN_WIDTH, SINGLE_COLUMN_WIDTH)) | |
| data = summary_df.dropna(subset=['performance']) | |
| # Manual label offsets to avoid overlap | |
| label_offsets = { | |
| "GPT-5.1": (-5, -8, "right", "top"), | |
| "o4-mini": (5, -8, "left", "top"), | |
| "GPT-OSS-120B": (5, 3, "left", "bottom"), | |
| "Gemini-2.5-Pro": (-5, 3, "right", "bottom"), | |
| "Gemini-3-Flash": (5, 3, "left", "bottom"), | |
| "gemini-3-pro-preview": (5, 3, "left", "bottom"), | |
| "Kimi-K2": (5, 3, "left", "bottom"), | |
| } | |
| # Get color palette | |
| color_palette = get_color_palette(len(data)) | |
| # Scatter plot | |
| for i, (_, row) in enumerate(data.iterrows()): | |
| ax.scatter(row["avg_inference_count"], row["performance"], | |
| c=[color_palette[i % len(color_palette)]], s=60, edgecolors='black', | |
| linewidth=0.5, zorder=10) | |
| # Label with custom offset | |
| offset = label_offsets.get(row["model"], (5, 3, "left", "bottom")) | |
| ax.annotate(row["model"], | |
| (row["avg_inference_count"], row["performance"]), | |
| xytext=(offset[0], offset[1]), textcoords='offset points', | |
| fontsize=MIN_FONT_SIZE - 1, ha=offset[2], va=offset[3]) | |
| ax.set_xlabel("Avg. Inference Requests") | |
| ax.set_ylabel("Performance (RC Entity F1)") | |
| ax.set_xlim(0, data["avg_inference_count"].max() * 1.2) | |
| ax.set_ylim(0, 0.7) | |
| plt.title("Inference Requests vs. Performance") | |
| plt.tight_layout() | |
| plt.show() | |
| fig.savefig(OUTPUT_DIR / "fig_inference_vs_performance.png") | |
| plt.close(fig) | |
| print("Saved: fig_inference_vs_performance.png") | |
| def plot_tool_breakdown_heatmap(summary_df: pd.DataFrame): | |
| """ | |
| Generate a heatmap showing which tools each agent uses most. | |
| """ | |
| # Parse the stringified dict of top_tools | |
| tool_usage = [] | |
| for _, row in summary_df.iterrows(): | |
| if pd.isna(row.get('top_tools')): | |
| print("pd.isna") | |
| continue | |
| tools = row['top_tools'] | |
| total_calls = row['total_tool_calls'] | |
| if total_calls == 0: | |
| print("No tool calls") | |
| continue | |
| for tool, count in tools.items(): | |
| tool_usage.append({ | |
| 'model': row['model'], | |
| 'tool': tool, | |
| 'count': count, | |
| 'avg_per_scenario': count / row['n_scenarios'] | |
| }) | |
| df = pd.DataFrame(tool_usage) | |
| if len(df) == 0: | |
| print("No tool usage data found") | |
| return | |
| # Pivot for heatmap | |
| pivot_df = df.pivot(index='model', columns='tool', values='avg_per_scenario').fillna(0) | |
| # Filter to top 10 most used tools across all models | |
| # top_tools = pivot_df.sum().sort_values(ascending=False).head(10).index | |
| top_tools = pivot_df.sum().sort_values(ascending=False).index | |
| pivot_df = pivot_df[top_tools] | |
| # Sort models by total tool usage | |
| pivot_df['total'] = pivot_df.sum(axis=1) | |
| pivot_df = pivot_df.sort_values('total', ascending=False).drop('total', axis=1) | |
| # Plot | |
| PLOT_PARAMETERS['font.size'] = 8 | |
| plt.rcParams.update(PLOT_PARAMETERS) | |
| fig, ax = plt.subplots(figsize=(SINGLE_COLUMN_WIDTH * 2, 4)) | |
| sns.heatmap(pivot_df, annot=True, fmt='.1f', cmap='YlOrRd', ax=ax, | |
| cbar_kws={'label': 'Avg. Calls per Scenario'}) | |
| ax.set_xlabel("") | |
| ax.set_ylabel("") | |
| plt.xticks(rotation=45, ha='right') | |
| plt.yticks(rotation=0) | |
| plt.title("Tool Call Distribution") | |
| plt.tight_layout() | |
| plt.show() | |
| fig.savefig(OUTPUT_DIR / "fig_tool_usage_heatmap.png") | |
| plt.close(fig) | |
| print("Saved: fig_tool_usage_heatmap.png") | |
| def main(): | |
| print("Extracting inference data for 'react with code' agents...") | |
| print(f"Reading from directories: {LEADERBOARD_DIR}") | |
| print(f"Output directory: {OUTPUT_DIR}") | |
| summary_df, detail_df = extract_all_data() | |
| if len(summary_df) == 0: | |
| print("No data extracted!") | |
| return | |
| save_data(summary_df, detail_df) | |
| print_summary(summary_df) | |
| if __name__ == "__main__": | |
| main() | |