| import json | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from datetime import datetime | |
| # Model release dates (sorted by date) | |
| model_release_dates = { | |
| 'Claude-4-Sonnet': datetime(2025, 5, 23), | |
| 'Qwen3-Coder': datetime(2025, 7, 23), | |
| 'DeepSeek-V3.1-Terminus': datetime(2025, 9, 22), | |
| 'Claude-4.5-Sonnet': datetime(2025, 9, 29), | |
| 'GLM-4.6': datetime(2025, 9, 30), | |
| 'MiniMax-M2': datetime(2025, 10, 27), | |
| 'Claude-4.5-Opus': datetime(2025, 11, 1), | |
| 'Gemini-3-Pro': datetime(2025, 11, 18), | |
| 'GPT-5.1-Codex': datetime(2025, 11, 19), | |
| 'DeepSeek-V3.2': datetime(2025, 12, 1), | |
| } | |
| # Sort models by release date | |
| sorted_models = sorted(model_release_dates.keys(), key=lambda x: model_release_dates[x]) | |
| # Load CVE dates from fliter-excel.xlsx | |
| df = pd.read_excel('/workspace/terminal-bench/hello-world-task/Result_CVE/fliter-excel.xlsx') | |
| cve_dates = dict(zip(df['CVE'], pd.to_datetime(df['Date']))) | |
| print(f"Loaded {len(cve_dates)} CVE dates from fliter-excel.xlsx") | |
| # Load task stats | |
| with open('task_stats_filtered.json', 'r') as f: | |
| task_stats = json.load(f) | |
| def calculate_stats(agent_name, model_name): | |
| """Calculate before/after release stats for a model with an agent.""" | |
| key = f"{agent_name}+{model_name}" | |
| if key not in task_stats: | |
| return None | |
| cve_results = task_stats[key] | |
| release_date = model_release_dates[model_name] | |
| before_total = 0 | |
| before_success = 0 | |
| after_total = 0 | |
| after_success = 0 | |
| for cve_id, result in cve_results.items(): | |
| if cve_id not in cve_dates: | |
| continue | |
| cve_date = cve_dates[cve_id] | |
| success = result.get('success', False) | |
| if cve_date < release_date: | |
| before_total += 1 | |
| if success: | |
| before_success += 1 | |
| else: | |
| after_total += 1 | |
| if success: | |
| after_success += 1 | |
| before_rate = (before_success / before_total * 100) if before_total > 0 else 0 | |
| after_rate = (after_success / after_total * 100) if after_total > 0 else 0 | |
| return { | |
| 'before_total': before_total, | |
| 'before_success': before_success, | |
| 'before_rate': before_rate, | |
| 'after_total': after_total, | |
| 'after_success': after_success, | |
| 'after_rate': after_rate, | |
| } | |
| def create_chart(agent_name, output_filename): | |
| """Create a chart for a specific agent.""" | |
| # Calculate stats for all models | |
| stats = {} | |
| for model in sorted_models: | |
| result = calculate_stats(agent_name, model) | |
| if result: | |
| stats[model] = result | |
| # Filter models that have data | |
| available_models = [m for m in sorted_models if m in stats] | |
| if not available_models: | |
| print(f"No data available for {agent_name}") | |
| return | |
| # Prepare data | |
| x = np.arange(len(available_models)) | |
| before_counts = [stats[m]['before_total'] for m in available_models] | |
| after_counts = [stats[m]['after_total'] for m in available_models] | |
| before_rates = [stats[m]['before_rate'] for m in available_models] | |
| after_rates = [stats[m]['after_rate'] for m in available_models] | |
| # Create figure | |
| fig, ax1 = plt.subplots(figsize=(16, 8)) | |
| # Bar width | |
| width = 0.35 | |
| # Plot bars | |
| bars1 = ax1.bar(x - width/2, before_counts, width, label='CVE Count (Before Release)', color='#5B9BD5') | |
| bars2 = ax1.bar(x + width/2, after_counts, width, label='CVE Count (After Release)', color='#ED7D31') | |
| # Add value labels on bars | |
| for bar, count in zip(bars1, before_counts): | |
| ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, str(count), | |
| ha='center', va='bottom', fontsize=9, fontweight='bold') | |
| for bar, count in zip(bars2, after_counts): | |
| ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, str(count), | |
| ha='center', va='bottom', fontsize=9, fontweight='bold') | |
| ax1.set_xlabel('Model', fontsize=12) | |
| ax1.set_ylabel('Number of CVE Tasks', fontsize=12) | |
| ax1.set_xticks(x) | |
| ax1.set_xticklabels(available_models, rotation=45, ha='right', fontsize=10) | |
| ax1.set_ylim(0, max(max(before_counts), max(after_counts)) * 1.15) | |
| # Create second y-axis for pass rates | |
| ax2 = ax1.twinx() | |
| # Define colors for each model | |
| colors = plt.cm.tab10(np.linspace(0, 1, len(available_models))) | |
| # Plot pass rate lines | |
| for i, (model, color) in enumerate(zip(available_models, colors)): | |
| # Draw line connecting before and after | |
| ax2.plot([x[i] - width/2, x[i] + width/2], | |
| [before_rates[i], after_rates[i]], | |
| 'o-', color=color, linewidth=2, markersize=8, | |
| label=model) | |
| # Add rate labels | |
| ax2.annotate(f'{before_rates[i]:.1f}%', | |
| (x[i] - width/2, before_rates[i]), | |
| textcoords="offset points", xytext=(0, 8), | |
| ha='center', fontsize=8, color=color) | |
| ax2.annotate(f'{after_rates[i]:.1f}%', | |
| (x[i] + width/2, after_rates[i]), | |
| textcoords="offset points", xytext=(0, 8), | |
| ha='center', fontsize=8, color=color) | |
| ax2.set_ylabel('Pass Rate (%)', fontsize=12) | |
| ax2.set_ylim(0, 60) | |
| # Add legends | |
| ax1.legend(loc='upper left', fontsize=9) | |
| ax2.legend(loc='upper right', fontsize=8, title='Pass Rate', ncol=2) | |
| # Title | |
| plt.title(f'CVE Tasks Before/After Model Release - {agent_name} (Full Year 2025, 189 CVEs)', | |
| fontsize=14, fontweight='bold') | |
| plt.tight_layout() | |
| plt.savefig(output_filename, dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print(f"Saved: {output_filename}") | |
| # Print stats | |
| print(f"\nStatistics for {agent_name}:") | |
| print("-" * 80) | |
| for model in available_models: | |
| s = stats[model] | |
| print(f"{model:25s}: Before={s['before_total']:3d} ({s['before_rate']:.1f}%), " | |
| f"After={s['after_total']:3d} ({s['after_rate']:.1f}%)") | |
| # Create charts for both agents | |
| create_chart('terminus-2', 'terminus-2_full_year_188.png') | |
| create_chart('mini-swe-agent', 'mini-swe-agent_full_year_188.png') | |
| print("\nDone!") | |
Xet Storage Details
- Size:
- 6.16 kB
- Xet hash:
- fbf31aeb261538fc41965c29a73e19d4c3e6c9706502228b244f9aef82c26b79
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.