Tsukihjy/testcase / LiveCVEBench-io /create_combined_chart.py
Tsukihjy's picture
download
raw
5.89 kB
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
# Model release dates (sorted by date)
model_release_dates = {
'Claude-4-Sonnet': datetime(2025, 5, 23),
'Qwen3-Coder': datetime(2025, 7, 23),
'DeepSeek-V3.1-Terminus': datetime(2025, 9, 22),
'Claude-4.5-Sonnet': datetime(2025, 9, 29),
'GLM-4.6': datetime(2025, 9, 30),
'MiniMax-M2': datetime(2025, 10, 27),
'Claude-4.5-Opus': datetime(2025, 11, 1),
'Gemini-3-Pro': datetime(2025, 11, 18),
'GPT-5.1-Codex': datetime(2025, 11, 19),
'DeepSeek-V3.2': datetime(2025, 12, 1),
}
# Sort models by release date
sorted_models = sorted(model_release_dates.keys(), key=lambda x: model_release_dates[x])
# Load CVE dates from both Excel files
df1 = pd.read_excel('Multi-Agent-Tasks-Result.xlsx')[['CVE', 'Date']]
df2 = pd.read_excel('result_01-27-batch (3).xlsx')[['CVE', 'Date']]
combined_df = pd.concat([df1, df2], ignore_index=True)
combined_df = combined_df.drop_duplicates(subset='CVE', keep='first')
cve_dates = dict(zip(combined_df['CVE'], pd.to_datetime(combined_df['Date'])))
# Load task stats
with open('task_stats_filtered.json', 'r') as f:
task_stats = json.load(f)
def calculate_stats(agent_name, model_name):
"""Calculate before/after release stats for a model with an agent."""
key = f"{agent_name}+{model_name}"
if key not in task_stats:
return None
cve_results = task_stats[key]
release_date = model_release_dates[model_name]
before_total = 0
before_success = 0
after_total = 0
after_success = 0
for cve_id, result in cve_results.items():
if cve_id not in cve_dates:
continue
cve_date = cve_dates[cve_id]
success = result.get('success', False)
if cve_date < release_date:
before_total += 1
if success:
before_success += 1
else:
after_total += 1
if success:
after_success += 1
before_rate = (before_success / before_total * 100) if before_total > 0 else 0
after_rate = (after_success / after_total * 100) if after_total > 0 else 0
return {
'before_total': before_total,
'before_success': before_success,
'before_rate': before_rate,
'after_total': after_total,
'after_success': after_success,
'after_rate': after_rate,
}
# Create a combined figure with two subplots
fig, axes = plt.subplots(2, 1, figsize=(18, 16))
agents = [('terminus-2', 'Terminus-2'), ('mini-swe-agent', 'Mini-SWE-Agent')]
for ax_idx, (agent_key, agent_name) in enumerate(agents):
ax1 = axes[ax_idx]
# Calculate stats for all models
stats = {}
for model in sorted_models:
result = calculate_stats(agent_key, model)
if result:
stats[model] = result
# Filter models that have data
available_models = [m for m in sorted_models if m in stats]
if not available_models:
continue
# Prepare data
x = np.arange(len(available_models))
before_counts = [stats[m]['before_total'] for m in available_models]
after_counts = [stats[m]['after_total'] for m in available_models]
before_rates = [stats[m]['before_rate'] for m in available_models]
after_rates = [stats[m]['after_rate'] for m in available_models]
# Bar width
width = 0.35
# Plot bars
bars1 = ax1.bar(x - width/2, before_counts, width, label='CVE Count (Before Release)', color='#5B9BD5')
bars2 = ax1.bar(x + width/2, after_counts, width, label='CVE Count (After Release)', color='#ED7D31')
# Add value labels on bars
for bar, count in zip(bars1, before_counts):
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, str(count),
ha='center', va='bottom', fontsize=9, fontweight='bold')
for bar, count in zip(bars2, after_counts):
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, str(count),
ha='center', va='bottom', fontsize=9, fontweight='bold')
ax1.set_xlabel('Model', fontsize=12)
ax1.set_ylabel('Number of CVE Tasks', fontsize=12)
ax1.set_xticks(x)
ax1.set_xticklabels(available_models, rotation=45, ha='right', fontsize=10)
ax1.set_ylim(0, max(max(before_counts), max(after_counts)) * 1.15)
# Create second y-axis for pass rates
ax2 = ax1.twinx()
# Define colors for each model
colors = plt.cm.tab10(np.linspace(0, 1, len(available_models)))
# Plot pass rate lines
for i, (model, color) in enumerate(zip(available_models, colors)):
# Draw line connecting before and after
ax2.plot([x[i] - width/2, x[i] + width/2],
[before_rates[i], after_rates[i]],
'o-', color=color, linewidth=2, markersize=8,
label=model)
# Add rate labels
ax2.annotate(f'{before_rates[i]:.1f}%',
(x[i] - width/2, before_rates[i]),
textcoords="offset points", xytext=(0, 8),
ha='center', fontsize=8, color=color)
ax2.annotate(f'{after_rates[i]:.1f}%',
(x[i] + width/2, after_rates[i]),
textcoords="offset points", xytext=(0, 8),
ha='center', fontsize=8, color=color)
ax2.set_ylabel('Pass Rate (%)', fontsize=12)
ax2.set_ylim(0, 60)
# Add legends
ax1.legend(loc='upper left', fontsize=9)
ax2.legend(loc='upper right', fontsize=8, title='Pass Rate', ncol=2)
# Title
ax1.set_title(f'CVE Tasks Before/After Model Release - {agent_name} (Full Year 2025, 189 CVEs)',
fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('terminus-2_vs_mini-swe-agent_comparison.png', dpi=150, bbox_inches='tight')
plt.close()
print("Saved: terminus-2_vs_mini-swe-agent_comparison.png")

Xet Storage Details

Size:
5.89 kB
·
Xet hash:
e07b17167771acd55d39d9860dbd2a67a3e5da086fbec53274836760c0819d68

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.