Tsukihjy's picture
download
raw
8.41 kB
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import matplotlib.font_manager as fm
# Try to use Liberation Serif (Times New Roman compatible)
try:
fm.fontManager.addfont('/usr/share/fonts/truetype/liberation/LiberationSerif-Regular.ttf')
plt.rcParams['font.family'] = 'Liberation Serif'
except:
plt.rcParams['font.family'] = 'serif'
# Model release dates (sorted by date)
model_release_dates = {
'Claude-4-Sonnet': datetime(2025, 5, 23),
'Qwen3-Coder': datetime(2025, 7, 23),
'DeepSeek-V3.1-Terminus': datetime(2025, 9, 22),
'Claude-4.5-Sonnet': datetime(2025, 9, 29),
'GLM-4.6': datetime(2025, 9, 30),
'MiniMax-M2': datetime(2025, 10, 27),
'Claude-4.5-Opus': datetime(2025, 11, 1),
'Gemini-3-Pro': datetime(2025, 11, 18),
'GPT-5.1-Codex': datetime(2025, 11, 19),
'DeepSeek-V3.2': datetime(2025, 12, 1),
}
# Model display names (multi-line)
model_display_names = {
'Claude-4-Sonnet': 'Claude-4\n-Sonnet',
'Qwen3-Coder': 'Qwen3\n-Coder',
'DeepSeek-V3.1-Terminus': 'DeepSeek\n-V3.1\n-Terminus',
'Claude-4.5-Sonnet': 'Claude-4.5\n-Sonnet',
'GLM-4.6': 'GLM\n-4.6',
'MiniMax-M2': 'MiniMax\n-M2',
'Claude-4.5-Opus': 'Claude-4.5\n-Opus',
'Gemini-3-Pro': 'Gemini-3\n-Pro',
'GPT-5.1-Codex': 'GPT-5.1\n-Codex',
'DeepSeek-V3.2': 'DeepSeek\n-V3.2',
}
sorted_models = sorted(model_release_dates.keys(), key=lambda x: model_release_dates[x])
# Load CVE dates
df = pd.read_excel('fliter-excel.xlsx')
cve_dates = dict(zip(df['CVE'], pd.to_datetime(df['Date'])))
# Load task stats
with open('task_stats_filtered.json', 'r') as f:
task_stats = json.load(f)
def calculate_stats(agent_name, model_name):
"""Calculate before/after release stats for a model with an agent."""
key = f"{agent_name}+{model_name}"
if key not in task_stats:
return None
cve_results = task_stats[key]
release_date = model_release_dates[model_name]
before_total = before_success = after_total = after_success = 0
for cve_id, result in cve_results.items():
if cve_id not in cve_dates:
continue
cve_date = cve_dates[cve_id]
success = result.get('success', False)
if cve_date < release_date:
before_total += 1
if success: before_success += 1
else:
after_total += 1
if success: after_success += 1
before_rate = (before_success / before_total * 100) if before_total > 0 else 0
after_rate = (after_success / after_total * 100) if after_total > 0 else 0
return {'before_total': before_total, 'before_rate': before_rate,
'after_total': after_total, 'after_rate': after_rate}
# Calculate stats for both agents
terminus_stats = {m: calculate_stats('terminus-2', m) for m in sorted_models}
mini_swe_stats = {m: calculate_stats('mini-swe-agent', m) for m in sorted_models}
terminus_stats = {k: v for k, v in terminus_stats.items() if v}
mini_swe_stats = {k: v for k, v in mini_swe_stats.items() if v}
available_models = [m for m in sorted_models if m in terminus_stats and m in mini_swe_stats]
# Prepare data
x = np.arange(len(available_models))
before_counts = [terminus_stats[m]['before_total'] for m in available_models]
after_counts = [terminus_stats[m]['after_total'] for m in available_models]
terminus_before_rates = [terminus_stats[m]['before_rate'] for m in available_models]
terminus_after_rates = [terminus_stats[m]['after_rate'] for m in available_models]
mini_swe_before_rates = [mini_swe_stats[m]['before_rate'] for m in available_models]
mini_swe_after_rates = [mini_swe_stats[m]['after_rate'] for m in available_models]
# Create figure
fig, ax1 = plt.subplots(figsize=(20, 10))
width = 0.35
# Colors - 稍微加深的蓝紫色系,与绿色和谐
bar_color1 = '#A8C8E8' # 稍深的淡蓝 - Before Release
bar_color2 = '#B8B8D8' # 稍深的淡紫 - After Release
line_color1 = '#1E5D3A' # 深绿 - Terminus-2
line_color2 = '#5A3A8A' # 深紫 - Mini-SWE-Agent
# Plot bars
bars1 = ax1.bar(x - width/2, before_counts, width, label='CVE Count (Before Release)', color=bar_color1, edgecolor='#7AA0C0', linewidth=1.5)
bars2 = ax1.bar(x + width/2, after_counts, width, label='CVE Count (After Release)', color=bar_color2, edgecolor='#9090B0', linewidth=1.5)
# Bar value labels
for bar, count in zip(bars1, before_counts):
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, str(count),
ha='center', va='bottom', fontsize=16, fontweight='bold', color='#333333')
for bar, count in zip(bars2, after_counts):
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, str(count),
ha='center', va='bottom', fontsize=16, fontweight='bold', color='#333333')
# Axis settings
ax1.set_ylabel('Number of CVE Tasks', fontsize=20)
ax1.set_xticks(x)
ax1.set_xticklabels([model_display_names[m] for m in available_models], rotation=0, ha='center', fontsize=16)
ax1.set_ylim(0, max(max(before_counts), max(after_counts)) * 1.18)
ax1.tick_params(axis='y', labelsize=17)
# Second y-axis for pass rates
ax2 = ax1.twinx()
# Plot lines
for i in range(len(available_models)):
ax2.plot([x[i] - width/2, x[i] + width/2], [terminus_before_rates[i], terminus_after_rates[i]],
'o-', color=line_color1, linewidth=4, markersize=14, label='Terminus-2' if i == 0 else '')
for i in range(len(available_models)):
ax2.plot([x[i] - width/2, x[i] + width/2], [mini_swe_before_rates[i], mini_swe_after_rates[i]],
's-', color=line_color2, linewidth=4, markersize=14, label='Mini-SWE-Agent' if i == 0 else '')
# Rate labels with position adjustments to avoid overlap
for i, model in enumerate(available_models):
t_before = terminus_before_rates[i]
t_after = terminus_after_rates[i]
m_before = mini_swe_before_rates[i]
m_after = mini_swe_after_rates[i]
# Terminus-2 labels
if model == 'DeepSeek-V3.1-Terminus':
ax2.annotate(f'{t_before:.1f}%', (x[i] - width/2, t_before),
textcoords="offset points", xytext=(-5, 16), ha='center', fontsize=14, color=line_color1, fontweight='bold')
elif model == 'Claude-4.5-Opus':
ax2.annotate(f'{t_before:.1f}%', (x[i] - width/2, t_before),
textcoords="offset points", xytext=(0, 14), ha='center', fontsize=14, color=line_color1, fontweight='bold')
else:
ax2.annotate(f'{t_before:.1f}%', (x[i] - width/2, t_before),
textcoords="offset points", xytext=(-5, 12), ha='center', fontsize=14, color=line_color1, fontweight='bold')
if model == 'GLM-4.6':
ax2.annotate(f'{t_after:.1f}%', (x[i] + width/2, t_after),
textcoords="offset points", xytext=(5, 16), ha='center', fontsize=14, color=line_color1, fontweight='bold')
else:
ax2.annotate(f'{t_after:.1f}%', (x[i] + width/2, t_after),
textcoords="offset points", xytext=(5, 12), ha='center', fontsize=14, color=line_color1, fontweight='bold')
# Mini-SWE-Agent labels
if model == 'DeepSeek-V3.1-Terminus':
ax2.annotate(f'{m_before:.1f}%', (x[i] - width/2, m_before),
textcoords="offset points", xytext=(-5, -22), ha='center', fontsize=14, color=line_color2, fontweight='bold')
elif model == 'Claude-4.5-Opus':
ax2.annotate(f'{m_before:.1f}%', (x[i] - width/2, m_before),
textcoords="offset points", xytext=(0, -22), ha='center', fontsize=14, color=line_color2, fontweight='bold')
else:
ax2.annotate(f'{m_before:.1f}%', (x[i] - width/2, m_before),
textcoords="offset points", xytext=(-5, -20), ha='center', fontsize=14, color=line_color2, fontweight='bold')
ax2.annotate(f'{m_after:.1f}%', (x[i] + width/2, m_after),
textcoords="offset points", xytext=(5, -20), ha='center', fontsize=14, color=line_color2, fontweight='bold')
ax2.set_ylabel('Pass Rate (%)', fontsize=20)
ax2.set_ylim(0, 55)
ax2.tick_params(axis='y', labelsize=17)
# Legends
ax1.legend(loc='upper left', fontsize=17)
ax2.legend(loc='upper right', fontsize=17)
plt.tight_layout()
plt.savefig('terminus2_vs_miniswe.pdf', format='pdf', bbox_inches='tight')
plt.savefig('terminus2_vs_miniswe_final.png', dpi=150, bbox_inches='tight')
plt.close()
print("Saved: terminus2_vs_miniswe.pdf and terminus2_vs_miniswe_final.png")

Xet Storage Details

Size:
8.41 kB
·
Xet hash:
88fcce8d6a35a17822db302874a2d6f263d292b938e7b141c056b0eb9f5d7c67

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.