| import json | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from datetime import datetime | |
| import matplotlib.font_manager as fm | |
| from label_offsets_config import label_offsets, bar_label_offsets | |
| # Try to use Liberation Serif (Times New Roman compatible) | |
| try: | |
| fm.fontManager.addfont('/usr/share/fonts/truetype/liberation/LiberationSerif-Regular.ttf') | |
| plt.rcParams['font.family'] = 'Liberation Serif' | |
| except: | |
| plt.rcParams['font.family'] = 'serif' | |
| # Model release dates (sorted by date) | |
| model_release_dates = { | |
| 'Claude-4-Sonnet': datetime(2025, 5, 23), | |
| 'Qwen3-Coder': datetime(2025, 7, 23), | |
| 'DeepSeek-V3.1-Terminus': datetime(2025, 9, 22), | |
| 'Claude-4.5-Sonnet': datetime(2025, 9, 29), | |
| 'GLM-4.6': datetime(2025, 9, 30), | |
| 'MiniMax-M2': datetime(2025, 10, 27), | |
| 'Claude-4.5-Opus': datetime(2025, 11, 1), | |
| 'Gemini-3-Pro': datetime(2025, 11, 18), | |
| 'GPT-5.1-Codex': datetime(2025, 11, 19), | |
| 'DeepSeek-V3.2': datetime(2025, 12, 1), | |
| } | |
| # Model display names (multi-line) | |
| model_display_names = { | |
| 'Claude-4-Sonnet': 'Claude-4-\nSonnet', | |
| 'Qwen3-Coder': 'Qwen3\n-Coder', | |
| 'DeepSeek-V3.1-Terminus': 'DeepSeek-V3.1\n-Terminus', | |
| 'Claude-4.5-Sonnet': 'Claude-4.5\n-Sonnet', | |
| 'GLM-4.6': 'GLM-4.6', | |
| 'MiniMax-M2': 'MiniMax-M2', | |
| 'Claude-4.5-Opus': 'Claude-4.5\n-Opus', | |
| 'Gemini-3-Pro': 'Gemini-3\n-Pro', | |
| 'GPT-5.1-Codex': 'GPT-5.1\n-Codex', | |
| 'DeepSeek-V3.2': 'DeepSeek-V3.2', | |
| } | |
| sorted_models = sorted(model_release_dates.keys(), key=lambda x: model_release_dates[x]) | |
| # Load CVE dates | |
| df = pd.read_excel('fliter-excel.xlsx') | |
| cve_dates = dict(zip(df['CVE'], pd.to_datetime(df['Date']))) | |
| # Load task stats | |
| with open('task_stats_filtered.json', 'r') as f: | |
| task_stats = json.load(f) | |
| def calculate_stats(agent_name, model_name): | |
| """Calculate before/after release stats for a model with an agent.""" | |
| key = f"{agent_name}+{model_name}" | |
| if key not in task_stats: | |
| return None | |
| cve_results = task_stats[key] | |
| release_date = model_release_dates[model_name] | |
| before_total = before_success = after_total = after_success = 0 | |
| for cve_id, result in cve_results.items(): | |
| if cve_id not in cve_dates: | |
| continue | |
| cve_date = cve_dates[cve_id] | |
| success = result.get('success', False) | |
| if cve_date < release_date: | |
| before_total += 1 | |
| if success: before_success += 1 | |
| else: | |
| after_total += 1 | |
| if success: after_success += 1 | |
| before_rate = (before_success / before_total * 100) if before_total > 0 else 0 | |
| after_rate = (after_success / after_total * 100) if after_total > 0 else 0 | |
| return {'before_total': before_total, 'before_rate': before_rate, | |
| 'after_total': after_total, 'after_rate': after_rate} | |
| # Calculate stats for both agents | |
| terminus_stats = {m: calculate_stats('terminus-2', m) for m in sorted_models} | |
| mini_swe_stats = {m: calculate_stats('mini-swe-agent', m) for m in sorted_models} | |
| terminus_stats = {k: v for k, v in terminus_stats.items() if v} | |
| mini_swe_stats = {k: v for k, v in mini_swe_stats.items() if v} | |
| available_models = [m for m in sorted_models if m in terminus_stats and m in mini_swe_stats] | |
| # Prepare data | |
| x = np.arange(len(available_models)) | |
| before_counts = [terminus_stats[m]['before_total'] for m in available_models] | |
| after_counts = [terminus_stats[m]['after_total'] for m in available_models] | |
| terminus_before_rates = [terminus_stats[m]['before_rate'] for m in available_models] | |
| terminus_after_rates = [terminus_stats[m]['after_rate'] for m in available_models] | |
| mini_swe_before_rates = [mini_swe_stats[m]['before_rate'] for m in available_models] | |
| mini_swe_after_rates = [mini_swe_stats[m]['after_rate'] for m in available_models] | |
| # Create figure - 单栏论文,整页宽度,1/3页高(压缩高度) | |
| # 基准尺寸和当前尺寸 | |
| BASE_FIGSIZE = (20, 10) | |
| CURRENT_FIGSIZE = (28, 9.5) | |
| # 自动计算偏移缩放系数 | |
| SCALE_X = CURRENT_FIGSIZE[0] / BASE_FIGSIZE[0] # 1.4 | |
| SCALE_Y = CURRENT_FIGSIZE[1] / BASE_FIGSIZE[1] # 0.95 | |
| def scale_offset(offset): | |
| """根据图表尺寸变化自动缩放偏移值""" | |
| return (offset[0] * SCALE_X, offset[1] * SCALE_Y) | |
| fig, ax1 = plt.subplots(figsize=CURRENT_FIGSIZE) | |
| width = 0.35 | |
| # Remove top spine | |
| ax1.spines['top'].set_visible(False) | |
| # Colors - 玫瑰青色调 (rose & teal tones) | |
| bar_color1 = '#F0D0D8' # 淡玫瑰 - Before Release | |
| bar_color2 = '#C8E0E0' # 淡青色 - After Release | |
| line_color1 = '#C04060' # 深玫瑰红 - Terminus-2 | |
| line_color2 = '#206868' # 深青色 - Mini-SWE-Agent | |
| # Plot bars | |
| bars1 = ax1.bar(x - width/2, before_counts, width, label='CVE Count (Before Release)', color=bar_color1, edgecolor='#D0A0B0', linewidth=1.5) | |
| bars2 = ax1.bar(x + width/2, after_counts, width, label='CVE Count (After Release)', color=bar_color2, edgecolor='#90B8B8', linewidth=1.5) | |
| # Bar value labels (using config offsets with auto-scaling) | |
| for i, (bar, count) in enumerate(zip(bars1, before_counts)): | |
| model = available_models[i] | |
| offset = bar_label_offsets.get(model, {}).get('before', (0, 2)) | |
| scaled = scale_offset(offset) | |
| ax1.text(bar.get_x() + bar.get_width()/2 + scaled[0], bar.get_height() + scaled[1], str(count), | |
| ha='center', va='bottom', fontsize=24, fontweight='bold', color='#333333') | |
| for i, (bar, count) in enumerate(zip(bars2, after_counts)): | |
| model = available_models[i] | |
| offset = bar_label_offsets.get(model, {}).get('after', (0, 2)) | |
| scaled = scale_offset(offset) | |
| ax1.text(bar.get_x() + bar.get_width()/2 + scaled[0], bar.get_height() + scaled[1], str(count), | |
| ha='center', va='bottom', fontsize=24, fontweight='bold', color='#333333') | |
| # Axis settings | |
| ax1.set_ylabel('Number of CVE Tasks', fontsize=32) | |
| ax1.set_xticks(x) | |
| ax1.set_xticklabels([model_display_names[m] for m in available_models], rotation=0, ha='center', fontsize=24) | |
| ax1.set_ylim(0, max(max(before_counts), max(after_counts)) * 1.18) | |
| ax1.tick_params(axis='y', labelsize=24) | |
| # Second y-axis for pass rates | |
| ax2 = ax1.twinx() | |
| ax2.spines['top'].set_visible(False) | |
| # Plot lines | |
| for i in range(len(available_models)): | |
| ax2.plot([x[i] - width/2, x[i] + width/2], [terminus_before_rates[i], terminus_after_rates[i]], | |
| 'o-', color=line_color1, linewidth=4, markersize=14, label='Terminus-2' if i == 0 else '') | |
| for i in range(len(available_models)): | |
| ax2.plot([x[i] - width/2, x[i] + width/2], [mini_swe_before_rates[i], mini_swe_after_rates[i]], | |
| 's-', color=line_color2, linewidth=4, markersize=14, label='Mini-SWE-Agent' if i == 0 else '') | |
| # Rate labels using offsets from config file (with auto-scaling) | |
| for i, model in enumerate(available_models): | |
| t_before = terminus_before_rates[i] | |
| t_after = terminus_after_rates[i] | |
| m_before = mini_swe_before_rates[i] | |
| m_after = mini_swe_after_rates[i] | |
| # Get offsets from config and apply scaling | |
| raw_offsets = label_offsets.get(model, { | |
| 't_before': (-5, 12), 't_after': (5, 12), | |
| 'm_before': (-5, -20), 'm_after': (5, -20) | |
| }) | |
| offsets = {k: scale_offset(v) for k, v in raw_offsets.items()} | |
| # Terminus-2 labels (green) | |
| ax2.annotate(f'{t_before:.1f}', (x[i] - width/2, t_before), | |
| textcoords="offset points", xytext=offsets['t_before'], | |
| ha='center', fontsize=20, color=line_color1, fontweight='bold') | |
| ax2.annotate(f'{t_after:.1f}', (x[i] + width/2, t_after), | |
| textcoords="offset points", xytext=offsets['t_after'], | |
| ha='center', fontsize=20, color=line_color1, fontweight='bold') | |
| # Mini-SWE-Agent labels (orange) | |
| ax2.annotate(f'{m_before:.1f}', (x[i] - width/2, m_before), | |
| textcoords="offset points", xytext=offsets['m_before'], | |
| ha='center', fontsize=20, color=line_color2, fontweight='bold') | |
| ax2.annotate(f'{m_after:.1f}', (x[i] + width/2, m_after), | |
| textcoords="offset points", xytext=offsets['m_after'], | |
| ha='center', fontsize=20, color=line_color2, fontweight='bold') | |
| ax2.set_ylabel('Pass Rate (%)', fontsize=32) | |
| ax2.set_ylim(0, 55) | |
| ax2.tick_params(axis='y', labelsize=24) | |
| # Legends | |
| ax1.legend(loc='upper left', fontsize=24) | |
| ax2.legend(loc='upper right', fontsize=24) | |
| # 紧凑布局,减少边距 | |
| plt.tight_layout(pad=0.5) | |
| plt.subplots_adjust(left=0.055, right=0.945, top=0.97, bottom=0.12) | |
| plt.savefig('terminus2_vs_miniswe.pdf', format='pdf', bbox_inches='tight', pad_inches=0.05) | |
| plt.savefig('terminus2_vs_miniswe_final.png', dpi=300, bbox_inches='tight', pad_inches=0.05) | |
| plt.close() | |
| print("Saved: terminus2_vs_miniswe.pdf and terminus2_vs_miniswe_final.png") |
Xet Storage Details
- Size:
- 8.67 kB
- Xet hash:
- 9f8127a54192fa7cab55f3935a86cdf91411c044c840386be053815889d946cd
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.