diff --git a/README.md b/README.md
index 8cf69bc2df1860fce1000109ea502ff1ce62bfa4..2af69967b32a311992ef1b99b8434760e307e860 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,4 @@ app_file: app.py
pinned: false
license: apache-2.0
short_description: Realistic and Comprehensive Bilingual Long-Context Benchmark
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
\ No newline at end of file
+---
\ No newline at end of file
diff --git a/app.py b/app.py
index cbffdf1ba490e3ae1fb244c10909cccfa7652993..45df31031d17acbf8aeaaf420b343b8cd685e430 100644
--- a/app.py
+++ b/app.py
@@ -1,7 +1,880 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+LongBenchmark 结果可视化
+"""
+
+import json
+import re
+import pandas as pd
+from pathlib import Path
import gradio as gr
+import plotly.graph_objects as go
+
+with open('./results/model_info.json', 'r', encoding='utf-8') as f:
+ MODLE_INFO_DICT = json.load(f)
+
+def get_color(index):
+ """基于索引生成颜色,使用黄金角度确保颜色分布均匀且无限"""
+ # 黄金角度约 137.508 度,确保颜色在色环上分布均匀
+ hue = (index * 137.508) % 360
+ # 固定饱和度为70%,亮度为60%,确保颜色既鲜艳又不刺眼
+ return f"hsl({hue}, 70%, 60%)"
+
+class ResultParser:
+ def __init__(self, output_dir: str):
+ self.output_dir = Path(output_dir)
+ self.results = []
+
+ def parse_filename(self, filename: str):
+ """解析文件名,提取context长度和是否包含thinking或nonthinking"""
+ # 提取context长度
+ context_match = re.search(r'context-(\d+)', filename)
+ context_length = int(context_match.group(1)) if context_match else 0
+
+ filename_lower = filename.lower()
+ # 检查是否包含nonthinking(优先检查,因为nonthinking也包含thinking)
+ has_nonthinking = 'nonthinking' in filename_lower
+ # 检查是否包含thinking(但不包含nonthinking)
+ has_thinking = 'thinking' in filename_lower and not has_nonthinking
+
+ return context_length, has_thinking, has_nonthinking
+
+ def parse_result_file(self, model_name: str, file_path: Path):
+ """解析单个结果文件"""
+ try:
+ with open(file_path, 'r', encoding='utf-8') as f:
+ data = json.load(f)
+
+ context_length, has_thinking, has_nonthinking = self.parse_filename(file_path.name)
+ # 使用JSON文件中的date字段作为评估日期
+ eval_date = data.get('date', "未知")
+
+ # 提取BoN数据
+ bon_data = {}
+ for bon_key in ['BoN-1', 'BoN-2', 'BoN-3']:
+ if bon_key in data and 'overall_metric' in data[bon_key]:
+ bon_data[bon_key] = data[bon_key]['overall_metric']
+
+ result = {
+ 'model_name': model_name,
+ 'eval_date': eval_date,
+ 'context_length': context_length,
+ 'has_thinking': has_thinking,
+ 'has_nonthinking': has_nonthinking,
+ 'overall_metric': data.get('average_overall_metric', 0.0),
+ 'token_length_metrics': data.get('average_token_length_metric', {}),
+ 'contextual_requirement': data.get('average_contextual_requirement_metric', {}),
+ 'difficulty': data.get('average_difficulty_metric', {}),
+ 'primary_task': data.get('average_primary_task_metric', {}),
+ 'language': data.get('average_language_metric', {}),
+ 'bon_data': bon_data, # 存储BoN-1, BoN-2, BoN-3的overall_metric
+ 'pass_at_k': {
+ 'pass@1': data.get('pass@1'),
+ 'pass@2': data.get('pass@2'),
+ 'pass@3': data.get('pass@3')
+ }
+ }
+
+ return result
+
+ except Exception as e:
+ print(f"解析文件 {file_path} 时出错: {e}")
+ return None
+
+ def scan_all_results(self):
+ """扫描所有模型的结果文件"""
+ self.results = []
+
+ if not self.output_dir.exists():
+ print(f"输出目录不存在: {self.output_dir}")
+ return
+
+ # 遍历所有模型目录
+ for model_dir in self.output_dir.iterdir():
+ if not model_dir.is_dir():
+ continue
+
+ model_name = model_dir.name
+ print(f"扫描模型: {model_name}")
+
+ # 查找该模型下的所有_summary.json文件
+ for file_path in model_dir.glob("*_summary.json"):
+ print(f" 解析文件: {file_path.name}")
+ result = self.parse_result_file(model_name, file_path)
+ if result:
+ self.results.append(result)
+
+ print(f"总共解析了 {len(self.results)} 个结果文件")
+
+ def get_leaderboard_data(self):
+ """获取排行榜数据"""
+ if not self.results:
+ return pd.DataFrame()
+
+ # 按模型名称聚合数据
+ model_groups = {}
+ for result in self.results:
+ model_name = result['model_name']
+ if model_name not in model_groups:
+ model_groups[model_name] = {
+ 'dates': [],
+ 'contexts': [],
+ 'thinking_scores': [],
+ 'non_thinking_scores': []
+ }
+
+ group = model_groups[model_name]
+ group['dates'].append(result['eval_date'])
+ group['contexts'].append(result['context_length'])
+
+ score = result['overall_metric']
+ if result['has_thinking']:
+ group['thinking_scores'].append(score)
+ else:
+ group['non_thinking_scores'].append(score)
+
+ leaderboard_data = []
+ for model_name, group in model_groups.items():
+ # 获取最新日期
+ valid_dates = [d for d in group['dates'] if d != "未知"]
+ latest_date = max(valid_dates) if valid_dates else "未知"
+
+ # 获取最大Context Window
+ max_context = max(group['contexts']) if group['contexts'] else 0
+
+ # 格式化截断长度
+ if max_context >= 1000000:
+ context_str = f"{max_context/1000000:.0f}M" if max_context % 1000000 == 0 else f"{max_context/1000000:.1f}M"
+ elif max_context >= 1000:
+ context_str = f"{max_context/1000:.0f}k" if max_context % 1000 == 0 else f"{max_context/1000:.1f}k"
+ else:
+ context_str = str(max_context)
+
+ # 获取模型类型和上下文长度
+ model_context = "-"
+ model_url = ""
+ if model_name in MODLE_INFO_DICT:
+ model_info = MODLE_INFO_DICT[model_name]
+ if isinstance(model_info, dict):
+ model_type = model_info.get("type", "Unknown")
+ model_context = model_info.get("context_length", "-")
+ model_url = model_info.get("url", "")
+ else:
+ model_type = str(model_info)
+ else:
+ model_type = "Unknown"
+
+ # 处理模型名称链接和图标
+ display_model_name = model_name
+
+ if model_url:
+ display_model_name = f"[{display_model_name}]({model_url})"
+
+ # 计算平均分
+ nt_score_val = 0
+ nt_score_str = "-"
+ if group['non_thinking_scores']:
+ nt_score_val = sum(group['non_thinking_scores']) / len(group['non_thinking_scores'])
+ nt_score_str = f"{nt_score_val * 100:.2f}"
+
+ t_score_val = 0
+ t_score_str = "-"
+ if group['thinking_scores']:
+ t_score_val = sum(group['thinking_scores']) / len(group['thinking_scores'])
+ t_score_str = f"{t_score_val * 100:.2f}"
+
+ leaderboard_data.append({
+ '模型名称': display_model_name,
+ '模型类型': model_type,
+ '上下文长度': model_context,
+ '截断长度': context_str,
+ '非思考得分': nt_score_str,
+ '思考得分': t_score_str,
+ '_sort_score': max(nt_score_val, t_score_val)
+ })
+
+ df = pd.DataFrame(leaderboard_data)
+ # 按最高分降序排列
+ if not df.empty:
+ df = df.sort_values('_sort_score', ascending=False).drop(columns=['_sort_score']).reset_index(drop=True)
+
+ return df
+
+def get_display_name_for_result(result):
+ """获取模型的显示名称(根据是否包含thinking或nonthinking添加后缀)"""
+ if result.get('has_nonthinking'):
+ return f"{result['model_name']}_nonthinking"
+ elif result.get('has_thinking'):
+ return f"{result['model_name']}_thinking"
+ else:
+ return result['model_name']
+
+def get_model_color_index(model_name, all_models):
+ """获取模型在颜色列表中的索引"""
+ try:
+ return all_models.index(model_name)
+ except ValueError:
+ return 0
+
+def create_contextual_requirement_chart(results, selected_models):
+ """创建上下文需求对比柱状图"""
+ if not selected_models:
+ return go.Figure()
+
+ # 收集数据 - 直接使用summary中的值,不需要计算平均值
+ chart_data = {}
+
+ for result in results:
+ display_name = get_display_name_for_result(result)
+ if display_name in selected_models:
+ model_name = display_name
+ contextual_requirement = result['contextual_requirement']
+
+ # 直接存储每个模型的结果,不需要计算平均值
+ if model_name not in chart_data:
+ chart_data[model_name] = {}
+
+ for req_type, score in contextual_requirement.items():
+ chart_data[model_name][req_type] = score * 100 # 乘以100
+
+ # 创建图表
+ fig = go.Figure()
+
+ # 获取所有需求类型 - 保持原始顺序,不排序
+ all_req_types = []
+ for result in results:
+ display_name = get_display_name_for_result(result)
+ if display_name in selected_models:
+ contextual_requirement = result['contextual_requirement']
+ for req_type in contextual_requirement.keys():
+ if req_type not in all_req_types:
+ all_req_types.append(req_type)
+
+ for model_name in selected_models:
+ if model_name in chart_data:
+ scores = [chart_data[model_name].get(req_type, 0) for req_type in all_req_types]
+ color_index = get_model_color_index(model_name, selected_models)
+
+ fig.add_trace(go.Bar(
+ name=model_name,
+ x=all_req_types,
+ y=scores,
+ marker_color=get_color(color_index),
+ text=[f"{score:.2f}" for score in scores], # 保留2位小数
+ textposition='auto'
+ ))
+
+ fig.update_layout(
+ title='模型在不同上下文需求上的性能对比',
+ xaxis_title='上下文需求类型',
+ yaxis_title='平均得分',
+ barmode='group',
+ autosize=True, # 自动调整大小
+ legend=dict(
+ orientation="h",
+ yanchor="top",
+ y=-0.25, # 调整到更下方
+ xanchor="center",
+ x=0.5
+ ),
+ margin=dict(b=100) # 增加底部边距
+ )
+
+ return fig
+
+def create_primary_task_radar_chart(results, selected_models):
+ """创建主要任务雷达图(按任务前缀聚合,使用'.'前缀,绘制最多11个任务)"""
+ if not selected_models:
+ return go.Figure()
+
+ # 收集所有模型下的任务前缀,保持出现顺序
+ prefix_order = []
+ # 为每个模型构建 前缀 -> [scores] 的映射
+ model_prefix_scores = {}
+
+ for result in results:
+ display_name = get_display_name_for_result(result)
+ if display_name not in selected_models:
+ continue
+ primary_task = result.get('primary_task', {})
+ if display_name not in model_prefix_scores:
+ model_prefix_scores[display_name] = {}
+ for task_key, score in primary_task.items():
+ prefix = task_key.split('.')[0].strip() if isinstance(task_key, str) else str(task_key)
+ if prefix not in prefix_order:
+ prefix_order.append(prefix)
+ if prefix not in model_prefix_scores[display_name]:
+ model_prefix_scores[display_name][prefix] = []
+ model_prefix_scores[display_name][prefix].append(score * 100)
+
+ # 只取前11个前缀用于绘制
+ categories = prefix_order[:11]
+
+ # 创建雷达图
+ fig = go.Figure()
+
+ for model_name in selected_models:
+ if model_name not in model_prefix_scores:
+ continue
+ # 对每个前缀做均值聚合;缺失则为0
+ values = []
+ for prefix in categories:
+ scores = model_prefix_scores[model_name].get(prefix, [])
+ if scores:
+ values.append(sum(scores) / len(scores))
+ else:
+ values.append(0)
+ # 闭合多边形
+ r_values = values + ([values[0]] if values else [])
+ theta_values = categories + ([categories[0]] if categories else [])
+ color_index = get_model_color_index(model_name, selected_models)
+ fig.add_trace(go.Scatterpolar(
+ r=r_values,
+ theta=theta_values,
+ mode='lines+markers',
+ name=model_name,
+ line=dict(color=get_color(color_index), width=3),
+ marker=dict(size=6),
+ fill='toself'
+ ))
+
+ fig.update_layout(
+ title='模型在不同主要任务上的性能对比',
+ polar=dict(
+ radialaxis=dict(visible=True, range=[0, 100])
+ ),
+ legend=dict(
+ orientation="h",
+ yanchor="top",
+ y=-0.2,
+ xanchor="center",
+ x=0.5
+ ),
+ margin=dict(b=100)
+ )
+
+ return fig
+
+def create_language_chart(results, selected_models):
+ """创建语言对比柱状图"""
+ if not selected_models:
+ return go.Figure()
+
+ # 收集数据 - 直接使用summary中的值,不需要计算平均值
+ chart_data = {}
+
+ for result in results:
+ display_name = get_display_name_for_result(result)
+ if display_name in selected_models:
+ model_name = display_name
+ language = result['language']
+
+ # 直接存储每个模型的结果,不需要计算平均值
+ if model_name not in chart_data:
+ chart_data[model_name] = {}
+
+ for lang_type, score in language.items():
+ chart_data[model_name][lang_type] = score * 100 # 乘以100
+
+ # 创建图表
+ fig = go.Figure()
+
+ # 获取所有语言类型 - 保持原始顺序,不排序
+ all_lang_types = []
+ for result in results:
+ display_name = get_display_name_for_result(result)
+ if display_name in selected_models:
+ language = result['language']
+ for lang_type in language.keys():
+ if lang_type not in all_lang_types:
+ all_lang_types.append(lang_type)
+
+ for model_name in selected_models:
+ if model_name in chart_data:
+ scores = [chart_data[model_name].get(lang_type, 0) for lang_type in all_lang_types]
+ color_index = get_model_color_index(model_name, selected_models)
+
+ fig.add_trace(go.Bar(
+ name=model_name,
+ x=all_lang_types,
+ y=scores,
+ marker_color=get_color(color_index),
+ text=[f"{score:.2f}" for score in scores], # 保留2位小数
+ textposition='auto'
+ ))
+
+ fig.update_layout(
+ title='模型在不同语言上的性能对比',
+ xaxis_title='语言类型',
+ yaxis_title='平均得分',
+ barmode='group',
+ autosize=True, # 自动调整大小
+ legend=dict(
+ orientation="h",
+ yanchor="top",
+ y=-0.25, # 调整到更下方
+ xanchor="center",
+ x=0.5
+ ),
+ margin=dict(b=100) # 增加底部边距
+ )
+
+ return fig
+
+def create_difficulty_chart(results, selected_models):
+ """创建难度对比柱状图"""
+ if not selected_models:
+ return go.Figure()
+
+ # 收集数据 - 直接使用summary中的值,不需要计算平均值
+ chart_data = {}
+
+ for result in results:
+ display_name = get_display_name_for_result(result)
+ if display_name in selected_models:
+ model_name = display_name
+ difficulty = result['difficulty']
+
+ # 直接存储每个模型的结果,不需要计算平均值
+ if model_name not in chart_data:
+ chart_data[model_name] = {}
+
+ for diff_type, score in difficulty.items():
+ chart_data[model_name][diff_type] = score * 100 # 乘以100
+
+ # 创建图表
+ fig = go.Figure()
+
+ # 获取所有难度类型 - 保持原始顺序,不排序
+ all_diff_types = []
+ for result in results:
+ display_name = get_display_name_for_result(result)
+ if display_name in selected_models:
+ difficulty = result['difficulty']
+ for diff_type in difficulty.keys():
+ if diff_type not in all_diff_types:
+ all_diff_types.append(diff_type)
+
+ for model_name in selected_models:
+ if model_name in chart_data:
+ scores = [chart_data[model_name].get(diff_type, 0) for diff_type in all_diff_types]
+ color_index = get_model_color_index(model_name, selected_models)
+
+ fig.add_trace(go.Bar(
+ name=model_name,
+ x=all_diff_types,
+ y=scores,
+ marker_color=get_color(color_index),
+ text=[f"{score:.2f}" for score in scores], # 保留2位小数
+ textposition='auto'
+ ))
+
+ fig.update_layout(
+ title='模型在不同难度上的性能对比',
+ xaxis_title='难度类型',
+ yaxis_title='平均得分',
+ barmode='group',
+ autosize=True, # 自动调整大小
+ legend=dict(
+ orientation="h",
+ yanchor="top",
+ y=-0.25, # 调整到更下方
+ xanchor="center",
+ x=0.5
+ ),
+ margin=dict(b=100) # 增加底部边距
+ )
+
+ return fig
+
+def create_length_heatmap(results, selected_models):
+ """创建长度热力图:横坐标为长度,纵坐标为模型"""
+ if not selected_models:
+ return go.Figure()
+
+ # 定义标准的context长度范围:8k, 16k, 32k, 64k, 128k, 256k
+ standard_lengths = [8000, 16000, 32000, 64000, 128000, 256000]
+ standard_length_keys = ['8k', '16k', '32k', '64k', '128k', '256k']
+
+ # 准备热力图数据
+ heatmap_data = []
+ model_names = []
+
+ for result in results:
+ display_name = get_display_name_for_result(result)
+ if display_name in selected_models:
+ model_names.append(display_name)
+
+ # 从token_length_metrics中获取数据
+ token_length_metrics = result.get('token_length_metrics', {})
+ row_data = []
+
+ for key in standard_length_keys:
+ if key in token_length_metrics:
+ row_data.append(token_length_metrics[key] * 100) # 乘以100转换为百分比
+ else:
+ row_data.append(None) # 没有数据点
+
+ heatmap_data.append(row_data)
+
+ # 创建热力图
+ fig = go.Figure(data=go.Heatmap(
+ z=heatmap_data,
+ x=[f"{length//1000}k" for length in standard_lengths], # x轴标签
+ y=model_names, # y轴标签
+ colorscale='RdYlBu_r', # 颜色映射:红色表示低分,蓝色表示高分
+ showscale=True,
+ text=[[f"{val:.2f}" if val is not None else "N/A" for val in row] for row in heatmap_data], # 显示数值
+ texttemplate="%{text}",
+ textfont={"size": 10},
+ hoverongaps=False
+ ))
+
+ fig.update_layout(
+ title='模型在不同Context长度上的性能热力图',
+ xaxis_title='Context长度 (tokens)',
+ yaxis_title='模型名称',
+ autosize=True,
+ height=max(400, len(model_names) * 50), # 根据模型数量调整高度
+ margin=dict(l=150, r=50, t=80, b=80) # 调整边距,左侧留更多空间给模型名称
+ )
+
+ return fig
+
+def create_bon_chart(results, selected_models):
+ """创建BoN 1-3折线图,显示overall_metric"""
+ if not selected_models:
+ return go.Figure()
+
+ # BoN 标签
+ bon_labels = ['BoN-1', 'BoN-2', 'BoN-3']
+ bon_indices = [1, 2, 3]
+
+ # 为每个模型准备数据
+ model_data = {}
+ for result in results:
+ display_name = get_display_name_for_result(result)
+ if display_name in selected_models:
+ if display_name not in model_data:
+ model_data[display_name] = {}
+
+ # 从bon_data中获取数据
+ bon_data = result.get('bon_data', {})
+ for bon_key in bon_labels:
+ if bon_key in bon_data:
+ bon_index = bon_labels.index(bon_key) + 1
+ model_data[display_name][bon_index] = bon_data[bon_key] * 100 # 乘以100转换为百分比
+
+ # 创建图表
+ fig = go.Figure()
+
+ for model_name, data in model_data.items():
+ if not data:
+ continue
+
+ # 为每个BoN准备数据
+ x_values = []
+ y_values = []
+ text_values = []
+
+ for bon_index in bon_indices:
+ x_values.append(bon_index)
+ if bon_index in data:
+ y_values.append(data[bon_index])
+ text_values.append(f"{data[bon_index]:.2f}")
+ else:
+ y_values.append(None)
+ text_values.append("")
+
+ # 获取模型颜色索引
+ color_index = get_model_color_index(model_name, selected_models)
+
+ fig.add_trace(go.Scatter(
+ x=x_values,
+ y=y_values,
+ mode='lines+markers',
+ name=model_name,
+ line=dict(color=get_color(color_index), width=3),
+ marker=dict(size=10),
+ text=text_values,
+ textposition='top center',
+ connectgaps=False
+ ))
+
+ # 设置x轴
+ fig.update_layout(
+ title='模型在不同Best-of-N下的对比',
+ xaxis_title='N',
+ yaxis_title='平均得分',
+ autosize=True,
+ xaxis=dict(
+ tickmode='array',
+ tickvals=bon_indices,
+ ticktext=bon_labels,
+ tickangle=0
+ ),
+ legend=dict(
+ orientation="h",
+ yanchor="top",
+ y=-0.25,
+ xanchor="center",
+ x=0.5
+ ),
+ margin=dict(b=100)
+ )
+
+ return fig
+
+def create_pass_k_chart(results, selected_models):
+ """创建Pass@N 折线图"""
+ if not selected_models:
+ return go.Figure()
+
+ # Pass@K 标签
+ k_labels = ['pass@1', 'pass@2', 'pass@3']
+ k_indices = [1, 2, 3]
+
+ # 为每个模型准备数据
+ model_data = {}
+ for result in results:
+ display_name = get_display_name_for_result(result)
+ if display_name in selected_models:
+ if display_name not in model_data:
+ model_data[display_name] = {}
+
+ # 从pass_at_k中获取数据
+ pass_data = result.get('pass_at_k', {})
+ for i, k_key in enumerate(k_labels):
+ val = pass_data.get(k_key)
+ if val is not None:
+ k_index = k_indices[i]
+ model_data[display_name][k_index] = val * 100 # 乘以100转换为百分比
+
+ # 创建图表
+ fig = go.Figure()
+
+ for model_name, data in model_data.items():
+ if not data:
+ continue
+
+ # 为每个Pass@K准备数据
+ x_values = []
+ y_values = []
+ text_values = []
+
+ for k_index in k_indices:
+ x_values.append(k_index)
+ if k_index in data:
+ y_values.append(data[k_index])
+ text_values.append(f"{data[k_index]:.2f}")
+ else:
+ y_values.append(None)
+ text_values.append("")
+
+ # 获取模型颜色索引
+ color_index = get_model_color_index(model_name, selected_models)
+
+ fig.add_trace(go.Scatter(
+ x=x_values,
+ y=y_values,
+ mode='lines+markers',
+ name=model_name,
+ line=dict(color=get_color(color_index), width=3),
+ marker=dict(size=10),
+ text=text_values,
+ textposition='top center',
+ connectgaps=False
+ ))
+
+ # 设置x轴
+ fig.update_layout(
+ title='模型在不同Pass@N下的对比',
+ xaxis_title='N',
+ yaxis_title='Pass@N (%)',
+ autosize=True,
+ xaxis=dict(
+ tickmode='array',
+ tickvals=k_indices,
+ ticktext=k_labels,
+ tickangle=0
+ ),
+ legend=dict(
+ orientation="h",
+ yanchor="top",
+ y=-0.25,
+ xanchor="center",
+ x=0.5
+ ),
+ margin=dict(b=100)
+ )
+
+ return fig
+
+def create_gradio_interface(parser: ResultParser):
+ """创建Gradio界面"""
+
+ def refresh_data():
+ """刷新数据"""
+ parser.scan_all_results()
+ return parser.get_leaderboard_data()
+
+ def get_model_choices():
+ """获取模型选择列表(按是否包含Thinking或NonThinking区分,以相应后缀标识)"""
+ if not parser.results:
+ return []
+ display_names = set()
+ for r in parser.results:
+ name = get_display_name_for_result(r)
+ display_names.add(name)
+ models = sorted(list(display_names))
+ return models
+
+ def update_charts(selected_models):
+ """更新所有图表"""
+ if not selected_models:
+ return None, None, None, None, None, None, None
+
+ length_heatmap = create_length_heatmap(parser.results, selected_models)
+ contextual_chart = create_contextual_requirement_chart(parser.results, selected_models)
+ primary_task_radar_chart = create_primary_task_radar_chart(parser.results, selected_models)
+ language_chart = create_language_chart(parser.results, selected_models)
+ difficulty_chart = create_difficulty_chart(parser.results, selected_models)
+ bon_chart = create_bon_chart(parser.results, selected_models)
+ pass_k_chart = create_pass_k_chart(parser.results, selected_models)
+
+ return length_heatmap, contextual_chart, primary_task_radar_chart, language_chart, difficulty_chart, bon_chart, pass_k_chart
+
+ # 自定义CSS:
+ # 1. 强制所有表头居中(包括内部的按钮或文本容器)
+ # 2. 除了第一列(模型名称),其他列内容居中
+ custom_css = """
+ /* 强制标题居中 */
+ h1 {
+ text-align: center;
+ display: block;
+ }
+
+ /* 表头居中 */
+ #leaderboard_table th,
+ #leaderboard_table th button,
+ #leaderboard_table th span {
+ text-align: center !important;
+ justify-content: center !important;
+ }
+
+ /* 内容列居中:从第3列开始(跳过行号和模型名称) */
+ #leaderboard_table td:nth-child(n+3) {
+ text-align: center !important;
+ }
+ """
+
+ # 创建界面
+ with gr.Blocks(title="LongBench Pro 结果可视化", theme=gr.themes.Soft(), css=custom_css) as demo:
+ gr.Markdown("# LongBench Pro 结果可视化")
+
+ gr.HTML("""
+
+ """)
+
+ # 排行榜区域
+ gr.Markdown("## 🏆 总体性能排行榜")
+ gr.Markdown("""
+ - *思考模型和混合思考模型的思考得分,使用本身的思考能力(Non-Thinking Prompt)*
+ - *指令模型的思考得分,使用思考提示获得(Thinking Prompt)*
+ """)
+ leaderboard_df = gr.Dataframe(
+ headers=["模型名称", "模型类型", "上下文长度", "截断长度", "非思考得分", "思考得分"],
+ datatype=["markdown", "str", "str", "str", "str", "str"],
+ interactive=False,
+ wrap=True,
+ show_row_numbers=True,
+ show_search="filter",
+ show_fullscreen_button=True,
+ max_height=800,
+ column_widths=["250px", "100px", "100px", "100px", "120px", "120px"],
+ elem_id="leaderboard_table"
+ )
+
+ # 模型筛选和图表区域
+ gr.HTML("
")
+ gr.Markdown("## 📊 特定维度对比")
+ with gr.Row():
+ with gr.Column(scale=4):
+ model_selector = gr.Dropdown(
+ choices=[],
+ label="选择模型",
+ value=[],
+ multiselect=True,
+ interactive=True
+ )
+ with gr.Column(scale=1):
+ update_charts_btn = gr.Button("更新图表", variant="primary", size="lg")
+
+ with gr.Tabs():
+ with gr.TabItem("语言维度"):
+ language_plot = gr.Plot()
+
+ with gr.TabItem("难度维度"):
+ difficulty_plot = gr.Plot()
+
+ with gr.TabItem("长度维度"):
+ length_heatmap = gr.Plot()
+
+ with gr.TabItem("主要任务维度"):
+ primary_task_radar_plot = gr.Plot()
+
+ with gr.TabItem("上下文需求维度"):
+ contextual_plot = gr.Plot()
+
+ with gr.TabItem("BoN维度"):
+ bon_plot = gr.Plot()
+
+ with gr.TabItem("Pass@N维度"):
+ pass_k_plot = gr.Plot()
+
+ # 事件处理
+ def update_model_choices():
+ models = get_model_choices()
+ return gr.Dropdown(choices=models, value=[])
+
+ update_charts_btn.click(
+ fn=update_charts,
+ inputs=[model_selector],
+ outputs=[length_heatmap, contextual_plot, primary_task_radar_plot, language_plot, difficulty_plot, bon_plot, pass_k_plot]
+ )
+
+ # 初始化 - 页面加载时自动刷新数据
+ demo.load(
+ fn=refresh_data,
+ outputs=[leaderboard_df]
+ ).then(
+ fn=update_model_choices,
+ outputs=[model_selector]
+ )
+
+ return demo
-def greet(name):
- return "Hello " + name + "!!"
+def main():
+ """主函数"""
+ output_dir = "./results"
+
+ print("初始化结果解析器...")
+ parser = ResultParser(output_dir)
+
+ print("扫描结果文件...")
+ parser.scan_all_results()
+
+ print("创建Gradio界面...")
+ demo = create_gradio_interface(parser)
+
+ print("启动服务器...")
+ demo.launch()
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()
\ No newline at end of file
+if __name__ == "__main__":
+ main()
diff --git a/results/Claude-3.7-Sonnet/nonthinking_context-120000_bon-3_summary.json b/results/Claude-3.7-Sonnet/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..14c2164381f9749a23ed31700de4b037cae30702
--- /dev/null
+++ b/results/Claude-3.7-Sonnet/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5144730485997339,
+ "inference_iteration_1_overall_metric": 0.5192628714494713,
+ "inference_iteration_2_overall_metric": 0.5090899475543829,
+ "inference_iteration_3_overall_metric": 0.515066326795347,
+ "average_token_length_metric": {
+ "8k": 0.5927607589235461,
+ "16k": 0.5922491004183165,
+ "32k": 0.5555486925170308,
+ "64k": 0.4991997081584744,
+ "128k": 0.45285894052515324,
+ "256k": 0.39422109105588254
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.47584256909012274,
+ "Partial": 0.5636391134301498
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6868572950582806,
+ "Moderate": 0.48375113564429373,
+ "Hard": 0.4728683670759167,
+ "Extreme": 0.3731393645349295
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7586982224527067,
+ "T2. Sequencing & Structure Reconstruction": 0.7545327049493711,
+ "T3. Evidence-Grounded QA": 0.5277777777777779,
+ "T4. Summarization & Synthesis": 0.5250996637138268,
+ "T5. Attribution & Citation Alignment": 0.5254132304220211,
+ "T6. Aggregation & Clustering": 0.47394883159992857,
+ "T7. Consistency & Compliance Checking": 0.3040021982475052,
+ "T8. Structured & Numeric Reasoning": 0.41188271604938276,
+ "T9. Version & Code Diff Analysis": 0.6042705189653765,
+ "T10. Rule Induction & In-Context Learning": 0.5114814814814815,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.43888888888888894
+ },
+ "average_language_metric": {
+ "Chinese": 0.5100199196277736,
+ "English": 0.5189261775716956
+ },
+ "BoN-1": {
+ "overall_metric": 0.5192628714494713,
+ "token_length": {
+ "8k": 0.5937761874854375,
+ "16k": 0.606154781504802,
+ "32k": 0.5701163293545726,
+ "64k": 0.49747085680734393,
+ "128k": 0.4476635155122931,
+ "256k": 0.40039555803238175
+ },
+ "contextual_requirement": {
+ "Full": 0.47604900489990065,
+ "Partial": 0.5742623379671082
+ },
+ "difficulty": {
+ "Easy": 0.6993477849390543,
+ "Moderate": 0.4824572359285609,
+ "Hard": 0.47426941765067004,
+ "Extreme": 0.37571516352087697
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7482072090157142,
+ "T2. Sequencing & Structure Reconstruction": 0.7523087560587559,
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
+ "T4. Summarization & Synthesis": 0.5249609995597003,
+ "T5. Attribution & Citation Alignment": 0.5307787048666445,
+ "T6. Aggregation & Clustering": 0.47337460590728553,
+ "T7. Consistency & Compliance Checking": 0.30808530916861365,
+ "T8. Structured & Numeric Reasoning": 0.40740740740740744,
+ "T9. Version & Code Diff Analysis": 0.6209514621148434,
+ "T10. Rule Induction & In-Context Learning": 0.5469444444444443,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.45
+ },
+ "language": {
+ "Chinese": 0.5151862466463957,
+ "English": 0.5233394962525484
+ }
+ },
+ "pass@1": 0.2673333333333333,
+ "BoN-2": {
+ "overall_metric": 0.5638452937577435,
+ "token_length": {
+ "8k": 0.6334745299899752,
+ "16k": 0.6535009894669588,
+ "32k": 0.6109603205298609,
+ "64k": 0.5566414838337063,
+ "128k": 0.5030500434216845,
+ "256k": 0.4254443953042751
+ },
+ "contextual_requirement": {
+ "Full": 0.523183868231744,
+ "Partial": 0.6155961989726518
+ },
+ "difficulty": {
+ "Easy": 0.7554808778953406,
+ "Moderate": 0.5292531239954449,
+ "Hard": 0.5124984336029716,
+ "Extreme": 0.41036353942072434
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8018540164669292,
+ "T2. Sequencing & Structure Reconstruction": 0.7873358123358121,
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
+ "T4. Summarization & Synthesis": 0.5413926274234249,
+ "T5. Attribution & Citation Alignment": 0.5675265408978069,
+ "T6. Aggregation & Clustering": 0.5409163851157314,
+ "T7. Consistency & Compliance Checking": 0.34558583778191654,
+ "T8. Structured & Numeric Reasoning": 0.4685185185185185,
+ "T9. Version & Code Diff Analysis": 0.6508982849457906,
+ "T10. Rule Induction & In-Context Learning": 0.6081944444444445,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.49166666666666664
+ },
+ "language": {
+ "Chinese": 0.5545031574164562,
+ "English": 0.5731874300990306
+ }
+ },
+ "pass@2": 0.30533333333333335,
+ "BoN-3": {
+ "overall_metric": 0.5987860690936202,
+ "token_length": {
+ "8k": 0.6767030215651172,
+ "16k": 0.6801366595488965,
+ "32k": 0.6345247903374839,
+ "64k": 0.6039272497657204,
+ "128k": 0.5233376257525678,
+ "256k": 0.47408706759193875
+ },
+ "contextual_requirement": {
+ "Full": 0.5614328850984434,
+ "Partial": 0.6463264850874838
+ },
+ "difficulty": {
+ "Easy": 0.7882244354415668,
+ "Moderate": 0.5766160107480841,
+ "Hard": 0.5655081578600745,
+ "Extreme": 0.42768418415872295
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8178727620501064,
+ "T2. Sequencing & Structure Reconstruction": 0.8166698116698113,
+ "T3. Evidence-Grounded QA": 0.6166666666666667,
+ "T4. Summarization & Synthesis": 0.5469270547891242,
+ "T5. Attribution & Citation Alignment": 0.581425502230592,
+ "T6. Aggregation & Clustering": 0.5699768544212985,
+ "T7. Consistency & Compliance Checking": 0.3875679491876082,
+ "T8. Structured & Numeric Reasoning": 0.5462962962962963,
+ "T9. Version & Code Diff Analysis": 0.6792246386283735,
+ "T10. Rule Induction & In-Context Learning": 0.6452777777777778,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.525
+ },
+ "language": {
+ "Chinese": 0.5867683815613326,
+ "English": 0.6108037566259096
+ }
+ },
+ "pass@3": 0.3433333333333333
+}
\ No newline at end of file
diff --git a/results/Claude-3.7-Sonnet/thinking_context-120000_bon-3_summary.json b/results/Claude-3.7-Sonnet/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..23a24c84a7e4bd03fb7073fed7fd86e496ac58f6
--- /dev/null
+++ b/results/Claude-3.7-Sonnet/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5966078087059191,
+ "inference_iteration_1_overall_metric": 0.5938171820634314,
+ "inference_iteration_2_overall_metric": 0.5955816438384393,
+ "inference_iteration_3_overall_metric": 0.6004246002158852,
+ "average_token_length_metric": {
+ "8k": 0.6997135645823386,
+ "16k": 0.6577212798228894,
+ "32k": 0.6419035800281319,
+ "64k": 0.6238264957040918,
+ "128k": 0.523846643485212,
+ "256k": 0.43263528861285133
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5527640561663963,
+ "Partial": 0.652408948301675
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7825568834883242,
+ "Moderate": 0.6155843766907921,
+ "Hard": 0.5658238514809286,
+ "Extreme": 0.4006643574451805
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8486700113628681,
+ "T2. Sequencing & Structure Reconstruction": 0.8000456983629063,
+ "T3. Evidence-Grounded QA": 0.5027777777777777,
+ "T4. Summarization & Synthesis": 0.5309981037882555,
+ "T5. Attribution & Citation Alignment": 0.5878801280848494,
+ "T6. Aggregation & Clustering": 0.5732629573374424,
+ "T7. Consistency & Compliance Checking": 0.3939611740106759,
+ "T8. Structured & Numeric Reasoning": 0.6212962962962962,
+ "T9. Version & Code Diff Analysis": 0.69342672946219,
+ "T10. Rule Induction & In-Context Learning": 0.5610185185185187,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5499999999999997
+ },
+ "average_language_metric": {
+ "Chinese": 0.588350975552306,
+ "English": 0.6048646418595319
+ },
+ "BoN-1": {
+ "overall_metric": 0.5938171820634314,
+ "token_length": {
+ "8k": 0.7191856133745022,
+ "16k": 0.6402554520543442,
+ "32k": 0.6341044882273853,
+ "64k": 0.6259136300211012,
+ "128k": 0.5177437687626877,
+ "256k": 0.4257001399405685
+ },
+ "contextual_requirement": {
+ "Full": 0.5550251903269423,
+ "Partial": 0.6431888079098727
+ },
+ "difficulty": {
+ "Easy": 0.7745826629942724,
+ "Moderate": 0.6051939662024207,
+ "Hard": 0.5720458613452599,
+ "Extreme": 0.40262413493324634
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8535130690589529,
+ "T2. Sequencing & Structure Reconstruction": 0.7808294450361011,
+ "T3. Evidence-Grounded QA": 0.5,
+ "T4. Summarization & Synthesis": 0.5297721568932299,
+ "T5. Attribution & Citation Alignment": 0.5849862779597039,
+ "T6. Aggregation & Clustering": 0.5655938313957183,
+ "T7. Consistency & Compliance Checking": 0.3878929800328972,
+ "T8. Structured & Numeric Reasoning": 0.6314814814814815,
+ "T9. Version & Code Diff Analysis": 0.6772724985908704,
+ "T10. Rule Induction & In-Context Learning": 0.5855555555555555,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333
+ },
+ "language": {
+ "Chinese": 0.5905648959587577,
+ "English": 0.5970694681681056
+ }
+ },
+ "pass@1": 0.37266666666666665,
+ "BoN-2": {
+ "overall_metric": 0.6528466974974912,
+ "token_length": {
+ "8k": 0.7638003125719123,
+ "16k": 0.7101654799888735,
+ "32k": 0.7028814358570394,
+ "64k": 0.6870948174265773,
+ "128k": 0.565071084237125,
+ "256k": 0.4880670549034259
+ },
+ "contextual_requirement": {
+ "Full": 0.6177398094156152,
+ "Partial": 0.6975281914198818
+ },
+ "difficulty": {
+ "Easy": 0.8281250142111442,
+ "Moderate": 0.6919969423858446,
+ "Hard": 0.6370194614390577,
+ "Extreme": 0.4455182924797047
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8893518121473104,
+ "T2. Sequencing & Structure Reconstruction": 0.8424985248669454,
+ "T3. Evidence-Grounded QA": 0.6,
+ "T4. Summarization & Synthesis": 0.544816527949887,
+ "T5. Attribution & Citation Alignment": 0.6380851790586051,
+ "T6. Aggregation & Clustering": 0.6286177167059521,
+ "T7. Consistency & Compliance Checking": 0.45873422235423905,
+ "T8. Structured & Numeric Reasoning": 0.6578703703703704,
+ "T9. Version & Code Diff Analysis": 0.7510537661056169,
+ "T10. Rule Induction & In-Context Learning": 0.618611111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333
+ },
+ "language": {
+ "Chinese": 0.6376749344767387,
+ "English": 0.6680184605182464
+ }
+ },
+ "pass@2": 0.44333333333333336,
+ "BoN-3": {
+ "overall_metric": 0.6830970097115105,
+ "token_length": {
+ "8k": 0.7837282981657816,
+ "16k": 0.7434736805597991,
+ "32k": 0.7293201301488285,
+ "64k": 0.7233152422607191,
+ "128k": 0.6019561639441493,
+ "256k": 0.5167885431897875
+ },
+ "contextual_requirement": {
+ "Full": 0.6474666039819896,
+ "Partial": 0.7284447988218108
+ },
+ "difficulty": {
+ "Easy": 0.8607457235279341,
+ "Moderate": 0.7303808913214545,
+ "Hard": 0.6594975210053281,
+ "Extreme": 0.47293457878264067
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9066883296818435,
+ "T2. Sequencing & Structure Reconstruction": 0.8643315529499735,
+ "T3. Evidence-Grounded QA": 0.6166666666666667,
+ "T4. Summarization & Synthesis": 0.5522881837833883,
+ "T5. Attribution & Citation Alignment": 0.6507666087400348,
+ "T6. Aggregation & Clustering": 0.6674614963830652,
+ "T7. Consistency & Compliance Checking": 0.48674980053348876,
+ "T8. Structured & Numeric Reasoning": 0.7092592592592593,
+ "T9. Version & Code Diff Analysis": 0.7716543341971472,
+ "T10. Rule Induction & In-Context Learning": 0.6727777777777777,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334
+ },
+ "language": {
+ "Chinese": 0.6635013624534305,
+ "English": 0.7026926569695916
+ }
+ },
+ "pass@3": 0.4826666666666667
+}
\ No newline at end of file
diff --git a/results/Claude-4-Sonnet/nonthinking_context-1000000_bon-3_summary.json b/results/Claude-4-Sonnet/nonthinking_context-1000000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9b4f9410b13418916d6f5d5f814cc03fde28168
--- /dev/null
+++ b/results/Claude-4-Sonnet/nonthinking_context-1000000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5606565628619046,
+ "inference_iteration_1_overall_metric": 0.5620036050651629,
+ "inference_iteration_2_overall_metric": 0.5631248059457928,
+ "inference_iteration_3_overall_metric": 0.5568412775747574,
+ "average_token_length_metric": {
+ "8k": 0.6071528197414233,
+ "16k": 0.5816154959256097,
+ "32k": 0.5612446325027117,
+ "64k": 0.5254403501645888,
+ "128k": 0.5465188702735214,
+ "256k": 0.541967208563576
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5264359183584719,
+ "Partial": 0.6042101104117302
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6842286640015465,
+ "Moderate": 0.5396282888053806,
+ "Hard": 0.5757154269645611,
+ "Extreme": 0.4292097599695439
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8091570544332052,
+ "T2. Sequencing & Structure Reconstruction": 0.8207229190562522,
+ "T3. Evidence-Grounded QA": 0.5027777777777778,
+ "T4. Summarization & Synthesis": 0.5379205858992453,
+ "T5. Attribution & Citation Alignment": 0.6634794615844591,
+ "T6. Aggregation & Clustering": 0.5193980953038955,
+ "T7. Consistency & Compliance Checking": 0.41333574812040713,
+ "T8. Structured & Numeric Reasoning": 0.3430555555555555,
+ "T9. Version & Code Diff Analysis": 0.7423632867012375,
+ "T10. Rule Induction & In-Context Learning": 0.5125462962962964,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5055555555555556
+ },
+ "average_language_metric": {
+ "Chinese": 0.5499191967861178,
+ "English": 0.5713939289376934
+ },
+ "BoN-1": {
+ "overall_metric": 0.5620036050651629,
+ "token_length": {
+ "8k": 0.6116650341526972,
+ "16k": 0.5789009200875381,
+ "32k": 0.564884526756138,
+ "64k": 0.5150575277083638,
+ "128k": 0.5512053153279016,
+ "256k": 0.5503083063583438
+ },
+ "contextual_requirement": {
+ "Full": 0.5327035707633319,
+ "Partial": 0.5992945578129499
+ },
+ "difficulty": {
+ "Easy": 0.6903796852415736,
+ "Moderate": 0.5418819297458669,
+ "Hard": 0.5712079185155163,
+ "Extreme": 0.4285441662812896
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8022069774152061,
+ "T2. Sequencing & Structure Reconstruction": 0.8176220076220074,
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
+ "T4. Summarization & Synthesis": 0.5377531684167628,
+ "T5. Attribution & Citation Alignment": 0.6702134910265403,
+ "T6. Aggregation & Clustering": 0.509165168922684,
+ "T7. Consistency & Compliance Checking": 0.4132211721134369,
+ "T8. Structured & Numeric Reasoning": 0.336574074074074,
+ "T9. Version & Code Diff Analysis": 0.755058796168736,
+ "T10. Rule Induction & In-Context Learning": 0.5204166666666666,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5
+ },
+ "language": {
+ "Chinese": 0.5525156622096248,
+ "English": 0.5714915479207029
+ }
+ },
+ "pass@1": 0.2986666666666667,
+ "BoN-2": {
+ "overall_metric": 0.6070413152649775,
+ "token_length": {
+ "8k": 0.648428309386666,
+ "16k": 0.6138312363156012,
+ "32k": 0.6050527198490672,
+ "64k": 0.5709459527531863,
+ "128k": 0.6043903989020145,
+ "256k": 0.5995992743833368
+ },
+ "contextual_requirement": {
+ "Full": 0.5725088578790201,
+ "Partial": 0.6509917155743812
+ },
+ "difficulty": {
+ "Easy": 0.7433988294310949,
+ "Moderate": 0.5959678783418261,
+ "Hard": 0.6127141026903381,
+ "Extreme": 0.461215101348602
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8352274736404431,
+ "T2. Sequencing & Structure Reconstruction": 0.8427542827542828,
+ "T3. Evidence-Grounded QA": 0.6,
+ "T4. Summarization & Synthesis": 0.5511890444295781,
+ "T5. Attribution & Citation Alignment": 0.709250434569062,
+ "T6. Aggregation & Clustering": 0.5590519848415796,
+ "T7. Consistency & Compliance Checking": 0.4538880045271121,
+ "T8. Structured & Numeric Reasoning": 0.400462962962963,
+ "T9. Version & Code Diff Analysis": 0.7896296658102724,
+ "T10. Rule Induction & In-Context Learning": 0.5898611111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.55
+ },
+ "language": {
+ "Chinese": 0.6018498726474579,
+ "English": 0.6122327578825001
+ }
+ },
+ "pass@2": 0.3486666666666667,
+ "BoN-3": {
+ "overall_metric": 0.6359674546066088,
+ "token_length": {
+ "8k": 0.6791089276748106,
+ "16k": 0.6344789904129863,
+ "32k": 0.6382754639404674,
+ "64k": 0.6155932927177382,
+ "128k": 0.6246025171745627,
+ "256k": 0.6237455357190941
+ },
+ "contextual_requirement": {
+ "Full": 0.5997037562420825,
+ "Partial": 0.6821212525251004
+ },
+ "difficulty": {
+ "Easy": 0.7810114340069705,
+ "Moderate": 0.6237269051463531,
+ "Hard": 0.6375808018524918,
+ "Extreme": 0.4840585077179298
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8531556276526557,
+ "T2. Sequencing & Structure Reconstruction": 0.8630080105080103,
+ "T3. Evidence-Grounded QA": 0.6416666666666667,
+ "T4. Summarization & Synthesis": 0.5557327849332047,
+ "T5. Attribution & Citation Alignment": 0.7262293044645985,
+ "T6. Aggregation & Clustering": 0.5966415249311195,
+ "T7. Consistency & Compliance Checking": 0.47966164058768634,
+ "T8. Structured & Numeric Reasoning": 0.4393518518518518,
+ "T9. Version & Code Diff Analysis": 0.7950682623015005,
+ "T10. Rule Induction & In-Context Learning": 0.6329166666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6083333333333333
+ },
+ "language": {
+ "Chinese": 0.6245809400158543,
+ "English": 0.6473539691973664
+ }
+ },
+ "pass@3": 0.38
+}
\ No newline at end of file
diff --git a/results/Claude-4-Sonnet/thinking_context-1000000_bon-3_summary.json b/results/Claude-4-Sonnet/thinking_context-1000000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..56c05dee9caa23c7aa666e1e62f40e3fb8b47e9a
--- /dev/null
+++ b/results/Claude-4-Sonnet/thinking_context-1000000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 3,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.6987364832054667,
+ "inference_iteration_1_overall_metric": 0.7019992434982991,
+ "inference_iteration_2_overall_metric": 0.6978899327024527,
+ "inference_iteration_3_overall_metric": 0.6963202734156487,
+ "average_token_length_metric": {
+ "8k": 0.7273068305229948,
+ "16k": 0.7148161402734813,
+ "32k": 0.7282156837997693,
+ "64k": 0.7051754330841736,
+ "128k": 0.6642984844940268,
+ "256k": 0.6526063270583587
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.6617044440872328,
+ "Partial": 0.7458681693559481
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.8377531760390221,
+ "Moderate": 0.7658446956684767,
+ "Hard": 0.7472224806628969,
+ "Extreme": 0.4705256363582413
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.907681462321177,
+ "T2. Sequencing & Structure Reconstruction": 0.8890326186159514,
+ "T3. Evidence-Grounded QA": 0.661111111111111,
+ "T4. Summarization & Synthesis": 0.5383660231848545,
+ "T5. Attribution & Citation Alignment": 0.7860152219301051,
+ "T6. Aggregation & Clustering": 0.6671470819716809,
+ "T7. Consistency & Compliance Checking": 0.5518199768375653,
+ "T8. Structured & Numeric Reasoning": 0.6859567901234568,
+ "T9. Version & Code Diff Analysis": 0.8575767924690506,
+ "T10. Rule Induction & In-Context Learning": 0.6481481481481483,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5888888888888888
+ },
+ "average_language_metric": {
+ "Chinese": 0.6865315143381795,
+ "English": 0.7109414520727555
+ },
+ "BoN-1": {
+ "overall_metric": 0.7019992434982991,
+ "token_length": {
+ "8k": 0.7253942984634015,
+ "16k": 0.7347686831241128,
+ "32k": 0.7405843026072749,
+ "64k": 0.6852109109698611,
+ "128k": 0.6670707753146399,
+ "256k": 0.6589664905105064
+ },
+ "contextual_requirement": {
+ "Full": 0.6664641511659741,
+ "Partial": 0.747225724648532
+ },
+ "difficulty": {
+ "Easy": 0.8574789158116471,
+ "Moderate": 0.7537775763318981,
+ "Hard": 0.7422849474268556,
+ "Extreme": 0.47120899649989867
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9235851079801015,
+ "T2. Sequencing & Structure Reconstruction": 0.8866411828911827,
+ "T3. Evidence-Grounded QA": 0.7166666666666667,
+ "T4. Summarization & Synthesis": 0.5397488846450174,
+ "T5. Attribution & Citation Alignment": 0.7968571187727533,
+ "T6. Aggregation & Clustering": 0.6625413639156731,
+ "T7. Consistency & Compliance Checking": 0.5348173324914223,
+ "T8. Structured & Numeric Reasoning": 0.7013888888888888,
+ "T9. Version & Code Diff Analysis": 0.8605924270512658,
+ "T10. Rule Induction & In-Context Learning": 0.6277777777777778,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.575
+ },
+ "language": {
+ "Chinese": 0.7011338968631923,
+ "English": 0.7028645901334066
+ }
+ },
+ "pass@1": 0.4786666666666667,
+ "BoN-2": {
+ "overall_metric": 0.7662921811826703,
+ "token_length": {
+ "8k": 0.7864626566034867,
+ "16k": 0.7903224578099031,
+ "32k": 0.7829418518359753,
+ "64k": 0.776573418015718,
+ "128k": 0.7369784925906059,
+ "256k": 0.724474210240336
+ },
+ "contextual_requirement": {
+ "Full": 0.7387739445109663,
+ "Partial": 0.8013153914921128
+ },
+ "difficulty": {
+ "Easy": 0.9044185942106279,
+ "Moderate": 0.8489744455895216,
+ "Hard": 0.8300963355442819,
+ "Extreme": 0.5187660309473054
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9337455622019767,
+ "T2. Sequencing & Structure Reconstruction": 0.920133061383061,
+ "T3. Evidence-Grounded QA": 0.8,
+ "T4. Summarization & Synthesis": 0.5512587507775879,
+ "T5. Attribution & Citation Alignment": 0.851460921546061,
+ "T6. Aggregation & Clustering": 0.7263341037175177,
+ "T7. Consistency & Compliance Checking": 0.6403373459035246,
+ "T8. Structured & Numeric Reasoning": 0.7763888888888889,
+ "T9. Version & Code Diff Analysis": 0.8952412388875778,
+ "T10. Rule Induction & In-Context Learning": 0.7372222222222222,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.675
+ },
+ "language": {
+ "Chinese": 0.7559027655426326,
+ "English": 0.7766815968227087
+ }
+ },
+ "pass@2": 0.5586666666666666,
+ "BoN-3": {
+ "overall_metric": 0.7907893209368455,
+ "token_length": {
+ "8k": 0.8018918816218435,
+ "16k": 0.8122398647423218,
+ "32k": 0.8052123235968958,
+ "64k": 0.8059913515132954,
+ "128k": 0.7658095463175019,
+ "256k": 0.7535909578292158
+ },
+ "contextual_requirement": {
+ "Full": 0.7602540929686584,
+ "Partial": 0.8296523383509014
+ },
+ "difficulty": {
+ "Easy": 0.9273787695935313,
+ "Moderate": 0.8740334338695857,
+ "Hard": 0.8656496547603084,
+ "Extreme": 0.5373159132889705
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9422092659363656,
+ "T2. Sequencing & Structure Reconstruction": 0.9295053557553554,
+ "T3. Evidence-Grounded QA": 0.8416666666666667,
+ "T4. Summarization & Synthesis": 0.5573876341160701,
+ "T5. Attribution & Citation Alignment": 0.8621875560226953,
+ "T6. Aggregation & Clustering": 0.7544804518953127,
+ "T7. Consistency & Compliance Checking": 0.6657566567798248,
+ "T8. Structured & Numeric Reasoning": 0.8055555555555556,
+ "T9. Version & Code Diff Analysis": 0.9043321479784867,
+ "T10. Rule Induction & In-Context Learning": 0.7838888888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.725
+ },
+ "language": {
+ "Chinese": 0.7791329047815275,
+ "English": 0.8024457370921636
+ }
+ },
+ "pass@3": 0.598
+}
\ No newline at end of file
diff --git a/results/DeepSeek-R1-0528/thinking_context-120000_bon-3_summary.json b/results/DeepSeek-R1-0528/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..46a5fb503fa4d93cb24b38f9f1f1a1d872fd58d0
--- /dev/null
+++ b/results/DeepSeek-R1-0528/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 1,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.61893761586453,
+ "inference_iteration_1_overall_metric": 0.6270481753191374,
+ "inference_iteration_2_overall_metric": 0.6115350419668117,
+ "inference_iteration_3_overall_metric": 0.6182296303076407,
+ "average_token_length_metric": {
+ "8k": 0.7165288754873151,
+ "16k": 0.6828199674990499,
+ "32k": 0.6181133860648209,
+ "64k": 0.6286866574208946,
+ "128k": 0.5812085902020846,
+ "256k": 0.48626821851301744
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5840521215265608,
+ "Partial": 0.6633373359310379
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.8266775116602355,
+ "Moderate": 0.6653109531944901,
+ "Hard": 0.5367579918507135,
+ "Extreme": 0.41488622286022364
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8218247532961501,
+ "T2. Sequencing & Structure Reconstruction": 0.8170893828393826,
+ "T3. Evidence-Grounded QA": 0.5638888888888887,
+ "T4. Summarization & Synthesis": 0.5505430321723027,
+ "T5. Attribution & Citation Alignment": 0.5766419140172199,
+ "T6. Aggregation & Clustering": 0.5593864809441226,
+ "T7. Consistency & Compliance Checking": 0.44029922393263027,
+ "T8. Structured & Numeric Reasoning": 0.6998456790123458,
+ "T9. Version & Code Diff Analysis": 0.7073888549627423,
+ "T10. Rule Induction & In-Context Learning": 0.613935185185185,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5361111111111111
+ },
+ "average_language_metric": {
+ "Chinese": 0.6388577912805221,
+ "English": 0.5990174404485396
+ },
+ "BoN-1": {
+ "overall_metric": 0.6270481753191374,
+ "token_length": {
+ "8k": 0.7085495640791158,
+ "16k": 0.6852891854029057,
+ "32k": 0.6523714114738722,
+ "64k": 0.6349276261566131,
+ "128k": 0.5990851877476868,
+ "256k": 0.48206607705463184
+ },
+ "contextual_requirement": {
+ "Full": 0.5954038691856903,
+ "Partial": 0.6673227467617081
+ },
+ "difficulty": {
+ "Easy": 0.842688379979359,
+ "Moderate": 0.6856872320332753,
+ "Hard": 0.5371806890772243,
+ "Extreme": 0.4113355332448204
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8085658637642787,
+ "T2. Sequencing & Structure Reconstruction": 0.8118172105672106,
+ "T3. Evidence-Grounded QA": 0.6083333333333333,
+ "T4. Summarization & Synthesis": 0.5488292098363808,
+ "T5. Attribution & Citation Alignment": 0.5873852854613878,
+ "T6. Aggregation & Clustering": 0.5544843141923278,
+ "T7. Consistency & Compliance Checking": 0.4479896544786345,
+ "T8. Structured & Numeric Reasoning": 0.7194444444444444,
+ "T9. Version & Code Diff Analysis": 0.704738113297963,
+ "T10. Rule Induction & In-Context Learning": 0.6438888888888888,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
+ },
+ "language": {
+ "Chinese": 0.6382004484945908,
+ "English": 0.6158959021436853
+ }
+ },
+ "pass@1": 0.39066666666666666,
+ "BoN-2": {
+ "overall_metric": 0.6908377344426165,
+ "token_length": {
+ "8k": 0.7595240418657973,
+ "16k": 0.7570136230136708,
+ "32k": 0.7096605299566262,
+ "64k": 0.7135170329369038,
+ "128k": 0.6539368768633997,
+ "256k": 0.5513743020193068
+ },
+ "contextual_requirement": {
+ "Full": 0.6581544826620869,
+ "Partial": 0.7324346003451124
+ },
+ "difficulty": {
+ "Easy": 0.900422588090864,
+ "Moderate": 0.7645982045756979,
+ "Hard": 0.6173528116491919,
+ "Extreme": 0.46106606935258343
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8656746491642889,
+ "T2. Sequencing & Structure Reconstruction": 0.8850374162874161,
+ "T3. Evidence-Grounded QA": 0.6833333333333333,
+ "T4. Summarization & Synthesis": 0.5661734768963829,
+ "T5. Attribution & Citation Alignment": 0.6630436055268355,
+ "T6. Aggregation & Clustering": 0.6284207499424888,
+ "T7. Consistency & Compliance Checking": 0.520982919531347,
+ "T8. Structured & Numeric Reasoning": 0.7888888888888889,
+ "T9. Version & Code Diff Analysis": 0.7536592506692632,
+ "T10. Rule Induction & In-Context Learning": 0.7194444444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5916666666666667
+ },
+ "language": {
+ "Chinese": 0.6996429460445758,
+ "English": 0.6820325228406605
+ }
+ },
+ "pass@2": 0.4533333333333333,
+ "BoN-3": {
+ "overall_metric": 0.7254771584689355,
+ "token_length": {
+ "8k": 0.7994316297691582,
+ "16k": 0.7937454228456009,
+ "32k": 0.7297465435731235,
+ "64k": 0.7326783552830499,
+ "128k": 0.6951047141031044,
+ "256k": 0.6021562852395752
+ },
+ "contextual_requirement": {
+ "Full": 0.6913631461546793,
+ "Partial": 0.7688949923234434
+ },
+ "difficulty": {
+ "Easy": 0.9337047416686951,
+ "Moderate": 0.8141591847806626,
+ "Hard": 0.6485103013588345,
+ "Extreme": 0.4896785698290395
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8968239339375051,
+ "T2. Sequencing & Structure Reconstruction": 0.8988072575572574,
+ "T3. Evidence-Grounded QA": 0.7583333333333333,
+ "T4. Summarization & Synthesis": 0.5734497230075248,
+ "T5. Attribution & Citation Alignment": 0.6928069332150858,
+ "T6. Aggregation & Clustering": 0.6670419649341216,
+ "T7. Consistency & Compliance Checking": 0.5624760870987693,
+ "T8. Structured & Numeric Reasoning": 0.8055555555555556,
+ "T9. Version & Code Diff Analysis": 0.7817439995394256,
+ "T10. Rule Induction & In-Context Learning": 0.7638888888888888,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.65
+ },
+ "language": {
+ "Chinese": 0.7321739274049238,
+ "English": 0.7187803895329472
+ }
+ },
+ "pass@3": 0.49666666666666665
+}
\ No newline at end of file
diff --git a/results/DeepSeek-R1/thinking_context-120000_bon-3_summary.json b/results/DeepSeek-R1/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..37ba1159b595642c379f48be640b3e6204da9346
--- /dev/null
+++ b/results/DeepSeek-R1/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.6006714049681133,
+ "inference_iteration_1_overall_metric": 0.6007584621917721,
+ "inference_iteration_2_overall_metric": 0.5960043654782469,
+ "inference_iteration_3_overall_metric": 0.6052513872343173,
+ "average_token_length_metric": {
+ "8k": 0.6896237775198697,
+ "16k": 0.66847824761939,
+ "32k": 0.6242811862728697,
+ "64k": 0.5907117819226532,
+ "128k": 0.526720556197483,
+ "256k": 0.5042128802764103
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5734808170096616,
+ "Partial": 0.6352776078243236
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.8244195631460464,
+ "Moderate": 0.5882837964508552,
+ "Hard": 0.5338546774181954,
+ "Extreme": 0.4075883160627708
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8460279171139484,
+ "T2. Sequencing & Structure Reconstruction": 0.7927840387644306,
+ "T3. Evidence-Grounded QA": 0.5666666666666665,
+ "T4. Summarization & Synthesis": 0.5315482688091906,
+ "T5. Attribution & Citation Alignment": 0.46763122932017526,
+ "T6. Aggregation & Clustering": 0.5661396588973091,
+ "T7. Consistency & Compliance Checking": 0.4411785360364781,
+ "T8. Structured & Numeric Reasoning": 0.6290123456790124,
+ "T9. Version & Code Diff Analysis": 0.7118775193966861,
+ "T10. Rule Induction & In-Context Learning": 0.6290277777777776,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5083333333333333
+ },
+ "average_language_metric": {
+ "Chinese": 0.5813030699136731,
+ "English": 0.6200397400225522
+ },
+ "BoN-1": {
+ "overall_metric": 0.6007584621917721,
+ "token_length": {
+ "8k": 0.6794004597378533,
+ "16k": 0.6605152745514365,
+ "32k": 0.637696287010787,
+ "64k": 0.6015965809771497,
+ "128k": 0.5259184504039809,
+ "256k": 0.49942372046943184
+ },
+ "contextual_requirement": {
+ "Full": 0.5724096385120696,
+ "Partial": 0.6368387832386698
+ },
+ "difficulty": {
+ "Easy": 0.8239541273708798,
+ "Moderate": 0.5859117167110014,
+ "Hard": 0.541012801830159,
+ "Extreme": 0.40525140462840953
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8346607474979665,
+ "T2. Sequencing & Structure Reconstruction": 0.7960157843246078,
+ "T3. Evidence-Grounded QA": 0.5916666666666667,
+ "T4. Summarization & Synthesis": 0.5314348105743746,
+ "T5. Attribution & Citation Alignment": 0.46439938615714244,
+ "T6. Aggregation & Clustering": 0.5590113115895492,
+ "T7. Consistency & Compliance Checking": 0.4443221207730568,
+ "T8. Structured & Numeric Reasoning": 0.612962962962963,
+ "T9. Version & Code Diff Analysis": 0.7031087891880523,
+ "T10. Rule Induction & In-Context Learning": 0.6470833333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5166666666666667
+ },
+ "language": {
+ "Chinese": 0.5732559357352887,
+ "English": 0.6282609886482589
+ }
+ },
+ "pass@1": 0.3433333333333333,
+ "BoN-2": {
+ "overall_metric": 0.6591761776037405,
+ "token_length": {
+ "8k": 0.7392791599059808,
+ "16k": 0.7298238663653037,
+ "32k": 0.6777532203191601,
+ "64k": 0.6787515982515117,
+ "128k": 0.5821796500623371,
+ "256k": 0.5472695707181553
+ },
+ "contextual_requirement": {
+ "Full": 0.6319274531975012,
+ "Partial": 0.6938563723025926
+ },
+ "difficulty": {
+ "Easy": 0.8783150831551243,
+ "Moderate": 0.678502573566839,
+ "Hard": 0.5963536523109759,
+ "Extreme": 0.4476885160139848
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8861689034562782,
+ "T2. Sequencing & Structure Reconstruction": 0.8374043195366726,
+ "T3. Evidence-Grounded QA": 0.625,
+ "T4. Summarization & Synthesis": 0.5455058096240588,
+ "T5. Attribution & Citation Alignment": 0.5369499475317325,
+ "T6. Aggregation & Clustering": 0.6381104251141014,
+ "T7. Consistency & Compliance Checking": 0.5019132623573087,
+ "T8. Structured & Numeric Reasoning": 0.699537037037037,
+ "T9. Version & Code Diff Analysis": 0.7605821531353517,
+ "T10. Rule Induction & In-Context Learning": 0.7220833333333334,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667
+ },
+ "language": {
+ "Chinese": 0.6390711207410246,
+ "English": 0.6792812344664584
+ }
+ },
+ "pass@2": 0.4093333333333333,
+ "BoN-3": {
+ "overall_metric": 0.6982292810132508,
+ "token_length": {
+ "8k": 0.783866781971667,
+ "16k": 0.7689573555512769,
+ "32k": 0.7150318293064207,
+ "64k": 0.707631405403529,
+ "128k": 0.6291537694328186,
+ "256k": 0.5847345444137952
+ },
+ "contextual_requirement": {
+ "Full": 0.6747138288729008,
+ "Partial": 0.7281580382827888
+ },
+ "difficulty": {
+ "Easy": 0.9164006912153285,
+ "Moderate": 0.7293351521236777,
+ "Hard": 0.6332713918179128,
+ "Extreme": 0.48146703898856563
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9025424539049985,
+ "T2. Sequencing & Structure Reconstruction": 0.8813529526029528,
+ "T3. Evidence-Grounded QA": 0.6666666666666666,
+ "T4. Summarization & Synthesis": 0.5516760401143904,
+ "T5. Attribution & Citation Alignment": 0.5770877507616411,
+ "T6. Aggregation & Clustering": 0.6710035196738627,
+ "T7. Consistency & Compliance Checking": 0.5581333121650413,
+ "T8. Structured & Numeric Reasoning": 0.7560185185185184,
+ "T9. Version & Code Diff Analysis": 0.8001126786344129,
+ "T10. Rule Induction & In-Context Learning": 0.7456944444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.625
+ },
+ "language": {
+ "Chinese": 0.6861725652789407,
+ "English": 0.7102859967475628
+ }
+ },
+ "pass@3": 0.45866666666666667
+}
\ No newline at end of file
diff --git a/results/DeepSeek-V3-0324/nonthinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3-0324/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..332e56ff1213d74b488f7947e01df363fea2b2f3
--- /dev/null
+++ b/results/DeepSeek-V3-0324/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5169762636111047,
+ "inference_iteration_1_overall_metric": 0.5181528065498966,
+ "inference_iteration_2_overall_metric": 0.5148683077997773,
+ "inference_iteration_3_overall_metric": 0.5179076764836414,
+ "average_token_length_metric": {
+ "8k": 0.5649502252093578,
+ "16k": 0.5347800008319371,
+ "32k": 0.5556420045489457,
+ "64k": 0.5214603320658495,
+ "128k": 0.4864319755387441,
+ "256k": 0.4385930434717982
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.477916384525216,
+ "Partial": 0.566688836993147
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6526369307346341,
+ "Moderate": 0.4967551723461267,
+ "Hard": 0.48299903154456436,
+ "Extreme": 0.4039646133594433
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7877679677233363,
+ "T2. Sequencing & Structure Reconstruction": 0.7655772360963239,
+ "T3. Evidence-Grounded QA": 0.5416666666666666,
+ "T4. Summarization & Synthesis": 0.5446412810076181,
+ "T5. Attribution & Citation Alignment": 0.5358172435200781,
+ "T6. Aggregation & Clustering": 0.4889882988114357,
+ "T7. Consistency & Compliance Checking": 0.3557205172395749,
+ "T8. Structured & Numeric Reasoning": 0.24367283950617283,
+ "T9. Version & Code Diff Analysis": 0.6358733797519817,
+ "T10. Rule Induction & In-Context Learning": 0.6043981481481482,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4138888888888889
+ },
+ "average_language_metric": {
+ "Chinese": 0.5177833454262655,
+ "English": 0.5161691817959461
+ },
+ "BoN-1": {
+ "overall_metric": 0.5181528065498966,
+ "token_length": {
+ "8k": 0.5625899513794126,
+ "16k": 0.5301053854931655,
+ "32k": 0.5507157770563107,
+ "64k": 0.5301772699202785,
+ "128k": 0.4898481902149404,
+ "256k": 0.44548026523527584
+ },
+ "contextual_requirement": {
+ "Full": 0.4761374399130544,
+ "Partial": 0.571626909542243
+ },
+ "difficulty": {
+ "Easy": 0.6469710156877221,
+ "Moderate": 0.5054097919629513,
+ "Hard": 0.48366558283696526,
+ "Extreme": 0.4080599930595185
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7857412502114394,
+ "T2. Sequencing & Structure Reconstruction": 0.7680012781036988,
+ "T3. Evidence-Grounded QA": 0.5416666666666666,
+ "T4. Summarization & Synthesis": 0.5417591122015827,
+ "T5. Attribution & Citation Alignment": 0.5399240477985636,
+ "T6. Aggregation & Clustering": 0.4939990937730982,
+ "T7. Consistency & Compliance Checking": 0.3541605841917901,
+ "T8. Structured & Numeric Reasoning": 0.24212962962962964,
+ "T9. Version & Code Diff Analysis": 0.6325782099444311,
+ "T10. Rule Induction & In-Context Learning": 0.6151388888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
+ },
+ "language": {
+ "Chinese": 0.5160793653421631,
+ "English": 0.5202262477576317
+ }
+ },
+ "pass@1": 0.23666666666666666,
+ "BoN-2": {
+ "overall_metric": 0.5442270665213056,
+ "token_length": {
+ "8k": 0.594568981140541,
+ "16k": 0.5576674233487121,
+ "32k": 0.5818313481616337,
+ "64k": 0.5446955915354488,
+ "128k": 0.5245780781334816,
+ "256k": 0.4620209768080174
+ },
+ "contextual_requirement": {
+ "Full": 0.5071422110822568,
+ "Partial": 0.591425973443732
+ },
+ "difficulty": {
+ "Easy": 0.6783617155027842,
+ "Moderate": 0.5372286722751785,
+ "Hard": 0.5038179540985808,
+ "Extreme": 0.4284267679263639
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8015300122926023,
+ "T2. Sequencing & Structure Reconstruction": 0.787322466174887,
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
+ "T4. Summarization & Synthesis": 0.5591478963061529,
+ "T5. Attribution & Citation Alignment": 0.5635377708713467,
+ "T6. Aggregation & Clustering": 0.5151496972716835,
+ "T7. Consistency & Compliance Checking": 0.37875998396209754,
+ "T8. Structured & Numeric Reasoning": 0.29953703703703705,
+ "T9. Version & Code Diff Analysis": 0.6509912195762164,
+ "T10. Rule Induction & In-Context Learning": 0.6334722222222222,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.45
+ },
+ "language": {
+ "Chinese": 0.5466945351249929,
+ "English": 0.5417595979176187
+ }
+ },
+ "pass@2": 0.26066666666666666,
+ "BoN-3": {
+ "overall_metric": 0.5555999285609496,
+ "token_length": {
+ "8k": 0.6111433308773455,
+ "16k": 0.5701468581090299,
+ "32k": 0.5948383611006594,
+ "64k": 0.5601715262733609,
+ "128k": 0.5307382241597844,
+ "256k": 0.4665612708455233
+ },
+ "contextual_requirement": {
+ "Full": 0.5192835049000674,
+ "Partial": 0.6018208314020757
+ },
+ "difficulty": {
+ "Easy": 0.6899946609782567,
+ "Moderate": 0.55112703775723,
+ "Hard": 0.5166174713215149,
+ "Extreme": 0.4369188707412474
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8147581778505403,
+ "T2. Sequencing & Structure Reconstruction": 0.8027788153812361,
+ "T3. Evidence-Grounded QA": 0.5916666666666667,
+ "T4. Summarization & Synthesis": 0.5641241115145229,
+ "T5. Attribution & Citation Alignment": 0.5716268051520625,
+ "T6. Aggregation & Clustering": 0.5308177346459271,
+ "T7. Consistency & Compliance Checking": 0.3957991644610091,
+ "T8. Structured & Numeric Reasoning": 0.29953703703703705,
+ "T9. Version & Code Diff Analysis": 0.6606747373420039,
+ "T10. Rule Induction & In-Context Learning": 0.6418055555555555,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4583333333333333
+ },
+ "language": {
+ "Chinese": 0.5602606340750268,
+ "English": 0.550939223046875
+ }
+ },
+ "pass@3": 0.272
+}
\ No newline at end of file
diff --git a/results/DeepSeek-V3-0324/thinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3-0324/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cc7a303d9b6ca379e1b6d87bfbe3e42476de989
--- /dev/null
+++ b/results/DeepSeek-V3-0324/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5670800708470047,
+ "inference_iteration_1_overall_metric": 0.5592880863605422,
+ "inference_iteration_2_overall_metric": 0.5704394040472569,
+ "inference_iteration_3_overall_metric": 0.5715127221332137,
+ "average_token_length_metric": {
+ "8k": 0.6286936796561847,
+ "16k": 0.6309027535519853,
+ "32k": 0.5969011989319307,
+ "64k": 0.5427727165403452,
+ "128k": 0.5275644070173147,
+ "256k": 0.4756456693842662
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5368141482212665,
+ "Partial": 0.605600336007035
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7920271132046242,
+ "Moderate": 0.5713738824882528,
+ "Hard": 0.4620098327210685,
+ "Extreme": 0.3868526000236004
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8062267128065881,
+ "T2. Sequencing & Structure Reconstruction": 0.7434196920363564,
+ "T3. Evidence-Grounded QA": 0.5111111111111112,
+ "T4. Summarization & Synthesis": 0.5146898974811284,
+ "T5. Attribution & Citation Alignment": 0.537979547816877,
+ "T6. Aggregation & Clustering": 0.5248106585111237,
+ "T7. Consistency & Compliance Checking": 0.3681804918962003,
+ "T8. Structured & Numeric Reasoning": 0.5841049382716048,
+ "T9. Version & Code Diff Analysis": 0.6466057172430281,
+ "T10. Rule Induction & In-Context Learning": 0.6183796296296297,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4944444444444444
+ },
+ "average_language_metric": {
+ "Chinese": 0.5527161660546936,
+ "English": 0.5814439756393159
+ },
+ "BoN-1": {
+ "overall_metric": 0.5592880863605422,
+ "token_length": {
+ "8k": 0.6182266355968743,
+ "16k": 0.6224911215102044,
+ "32k": 0.6006190818475612,
+ "64k": 0.5386968289399401,
+ "128k": 0.5299641908835836,
+ "256k": 0.4457306593850912
+ },
+ "contextual_requirement": {
+ "Full": 0.530042096148415,
+ "Partial": 0.5965102557214322
+ },
+ "difficulty": {
+ "Easy": 0.7861925110638937,
+ "Moderate": 0.563744851626512,
+ "Hard": 0.4535437025457247,
+ "Extreme": 0.377252152391456
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7951398867629517,
+ "T2. Sequencing & Structure Reconstruction": 0.751023606023606,
+ "T3. Evidence-Grounded QA": 0.4666666666666667,
+ "T4. Summarization & Synthesis": 0.5136736975702932,
+ "T5. Attribution & Citation Alignment": 0.510566852786565,
+ "T6. Aggregation & Clustering": 0.5116905358775884,
+ "T7. Consistency & Compliance Checking": 0.37914696318940605,
+ "T8. Structured & Numeric Reasoning": 0.5847222222222223,
+ "T9. Version & Code Diff Analysis": 0.6690241210962063,
+ "T10. Rule Induction & In-Context Learning": 0.5966666666666666,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.475
+ },
+ "language": {
+ "Chinese": 0.5492580351840802,
+ "English": 0.569318137537005
+ }
+ },
+ "pass@1": 0.30666666666666664,
+ "BoN-2": {
+ "overall_metric": 0.6262109782348795,
+ "token_length": {
+ "8k": 0.6883768372773764,
+ "16k": 0.7011104482454619,
+ "32k": 0.647383679165818,
+ "64k": 0.6092322863406843,
+ "128k": 0.5965297187489229,
+ "256k": 0.5146328996310173
+ },
+ "contextual_requirement": {
+ "Full": 0.5949295492853987,
+ "Partial": 0.6660237059887659
+ },
+ "difficulty": {
+ "Easy": 0.8509257637159784,
+ "Moderate": 0.66205670168914,
+ "Hard": 0.5144444684176784,
+ "Extreme": 0.42991229790988206
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8583478908910738,
+ "T2. Sequencing & Structure Reconstruction": 0.7814144189144188,
+ "T3. Evidence-Grounded QA": 0.5916666666666667,
+ "T4. Summarization & Synthesis": 0.5277169061289491,
+ "T5. Attribution & Citation Alignment": 0.5960013921722541,
+ "T6. Aggregation & Clustering": 0.5992683212004715,
+ "T7. Consistency & Compliance Checking": 0.4404830107556436,
+ "T8. Structured & Numeric Reasoning": 0.6166666666666667,
+ "T9. Version & Code Diff Analysis": 0.7117518441173546,
+ "T10. Rule Induction & In-Context Learning": 0.7094444444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667
+ },
+ "language": {
+ "Chinese": 0.6172660802474611,
+ "English": 0.6351558762222995
+ }
+ },
+ "pass@2": 0.37133333333333335,
+ "BoN-3": {
+ "overall_metric": 0.656766091226133,
+ "token_length": {
+ "8k": 0.7136143676151803,
+ "16k": 0.723535463035206,
+ "32k": 0.676630607848943,
+ "64k": 0.6371662004749992,
+ "128k": 0.6230203333837121,
+ "256k": 0.5666295749987625
+ },
+ "contextual_requirement": {
+ "Full": 0.6274188340913682,
+ "Partial": 0.6941171457612914
+ },
+ "difficulty": {
+ "Easy": 0.8771215787680936,
+ "Moderate": 0.7073341252772278,
+ "Hard": 0.5445129149531545,
+ "Extreme": 0.4558925937418165
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8681438231282913,
+ "T2. Sequencing & Structure Reconstruction": 0.8017822455322453,
+ "T3. Evidence-Grounded QA": 0.625,
+ "T4. Summarization & Synthesis": 0.5352432221044823,
+ "T5. Attribution & Citation Alignment": 0.6492191718976088,
+ "T6. Aggregation & Clustering": 0.6200836170157672,
+ "T7. Consistency & Compliance Checking": 0.4675370424175185,
+ "T8. Structured & Numeric Reasoning": 0.6592592592592593,
+ "T9. Version & Code Diff Analysis": 0.7376455774030053,
+ "T10. Rule Induction & In-Context Learning": 0.7722222222222221,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6
+ },
+ "language": {
+ "Chinese": 0.6466196520294557,
+ "English": 0.6669125304228131
+ }
+ },
+ "pass@3": 0.39866666666666667
+}
\ No newline at end of file
diff --git a/results/DeepSeek-V3.1/nonthinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3.1/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3f059aa81e94cd065a0953bae9f538b4e86e7b8
--- /dev/null
+++ b/results/DeepSeek-V3.1/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.513858634133048,
+ "inference_iteration_1_overall_metric": 0.5123343209652136,
+ "inference_iteration_2_overall_metric": 0.5169477472023125,
+ "inference_iteration_3_overall_metric": 0.5122938342316177,
+ "average_token_length_metric": {
+ "8k": 0.5798800160519532,
+ "16k": 0.557162234839459,
+ "32k": 0.5231647768475723,
+ "64k": 0.5020895430155518,
+ "128k": 0.47482295470763564,
+ "256k": 0.44603227933611866
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.4799762392602454,
+ "Partial": 0.556981682152979
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6361477184952151,
+ "Moderate": 0.4879546435756716,
+ "Hard": 0.4929175841249406,
+ "Extreme": 0.4106651751804595
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7899439606505894,
+ "T2. Sequencing & Structure Reconstruction": 0.7633837790575241,
+ "T3. Evidence-Grounded QA": 0.5472222222222223,
+ "T4. Summarization & Synthesis": 0.5504457320532966,
+ "T5. Attribution & Citation Alignment": 0.5405950417654154,
+ "T6. Aggregation & Clustering": 0.4826886948879801,
+ "T7. Consistency & Compliance Checking": 0.3782532668616311,
+ "T8. Structured & Numeric Reasoning": 0.20864197530864195,
+ "T9. Version & Code Diff Analysis": 0.6426366556970474,
+ "T10. Rule Induction & In-Context Learning": 0.5235185185185185,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4611111111111112
+ },
+ "average_language_metric": {
+ "Chinese": 0.5242036714829956,
+ "English": 0.5035135967831006
+ },
+ "BoN-1": {
+ "overall_metric": 0.5123343209652136,
+ "token_length": {
+ "8k": 0.5775546994411811,
+ "16k": 0.5639037168302903,
+ "32k": 0.5253784942631851,
+ "64k": 0.4921258363031359,
+ "128k": 0.46961700213634144,
+ "256k": 0.4454261768171497
+ },
+ "contextual_requirement": {
+ "Full": 0.4805828046453364,
+ "Partial": 0.5527453417359676
+ },
+ "difficulty": {
+ "Easy": 0.6289501731523275,
+ "Moderate": 0.48863452317355005,
+ "Hard": 0.49859679584956984,
+ "Extreme": 0.4091764699789028
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7832093443146274,
+ "T2. Sequencing & Structure Reconstruction": 0.758390884975293,
+ "T3. Evidence-Grounded QA": 0.525,
+ "T4. Summarization & Synthesis": 0.5526444826651147,
+ "T5. Attribution & Citation Alignment": 0.550721576382896,
+ "T6. Aggregation & Clustering": 0.49314760273206115,
+ "T7. Consistency & Compliance Checking": 0.36201890631349554,
+ "T8. Structured & Numeric Reasoning": 0.19351851851851853,
+ "T9. Version & Code Diff Analysis": 0.6396574046033499,
+ "T10. Rule Induction & In-Context Learning": 0.5548611111111109,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667
+ },
+ "language": {
+ "Chinese": 0.5269577477981102,
+ "English": 0.49771089413231795
+ }
+ },
+ "pass@1": 0.24,
+ "BoN-2": {
+ "overall_metric": 0.5575639036526701,
+ "token_length": {
+ "8k": 0.6295556911757817,
+ "16k": 0.600201088330424,
+ "32k": 0.568079050385207,
+ "64k": 0.540874671214598,
+ "128k": 0.5142278846286528,
+ "256k": 0.4924450361813656
+ },
+ "contextual_requirement": {
+ "Full": 0.5270669681600962,
+ "Partial": 0.596378185188677
+ },
+ "difficulty": {
+ "Easy": 0.6861712816104073,
+ "Moderate": 0.5385181729697447,
+ "Hard": 0.5320539370618487,
+ "Extreme": 0.4459453589628649
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7998040423032986,
+ "T2. Sequencing & Structure Reconstruction": 0.7974421612945821,
+ "T3. Evidence-Grounded QA": 0.6083333333333333,
+ "T4. Summarization & Synthesis": 0.5644516725623473,
+ "T5. Attribution & Citation Alignment": 0.603125221836478,
+ "T6. Aggregation & Clustering": 0.5375390883957035,
+ "T7. Consistency & Compliance Checking": 0.42616915873421196,
+ "T8. Structured & Numeric Reasoning": 0.2569444444444444,
+ "T9. Version & Code Diff Analysis": 0.6918022158557025,
+ "T10. Rule Induction & In-Context Learning": 0.5736111111111112,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5
+ },
+ "language": {
+ "Chinese": 0.5701747476373757,
+ "English": 0.5449530596679679
+ }
+ },
+ "pass@2": 0.278,
+ "BoN-3": {
+ "overall_metric": 0.5783189478755758,
+ "token_length": {
+ "8k": 0.6444603085006093,
+ "16k": 0.6168368410260194,
+ "32k": 0.591356956659339,
+ "64k": 0.5741881409846333,
+ "128k": 0.5329008044819139,
+ "256k": 0.5101706356009489
+ },
+ "contextual_requirement": {
+ "Full": 0.5453502508711524,
+ "Partial": 0.6202791076993917
+ },
+ "difficulty": {
+ "Easy": 0.700691507162887,
+ "Moderate": 0.5586892040625308,
+ "Hard": 0.5577221017016739,
+ "Extreme": 0.47068692726136246
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8323002597882068,
+ "T2. Sequencing & Structure Reconstruction": 0.8148032724056932,
+ "T3. Evidence-Grounded QA": 0.625,
+ "T4. Summarization & Synthesis": 0.5726119582175254,
+ "T5. Attribution & Citation Alignment": 0.625566004790705,
+ "T6. Aggregation & Clustering": 0.5576354028999125,
+ "T7. Consistency & Compliance Checking": 0.4599095603777464,
+ "T8. Structured & Numeric Reasoning": 0.2736111111111111,
+ "T9. Version & Code Diff Analysis": 0.7066934638816516,
+ "T10. Rule Induction & In-Context Learning": 0.5819444444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333
+ },
+ "language": {
+ "Chinese": 0.5882921354915299,
+ "English": 0.5683457602596252
+ }
+ },
+ "pass@3": 0.3
+}
\ No newline at end of file
diff --git a/results/DeepSeek-V3.1/thinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3.1/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef74a0fdd4a8c72e383463d2456585b9f2f92385
--- /dev/null
+++ b/results/DeepSeek-V3.1/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 8,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.6621817899708398,
+ "inference_iteration_1_overall_metric": 0.6612230154154042,
+ "inference_iteration_2_overall_metric": 0.6610111426397741,
+ "inference_iteration_3_overall_metric": 0.6643112118573413,
+ "average_token_length_metric": {
+ "8k": 0.7494820895775017,
+ "16k": 0.7158886748078707,
+ "32k": 0.668616684861116,
+ "64k": 0.7028333128738413,
+ "128k": 0.6150251691532579,
+ "256k": 0.5212448085514543
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.6306995662884327,
+ "Partial": 0.7022500746575411
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.8572162949660228,
+ "Moderate": 0.7353266184513482,
+ "Hard": 0.622190936275892,
+ "Extreme": 0.4267542215146938
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8547667520039813,
+ "T2. Sequencing & Structure Reconstruction": 0.8502990373823708,
+ "T3. Evidence-Grounded QA": 0.5944444444444446,
+ "T4. Summarization & Synthesis": 0.5592502973941748,
+ "T5. Attribution & Citation Alignment": 0.664951753589773,
+ "T6. Aggregation & Clustering": 0.6143401320362227,
+ "T7. Consistency & Compliance Checking": 0.5020434872004602,
+ "T8. Structured & Numeric Reasoning": 0.7200617283950619,
+ "T9. Version & Code Diff Analysis": 0.7327346609657337,
+ "T10. Rule Induction & In-Context Learning": 0.6883796296296296,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5777777777777778
+ },
+ "average_language_metric": {
+ "Chinese": 0.6626168849921547,
+ "English": 0.6617466949495263
+ },
+ "BoN-1": {
+ "overall_metric": 0.6612230154154042,
+ "token_length": {
+ "8k": 0.7464911564030304,
+ "16k": 0.7250299866543051,
+ "32k": 0.658322634935698,
+ "64k": 0.7169507057254954,
+ "128k": 0.6020278750216188,
+ "256k": 0.5185157337522811
+ },
+ "contextual_requirement": {
+ "Full": 0.6306907002796491,
+ "Partial": 0.7000823255881854
+ },
+ "difficulty": {
+ "Easy": 0.8474131666511834,
+ "Moderate": 0.7235563816620875,
+ "Hard": 0.6392066996365914,
+ "Extreme": 0.4307791961407244
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.858023793743841,
+ "T2. Sequencing & Structure Reconstruction": 0.8565782365782365,
+ "T3. Evidence-Grounded QA": 0.6083333333333333,
+ "T4. Summarization & Synthesis": 0.5579680390729822,
+ "T5. Attribution & Citation Alignment": 0.6494356501600668,
+ "T6. Aggregation & Clustering": 0.6299150042042198,
+ "T7. Consistency & Compliance Checking": 0.5158224119304962,
+ "T8. Structured & Numeric Reasoning": 0.6930555555555555,
+ "T9. Version & Code Diff Analysis": 0.7300925156020261,
+ "T10. Rule Induction & In-Context Learning": 0.6966666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.55
+ },
+ "language": {
+ "Chinese": 0.6664246885361451,
+ "English": 0.6560213422946652
+ }
+ },
+ "pass@1": 0.44533333333333336,
+ "BoN-2": {
+ "overall_metric": 0.7247597749822192,
+ "token_length": {
+ "8k": 0.7863113902541472,
+ "16k": 0.7674558325218811,
+ "32k": 0.7281178415241295,
+ "64k": 0.7821445768686631,
+ "128k": 0.6970380441541918,
+ "256k": 0.5874909645703058
+ },
+ "contextual_requirement": {
+ "Full": 0.6960065402743163,
+ "Partial": 0.7613548009740968
+ },
+ "difficulty": {
+ "Easy": 0.9116793007094617,
+ "Moderate": 0.8131496116751571,
+ "Hard": 0.7011503473822069,
+ "Extreme": 0.47744898037225214
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8830939332648011,
+ "T2. Sequencing & Structure Reconstruction": 0.8806919931919931,
+ "T3. Evidence-Grounded QA": 0.6916666666666667,
+ "T4. Summarization & Synthesis": 0.5727671170579177,
+ "T5. Attribution & Citation Alignment": 0.7446276254629812,
+ "T6. Aggregation & Clustering": 0.683646617964251,
+ "T7. Consistency & Compliance Checking": 0.5837976903893948,
+ "T8. Structured & Numeric Reasoning": 0.7902777777777777,
+ "T9. Version & Code Diff Analysis": 0.7892333891029187,
+ "T10. Rule Induction & In-Context Learning": 0.7691666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6416666666666667
+ },
+ "language": {
+ "Chinese": 0.7206824886096936,
+ "English": 0.7288370613547459
+ }
+ },
+ "pass@2": 0.5166666666666667,
+ "BoN-3": {
+ "overall_metric": 0.7535692341250043,
+ "token_length": {
+ "8k": 0.8201462848118084,
+ "16k": 0.7948417017172265,
+ "32k": 0.7537778254297324,
+ "64k": 0.8032112869269382,
+ "128k": 0.7298931033645517,
+ "256k": 0.6195452024997666
+ },
+ "contextual_requirement": {
+ "Full": 0.726098496655499,
+ "Partial": 0.7885319909043751
+ },
+ "difficulty": {
+ "Easy": 0.9344410438349966,
+ "Moderate": 0.844684360754373,
+ "Hard": 0.7404763342771397,
+ "Extreme": 0.5041859708975693
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9022474827372764,
+ "T2. Sequencing & Structure Reconstruction": 0.8967201779701777,
+ "T3. Evidence-Grounded QA": 0.7333333333333333,
+ "T4. Summarization & Synthesis": 0.5812947594742044,
+ "T5. Attribution & Citation Alignment": 0.7893345155800883,
+ "T6. Aggregation & Clustering": 0.7043090297107774,
+ "T7. Consistency & Compliance Checking": 0.6227060379432763,
+ "T8. Structured & Numeric Reasoning": 0.8129629629629629,
+ "T9. Version & Code Diff Analysis": 0.8129681115419459,
+ "T10. Rule Induction & In-Context Learning": 0.79375,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7
+ },
+ "language": {
+ "Chinese": 0.7447219230644647,
+ "English": 0.7624165451855438
+ }
+ },
+ "pass@3": 0.552
+}
\ No newline at end of file
diff --git a/results/DeepSeek-V3.2/nonthinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3.2/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2fb18fa953399c601262552941ba97a0d56aa03
--- /dev/null
+++ b/results/DeepSeek-V3.2/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5167049903246114,
+ "inference_iteration_1_overall_metric": 0.5175993160365915,
+ "inference_iteration_2_overall_metric": 0.5135807596157895,
+ "inference_iteration_3_overall_metric": 0.5189348953214519,
+ "average_token_length_metric": {
+ "8k": 0.5691616296699119,
+ "16k": 0.5676134549372556,
+ "32k": 0.5289760437098003,
+ "64k": 0.5015696259811485,
+ "128k": 0.4857001095282947,
+ "256k": 0.44720907812125815
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.48158748704864085,
+ "Partial": 0.5613999944940294
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6212240545937193,
+ "Moderate": 0.5136346834318135,
+ "Hard": 0.5163049021668649,
+ "Extreme": 0.4044885248516518
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7713866456415331,
+ "T2. Sequencing & Structure Reconstruction": 0.762687420604087,
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
+ "T4. Summarization & Synthesis": 0.5515992544844097,
+ "T5. Attribution & Citation Alignment": 0.5944535310423185,
+ "T6. Aggregation & Clustering": 0.4789878465188747,
+ "T7. Consistency & Compliance Checking": 0.39465891182344254,
+ "T8. Structured & Numeric Reasoning": 0.2149691358024691,
+ "T9. Version & Code Diff Analysis": 0.6451135379199672,
+ "T10. Rule Induction & In-Context Learning": 0.49787037037037035,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.46944444444444444
+ },
+ "average_language_metric": {
+ "Chinese": 0.5273254322066815,
+ "English": 0.5060845484425422
+ },
+ "BoN-1": {
+ "overall_metric": 0.5175993160365915,
+ "token_length": {
+ "8k": 0.5773249164903291,
+ "16k": 0.5604627326667504,
+ "32k": 0.5374659261246943,
+ "64k": 0.5074991434398594,
+ "128k": 0.4702901190904467,
+ "256k": 0.45255305840747634
+ },
+ "contextual_requirement": {
+ "Full": 0.4801143243240927,
+ "Partial": 0.5653074873070485
+ },
+ "difficulty": {
+ "Easy": 0.6321632804118208,
+ "Moderate": 0.5036215268809261,
+ "Hard": 0.5164886663122836,
+ "Extreme": 0.40201006150807705
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7671950408606149,
+ "T2. Sequencing & Structure Reconstruction": 0.7640499777999779,
+ "T3. Evidence-Grounded QA": 0.5833333333333334,
+ "T4. Summarization & Synthesis": 0.5488326711816013,
+ "T5. Attribution & Citation Alignment": 0.58422686569879,
+ "T6. Aggregation & Clustering": 0.47446486157270457,
+ "T7. Consistency & Compliance Checking": 0.3846286380881271,
+ "T8. Structured & Numeric Reasoning": 0.2226851851851852,
+ "T9. Version & Code Diff Analysis": 0.6587133120918426,
+ "T10. Rule Induction & In-Context Learning": 0.5076388888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.43333333333333335
+ },
+ "language": {
+ "Chinese": 0.5202198683535451,
+ "English": 0.5149787637196416
+ }
+ },
+ "pass@1": 0.24533333333333332,
+ "BoN-2": {
+ "overall_metric": 0.5858993921620037,
+ "token_length": {
+ "8k": 0.6332517244583726,
+ "16k": 0.6310887297252004,
+ "32k": 0.6114096243459353,
+ "64k": 0.5723573466459502,
+ "128k": 0.5523069746695444,
+ "256k": 0.5149819531270248
+ },
+ "contextual_requirement": {
+ "Full": 0.5499254609160144,
+ "Partial": 0.6316843955659928
+ },
+ "difficulty": {
+ "Easy": 0.6997370485006186,
+ "Moderate": 0.578931807315116,
+ "Hard": 0.59328992133015,
+ "Extreme": 0.46091761656187924
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8076556299673503,
+ "T2. Sequencing & Structure Reconstruction": 0.8115853128353127,
+ "T3. Evidence-Grounded QA": 0.65,
+ "T4. Summarization & Synthesis": 0.5673030178914519,
+ "T5. Attribution & Citation Alignment": 0.6729248677257385,
+ "T6. Aggregation & Clustering": 0.5463889541830715,
+ "T7. Consistency & Compliance Checking": 0.4736347042901523,
+ "T8. Structured & Numeric Reasoning": 0.2773148148148148,
+ "T9. Version & Code Diff Analysis": 0.7103491970064771,
+ "T10. Rule Induction & In-Context Learning": 0.5745833333333332,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5833333333333334
+ },
+ "language": {
+ "Chinese": 0.5980713707246905,
+ "English": 0.5737274135993191
+ }
+ },
+ "pass@2": 0.312,
+ "BoN-3": {
+ "overall_metric": 0.6250975643039252,
+ "token_length": {
+ "8k": 0.6624896013016575,
+ "16k": 0.667371637798531,
+ "32k": 0.6458484887458542,
+ "64k": 0.619893995338142,
+ "128k": 0.5995137452171235,
+ "256k": 0.5554679174222451
+ },
+ "contextual_requirement": {
+ "Full": 0.5850937206249248,
+ "Partial": 0.6760115471681097
+ },
+ "difficulty": {
+ "Easy": 0.7532527253719965,
+ "Moderate": 0.6215376875723243,
+ "Hard": 0.6270141217985696,
+ "Extreme": 0.48578877254181324
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8312391426962816,
+ "T2. Sequencing & Structure Reconstruction": 0.8401638639138637,
+ "T3. Evidence-Grounded QA": 0.725,
+ "T4. Summarization & Synthesis": 0.5737811360033489,
+ "T5. Attribution & Citation Alignment": 0.7032234702446885,
+ "T6. Aggregation & Clustering": 0.5739851542792719,
+ "T7. Consistency & Compliance Checking": 0.5085181572307016,
+ "T8. Structured & Numeric Reasoning": 0.33425925925925926,
+ "T9. Version & Code Diff Analysis": 0.7396125292314827,
+ "T10. Rule Induction & In-Context Learning": 0.6588888888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6166666666666667
+ },
+ "language": {
+ "Chinese": 0.6402707582944462,
+ "English": 0.6099243703134061
+ }
+ },
+ "pass@3": 0.3526666666666667
+}
\ No newline at end of file
diff --git a/results/DeepSeek-V3.2/thinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3.2/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..67a1cc00e9cb43fb03fcc17e28986ce26181ea7a
--- /dev/null
+++ b/results/DeepSeek-V3.2/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.6782077426413915,
+ "inference_iteration_1_overall_metric": 0.671629754030229,
+ "inference_iteration_2_overall_metric": 0.6777556491690084,
+ "inference_iteration_3_overall_metric": 0.6852378247249357,
+ "average_token_length_metric": {
+ "8k": 0.755369154280727,
+ "16k": 0.7449467265987637,
+ "32k": 0.6953336880653428,
+ "64k": 0.6946800210314833,
+ "128k": 0.6477035080898761,
+ "256k": 0.5312133577821595
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.6459619783148297,
+ "Partial": 0.7192478063297452
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.8502179380964533,
+ "Moderate": 0.7507860067400632,
+ "Hard": 0.6772551692268365,
+ "Extreme": 0.4427333362390087
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8628300416379104,
+ "T2. Sequencing & Structure Reconstruction": 0.8633894500561163,
+ "T3. Evidence-Grounded QA": 0.6277777777777778,
+ "T4. Summarization & Synthesis": 0.5645627813985595,
+ "T5. Attribution & Citation Alignment": 0.7367830500533472,
+ "T6. Aggregation & Clustering": 0.6168551563610202,
+ "T7. Consistency & Compliance Checking": 0.5431477714039084,
+ "T8. Structured & Numeric Reasoning": 0.6640432098765434,
+ "T9. Version & Code Diff Analysis": 0.7821104015574073,
+ "T10. Rule Induction & In-Context Learning": 0.6818518518518517,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.622222222222222
+ },
+ "average_language_metric": {
+ "Chinese": 0.6775197474946019,
+ "English": 0.6788957377881832
+ },
+ "BoN-1": {
+ "overall_metric": 0.671629754030229,
+ "token_length": {
+ "8k": 0.7397920433374541,
+ "16k": 0.7269924173975423,
+ "32k": 0.7007145536231846,
+ "64k": 0.6696695962094932,
+ "128k": 0.655131428243527,
+ "256k": 0.5374784853701818
+ },
+ "contextual_requirement": {
+ "Full": 0.6384885103680042,
+ "Partial": 0.7138095186912471
+ },
+ "difficulty": {
+ "Easy": 0.8524722619644284,
+ "Moderate": 0.7391126766592921,
+ "Hard": 0.6576844523580323,
+ "Extreme": 0.4383605238465565
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8608914983834914,
+ "T2. Sequencing & Structure Reconstruction": 0.8649913049913043,
+ "T3. Evidence-Grounded QA": 0.6166666666666667,
+ "T4. Summarization & Synthesis": 0.5629979913624931,
+ "T5. Attribution & Citation Alignment": 0.727192234350903,
+ "T6. Aggregation & Clustering": 0.6142105299342737,
+ "T7. Consistency & Compliance Checking": 0.5448247044942024,
+ "T8. Structured & Numeric Reasoning": 0.6222222222222223,
+ "T9. Version & Code Diff Analysis": 0.7868571557580843,
+ "T10. Rule Induction & In-Context Learning": 0.7038888888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6
+ },
+ "language": {
+ "Chinese": 0.6695591460858414,
+ "English": 0.6737003619746216
+ }
+ },
+ "pass@1": 0.442,
+ "BoN-2": {
+ "overall_metric": 0.739062545668539,
+ "token_length": {
+ "8k": 0.7985986496818439,
+ "16k": 0.8081758798304621,
+ "32k": 0.7769565912085228,
+ "64k": 0.7312553059908137,
+ "128k": 0.7215196949254835,
+ "256k": 0.5978691523741125
+ },
+ "contextual_requirement": {
+ "Full": 0.7083303283149,
+ "Partial": 0.7781762768459
+ },
+ "difficulty": {
+ "Easy": 0.9085052195225997,
+ "Moderate": 0.8289156864861236,
+ "Hard": 0.7590532764337315,
+ "Extreme": 0.4812983463842697
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8977317620746387,
+ "T2. Sequencing & Structure Reconstruction": 0.9110066322566314,
+ "T3. Evidence-Grounded QA": 0.725,
+ "T4. Summarization & Synthesis": 0.5777725397440469,
+ "T5. Attribution & Citation Alignment": 0.807338444836897,
+ "T6. Aggregation & Clustering": 0.6747522573464864,
+ "T7. Consistency & Compliance Checking": 0.6096717826867489,
+ "T8. Structured & Numeric Reasoning": 0.7398148148148148,
+ "T9. Version & Code Diff Analysis": 0.8172408263391231,
+ "T10. Rule Induction & In-Context Learning": 0.7575,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334
+ },
+ "language": {
+ "Chinese": 0.7379779669884229,
+ "English": 0.7401471243486567
+ }
+ },
+ "pass@2": 0.5286666666666666,
+ "BoN-3": {
+ "overall_metric": 0.769484517884144,
+ "token_length": {
+ "8k": 0.8183989834849181,
+ "16k": 0.8287125912826261,
+ "32k": 0.8000120047613876,
+ "64k": 0.7929440904260506,
+ "128k": 0.761968193898885,
+ "256k": 0.6148712434509996
+ },
+ "contextual_requirement": {
+ "Full": 0.7396412839189731,
+ "Partial": 0.8074668156579995
+ },
+ "difficulty": {
+ "Easy": 0.9315740410553816,
+ "Moderate": 0.8685767211690967,
+ "Hard": 0.8013938214601453,
+ "Extreme": 0.5058786414037999
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.904701887970005,
+ "T2. Sequencing & Structure Reconstruction": 0.9204745254745246,
+ "T3. Evidence-Grounded QA": 0.7583333333333333,
+ "T4. Summarization & Synthesis": 0.5854934073644178,
+ "T5. Attribution & Citation Alignment": 0.8343370156947629,
+ "T6. Aggregation & Clustering": 0.6936792642304824,
+ "T7. Consistency & Compliance Checking": 0.6489698568119598,
+ "T8. Structured & Numeric Reasoning": 0.789814814814815,
+ "T9. Version & Code Diff Analysis": 0.8323537332622081,
+ "T10. Rule Induction & In-Context Learning": 0.8091666666666666,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.775
+ },
+ "language": {
+ "Chinese": 0.76992326145198,
+ "English": 0.7690457743163093
+ }
+ },
+ "pass@3": 0.572
+}
\ No newline at end of file
diff --git a/results/GLM-4.5/nonthinking_context-120000_bon-3_summary.json b/results/GLM-4.5/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a211dbb51908ddda1f1dde58e649fe3509fdfa0
--- /dev/null
+++ b/results/GLM-4.5/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.43035083788419254,
+ "inference_iteration_1_overall_metric": 0.4323298899239496,
+ "inference_iteration_2_overall_metric": 0.42711968234411496,
+ "inference_iteration_3_overall_metric": 0.43160294138451205,
+ "average_token_length_metric": {
+ "8k": 0.5314683958937002,
+ "16k": 0.49409349830535854,
+ "32k": 0.5016963643416883,
+ "64k": 0.41773091498134723,
+ "128k": 0.3415048455402667,
+ "256k": 0.2956110082427921
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.40615033864921796,
+ "Partial": 0.46115147327415856
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5567819400237102,
+ "Moderate": 0.36920960557839383,
+ "Hard": 0.4020802519560651,
+ "Extreme": 0.3505786202440932
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.698931526590424,
+ "T2. Sequencing & Structure Reconstruction": 0.6796812769380679,
+ "T3. Evidence-Grounded QA": 0.4333333333333333,
+ "T4. Summarization & Synthesis": 0.5382688702946171,
+ "T5. Attribution & Citation Alignment": 0.46682998936207576,
+ "T6. Aggregation & Clustering": 0.3861931118799582,
+ "T7. Consistency & Compliance Checking": 0.2699805385521367,
+ "T8. Structured & Numeric Reasoning": 0.19089506172839513,
+ "T9. Version & Code Diff Analysis": 0.5555800013857407,
+ "T10. Rule Induction & In-Context Learning": 0.4250462962962961,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3111111111111111
+ },
+ "average_language_metric": {
+ "Chinese": 0.4302365847397278,
+ "English": 0.4304650910286559
+ },
+ "BoN-1": {
+ "overall_metric": 0.4323298899239496,
+ "token_length": {
+ "8k": 0.5493363038285968,
+ "16k": 0.5003136640414932,
+ "32k": 0.48767860893851595,
+ "64k": 0.427526424572054,
+ "128k": 0.3355553710370271,
+ "256k": 0.29356896712600833
+ },
+ "contextual_requirement": {
+ "Full": 0.4069098702566493,
+ "Partial": 0.4646826422277853
+ },
+ "difficulty": {
+ "Easy": 0.549387464064302,
+ "Moderate": 0.38289221280334496,
+ "Hard": 0.4057019361680662,
+ "Extreme": 0.35405992762316457
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6989719229147292,
+ "T2. Sequencing & Structure Reconstruction": 0.6906849631849629,
+ "T3. Evidence-Grounded QA": 0.4583333333333333,
+ "T4. Summarization & Synthesis": 0.5384461382258066,
+ "T5. Attribution & Citation Alignment": 0.4818936352277487,
+ "T6. Aggregation & Clustering": 0.37379481108384327,
+ "T7. Consistency & Compliance Checking": 0.26307329406050667,
+ "T8. Structured & Numeric Reasoning": 0.19120370370370374,
+ "T9. Version & Code Diff Analysis": 0.5459081401129251,
+ "T10. Rule Induction & In-Context Learning": 0.43111111111111106,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
+ },
+ "language": {
+ "Chinese": 0.42606506993377413,
+ "English": 0.4385947099141243
+ }
+ },
+ "pass@1": 0.188,
+ "BoN-2": {
+ "overall_metric": 0.5016598810015709,
+ "token_length": {
+ "8k": 0.6177853878601806,
+ "16k": 0.5606584472775635,
+ "32k": 0.5705417377303723,
+ "64k": 0.49224051142500497,
+ "128k": 0.4148610132238459,
+ "256k": 0.3538721884924622
+ },
+ "contextual_requirement": {
+ "Full": 0.4781661168763403,
+ "Partial": 0.531561035342776
+ },
+ "difficulty": {
+ "Easy": 0.6469329658307705,
+ "Moderate": 0.4476620656378584,
+ "Hard": 0.47548119381384835,
+ "Extreme": 0.39518120452359784
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7612773821912874,
+ "T2. Sequencing & Structure Reconstruction": 0.7455342342842339,
+ "T3. Evidence-Grounded QA": 0.5416666666666666,
+ "T4. Summarization & Synthesis": 0.5515008189706921,
+ "T5. Attribution & Citation Alignment": 0.5415377453921782,
+ "T6. Aggregation & Clustering": 0.46179351546743685,
+ "T7. Consistency & Compliance Checking": 0.33666684704302524,
+ "T8. Structured & Numeric Reasoning": 0.24537037037037035,
+ "T9. Version & Code Diff Analysis": 0.6269577879155583,
+ "T10. Rule Induction & In-Context Learning": 0.5281944444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
+ },
+ "language": {
+ "Chinese": 0.48816204809318214,
+ "English": 0.5151577139099619
+ }
+ },
+ "pass@2": 0.23933333333333334,
+ "BoN-3": {
+ "overall_metric": 0.5384969000760513,
+ "token_length": {
+ "8k": 0.6356799930117969,
+ "16k": 0.5929083561253189,
+ "32k": 0.6076554475255213,
+ "64k": 0.5450784408185039,
+ "128k": 0.4563507575799897,
+ "256k": 0.3933084053951815
+ },
+ "contextual_requirement": {
+ "Full": 0.5134959721986003,
+ "Partial": 0.5703162628291728
+ },
+ "difficulty": {
+ "Easy": 0.6991002895184023,
+ "Moderate": 0.4873593462709992,
+ "Hard": 0.49915491813734847,
+ "Extreme": 0.4219917912549902
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7936494394174134,
+ "T2. Sequencing & Structure Reconstruction": 0.7682061919561918,
+ "T3. Evidence-Grounded QA": 0.6,
+ "T4. Summarization & Synthesis": 0.5604787610213925,
+ "T5. Attribution & Citation Alignment": 0.5757231079007503,
+ "T6. Aggregation & Clustering": 0.4903242104980166,
+ "T7. Consistency & Compliance Checking": 0.3811432372555578,
+ "T8. Structured & Numeric Reasoning": 0.274537037037037,
+ "T9. Version & Code Diff Analysis": 0.6567859123578721,
+ "T10. Rule Induction & In-Context Learning": 0.5740277777777778,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
+ },
+ "language": {
+ "Chinese": 0.5274718845754229,
+ "English": 0.5495219155766814
+ }
+ },
+ "pass@3": 0.27466666666666667
+}
\ No newline at end of file
diff --git a/results/GLM-4.5/thinking_context-120000_bon-3_summary.json b/results/GLM-4.5/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b3d4b1d4a70ce982dac62d983a27d967fb394cf
--- /dev/null
+++ b/results/GLM-4.5/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 2,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5547937875815533,
+ "inference_iteration_1_overall_metric": 0.5516398292433491,
+ "inference_iteration_2_overall_metric": 0.5535867950098665,
+ "inference_iteration_3_overall_metric": 0.559154738491441,
+ "average_token_length_metric": {
+ "8k": 0.6972820277601768,
+ "16k": 0.6560112539595868,
+ "32k": 0.6029656036576351,
+ "64k": 0.5486294944947675,
+ "128k": 0.4403706772307823,
+ "256k": 0.38350366838636923
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5245120995377551,
+ "Partial": 0.593334117819114
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7655351117837857,
+ "Moderate": 0.5513197070461822,
+ "Hard": 0.473832729215578,
+ "Extreme": 0.37939478048385467
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.81334590378291,
+ "T2. Sequencing & Structure Reconstruction": 0.7505160653892534,
+ "T3. Evidence-Grounded QA": 0.5027777777777779,
+ "T4. Summarization & Synthesis": 0.5448942860165678,
+ "T5. Attribution & Citation Alignment": 0.5301683776855378,
+ "T6. Aggregation & Clustering": 0.5045905245734419,
+ "T7. Consistency & Compliance Checking": 0.3519808997100079,
+ "T8. Structured & Numeric Reasoning": 0.6371913580246912,
+ "T9. Version & Code Diff Analysis": 0.6439646495440424,
+ "T10. Rule Induction & In-Context Learning": 0.5808333333333334,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3277777777777777
+ },
+ "average_language_metric": {
+ "Chinese": 0.5739046099726642,
+ "English": 0.5356829651904418
+ },
+ "BoN-1": {
+ "overall_metric": 0.5516398292433491,
+ "token_length": {
+ "8k": 0.7055429088605052,
+ "16k": 0.6578481199386829,
+ "32k": 0.5956492591185892,
+ "64k": 0.5401522196952867,
+ "128k": 0.42395107911700697,
+ "256k": 0.386695388730029
+ },
+ "contextual_requirement": {
+ "Full": 0.5177580971479406,
+ "Partial": 0.5947620337284167
+ },
+ "difficulty": {
+ "Easy": 0.752722499925911,
+ "Moderate": 0.5341124686360985,
+ "Hard": 0.4868060322041808,
+ "Extreme": 0.3854592094497618
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.802009002313031,
+ "T2. Sequencing & Structure Reconstruction": 0.7465347152847152,
+ "T3. Evidence-Grounded QA": 0.5,
+ "T4. Summarization & Synthesis": 0.545377281117479,
+ "T5. Attribution & Citation Alignment": 0.5499549412575729,
+ "T6. Aggregation & Clustering": 0.5019983393746833,
+ "T7. Consistency & Compliance Checking": 0.36021434419736664,
+ "T8. Structured & Numeric Reasoning": 0.6138888888888889,
+ "T9. Version & Code Diff Analysis": 0.6552473446554453,
+ "T10. Rule Induction & In-Context Learning": 0.5405555555555556,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667
+ },
+ "language": {
+ "Chinese": 0.5760533640670983,
+ "English": 0.5272262944196019
+ }
+ },
+ "pass@1": 0.304,
+ "BoN-2": {
+ "overall_metric": 0.634985055308529,
+ "token_length": {
+ "8k": 0.7829957270581746,
+ "16k": 0.720868655519701,
+ "32k": 0.6955532292535292,
+ "64k": 0.6191323026708053,
+ "128k": 0.5371685233314611,
+ "256k": 0.45419189401750604
+ },
+ "contextual_requirement": {
+ "Full": 0.6004623267171497,
+ "Partial": 0.67892307351574
+ },
+ "difficulty": {
+ "Easy": 0.8494656930118193,
+ "Moderate": 0.6521983830806934,
+ "Hard": 0.561519594238419,
+ "Extreme": 0.43697868974062276
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8580534190067889,
+ "T2. Sequencing & Structure Reconstruction": 0.8009605209605205,
+ "T3. Evidence-Grounded QA": 0.6166666666666667,
+ "T4. Summarization & Synthesis": 0.5623513196545108,
+ "T5. Attribution & Citation Alignment": 0.6128841050494817,
+ "T6. Aggregation & Clustering": 0.5858431630098295,
+ "T7. Consistency & Compliance Checking": 0.42985731617797807,
+ "T8. Structured & Numeric Reasoning": 0.7472222222222222,
+ "T9. Version & Code Diff Analysis": 0.7035408856813825,
+ "T10. Rule Induction & In-Context Learning": 0.6718055555555555,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667
+ },
+ "language": {
+ "Chinese": 0.6487038966161633,
+ "English": 0.6212662140008957
+ }
+ },
+ "pass@2": 0.4,
+ "BoN-3": {
+ "overall_metric": 0.6753829492782434,
+ "token_length": {
+ "8k": 0.8185554078642306,
+ "16k": 0.7599530890518512,
+ "32k": 0.7311990883460826,
+ "64k": 0.6754068771726057,
+ "128k": 0.5730612976856176,
+ "256k": 0.49412193554907724
+ },
+ "contextual_requirement": {
+ "Full": 0.6412352384515387,
+ "Partial": 0.718843672148596
+ },
+ "difficulty": {
+ "Easy": 0.8844767087888782,
+ "Moderate": 0.717837978263145,
+ "Hard": 0.600883682680195,
+ "Extreme": 0.4673774778829585
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.896714960990196,
+ "T2. Sequencing & Structure Reconstruction": 0.8421179283679281,
+ "T3. Evidence-Grounded QA": 0.675,
+ "T4. Summarization & Synthesis": 0.5712469046961466,
+ "T5. Attribution & Citation Alignment": 0.6300235179241888,
+ "T6. Aggregation & Clustering": 0.6348800775177585,
+ "T7. Consistency & Compliance Checking": 0.4736376213352886,
+ "T8. Structured & Numeric Reasoning": 0.7824074074074073,
+ "T9. Version & Code Diff Analysis": 0.7424625612755764,
+ "T10. Rule Induction & In-Context Learning": 0.765,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
+ },
+ "language": {
+ "Chinese": 0.6895150466449506,
+ "English": 0.6612508519115379
+ }
+ },
+ "pass@3": 0.444
+}
\ No newline at end of file
diff --git a/results/GLM-4.6/nonthinking_context-120000_bon-3_summary.json b/results/GLM-4.6/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..80081a99bad607c4b85bc39af6499da435db9b66
--- /dev/null
+++ b/results/GLM-4.6/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.45854238430368943,
+ "inference_iteration_1_overall_metric": 0.44890397156188516,
+ "inference_iteration_2_overall_metric": 0.4676756901179884,
+ "inference_iteration_3_overall_metric": 0.45904749123119587,
+ "average_token_length_metric": {
+ "8k": 0.539826456202734,
+ "16k": 0.49883990878468565,
+ "32k": 0.5226004279628154,
+ "64k": 0.4617605114078172,
+ "128k": 0.3868307627999842,
+ "256k": 0.34139623866410224
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.43758451425043066,
+ "Partial": 0.4852160370987465
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5881815896217503,
+ "Moderate": 0.40045739164324096,
+ "Hard": 0.43071501653405486,
+ "Extreme": 0.3729573279423014
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7391294273870368,
+ "T2. Sequencing & Structure Reconstruction": 0.7168876379566528,
+ "T3. Evidence-Grounded QA": 0.48333333333333334,
+ "T4. Summarization & Synthesis": 0.54430615481068,
+ "T5. Attribution & Citation Alignment": 0.5036524549754317,
+ "T6. Aggregation & Clustering": 0.4230515938862694,
+ "T7. Consistency & Compliance Checking": 0.28024190494681,
+ "T8. Structured & Numeric Reasoning": 0.204783950617284,
+ "T9. Version & Code Diff Analysis": 0.5493453618981832,
+ "T10. Rule Induction & In-Context Learning": 0.48856481481481484,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3444444444444445
+ },
+ "average_language_metric": {
+ "Chinese": 0.4607293656684521,
+ "English": 0.45635540293892746
+ },
+ "BoN-1": {
+ "overall_metric": 0.44890397156188516,
+ "token_length": {
+ "8k": 0.5369079824065298,
+ "16k": 0.4950184758232463,
+ "32k": 0.5119378052749619,
+ "64k": 0.4359616248204175,
+ "128k": 0.3900490912931948,
+ "256k": 0.323548849752962
+ },
+ "contextual_requirement": {
+ "Full": 0.42475510461227917,
+ "Partial": 0.47963889313411256
+ },
+ "difficulty": {
+ "Easy": 0.5839417584803529,
+ "Moderate": 0.38543423470966093,
+ "Hard": 0.4251833017579716,
+ "Extreme": 0.3582444584458006
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7552782575667283,
+ "T2. Sequencing & Structure Reconstruction": 0.72015059015059,
+ "T3. Evidence-Grounded QA": 0.425,
+ "T4. Summarization & Synthesis": 0.5415035414147457,
+ "T5. Attribution & Citation Alignment": 0.49621823831328055,
+ "T6. Aggregation & Clustering": 0.41038094275648807,
+ "T7. Consistency & Compliance Checking": 0.27110571290315905,
+ "T8. Structured & Numeric Reasoning": 0.1953703703703704,
+ "T9. Version & Code Diff Analysis": 0.5256412558109732,
+ "T10. Rule Induction & In-Context Learning": 0.4822222222222222,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35
+ },
+ "language": {
+ "Chinese": 0.44671558878002343,
+ "English": 0.45109235434374795
+ }
+ },
+ "pass@1": 0.19266666666666668,
+ "BoN-2": {
+ "overall_metric": 0.5266630332287439,
+ "token_length": {
+ "8k": 0.604236555945032,
+ "16k": 0.5564543546519637,
+ "32k": 0.5870438443448399,
+ "64k": 0.5387565602861155,
+ "128k": 0.46692762105066604,
+ "256k": 0.4065592630938467
+ },
+ "contextual_requirement": {
+ "Full": 0.5015019335956029,
+ "Partial": 0.5586862509436522
+ },
+ "difficulty": {
+ "Easy": 0.6773656505276844,
+ "Moderate": 0.47328676466113107,
+ "Hard": 0.49837787163941455,
+ "Extreme": 0.4152118781770787
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7867939033692525,
+ "T2. Sequencing & Structure Reconstruction": 0.7854105213890441,
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
+ "T4. Summarization & Synthesis": 0.5572174880251567,
+ "T5. Attribution & Citation Alignment": 0.573956288098141,
+ "T6. Aggregation & Clustering": 0.4915724205045227,
+ "T7. Consistency & Compliance Checking": 0.33460325146257974,
+ "T8. Structured & Numeric Reasoning": 0.2578703703703704,
+ "T9. Version & Code Diff Analysis": 0.6423128731937172,
+ "T10. Rule Induction & In-Context Learning": 0.6198611111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.425
+ },
+ "language": {
+ "Chinese": 0.518790207475404,
+ "English": 0.5345358589820849
+ }
+ },
+ "pass@2": 0.25466666666666665,
+ "BoN-3": {
+ "overall_metric": 0.5609610306922399,
+ "token_length": {
+ "8k": 0.6386076789983793,
+ "16k": 0.5860184400135818,
+ "32k": 0.6210153091497477,
+ "64k": 0.5871225156541778,
+ "128k": 0.4913536643924111,
+ "256k": 0.4416485759451446
+ },
+ "contextual_requirement": {
+ "Full": 0.5370910562726694,
+ "Partial": 0.5913409981353319
+ },
+ "difficulty": {
+ "Easy": 0.7100406216614994,
+ "Moderate": 0.5174437099567866,
+ "Hard": 0.5265960486680222,
+ "Extreme": 0.44880562762488274
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.815117939706351,
+ "T2. Sequencing & Structure Reconstruction": 0.8064498061783285,
+ "T3. Evidence-Grounded QA": 0.6416666666666667,
+ "T4. Summarization & Synthesis": 0.5680717589841464,
+ "T5. Attribution & Citation Alignment": 0.604565984118908,
+ "T6. Aggregation & Clustering": 0.5283992168363204,
+ "T7. Consistency & Compliance Checking": 0.35872230397543203,
+ "T8. Structured & Numeric Reasoning": 0.28935185185185186,
+ "T9. Version & Code Diff Analysis": 0.6707362245587523,
+ "T10. Rule Induction & In-Context Learning": 0.6406944444444443,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5
+ },
+ "language": {
+ "Chinese": 0.5609588769304155,
+ "English": 0.5609631844540666
+ }
+ },
+ "pass@3": 0.286
+}
\ No newline at end of file
diff --git a/results/GLM-4.6/thinking_context-120000_bon-3_summary.json b/results/GLM-4.6/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..42a80ff8e04de9b7cdc2bb58d04268e6b7a11256
--- /dev/null
+++ b/results/GLM-4.6/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5820993757625644,
+ "inference_iteration_1_overall_metric": 0.5900318347862288,
+ "inference_iteration_2_overall_metric": 0.5774825139114689,
+ "inference_iteration_3_overall_metric": 0.5787837785899949,
+ "average_token_length_metric": {
+ "8k": 0.7122784818137915,
+ "16k": 0.6603518496747058,
+ "32k": 0.6352743108645184,
+ "64k": 0.5897286272690893,
+ "128k": 0.475467875017661,
+ "256k": 0.4194951099356217
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5470143190278319,
+ "Partial": 0.6267530843340428
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7978473417092227,
+ "Moderate": 0.6094768922677877,
+ "Hard": 0.4892370620605133,
+ "Extreme": 0.3887688912252786
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8197327977970514,
+ "T2. Sequencing & Structure Reconstruction": 0.8006519321293782,
+ "T3. Evidence-Grounded QA": 0.538888888888889,
+ "T4. Summarization & Synthesis": 0.5408566771607968,
+ "T5. Attribution & Citation Alignment": 0.5337112988588841,
+ "T6. Aggregation & Clustering": 0.5397680321862239,
+ "T7. Consistency & Compliance Checking": 0.380513781495624,
+ "T8. Structured & Numeric Reasoning": 0.6123456790123456,
+ "T9. Version & Code Diff Analysis": 0.6754501038965057,
+ "T10. Rule Induction & In-Context Learning": 0.6013425925925924,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.46666666666666673
+ },
+ "average_language_metric": {
+ "Chinese": 0.5991918774535788,
+ "English": 0.5650068740715505
+ },
+ "BoN-1": {
+ "overall_metric": 0.5900318347862288,
+ "token_length": {
+ "8k": 0.7190910013269595,
+ "16k": 0.6680291983169964,
+ "32k": 0.6447298296516131,
+ "64k": 0.5905857251798682,
+ "128k": 0.4766512837488421,
+ "256k": 0.4411039704930917
+ },
+ "contextual_requirement": {
+ "Full": 0.5455741223917344,
+ "Partial": 0.6466143778337665
+ },
+ "difficulty": {
+ "Easy": 0.8187580590040169,
+ "Moderate": 0.6184177733393077,
+ "Hard": 0.4890540077544932,
+ "Extreme": 0.38715232500749663
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8264146064969754,
+ "T2. Sequencing & Structure Reconstruction": 0.8039418272654707,
+ "T3. Evidence-Grounded QA": 0.55,
+ "T4. Summarization & Synthesis": 0.5427878753470068,
+ "T5. Attribution & Citation Alignment": 0.5419657251498339,
+ "T6. Aggregation & Clustering": 0.5399584229450126,
+ "T7. Consistency & Compliance Checking": 0.37011974365960826,
+ "T8. Structured & Numeric Reasoning": 0.6416666666666667,
+ "T9. Version & Code Diff Analysis": 0.693087317328304,
+ "T10. Rule Induction & In-Context Learning": 0.6145833333333334,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.475
+ },
+ "language": {
+ "Chinese": 0.6075617792305364,
+ "English": 0.5725018903419208
+ }
+ },
+ "pass@1": 0.36466666666666664,
+ "BoN-2": {
+ "overall_metric": 0.6659160211594685,
+ "token_length": {
+ "8k": 0.7935291454736249,
+ "16k": 0.7469698033059613,
+ "32k": 0.7147019641303554,
+ "64k": 0.6716314717341791,
+ "128k": 0.5707409059251077,
+ "256k": 0.497922836387586
+ },
+ "contextual_requirement": {
+ "Full": 0.6309312423015924,
+ "Partial": 0.7104421033422219
+ },
+ "difficulty": {
+ "Easy": 0.8923409007483016,
+ "Moderate": 0.7327459154582487,
+ "Hard": 0.5632013922542815,
+ "Extreme": 0.4414476037490934
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.880882952904511,
+ "T2. Sequencing & Structure Reconstruction": 0.8476685999185997,
+ "T3. Evidence-Grounded QA": 0.6666666666666666,
+ "T4. Summarization & Synthesis": 0.5583277247093672,
+ "T5. Attribution & Citation Alignment": 0.6177785331058078,
+ "T6. Aggregation & Clustering": 0.6268647075743846,
+ "T7. Consistency & Compliance Checking": 0.4566279411326888,
+ "T8. Structured & Numeric Reasoning": 0.7217592592592593,
+ "T9. Version & Code Diff Analysis": 0.7464145919055787,
+ "T10. Rule Induction & In-Context Learning": 0.7400000000000001,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5583333333333333
+ },
+ "language": {
+ "Chinese": 0.6881369643762367,
+ "English": 0.6436950779427024
+ }
+ },
+ "pass@2": 0.43866666666666665,
+ "BoN-3": {
+ "overall_metric": 0.706844154531168,
+ "token_length": {
+ "8k": 0.8222844269368486,
+ "16k": 0.7772591556882481,
+ "32k": 0.7466099276083229,
+ "64k": 0.7234900151981559,
+ "128k": 0.62014724878678,
+ "256k": 0.551274152968658
+ },
+ "contextual_requirement": {
+ "Full": 0.6725119993190923,
+ "Partial": 0.7505396248010854
+ },
+ "difficulty": {
+ "Easy": 0.9285919042851156,
+ "Moderate": 0.7918869916930842,
+ "Hard": 0.6026672046571206,
+ "Extreme": 0.4764972072411785
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9055704999975606,
+ "T2. Sequencing & Structure Reconstruction": 0.8679013780100734,
+ "T3. Evidence-Grounded QA": 0.7416666666666667,
+ "T4. Summarization & Synthesis": 0.5683880603721403,
+ "T5. Attribution & Citation Alignment": 0.6545164467516165,
+ "T6. Aggregation & Clustering": 0.6671955091257745,
+ "T7. Consistency & Compliance Checking": 0.5102082868832639,
+ "T8. Structured & Numeric Reasoning": 0.7717592592592593,
+ "T9. Version & Code Diff Analysis": 0.7612642969391072,
+ "T10. Rule Induction & In-Context Learning": 0.7541666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333
+ },
+ "language": {
+ "Chinese": 0.7225009620618484,
+ "English": 0.6911873470004898
+ }
+ },
+ "pass@3": 0.48133333333333334
+}
\ No newline at end of file
diff --git a/results/GPT-4o/nonthinking_context-120000_bon-3_summary.json b/results/GPT-4o/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..f34052712384c2995792bfc0438bdf36c43f10ec
--- /dev/null
+++ b/results/GPT-4o/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.46665010092977977,
+ "inference_iteration_1_overall_metric": 0.4658401067882854,
+ "inference_iteration_2_overall_metric": 0.46753496394327626,
+ "inference_iteration_3_overall_metric": 0.466575232057776,
+ "average_token_length_metric": {
+ "8k": 0.5113488376851383,
+ "16k": 0.4997009141224516,
+ "32k": 0.5251055066966325,
+ "64k": 0.45692433752384126,
+ "128k": 0.4357776587958875,
+ "256k": 0.37104335075472816
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.4340554247509889,
+ "Partial": 0.5081342342482407
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5937760942513476,
+ "Moderate": 0.4302951347009006,
+ "Hard": 0.4487990540053617,
+ "Extreme": 0.3629928487032055
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7349022081832958,
+ "T2. Sequencing & Structure Reconstruction": 0.7250800279966945,
+ "T3. Evidence-Grounded QA": 0.522222222222222,
+ "T4. Summarization & Synthesis": 0.5082664460738612,
+ "T5. Attribution & Citation Alignment": 0.5342878338439898,
+ "T6. Aggregation & Clustering": 0.4265212400920285,
+ "T7. Consistency & Compliance Checking": 0.27964395302062434,
+ "T8. Structured & Numeric Reasoning": 0.21157407407407405,
+ "T9. Version & Code Diff Analysis": 0.5920364002998717,
+ "T10. Rule Induction & In-Context Learning": 0.4702777777777778,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.36944444444444435
+ },
+ "average_language_metric": {
+ "Chinese": 0.4566116864733059,
+ "English": 0.47668851538625334
+ },
+ "BoN-1": {
+ "overall_metric": 0.4658401067882854,
+ "token_length": {
+ "8k": 0.49942481947489,
+ "16k": 0.5075293232399243,
+ "32k": 0.535471182543319,
+ "64k": 0.4466006941173288,
+ "128k": 0.43362259025283534,
+ "256k": 0.37239203110142105
+ },
+ "contextual_requirement": {
+ "Full": 0.434998076864218,
+ "Partial": 0.5050935994189195
+ },
+ "difficulty": {
+ "Easy": 0.5852393493724929,
+ "Moderate": 0.43887833780663893,
+ "Hard": 0.4558206214636842,
+ "Extreme": 0.3593336239903742
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7408543245056541,
+ "T2. Sequencing & Structure Reconstruction": 0.7499819162319159,
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
+ "T4. Summarization & Synthesis": 0.5048929077004661,
+ "T5. Attribution & Citation Alignment": 0.5359141447270879,
+ "T6. Aggregation & Clustering": 0.42758922871826105,
+ "T7. Consistency & Compliance Checking": 0.27436727289246843,
+ "T8. Structured & Numeric Reasoning": 0.2152777777777778,
+ "T9. Version & Code Diff Analysis": 0.592312178161249,
+ "T10. Rule Induction & In-Context Learning": 0.44819444444444445,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334
+ },
+ "language": {
+ "Chinese": 0.4424240788239187,
+ "English": 0.48925613475265434
+ }
+ },
+ "pass@1": 0.194,
+ "BoN-2": {
+ "overall_metric": 0.5398849816112041,
+ "token_length": {
+ "8k": 0.5759687409807404,
+ "16k": 0.549906030476181,
+ "32k": 0.6066920241775022,
+ "64k": 0.5317817512096467,
+ "128k": 0.5214872601681293,
+ "256k": 0.4534740826550301
+ },
+ "contextual_requirement": {
+ "Full": 0.5083509910328655,
+ "Partial": 0.5800191514381833
+ },
+ "difficulty": {
+ "Easy": 0.6782240284299118,
+ "Moderate": 0.5120970500900949,
+ "Hard": 0.5256120585356611,
+ "Extreme": 0.41596717800169664
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7993838319887435,
+ "T2. Sequencing & Structure Reconstruction": 0.7934912309912311,
+ "T3. Evidence-Grounded QA": 0.6666666666666666,
+ "T4. Summarization & Synthesis": 0.5276777173176291,
+ "T5. Attribution & Citation Alignment": 0.6042284709538017,
+ "T6. Aggregation & Clustering": 0.4920141039764696,
+ "T7. Consistency & Compliance Checking": 0.34899250013259275,
+ "T8. Structured & Numeric Reasoning": 0.26296296296296295,
+ "T9. Version & Code Diff Analysis": 0.6671322238361709,
+ "T10. Rule Induction & In-Context Learning": 0.5840277777777777,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.45
+ },
+ "language": {
+ "Chinese": 0.525731927704756,
+ "English": 0.5540380355176548
+ }
+ },
+ "pass@2": 0.25466666666666665,
+ "BoN-3": {
+ "overall_metric": 0.5744593750081309,
+ "token_length": {
+ "8k": 0.606512147802086,
+ "16k": 0.5958370007923852,
+ "32k": 0.6384127504911996,
+ "64k": 0.5707759455131253,
+ "128k": 0.5568976109117445,
+ "256k": 0.4783207945382497
+ },
+ "contextual_requirement": {
+ "Full": 0.5391592397427731,
+ "Partial": 0.6193868198913163
+ },
+ "difficulty": {
+ "Easy": 0.7196856800163218,
+ "Moderate": 0.540664277732673,
+ "Hard": 0.5601282438717835,
+ "Extreme": 0.44698074091055134
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8103075764869724,
+ "T2. Sequencing & Structure Reconstruction": 0.8166357716357717,
+ "T3. Evidence-Grounded QA": 0.7,
+ "T4. Summarization & Synthesis": 0.5377648665242384,
+ "T5. Attribution & Citation Alignment": 0.6423496422537093,
+ "T6. Aggregation & Clustering": 0.532547329213996,
+ "T7. Consistency & Compliance Checking": 0.40458496685251083,
+ "T8. Structured & Numeric Reasoning": 0.30046296296296293,
+ "T9. Version & Code Diff Analysis": 0.6915969977123051,
+ "T10. Rule Induction & In-Context Learning": 0.6340277777777777,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.49166666666666664
+ },
+ "language": {
+ "Chinese": 0.563099563818372,
+ "English": 0.5858191861978923
+ }
+ },
+ "pass@3": 0.286
+}
\ No newline at end of file
diff --git a/results/GPT-4o/thinking_context-120000_bon-3_summary.json b/results/GPT-4o/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..09045b4f19a91cc5d02399fa1e1d6a228181932f
--- /dev/null
+++ b/results/GPT-4o/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.4943586900400841,
+ "inference_iteration_1_overall_metric": 0.4968481802669354,
+ "inference_iteration_2_overall_metric": 0.4953906178052376,
+ "inference_iteration_3_overall_metric": 0.49083727204807814,
+ "average_token_length_metric": {
+ "8k": 0.5879811335998006,
+ "16k": 0.5326400416753286,
+ "32k": 0.512948102728002,
+ "64k": 0.4721690409999518,
+ "128k": 0.44658724759711643,
+ "256k": 0.41382657364030534
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.46488316610562114,
+ "Partial": 0.5318729932294004
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7183848834775743,
+ "Moderate": 0.43069679620968054,
+ "Hard": 0.41352044386464876,
+ "Extreme": 0.34385849736921437
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7592784040430812,
+ "T2. Sequencing & Structure Reconstruction": 0.6992785706820794,
+ "T3. Evidence-Grounded QA": 0.41944444444444456,
+ "T4. Summarization & Synthesis": 0.4904128501144795,
+ "T5. Attribution & Citation Alignment": 0.5555445495468067,
+ "T6. Aggregation & Clustering": 0.46463843332938043,
+ "T7. Consistency & Compliance Checking": 0.26892122251820344,
+ "T8. Structured & Numeric Reasoning": 0.4810185185185185,
+ "T9. Version & Code Diff Analysis": 0.5265279154913766,
+ "T10. Rule Induction & In-Context Learning": 0.48212962962962963,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4250000000000001
+ },
+ "average_language_metric": {
+ "Chinese": 0.46257560421163496,
+ "English": 0.526141775868533
+ },
+ "BoN-1": {
+ "overall_metric": 0.4968481802669354,
+ "token_length": {
+ "8k": 0.5741867935281576,
+ "16k": 0.535374756046493,
+ "32k": 0.5202544808789716,
+ "64k": 0.47647427072064424,
+ "128k": 0.4412042204935842,
+ "256k": 0.4335945599337654
+ },
+ "contextual_requirement": {
+ "Full": 0.47247358832994213,
+ "Partial": 0.5278703881867464
+ },
+ "difficulty": {
+ "Easy": 0.7337781908034996,
+ "Moderate": 0.4336235016656763,
+ "Hard": 0.39945319018108144,
+ "Extreme": 0.3428000420213732
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.75748852621513,
+ "T2. Sequencing & Structure Reconstruction": 0.7128378857984122,
+ "T3. Evidence-Grounded QA": 0.4166666666666667,
+ "T4. Summarization & Synthesis": 0.4915611304736425,
+ "T5. Attribution & Citation Alignment": 0.5573645055278006,
+ "T6. Aggregation & Clustering": 0.4608094132930736,
+ "T7. Consistency & Compliance Checking": 0.2337605934000851,
+ "T8. Structured & Numeric Reasoning": 0.5046296296296297,
+ "T9. Version & Code Diff Analysis": 0.5617174175041987,
+ "T10. Rule Induction & In-Context Learning": 0.5141666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4
+ },
+ "language": {
+ "Chinese": 0.46378379039245493,
+ "English": 0.5299125701414172
+ }
+ },
+ "pass@1": 0.25266666666666665,
+ "BoN-2": {
+ "overall_metric": 0.5733973874130963,
+ "token_length": {
+ "8k": 0.6607408168420358,
+ "16k": 0.6168176180801352,
+ "32k": 0.6026001586229682,
+ "64k": 0.5549119793217003,
+ "128k": 0.5219672618111347,
+ "256k": 0.4833464898006067
+ },
+ "contextual_requirement": {
+ "Full": 0.5468729804164447,
+ "Partial": 0.6071557235906547
+ },
+ "difficulty": {
+ "Easy": 0.832354149230665,
+ "Moderate": 0.5035087598472016,
+ "Hard": 0.482911736547395,
+ "Extreme": 0.39505876757369657
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8194314757440243,
+ "T2. Sequencing & Structure Reconstruction": 0.7647144522144521,
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
+ "T4. Summarization & Synthesis": 0.507756894787762,
+ "T5. Attribution & Citation Alignment": 0.6431090618973574,
+ "T6. Aggregation & Clustering": 0.5325685690744169,
+ "T7. Consistency & Compliance Checking": 0.31928948521783374,
+ "T8. Structured & Numeric Reasoning": 0.5949074074074073,
+ "T9. Version & Code Diff Analysis": 0.5981405988039616,
+ "T10. Rule Induction & In-Context Learning": 0.6058333333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
+ },
+ "language": {
+ "Chinese": 0.5418805636874472,
+ "English": 0.6049142111387469
+ }
+ },
+ "pass@2": 0.33,
+ "BoN-3": {
+ "overall_metric": 0.6133026799603023,
+ "token_length": {
+ "8k": 0.7092280973251848,
+ "16k": 0.6573848999983455,
+ "32k": 0.6374049433271965,
+ "64k": 0.5884360272634136,
+ "128k": 0.5736764233126421,
+ "256k": 0.513685688535035
+ },
+ "contextual_requirement": {
+ "Full": 0.585559935676702,
+ "Partial": 0.6486116272303409
+ },
+ "difficulty": {
+ "Easy": 0.8579938866329095,
+ "Moderate": 0.5531379960724733,
+ "Hard": 0.5375404413799146,
+ "Extreme": 0.4345338594537549
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8365910773415853,
+ "T2. Sequencing & Structure Reconstruction": 0.7977771302771303,
+ "T3. Evidence-Grounded QA": 0.5916666666666667,
+ "T4. Summarization & Synthesis": 0.5159806774505266,
+ "T5. Attribution & Citation Alignment": 0.6935011362804332,
+ "T6. Aggregation & Clustering": 0.5705653708431486,
+ "T7. Consistency & Compliance Checking": 0.381137305153889,
+ "T8. Structured & Numeric Reasoning": 0.6296296296296297,
+ "T9. Version & Code Diff Analysis": 0.6584905752696673,
+ "T10. Rule Induction & In-Context Learning": 0.6336111111111112,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667
+ },
+ "language": {
+ "Chinese": 0.5780672262010988,
+ "English": 0.6485381337195074
+ }
+ },
+ "pass@3": 0.36666666666666664
+}
\ No newline at end of file
diff --git a/results/GPT-5/thinking_context-272000_bon-3_summary.json b/results/GPT-5/thinking_context-272000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..184b0204bdfe3a96f1df404699b209378784eb42
--- /dev/null
+++ b/results/GPT-5/thinking_context-272000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.726053089253122,
+ "inference_iteration_1_overall_metric": 0.7242860759291603,
+ "inference_iteration_2_overall_metric": 0.72436075729001,
+ "inference_iteration_3_overall_metric": 0.729512434540192,
+ "average_token_length_metric": {
+ "8k": 0.7537078410340138,
+ "16k": 0.7627066310839429,
+ "32k": 0.7434290864816196,
+ "64k": 0.7646193918174649,
+ "128k": 0.6936202889645278,
+ "256k": 0.638235296137159
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.6915568234658586,
+ "Partial": 0.7699574275278195
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.8523326045847652,
+ "Moderate": 0.8231088494697211,
+ "Hard": 0.787367547123676,
+ "Extreme": 0.4836991814871219
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.9032376385150938,
+ "T2. Sequencing & Structure Reconstruction": 0.9075063054229715,
+ "T3. Evidence-Grounded QA": 0.6666666666666666,
+ "T4. Summarization & Synthesis": 0.5256066584699448,
+ "T5. Attribution & Citation Alignment": 0.8116994715897818,
+ "T6. Aggregation & Clustering": 0.6716265654111317,
+ "T7. Consistency & Compliance Checking": 0.631179283519898,
+ "T8. Structured & Numeric Reasoning": 0.7979938271604939,
+ "T9. Version & Code Diff Analysis": 0.818404768269679,
+ "T10. Rule Induction & In-Context Learning": 0.6802314814814814,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6111111111111112
+ },
+ "average_language_metric": {
+ "Chinese": 0.7196645097291159,
+ "English": 0.7324416687771269
+ },
+ "BoN-1": {
+ "overall_metric": 0.7242860759291603,
+ "token_length": {
+ "8k": 0.7638228227994025,
+ "16k": 0.7511485364018967,
+ "32k": 0.7397315002658593,
+ "64k": 0.7648062624572959,
+ "128k": 0.6947065191324134,
+ "256k": 0.6315008145180959
+ },
+ "contextual_requirement": {
+ "Full": 0.6845638507619599,
+ "Partial": 0.7748416352328712
+ },
+ "difficulty": {
+ "Easy": 0.8419121655420269,
+ "Moderate": 0.8140896757444649,
+ "Hard": 0.8018107002313927,
+ "Extreme": 0.4855278214669571
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9022908711992736,
+ "T2. Sequencing & Structure Reconstruction": 0.9003492803492802,
+ "T3. Evidence-Grounded QA": 0.6666666666666666,
+ "T4. Summarization & Synthesis": 0.525285592483348,
+ "T5. Attribution & Citation Alignment": 0.8350389199886978,
+ "T6. Aggregation & Clustering": 0.6728116198035761,
+ "T7. Consistency & Compliance Checking": 0.6250527729039961,
+ "T8. Structured & Numeric Reasoning": 0.7824074074074074,
+ "T9. Version & Code Diff Analysis": 0.8228424738103258,
+ "T10. Rule Induction & In-Context Learning": 0.6890277777777778,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5916666666666667
+ },
+ "language": {
+ "Chinese": 0.7225808137285838,
+ "English": 0.725991338129738
+ }
+ },
+ "pass@1": 0.5033333333333333,
+ "BoN-2": {
+ "overall_metric": 0.773365567880672,
+ "token_length": {
+ "8k": 0.7988567725267066,
+ "16k": 0.7953552672252621,
+ "32k": 0.7853032014648265,
+ "64k": 0.8171591510524335,
+ "128k": 0.7387615265550217,
+ "256k": 0.7047574884597809
+ },
+ "contextual_requirement": {
+ "Full": 0.740254405395005,
+ "Partial": 0.8155070474078848
+ },
+ "difficulty": {
+ "Easy": 0.8943471956938479,
+ "Moderate": 0.8694949682853881,
+ "Hard": 0.8603608124174508,
+ "Extreme": 0.5205560974396651
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9235081407527015,
+ "T2. Sequencing & Structure Reconstruction": 0.9270526695526693,
+ "T3. Evidence-Grounded QA": 0.7583333333333333,
+ "T4. Summarization & Synthesis": 0.5388141391367185,
+ "T5. Attribution & Citation Alignment": 0.8662194687189113,
+ "T6. Aggregation & Clustering": 0.724952326567939,
+ "T7. Consistency & Compliance Checking": 0.6769275451403334,
+ "T8. Structured & Numeric Reasoning": 0.837962962962963,
+ "T9. Version & Code Diff Analysis": 0.8518498172294341,
+ "T10. Rule Induction & In-Context Learning": 0.749861111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6916666666666667
+ },
+ "language": {
+ "Chinese": 0.7653921632804664,
+ "English": 0.7813389724808776
+ }
+ },
+ "pass@2": 0.5773333333333334,
+ "BoN-3": {
+ "overall_metric": 0.7997603117800453,
+ "token_length": {
+ "8k": 0.8156058789899132,
+ "16k": 0.8312258319915683,
+ "32k": 0.8146647150412942,
+ "64k": 0.8402343004850696,
+ "128k": 0.7648319163907665,
+ "256k": 0.7319992277816549
+ },
+ "contextual_requirement": {
+ "Full": 0.7681553658992594,
+ "Partial": 0.8399847883555894
+ },
+ "difficulty": {
+ "Easy": 0.9168344057692764,
+ "Moderate": 0.9117105202934518,
+ "Hard": 0.8867394849893248,
+ "Extreme": 0.5408505285512573
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9362853987173203,
+ "T2. Sequencing & Structure Reconstruction": 0.9359547859547858,
+ "T3. Evidence-Grounded QA": 0.7916666666666666,
+ "T4. Summarization & Synthesis": 0.5458038576746401,
+ "T5. Attribution & Citation Alignment": 0.8823540286034711,
+ "T6. Aggregation & Clustering": 0.7446436845926303,
+ "T7. Consistency & Compliance Checking": 0.6987021524631377,
+ "T8. Structured & Numeric Reasoning": 0.8824074074074073,
+ "T9. Version & Code Diff Analysis": 0.8622815151611319,
+ "T10. Rule Induction & In-Context Learning": 0.8040277777777778,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.75
+ },
+ "language": {
+ "Chinese": 0.7871986587353806,
+ "English": 0.8123219648247083
+ }
+ },
+ "pass@3": 0.6106666666666667
+}
\ No newline at end of file
diff --git a/results/GPT-OSS-120B/thinking_context-120000_bon-3_summary.json b/results/GPT-OSS-120B/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa68bddd3ad0e0c1a53cc58998c214b93c7c6405
--- /dev/null
+++ b/results/GPT-OSS-120B/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5260760130553013,
+ "inference_iteration_1_overall_metric": 0.5251990705311491,
+ "inference_iteration_2_overall_metric": 0.5187040802401437,
+ "inference_iteration_3_overall_metric": 0.5343248883946079,
+ "average_token_length_metric": {
+ "8k": 0.6379995894817992,
+ "16k": 0.6200629617253591,
+ "32k": 0.5668769322787303,
+ "64k": 0.5173492904735919,
+ "128k": 0.4362186866504548,
+ "256k": 0.3779486177218682
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.49116755127749995,
+ "Partial": 0.5705049644088642
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7406025674065024,
+ "Moderate": 0.506610347347898,
+ "Hard": 0.44966953179643426,
+ "Extreme": 0.3540424932279647
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7441033370277687,
+ "T2. Sequencing & Structure Reconstruction": 0.7329905896572565,
+ "T3. Evidence-Grounded QA": 0.5333333333333334,
+ "T4. Summarization & Synthesis": 0.5106082800845382,
+ "T5. Attribution & Citation Alignment": 0.46625824816375694,
+ "T6. Aggregation & Clustering": 0.5279484217060981,
+ "T7. Consistency & Compliance Checking": 0.31563292534840204,
+ "T8. Structured & Numeric Reasoning": 0.5515432098765432,
+ "T9. Version & Code Diff Analysis": 0.5119880580465573,
+ "T10. Rule Induction & In-Context Learning": 0.5589814814814815,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.425
+ },
+ "average_language_metric": {
+ "Chinese": 0.505405756924764,
+ "English": 0.5467462691858365
+ },
+ "BoN-1": {
+ "overall_metric": 0.5251990705311491,
+ "token_length": {
+ "8k": 0.6529081123251393,
+ "16k": 0.6200667957335821,
+ "32k": 0.5763521514454887,
+ "64k": 0.49832867440843903,
+ "128k": 0.4350202675435077,
+ "256k": 0.36851842173074006
+ },
+ "contextual_requirement": {
+ "Full": 0.48686804900920944,
+ "Partial": 0.5739840070136185
+ },
+ "difficulty": {
+ "Easy": 0.7411278216421984,
+ "Moderate": 0.5064062158524808,
+ "Hard": 0.4567280838491506,
+ "Extreme": 0.34597541625321154
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.737237169976606,
+ "T2. Sequencing & Structure Reconstruction": 0.7325725663225663,
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
+ "T4. Summarization & Synthesis": 0.5064758252528795,
+ "T5. Attribution & Citation Alignment": 0.4582161191244139,
+ "T6. Aggregation & Clustering": 0.5344712887432478,
+ "T7. Consistency & Compliance Checking": 0.3124319752506027,
+ "T8. Structured & Numeric Reasoning": 0.5666666666666667,
+ "T9. Version & Code Diff Analysis": 0.5286040271943487,
+ "T10. Rule Induction & In-Context Learning": 0.5565277777777777,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
+ },
+ "language": {
+ "Chinese": 0.5156423632056797,
+ "English": 0.534755777856619
+ }
+ },
+ "pass@1": 0.2833333333333333,
+ "BoN-2": {
+ "overall_metric": 0.6024165651661463,
+ "token_length": {
+ "8k": 0.7164315027505042,
+ "16k": 0.700257643310589,
+ "32k": 0.6507634459310141,
+ "64k": 0.5868187511846459,
+ "128k": 0.5172068676740627,
+ "256k": 0.4430211801460688
+ },
+ "contextual_requirement": {
+ "Full": 0.5670686758499749,
+ "Partial": 0.6474047879321861
+ },
+ "difficulty": {
+ "Easy": 0.8314482789657763,
+ "Moderate": 0.5941446742173757,
+ "Hard": 0.5272666327730237,
+ "Extreme": 0.4063157035624808
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8187672440796736,
+ "T2. Sequencing & Structure Reconstruction": 0.7801595626595622,
+ "T3. Evidence-Grounded QA": 0.6583333333333333,
+ "T4. Summarization & Synthesis": 0.5244604747568062,
+ "T5. Attribution & Citation Alignment": 0.5425411826774184,
+ "T6. Aggregation & Clustering": 0.5998244613513222,
+ "T7. Consistency & Compliance Checking": 0.38919737648291697,
+ "T8. Structured & Numeric Reasoning": 0.6231481481481482,
+ "T9. Version & Code Diff Analysis": 0.5981069547631331,
+ "T10. Rule Induction & In-Context Learning": 0.6645833333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.525
+ },
+ "language": {
+ "Chinese": 0.5840400269819745,
+ "English": 0.620793103350321
+ }
+ },
+ "pass@2": 0.3506666666666667,
+ "BoN-3": {
+ "overall_metric": 0.6337631142743206,
+ "token_length": {
+ "8k": 0.7401560588202145,
+ "16k": 0.7330726977884732,
+ "32k": 0.6780170211387931,
+ "64k": 0.6366671272752144,
+ "128k": 0.543143313800391,
+ "256k": 0.47152246682284277
+ },
+ "contextual_requirement": {
+ "Full": 0.5976534402407667,
+ "Partial": 0.6797208812261188
+ },
+ "difficulty": {
+ "Easy": 0.858812640816625,
+ "Moderate": 0.6436207584661483,
+ "Hard": 0.5564637457814393,
+ "Extreme": 0.43152853820526416
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.841381085118233,
+ "T2. Sequencing & Structure Reconstruction": 0.8196503034003031,
+ "T3. Evidence-Grounded QA": 0.6916666666666667,
+ "T4. Summarization & Synthesis": 0.5298447017215417,
+ "T5. Attribution & Citation Alignment": 0.5830791662518957,
+ "T6. Aggregation & Clustering": 0.6214286606830465,
+ "T7. Consistency & Compliance Checking": 0.41851724869569523,
+ "T8. Structured & Numeric Reasoning": 0.6564814814814816,
+ "T9. Version & Code Diff Analysis": 0.626359252313376,
+ "T10. Rule Induction & In-Context Learning": 0.7104166666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.575
+ },
+ "language": {
+ "Chinese": 0.6166297449202884,
+ "English": 0.6508964836283548
+ }
+ },
+ "pass@3": 0.382
+}
\ No newline at end of file
diff --git a/results/GPT-OSS-20B/thinking_context-120000_bon-3_summary.json b/results/GPT-OSS-20B/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b91f8679753c4a7cb443194902a9199a89c5481
--- /dev/null
+++ b/results/GPT-OSS-20B/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.4466309565832364,
+ "inference_iteration_1_overall_metric": 0.4454625807266656,
+ "inference_iteration_2_overall_metric": 0.45246537487177085,
+ "inference_iteration_3_overall_metric": 0.44196491415127315,
+ "average_token_length_metric": {
+ "8k": 0.5748339290561163,
+ "16k": 0.520513959710621,
+ "32k": 0.4891012266007553,
+ "64k": 0.41584677147603494,
+ "128k": 0.358630149540046,
+ "256k": 0.3208597031158458
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.415365323316177,
+ "Partial": 0.4864235807413135
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.650502297368124,
+ "Moderate": 0.39329906469313236,
+ "Hard": 0.35893228463928295,
+ "Extreme": 0.3159306081507978
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7015484192056548,
+ "T2. Sequencing & Structure Reconstruction": 0.685767192683859,
+ "T3. Evidence-Grounded QA": 0.45555555555555555,
+ "T4. Summarization & Synthesis": 0.4908914699997827,
+ "T5. Attribution & Citation Alignment": 0.36677196742848295,
+ "T6. Aggregation & Clustering": 0.4730052458390773,
+ "T7. Consistency & Compliance Checking": 0.20816491065985157,
+ "T8. Structured & Numeric Reasoning": 0.41743827160493835,
+ "T9. Version & Code Diff Analysis": 0.447495265816877,
+ "T10. Rule Induction & In-Context Learning": 0.4786111111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3083333333333334
+ },
+ "average_language_metric": {
+ "Chinese": 0.4149338600461589,
+ "English": 0.4783280531203148
+ },
+ "BoN-1": {
+ "overall_metric": 0.4454625807266656,
+ "token_length": {
+ "8k": 0.568519720930888,
+ "16k": 0.5262031339471792,
+ "32k": 0.4861844372065094,
+ "64k": 0.41353173269416393,
+ "128k": 0.3591760759962795,
+ "256k": 0.3191603835849737
+ },
+ "contextual_requirement": {
+ "Full": 0.4087106311919467,
+ "Partial": 0.49223778922539846
+ },
+ "difficulty": {
+ "Easy": 0.6501986428504741,
+ "Moderate": 0.4020783848645907,
+ "Hard": 0.35262004275477665,
+ "Extreme": 0.3106597264865292
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6988816931417082,
+ "T2. Sequencing & Structure Reconstruction": 0.6855601343101344,
+ "T3. Evidence-Grounded QA": 0.44166666666666665,
+ "T4. Summarization & Synthesis": 0.48711155580421095,
+ "T5. Attribution & Citation Alignment": 0.3737909056226912,
+ "T6. Aggregation & Clustering": 0.47376675235942955,
+ "T7. Consistency & Compliance Checking": 0.19543673928650457,
+ "T8. Structured & Numeric Reasoning": 0.4592592592592593,
+ "T9. Version & Code Diff Analysis": 0.4319105105134516,
+ "T10. Rule Induction & In-Context Learning": 0.4483333333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.30833333333333335
+ },
+ "language": {
+ "Chinese": 0.4135726150175988,
+ "English": 0.4773525464357322
+ }
+ },
+ "pass@1": 0.22066666666666668,
+ "BoN-2": {
+ "overall_metric": 0.5272106840251494,
+ "token_length": {
+ "8k": 0.6646954568347576,
+ "16k": 0.5904928422287453,
+ "32k": 0.5839842384318158,
+ "64k": 0.5063967419452635,
+ "128k": 0.43057993780384596,
+ "256k": 0.38711488690647056
+ },
+ "contextual_requirement": {
+ "Full": 0.49180621097136906,
+ "Partial": 0.5722709224572338
+ },
+ "difficulty": {
+ "Easy": 0.7548882368495022,
+ "Moderate": 0.4956196438725791,
+ "Hard": 0.4286236931227538,
+ "Extreme": 0.3633035715559389
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7734553050177705,
+ "T2. Sequencing & Structure Reconstruction": 0.7497803122803125,
+ "T3. Evidence-Grounded QA": 0.5833333333333334,
+ "T4. Summarization & Synthesis": 0.5045962797540592,
+ "T5. Attribution & Citation Alignment": 0.4402260724918136,
+ "T6. Aggregation & Clustering": 0.5442968593297539,
+ "T7. Consistency & Compliance Checking": 0.271121809138241,
+ "T8. Structured & Numeric Reasoning": 0.5342592592592592,
+ "T9. Version & Code Diff Analysis": 0.5242253558461997,
+ "T10. Rule Induction & In-Context Learning": 0.5733333333333334,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
+ },
+ "language": {
+ "Chinese": 0.5001287057888837,
+ "English": 0.5542926622614154
+ }
+ },
+ "pass@2": 0.288,
+ "BoN-3": {
+ "overall_metric": 0.5630326206114822,
+ "token_length": {
+ "8k": 0.696307151429279,
+ "16k": 0.6363828018704142,
+ "32k": 0.6080757988876709,
+ "64k": 0.5447446048393001,
+ "128k": 0.4651876442546681,
+ "256k": 0.4274977223875639
+ },
+ "contextual_requirement": {
+ "Full": 0.5231691902067062,
+ "Partial": 0.6137678956721072
+ },
+ "difficulty": {
+ "Easy": 0.7981617653401918,
+ "Moderate": 0.5378309814958724,
+ "Hard": 0.4581227829612033,
+ "Extreme": 0.3909189138526289
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7979486004255343,
+ "T2. Sequencing & Structure Reconstruction": 0.787096468346468,
+ "T3. Evidence-Grounded QA": 0.6416666666666667,
+ "T4. Summarization & Synthesis": 0.5114673228538231,
+ "T5. Attribution & Citation Alignment": 0.4791973901175429,
+ "T6. Aggregation & Clustering": 0.5794154259483205,
+ "T7. Consistency & Compliance Checking": 0.29817240493883673,
+ "T8. Structured & Numeric Reasoning": 0.5800925925925926,
+ "T9. Version & Code Diff Analysis": 0.5598440073472041,
+ "T10. Rule Induction & In-Context Learning": 0.6325000000000001,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665
+ },
+ "language": {
+ "Chinese": 0.543096347456943,
+ "English": 0.5829688937660222
+ }
+ },
+ "pass@3": 0.316
+}
\ No newline at end of file
diff --git a/results/Gemini-2.5-Flash/nonthinking_context-1000000_bon-3_summary.json b/results/Gemini-2.5-Flash/nonthinking_context-1000000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc1d77e02113183f0d8f163734122d86e16697c1
--- /dev/null
+++ b/results/Gemini-2.5-Flash/nonthinking_context-1000000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5591861836855936,
+ "inference_iteration_1_overall_metric": 0.555222408533961,
+ "inference_iteration_2_overall_metric": 0.5555746542742924,
+ "inference_iteration_3_overall_metric": 0.5667614882485269,
+ "average_token_length_metric": {
+ "8k": 0.5794836437092291,
+ "16k": 0.585038678968723,
+ "32k": 0.5764993408909757,
+ "64k": 0.5298001757287436,
+ "128k": 0.5583690767328653,
+ "256k": 0.5259261860830253
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5219144924948039,
+ "Partial": 0.6066228815647814
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6655203056614667,
+ "Moderate": 0.5398880938056573,
+ "Hard": 0.5786822600966999,
+ "Extreme": 0.4425719452767774
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7771168693642845,
+ "T2. Sequencing & Structure Reconstruction": 0.8032048969548965,
+ "T3. Evidence-Grounded QA": 0.6055555555555554,
+ "T4. Summarization & Synthesis": 0.5340660905787081,
+ "T5. Attribution & Citation Alignment": 0.745788551044203,
+ "T6. Aggregation & Clustering": 0.5023487328603856,
+ "T7. Consistency & Compliance Checking": 0.4407587176859518,
+ "T8. Structured & Numeric Reasoning": 0.2706790123456789,
+ "T9. Version & Code Diff Analysis": 0.7292026752712853,
+ "T10. Rule Induction & In-Context Learning": 0.5269907407407407,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44722222222222224
+ },
+ "average_language_metric": {
+ "Chinese": 0.5654483346944664,
+ "English": 0.5529240326767215
+ },
+ "BoN-1": {
+ "overall_metric": 0.555222408533961,
+ "token_length": {
+ "8k": 0.5888731703826585,
+ "16k": 0.5734020363631814,
+ "32k": 0.5716727282141728,
+ "64k": 0.5201046130976303,
+ "128k": 0.5511174700611993,
+ "256k": 0.5261644330849247
+ },
+ "contextual_requirement": {
+ "Full": 0.5109299072772209,
+ "Partial": 0.6115946828607216
+ },
+ "difficulty": {
+ "Easy": 0.6581097400970598,
+ "Moderate": 0.5327009022057297,
+ "Hard": 0.5733517044448927,
+ "Extreme": 0.4453988971639295
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7847565381129994,
+ "T2. Sequencing & Structure Reconstruction": 0.8209740259740258,
+ "T3. Evidence-Grounded QA": 0.5833333333333334,
+ "T4. Summarization & Synthesis": 0.5367973042699341,
+ "T5. Attribution & Citation Alignment": 0.7270779373385174,
+ "T6. Aggregation & Clustering": 0.49367147369310305,
+ "T7. Consistency & Compliance Checking": 0.444842452883972,
+ "T8. Structured & Numeric Reasoning": 0.27037037037037037,
+ "T9. Version & Code Diff Analysis": 0.7099867444467561,
+ "T10. Rule Induction & In-Context Learning": 0.522361111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665
+ },
+ "language": {
+ "Chinese": 0.5642007220552895,
+ "English": 0.5462440950126328
+ }
+ },
+ "pass@1": 0.2846666666666667,
+ "BoN-2": {
+ "overall_metric": 0.600215470701642,
+ "token_length": {
+ "8k": 0.6308006025002828,
+ "16k": 0.6218713098115006,
+ "32k": 0.6209629478481974,
+ "64k": 0.5627834642545474,
+ "128k": 0.6034759308063585,
+ "256k": 0.5613985689889686
+ },
+ "contextual_requirement": {
+ "Full": 0.5620277169409751,
+ "Partial": 0.6488180663970381
+ },
+ "difficulty": {
+ "Easy": 0.7092035834801983,
+ "Moderate": 0.562063871268018,
+ "Hard": 0.6309834472774163,
+ "Extreme": 0.4856736448985854
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8017289450670718,
+ "T2. Sequencing & Structure Reconstruction": 0.8388936988936988,
+ "T3. Evidence-Grounded QA": 0.6833333333333333,
+ "T4. Summarization & Synthesis": 0.5495701538780565,
+ "T5. Attribution & Citation Alignment": 0.7927380382693102,
+ "T6. Aggregation & Clustering": 0.5444118952178385,
+ "T7. Consistency & Compliance Checking": 0.5074368060401414,
+ "T8. Structured & Numeric Reasoning": 0.3101851851851852,
+ "T9. Version & Code Diff Analysis": 0.751572829108757,
+ "T10. Rule Induction & In-Context Learning": 0.5584722222222223,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
+ },
+ "language": {
+ "Chinese": 0.6052369717987128,
+ "English": 0.5951939696045724
+ }
+ },
+ "pass@2": 0.32666666666666666,
+ "BoN-3": {
+ "overall_metric": 0.6297638749448518,
+ "token_length": {
+ "8k": 0.6514500993442873,
+ "16k": 0.6579228544698497,
+ "32k": 0.6569374765913272,
+ "64k": 0.5879896058573842,
+ "128k": 0.6298609557348528,
+ "256k": 0.5944222576714168
+ },
+ "contextual_requirement": {
+ "Full": 0.5933694722646927,
+ "Partial": 0.6760840238105122
+ },
+ "difficulty": {
+ "Easy": 0.7491261498971669,
+ "Moderate": 0.6049484175447234,
+ "Hard": 0.6548738677798389,
+ "Extreme": 0.49881447206374147
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8141979383533716,
+ "T2. Sequencing & Structure Reconstruction": 0.8490187590187587,
+ "T3. Evidence-Grounded QA": 0.7083333333333334,
+ "T4. Summarization & Synthesis": 0.5562735797066128,
+ "T5. Attribution & Citation Alignment": 0.8231149616655856,
+ "T6. Aggregation & Clustering": 0.5745805749141115,
+ "T7. Consistency & Compliance Checking": 0.5414909010178753,
+ "T8. Structured & Numeric Reasoning": 0.3379629629629629,
+ "T9. Version & Code Diff Analysis": 0.7854748730572404,
+ "T10. Rule Induction & In-Context Learning": 0.6129166666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
+ },
+ "language": {
+ "Chinese": 0.6363071790468551,
+ "English": 0.6232205708428518
+ }
+ },
+ "pass@3": 0.3606666666666667
+}
\ No newline at end of file
diff --git a/results/Gemini-2.5-Flash/thinking_context-1000000_bon-3_summary.json b/results/Gemini-2.5-Flash/thinking_context-1000000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1ca1664790b9260467fadb6436ca12c6afe917b
--- /dev/null
+++ b/results/Gemini-2.5-Flash/thinking_context-1000000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.674056323049449,
+ "inference_iteration_1_overall_metric": 0.6759983703704746,
+ "inference_iteration_2_overall_metric": 0.6798720821649146,
+ "inference_iteration_3_overall_metric": 0.6662985166129568,
+ "average_token_length_metric": {
+ "8k": 0.7135890855897297,
+ "16k": 0.6856610849454701,
+ "32k": 0.6818807100418771,
+ "64k": 0.7027829296522448,
+ "128k": 0.6399411623659867,
+ "256k": 0.6204829657013889
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.63656677094683,
+ "Partial": 0.7217702984527835
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7982292314606038,
+ "Moderate": 0.7239074886730374,
+ "Hard": 0.7218614768527792,
+ "Extreme": 0.47388809993909664
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8642581912430608,
+ "T2. Sequencing & Structure Reconstruction": 0.877125374625374,
+ "T3. Evidence-Grounded QA": 0.6638888888888889,
+ "T4. Summarization & Synthesis": 0.545007270213042,
+ "T5. Attribution & Citation Alignment": 0.7904062945014397,
+ "T6. Aggregation & Clustering": 0.6528080258554949,
+ "T7. Consistency & Compliance Checking": 0.5102049505643657,
+ "T8. Structured & Numeric Reasoning": 0.6658950617283949,
+ "T9. Version & Code Diff Analysis": 0.8004985540165189,
+ "T10. Rule Induction & In-Context Learning": 0.605046296296296,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5361111111111112
+ },
+ "average_language_metric": {
+ "Chinese": 0.6759423962737819,
+ "English": 0.6721702498251175
+ },
+ "BoN-1": {
+ "overall_metric": 0.6759983703704746,
+ "token_length": {
+ "8k": 0.7102710318952188,
+ "16k": 0.6883832260010031,
+ "32k": 0.6744900827350674,
+ "64k": 0.7119047863552107,
+ "128k": 0.6560068019942544,
+ "256k": 0.6149342932420981
+ },
+ "contextual_requirement": {
+ "Full": 0.6308759268961498,
+ "Partial": 0.733426934792345
+ },
+ "difficulty": {
+ "Easy": 0.8006604241201102,
+ "Moderate": 0.7377405126394252,
+ "Hard": 0.7224536765391667,
+ "Extreme": 0.46837070558456173
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8740026641040577,
+ "T2. Sequencing & Structure Reconstruction": 0.8621174196174196,
+ "T3. Evidence-Grounded QA": 0.675,
+ "T4. Summarization & Synthesis": 0.5455538913362303,
+ "T5. Attribution & Citation Alignment": 0.8061189254081019,
+ "T6. Aggregation & Clustering": 0.6554340790288137,
+ "T7. Consistency & Compliance Checking": 0.5169546620879129,
+ "T8. Structured & Numeric Reasoning": 0.6708333333333333,
+ "T9. Version & Code Diff Analysis": 0.7797147286011522,
+ "T10. Rule Induction & In-Context Learning": 0.5926388888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.55
+ },
+ "language": {
+ "Chinese": 0.6768990117658723,
+ "English": 0.6750977289750792
+ }
+ },
+ "pass@1": 0.4493333333333333,
+ "BoN-2": {
+ "overall_metric": 0.7479747877310018,
+ "token_length": {
+ "8k": 0.7719367863380663,
+ "16k": 0.7848557527755085,
+ "32k": 0.7435010923393187,
+ "64k": 0.7892992664474747,
+ "128k": 0.7226763130932595,
+ "256k": 0.6755795153923868
+ },
+ "contextual_requirement": {
+ "Full": 0.720714483291413,
+ "Partial": 0.782669720654116
+ },
+ "difficulty": {
+ "Easy": 0.8637385553396211,
+ "Moderate": 0.8121373016562715,
+ "Hard": 0.8141424576405533,
+ "Extreme": 0.535557607922782
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9027833860231022,
+ "T2. Sequencing & Structure Reconstruction": 0.9054959854959853,
+ "T3. Evidence-Grounded QA": 0.7666666666666667,
+ "T4. Summarization & Synthesis": 0.5582670711283242,
+ "T5. Attribution & Citation Alignment": 0.8550801223307993,
+ "T6. Aggregation & Clustering": 0.7273381143811253,
+ "T7. Consistency & Compliance Checking": 0.6357469182183698,
+ "T8. Structured & Numeric Reasoning": 0.7416666666666667,
+ "T9. Version & Code Diff Analysis": 0.8585696216489627,
+ "T10. Rule Induction & In-Context Learning": 0.712361111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6333333333333333
+ },
+ "language": {
+ "Chinese": 0.7535034625567544,
+ "English": 0.74244611290525
+ }
+ },
+ "pass@2": 0.5313333333333333,
+ "BoN-3": {
+ "overall_metric": 0.7790736538060705,
+ "token_length": {
+ "8k": 0.800474143876513,
+ "16k": 0.8203674165938436,
+ "32k": 0.767107753488941,
+ "64k": 0.8124208427670737,
+ "128k": 0.743086283346713,
+ "256k": 0.7309854827633406
+ },
+ "contextual_requirement": {
+ "Full": 0.7557801236733843,
+ "Partial": 0.808719964884035
+ },
+ "difficulty": {
+ "Easy": 0.8954381880917556,
+ "Moderate": 0.8399060564631339,
+ "Hard": 0.848259677254879,
+ "Extreme": 0.5662031295553962
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9242311389541524,
+ "T2. Sequencing & Structure Reconstruction": 0.9315938690938689,
+ "T3. Evidence-Grounded QA": 0.7916666666666666,
+ "T4. Summarization & Synthesis": 0.5695550592103452,
+ "T5. Attribution & Citation Alignment": 0.8809133228802023,
+ "T6. Aggregation & Clustering": 0.7566450728547504,
+ "T7. Consistency & Compliance Checking": 0.662115306165705,
+ "T8. Structured & Numeric Reasoning": 0.7824074074074074,
+ "T9. Version & Code Diff Analysis": 0.8867644916844079,
+ "T10. Rule Induction & In-Context Learning": 0.7519444444444443,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7
+ },
+ "language": {
+ "Chinese": 0.7890668786410705,
+ "English": 0.7690804289710705
+ }
+ },
+ "pass@3": 0.5786666666666667
+}
\ No newline at end of file
diff --git a/results/Gemini-2.5-Pro/thinking_context-1000000_bon-3_summary.json b/results/Gemini-2.5-Pro/thinking_context-1000000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..44824d02ab11c2aeb3b9632106705ebdf4b2a670
--- /dev/null
+++ b/results/Gemini-2.5-Pro/thinking_context-1000000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.7342184707317124,
+ "inference_iteration_1_overall_metric": 0.7402405885346022,
+ "inference_iteration_2_overall_metric": 0.7288378446496467,
+ "inference_iteration_3_overall_metric": 0.7335769790108894,
+ "average_token_length_metric": {
+ "8k": 0.7449778241967657,
+ "16k": 0.7478649041506191,
+ "32k": 0.7530566835243759,
+ "64k": 0.7417918268320294,
+ "128k": 0.6999601003776742,
+ "256k": 0.7176594853088111
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.7006912685258201,
+ "Partial": 0.7768894553573948
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.8440057387459964,
+ "Moderate": 0.819848501651939,
+ "Hard": 0.8102915033262061,
+ "Extreme": 0.5077419967802616
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.900910721502263,
+ "T2. Sequencing & Structure Reconstruction": 0.9242053162886497,
+ "T3. Evidence-Grounded QA": 0.6500000000000001,
+ "T4. Summarization & Synthesis": 0.5430214244860422,
+ "T5. Attribution & Citation Alignment": 0.8428063760922413,
+ "T6. Aggregation & Clustering": 0.7039837824498163,
+ "T7. Consistency & Compliance Checking": 0.6274987753728497,
+ "T8. Structured & Numeric Reasoning": 0.7824074074074073,
+ "T9. Version & Code Diff Analysis": 0.873498394228399,
+ "T10. Rule Induction & In-Context Learning": 0.683564814814815,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5888888888888888
+ },
+ "average_language_metric": {
+ "Chinese": 0.7449054000919034,
+ "English": 0.7235315413715225
+ },
+ "BoN-1": {
+ "overall_metric": 0.7402405885346022,
+ "token_length": {
+ "8k": 0.7531611316917413,
+ "16k": 0.7524897292361332,
+ "32k": 0.759794274989024,
+ "64k": 0.7435484076033682,
+ "128k": 0.6968353298720406,
+ "256k": 0.7356146578153082
+ },
+ "contextual_requirement": {
+ "Full": 0.6997313831727073,
+ "Partial": 0.7917977589951957
+ },
+ "difficulty": {
+ "Easy": 0.8496811955485003,
+ "Moderate": 0.8519813060901164,
+ "Hard": 0.7987384600115967,
+ "Extreme": 0.5085375776002995
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8998469455079252,
+ "T2. Sequencing & Structure Reconstruction": 0.9226606264106262,
+ "T3. Evidence-Grounded QA": 0.7083333333333334,
+ "T4. Summarization & Synthesis": 0.5451740468187636,
+ "T5. Attribution & Citation Alignment": 0.8298635734563725,
+ "T6. Aggregation & Clustering": 0.6845419570109973,
+ "T7. Consistency & Compliance Checking": 0.6246870318062281,
+ "T8. Structured & Numeric Reasoning": 0.7995370370370369,
+ "T9. Version & Code Diff Analysis": 0.8931464590407817,
+ "T10. Rule Induction & In-Context Learning": 0.6825,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6083333333333333
+ },
+ "language": {
+ "Chinese": 0.7482902342281601,
+ "English": 0.7321909428410444
+ }
+ },
+ "pass@1": 0.5366666666666666,
+ "BoN-2": {
+ "overall_metric": 0.7920257143166666,
+ "token_length": {
+ "8k": 0.8033887159307824,
+ "16k": 0.7898793857734576,
+ "32k": 0.8117863707215878,
+ "64k": 0.7964277126734358,
+ "128k": 0.7607965077861976,
+ "256k": 0.7898755930145346
+ },
+ "contextual_requirement": {
+ "Full": 0.7559933032110803,
+ "Partial": 0.8378851466328657
+ },
+ "difficulty": {
+ "Easy": 0.8992430876416225,
+ "Moderate": 0.8976673772472249,
+ "Hard": 0.8677753697095277,
+ "Extreme": 0.5554328394573533
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9332256652370721,
+ "T2. Sequencing & Structure Reconstruction": 0.9412599437599435,
+ "T3. Evidence-Grounded QA": 0.775,
+ "T4. Summarization & Synthesis": 0.5583020906699263,
+ "T5. Attribution & Citation Alignment": 0.8863658130468474,
+ "T6. Aggregation & Clustering": 0.7465324819181888,
+ "T7. Consistency & Compliance Checking": 0.7073416442539386,
+ "T8. Structured & Numeric Reasoning": 0.8560185185185186,
+ "T9. Version & Code Diff Analysis": 0.9071622825418993,
+ "T10. Rule Induction & In-Context Learning": 0.7341666666666666,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7
+ },
+ "language": {
+ "Chinese": 0.795156398417639,
+ "English": 0.7888950302156931
+ }
+ },
+ "pass@2": 0.6133333333333333,
+ "BoN-3": {
+ "overall_metric": 0.8133867657039734,
+ "token_length": {
+ "8k": 0.8126699858808082,
+ "16k": 0.8091582494700531,
+ "32k": 0.8359194281957039,
+ "64k": 0.816367429901851,
+ "128k": 0.7927970026749335,
+ "256k": 0.8134084981004824
+ },
+ "contextual_requirement": {
+ "Full": 0.7838454845628277,
+ "Partial": 0.85098475988361
+ },
+ "difficulty": {
+ "Easy": 0.9143269591461097,
+ "Moderate": 0.9126631081599108,
+ "Hard": 0.9014414032827288,
+ "Extreme": 0.5797689782741141
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9453427087326437,
+ "T2. Sequencing & Structure Reconstruction": 0.9562455137455134,
+ "T3. Evidence-Grounded QA": 0.7833333333333333,
+ "T4. Summarization & Synthesis": 0.5633383564725131,
+ "T5. Attribution & Citation Alignment": 0.9113749401192192,
+ "T6. Aggregation & Clustering": 0.7887683935896461,
+ "T7. Consistency & Compliance Checking": 0.7334094152440016,
+ "T8. Structured & Numeric Reasoning": 0.8671296296296297,
+ "T9. Version & Code Diff Analysis": 0.922905227868178,
+ "T10. Rule Induction & In-Context Learning": 0.7925,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334
+ },
+ "language": {
+ "Chinese": 0.8203398785387466,
+ "English": 0.8064336528691968
+ }
+ },
+ "pass@3": 0.6466666666666666
+}
\ No newline at end of file
diff --git a/results/Gemma-3-12B-It/nonthinking_context-120000_bon-3_summary.json b/results/Gemma-3-12B-It/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c4c1db11176d1956439c3f5e1741429cf59baad
--- /dev/null
+++ b/results/Gemma-3-12B-It/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3215537579100566,
+ "inference_iteration_1_overall_metric": 0.32433034575226716,
+ "inference_iteration_2_overall_metric": 0.31844270123543494,
+ "inference_iteration_3_overall_metric": 0.3218882267424678,
+ "average_token_length_metric": {
+ "8k": 0.3884654162699951,
+ "16k": 0.3447777819230472,
+ "32k": 0.34075916239810233,
+ "64k": 0.2830133627729955,
+ "128k": 0.2903427416083741,
+ "256k": 0.2819640824878266
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.2967079738599621,
+ "Partial": 0.3531756648829045
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.43662085782547294,
+ "Moderate": 0.23390254455699586,
+ "Hard": 0.30432253509954527,
+ "Extreme": 0.26439167130106106
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.5521425232273246,
+ "T2. Sequencing & Structure Reconstruction": 0.6018784456284454,
+ "T3. Evidence-Grounded QA": 0.39722222222222225,
+ "T4. Summarization & Synthesis": 0.4889337252719512,
+ "T5. Attribution & Citation Alignment": 0.17289417311274363,
+ "T6. Aggregation & Clustering": 0.31608511775757386,
+ "T7. Consistency & Compliance Checking": 0.18810666022578687,
+ "T8. Structured & Numeric Reasoning": 0.06064814814814815,
+ "T9. Version & Code Diff Analysis": 0.34441506928983295,
+ "T10. Rule Induction & In-Context Learning": 0.39800925925925923,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666667
+ },
+ "average_language_metric": {
+ "Chinese": 0.31275985343364654,
+ "English": 0.3303476623864672
+ },
+ "BoN-1": {
+ "overall_metric": 0.32433034575226716,
+ "token_length": {
+ "8k": 0.3906271090949922,
+ "16k": 0.3561137033219127,
+ "32k": 0.34838703683305916,
+ "64k": 0.2733930672728249,
+ "128k": 0.295394704679806,
+ "256k": 0.2820664533110089
+ },
+ "contextual_requirement": {
+ "Full": 0.299184653750444,
+ "Partial": 0.35633395375458804
+ },
+ "difficulty": {
+ "Easy": 0.4375571061543292,
+ "Moderate": 0.2456864337252399,
+ "Hard": 0.3046160927997635,
+ "Extreme": 0.26489900749156403
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5440288685497278,
+ "T2. Sequencing & Structure Reconstruction": 0.595296185296185,
+ "T3. Evidence-Grounded QA": 0.4166666666666667,
+ "T4. Summarization & Synthesis": 0.4882012721729454,
+ "T5. Attribution & Citation Alignment": 0.16495676080926253,
+ "T6. Aggregation & Clustering": 0.3232791449049618,
+ "T7. Consistency & Compliance Checking": 0.1944817769562575,
+ "T8. Structured & Numeric Reasoning": 0.0699074074074074,
+ "T9. Version & Code Diff Analysis": 0.345977074505613,
+ "T10. Rule Induction & In-Context Learning": 0.4008333333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666667
+ },
+ "language": {
+ "Chinese": 0.3144798192828198,
+ "English": 0.334180872221715
+ }
+ },
+ "pass@1": 0.11133333333333334,
+ "BoN-2": {
+ "overall_metric": 0.3467523917671754,
+ "token_length": {
+ "8k": 0.4142271322547509,
+ "16k": 0.377736900046965,
+ "32k": 0.37302728004656277,
+ "64k": 0.3062912069169447,
+ "128k": 0.3073157553107305,
+ "256k": 0.3019160760270996
+ },
+ "contextual_requirement": {
+ "Full": 0.32135117390700496,
+ "Partial": 0.3790812144983023
+ },
+ "difficulty": {
+ "Easy": 0.46615306988638716,
+ "Moderate": 0.26112276416188485,
+ "Hard": 0.32737280433310617,
+ "Extreme": 0.28492633788743743
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5650465241334548,
+ "T2. Sequencing & Structure Reconstruction": 0.6249731287231288,
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
+ "T4. Summarization & Synthesis": 0.5032712591907067,
+ "T5. Attribution & Citation Alignment": 0.20597827766959523,
+ "T6. Aggregation & Clustering": 0.346419862594183,
+ "T7. Consistency & Compliance Checking": 0.21497503949007407,
+ "T8. Structured & Numeric Reasoning": 0.07962962962962963,
+ "T9. Version & Code Diff Analysis": 0.3894322431353122,
+ "T10. Rule Induction & In-Context Learning": 0.4174999999999999,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334
+ },
+ "language": {
+ "Chinese": 0.3387201257962417,
+ "English": 0.3547846577381098
+ }
+ },
+ "pass@2": 0.122,
+ "BoN-3": {
+ "overall_metric": 0.36208312298536466,
+ "token_length": {
+ "8k": 0.4291254746141897,
+ "16k": 0.3819529993839447,
+ "32k": 0.3886685107360013,
+ "64k": 0.3331907556566436,
+ "128k": 0.3232918491379485,
+ "256k": 0.3162691483834608
+ },
+ "contextual_requirement": {
+ "Full": 0.33738791539283736,
+ "Partial": 0.39351338719403606
+ },
+ "difficulty": {
+ "Easy": 0.4781807777959498,
+ "Moderate": 0.2766679371680117,
+ "Hard": 0.3532458860700918,
+ "Extreme": 0.29681012423769865
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5905198790888515,
+ "T2. Sequencing & Structure Reconstruction": 0.6396154771154773,
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
+ "T4. Summarization & Synthesis": 0.5090709574388708,
+ "T5. Attribution & Citation Alignment": 0.21374662530636385,
+ "T6. Aggregation & Clustering": 0.3778424215167421,
+ "T7. Consistency & Compliance Checking": 0.2211746842442865,
+ "T8. Structured & Numeric Reasoning": 0.09351851851851851,
+ "T9. Version & Code Diff Analysis": 0.4124771063926183,
+ "T10. Rule Induction & In-Context Learning": 0.43847222222222215,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25
+ },
+ "language": {
+ "Chinese": 0.35324378584530425,
+ "English": 0.3709224601254252
+ }
+ },
+ "pass@3": 0.12733333333333333
+}
\ No newline at end of file
diff --git a/results/Gemma-3-12B-It/thinking_context-120000_bon-3_summary.json b/results/Gemma-3-12B-It/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0715104a7ead06c8a3b275d80b3f8fd378066ac
--- /dev/null
+++ b/results/Gemma-3-12B-It/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3191893284499226,
+ "inference_iteration_1_overall_metric": 0.3198032509438655,
+ "inference_iteration_2_overall_metric": 0.3185140430839788,
+ "inference_iteration_3_overall_metric": 0.3192506913219244,
+ "average_token_length_metric": {
+ "8k": 0.39039471003518295,
+ "16k": 0.36666548253930176,
+ "32k": 0.3454075697967419,
+ "64k": 0.2965559300528977,
+ "128k": 0.2634390883568726,
+ "256k": 0.25267318991854026
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.29023833980013936,
+ "Partial": 0.3560360412769198
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.45478741131761163,
+ "Moderate": 0.2260826308323087,
+ "Hard": 0.28024203424188526,
+ "Extreme": 0.2573832550303484
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.5500567032378852,
+ "T2. Sequencing & Structure Reconstruction": 0.5750085972908512,
+ "T3. Evidence-Grounded QA": 0.275,
+ "T4. Summarization & Synthesis": 0.4560070506311603,
+ "T5. Attribution & Citation Alignment": 0.1618052577444313,
+ "T6. Aggregation & Clustering": 0.3308854760173438,
+ "T7. Consistency & Compliance Checking": 0.16988798875914354,
+ "T8. Structured & Numeric Reasoning": 0.254783950617284,
+ "T9. Version & Code Diff Analysis": 0.3307362069623843,
+ "T10. Rule Induction & In-Context Learning": 0.3245833333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.18333333333333338
+ },
+ "average_language_metric": {
+ "Chinese": 0.29411514963064544,
+ "English": 0.3442635072692001
+ },
+ "BoN-1": {
+ "overall_metric": 0.3198032509438655,
+ "token_length": {
+ "8k": 0.3780018581730685,
+ "16k": 0.3712141189933461,
+ "32k": 0.32728165209503113,
+ "64k": 0.30571589667403015,
+ "128k": 0.2747567763160667,
+ "256k": 0.2618492034116515
+ },
+ "contextual_requirement": {
+ "Full": 0.29216027329955657,
+ "Partial": 0.3549852224911681
+ },
+ "difficulty": {
+ "Easy": 0.45511408988195684,
+ "Moderate": 0.23953366795757702,
+ "Hard": 0.27643841537644914,
+ "Extreme": 0.25278171138445865
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5667485461728832,
+ "T2. Sequencing & Structure Reconstruction": 0.562965737965738,
+ "T3. Evidence-Grounded QA": 0.25,
+ "T4. Summarization & Synthesis": 0.4542471570812729,
+ "T5. Attribution & Citation Alignment": 0.17077813789682644,
+ "T6. Aggregation & Clustering": 0.331058456491463,
+ "T7. Consistency & Compliance Checking": 0.17425550039116783,
+ "T8. Structured & Numeric Reasoning": 0.25555555555555554,
+ "T9. Version & Code Diff Analysis": 0.34802456680209815,
+ "T10. Rule Induction & In-Context Learning": 0.32847222222222217,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.175
+ },
+ "language": {
+ "Chinese": 0.29038552736401346,
+ "English": 0.34922097452371764
+ }
+ },
+ "pass@1": 0.11466666666666667,
+ "BoN-2": {
+ "overall_metric": 0.38355638163031974,
+ "token_length": {
+ "8k": 0.4753389925367892,
+ "16k": 0.42098100793684207,
+ "32k": 0.4193716441650546,
+ "64k": 0.35186094157335357,
+ "128k": 0.3173777879671563,
+ "256k": 0.3164079156027232
+ },
+ "contextual_requirement": {
+ "Full": 0.3475360182865405,
+ "Partial": 0.4294004804314932
+ },
+ "difficulty": {
+ "Easy": 0.5523314770092703,
+ "Moderate": 0.2771664695259602,
+ "Hard": 0.3369385202392175,
+ "Extreme": 0.29916922382926153
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6227581461675343,
+ "T2. Sequencing & Structure Reconstruction": 0.6340152902652901,
+ "T3. Evidence-Grounded QA": 0.35,
+ "T4. Summarization & Synthesis": 0.47044783692403785,
+ "T5. Attribution & Citation Alignment": 0.21419149989714506,
+ "T6. Aggregation & Clustering": 0.40641717668678456,
+ "T7. Consistency & Compliance Checking": 0.22278173964383363,
+ "T8. Structured & Numeric Reasoning": 0.31851851851851853,
+ "T9. Version & Code Diff Analysis": 0.41479917818461753,
+ "T10. Rule Induction & In-Context Learning": 0.4166666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25
+ },
+ "language": {
+ "Chinese": 0.35221914805982085,
+ "English": 0.41489361520081863
+ }
+ },
+ "pass@2": 0.154,
+ "BoN-3": {
+ "overall_metric": 0.41783270358538355,
+ "token_length": {
+ "8k": 0.5097501627706587,
+ "16k": 0.4551451198013833,
+ "32k": 0.4565584105132001,
+ "64k": 0.39691938688026035,
+ "128k": 0.34453029473355373,
+ "256k": 0.3440928468132463
+ },
+ "contextual_requirement": {
+ "Full": 0.37691399280812576,
+ "Partial": 0.46991106275643846
+ },
+ "difficulty": {
+ "Easy": 0.6081488352407031,
+ "Moderate": 0.3030799752726802,
+ "Hard": 0.35694868313688616,
+ "Extreme": 0.32471144207202707
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6492859989190713,
+ "T2. Sequencing & Structure Reconstruction": 0.673747440830774,
+ "T3. Evidence-Grounded QA": 0.4083333333333333,
+ "T4. Summarization & Synthesis": 0.47953981981487215,
+ "T5. Attribution & Citation Alignment": 0.23614408266148504,
+ "T6. Aggregation & Clustering": 0.44385472207531035,
+ "T7. Consistency & Compliance Checking": 0.24389274305473457,
+ "T8. Structured & Numeric Reasoning": 0.3634259259259259,
+ "T9. Version & Code Diff Analysis": 0.4490980326738026,
+ "T10. Rule Induction & In-Context Learning": 0.4583333333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667
+ },
+ "language": {
+ "Chinese": 0.3946655531503397,
+ "English": 0.440999854020427
+ }
+ },
+ "pass@3": 0.174
+}
\ No newline at end of file
diff --git a/results/Gemma-3-27B-It/nonthinking_context-120000_bon-3_summary.json b/results/Gemma-3-27B-It/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..5399cedea6ec59a9ac2f1b923301c979c57d75f9
--- /dev/null
+++ b/results/Gemma-3-27B-It/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3613898319544999,
+ "inference_iteration_1_overall_metric": 0.35966281601422384,
+ "inference_iteration_2_overall_metric": 0.3610482900444428,
+ "inference_iteration_3_overall_metric": 0.3634583898048339,
+ "average_token_length_metric": {
+ "8k": 0.43644157643949566,
+ "16k": 0.3804621509069283,
+ "32k": 0.39249485549033103,
+ "64k": 0.3508346036478247,
+ "128k": 0.30224089028156714,
+ "256k": 0.30586491496085444
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.33698292855454914,
+ "Partial": 0.3924531635544373
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.49956833239118315,
+ "Moderate": 0.2504305317277634,
+ "Hard": 0.3319623187177785,
+ "Extreme": 0.30223017713736006
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6186817382099091,
+ "T2. Sequencing & Structure Reconstruction": 0.6828640898869156,
+ "T3. Evidence-Grounded QA": 0.475,
+ "T4. Summarization & Synthesis": 0.4863291170840516,
+ "T5. Attribution & Citation Alignment": 0.20718550263597674,
+ "T6. Aggregation & Clustering": 0.35878702296933646,
+ "T7. Consistency & Compliance Checking": 0.19534470405785112,
+ "T8. Structured & Numeric Reasoning": 0.11558641975308641,
+ "T9. Version & Code Diff Analysis": 0.3981518981106548,
+ "T10. Rule Induction & In-Context Learning": 0.37236111111111114,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.27222222222222225
+ },
+ "average_language_metric": {
+ "Chinese": 0.3481327108034932,
+ "English": 0.3746469531055068
+ },
+ "BoN-1": {
+ "overall_metric": 0.35966281601422384,
+ "token_length": {
+ "8k": 0.43768168643426314,
+ "16k": 0.37280230082410293,
+ "32k": 0.39131018375907956,
+ "64k": 0.3439379322622389,
+ "128k": 0.30914303161909973,
+ "256k": 0.30310176118655724
+ },
+ "contextual_requirement": {
+ "Full": 0.33431999039428895,
+ "Partial": 0.39191732134868573
+ },
+ "difficulty": {
+ "Easy": 0.49522975795181967,
+ "Moderate": 0.24738817424396956,
+ "Hard": 0.33414580218141465,
+ "Extreme": 0.30165945795823235
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6149553491444276,
+ "T2. Sequencing & Structure Reconstruction": 0.6777184081350744,
+ "T3. Evidence-Grounded QA": 0.4666666666666667,
+ "T4. Summarization & Synthesis": 0.48496676951441003,
+ "T5. Attribution & Citation Alignment": 0.2117639703302691,
+ "T6. Aggregation & Clustering": 0.3687808030960292,
+ "T7. Consistency & Compliance Checking": 0.19168084686723558,
+ "T8. Structured & Numeric Reasoning": 0.1189814814814815,
+ "T9. Version & Code Diff Analysis": 0.3861048947753823,
+ "T10. Rule Induction & In-Context Learning": 0.35944444444444446,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.275
+ },
+ "language": {
+ "Chinese": 0.34703709850451614,
+ "English": 0.3722885335239309
+ }
+ },
+ "pass@1": 0.13066666666666665,
+ "BoN-2": {
+ "overall_metric": 0.38518154623771866,
+ "token_length": {
+ "8k": 0.45699608191277485,
+ "16k": 0.3970070268223878,
+ "32k": 0.4155514974235712,
+ "64k": 0.3795777424428599,
+ "128k": 0.3342775351649673,
+ "256k": 0.32767939365975096
+ },
+ "contextual_requirement": {
+ "Full": 0.36193530889736725,
+ "Partial": 0.4147676664890755
+ },
+ "difficulty": {
+ "Easy": 0.5260736620896884,
+ "Moderate": 0.2746052566038148,
+ "Hard": 0.3586645680646715,
+ "Extreme": 0.32088598840944504
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6391494300203041,
+ "T2. Sequencing & Structure Reconstruction": 0.6911707890874557,
+ "T3. Evidence-Grounded QA": 0.49166666666666664,
+ "T4. Summarization & Synthesis": 0.4979131525726192,
+ "T5. Attribution & Citation Alignment": 0.24413521215425465,
+ "T6. Aggregation & Clustering": 0.3897620242575365,
+ "T7. Consistency & Compliance Checking": 0.2135542070257249,
+ "T8. Structured & Numeric Reasoning": 0.14675925925925926,
+ "T9. Version & Code Diff Analysis": 0.4478430638786239,
+ "T10. Rule Induction & In-Context Learning": 0.39444444444444443,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2833333333333333
+ },
+ "language": {
+ "Chinese": 0.37305464561658386,
+ "English": 0.3973084468588535
+ }
+ },
+ "pass@2": 0.144,
+ "BoN-3": {
+ "overall_metric": 0.4011866600471999,
+ "token_length": {
+ "8k": 0.4633867051191647,
+ "16k": 0.42281662150014376,
+ "32k": 0.44096928608538427,
+ "64k": 0.39220811203028316,
+ "128k": 0.34412857203690767,
+ "256k": 0.34361066351131797
+ },
+ "contextual_requirement": {
+ "Full": 0.37845160215956763,
+ "Partial": 0.4301221882678235
+ },
+ "difficulty": {
+ "Easy": 0.5455848419176232,
+ "Moderate": 0.2912498230785689,
+ "Hard": 0.36825828865136306,
+ "Extreme": 0.33684148176489936
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.654875485368947,
+ "T2. Sequencing & Structure Reconstruction": 0.7061728240894906,
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
+ "T4. Summarization & Synthesis": 0.5049097313487289,
+ "T5. Attribution & Citation Alignment": 0.2565335955530918,
+ "T6. Aggregation & Clustering": 0.4143461915069394,
+ "T7. Consistency & Compliance Checking": 0.22258770429716687,
+ "T8. Structured & Numeric Reasoning": 0.175462962962963,
+ "T9. Version & Code Diff Analysis": 0.4681907705235851,
+ "T10. Rule Induction & In-Context Learning": 0.40555555555555556,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2833333333333333
+ },
+ "language": {
+ "Chinese": 0.3916620761944417,
+ "English": 0.410711243899959
+ }
+ },
+ "pass@3": 0.15533333333333332
+}
\ No newline at end of file
diff --git a/results/Gemma-3-27B-It/thinking_context-120000_bon-3_summary.json b/results/Gemma-3-27B-It/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..01fcada94b951d98f1f3bac4624de962402c1239
--- /dev/null
+++ b/results/Gemma-3-27B-It/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.37338415281793874,
+ "inference_iteration_1_overall_metric": 0.3756110685938797,
+ "inference_iteration_2_overall_metric": 0.3716321861397887,
+ "inference_iteration_3_overall_metric": 0.3729092037201496,
+ "average_token_length_metric": {
+ "8k": 0.44812577930836095,
+ "16k": 0.4266217475899872,
+ "32k": 0.4074453646105579,
+ "64k": 0.35662526806956907,
+ "128k": 0.2952141304786102,
+ "256k": 0.3062726268505501
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3443713298159222,
+ "Partial": 0.4103095639114165
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5780767692142667,
+ "Moderate": 0.24533089723723267,
+ "Hard": 0.3056384367420397,
+ "Extreme": 0.27775702033096106
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6037064651554387,
+ "T2. Sequencing & Structure Reconstruction": 0.6278458897510606,
+ "T3. Evidence-Grounded QA": 0.3361111111111111,
+ "T4. Summarization & Synthesis": 0.45719209902963875,
+ "T5. Attribution & Citation Alignment": 0.23234121031762375,
+ "T6. Aggregation & Clustering": 0.38387242742350736,
+ "T7. Consistency & Compliance Checking": 0.18133975282134737,
+ "T8. Structured & Numeric Reasoning": 0.3114197530864198,
+ "T9. Version & Code Diff Analysis": 0.44895353115875314,
+ "T10. Rule Induction & In-Context Learning": 0.35731481481481475,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.28888888888888886
+ },
+ "average_language_metric": {
+ "Chinese": 0.3378195347928154,
+ "English": 0.40894877084306375
+ },
+ "BoN-1": {
+ "overall_metric": 0.3756110685938797,
+ "token_length": {
+ "8k": 0.4545820286350837,
+ "16k": 0.4352872653228386,
+ "32k": 0.3950079365533934,
+ "64k": 0.35311614333477187,
+ "128k": 0.3114655730243775,
+ "256k": 0.3042074646928149
+ },
+ "contextual_requirement": {
+ "Full": 0.35375594987355935,
+ "Partial": 0.4034266742379251
+ },
+ "difficulty": {
+ "Easy": 0.5844518111059833,
+ "Moderate": 0.23662855463799365,
+ "Hard": 0.3096811226362121,
+ "Extreme": 0.281425757285206
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6068670183757786,
+ "T2. Sequencing & Structure Reconstruction": 0.6163485749654042,
+ "T3. Evidence-Grounded QA": 0.36666666666666664,
+ "T4. Summarization & Synthesis": 0.4560619845050425,
+ "T5. Attribution & Citation Alignment": 0.22005868934189987,
+ "T6. Aggregation & Clustering": 0.39305351710005704,
+ "T7. Consistency & Compliance Checking": 0.1715251657008684,
+ "T8. Structured & Numeric Reasoning": 0.33888888888888885,
+ "T9. Version & Code Diff Analysis": 0.44004517714509755,
+ "T10. Rule Induction & In-Context Learning": 0.35888888888888887,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.275
+ },
+ "language": {
+ "Chinese": 0.34160224855312066,
+ "English": 0.40961988863463966
+ }
+ },
+ "pass@1": 0.15533333333333332,
+ "BoN-2": {
+ "overall_metric": 0.4363206399942735,
+ "token_length": {
+ "8k": 0.5234602086730173,
+ "16k": 0.5018004637617909,
+ "32k": 0.46794475339715547,
+ "64k": 0.4185517955545234,
+ "128k": 0.3516627837551589,
+ "256k": 0.354503834823997
+ },
+ "contextual_requirement": {
+ "Full": 0.4063870314951086,
+ "Partial": 0.4744179599023025
+ },
+ "difficulty": {
+ "Easy": 0.6923421245761895,
+ "Moderate": 0.2843297701377906,
+ "Hard": 0.35026505438423683,
+ "Extreme": 0.31221398104277637
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6567331296259676,
+ "T2. Sequencing & Structure Reconstruction": 0.6726985976985973,
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
+ "T4. Summarization & Synthesis": 0.4711965663860271,
+ "T5. Attribution & Citation Alignment": 0.27489752350046465,
+ "T6. Aggregation & Clustering": 0.44484952313131765,
+ "T7. Consistency & Compliance Checking": 0.22198381203531814,
+ "T8. Structured & Numeric Reasoning": 0.40925925925925927,
+ "T9. Version & Code Diff Analysis": 0.5240655133007459,
+ "T10. Rule Induction & In-Context Learning": 0.4486111111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334
+ },
+ "language": {
+ "Chinese": 0.3992628958468014,
+ "English": 0.47337838414174643
+ }
+ },
+ "pass@2": 0.19933333333333333,
+ "BoN-3": {
+ "overall_metric": 0.4678951184844386,
+ "token_length": {
+ "8k": 0.5595081038285018,
+ "16k": 0.5288508385707865,
+ "32k": 0.5128049001652387,
+ "64k": 0.45281870495664023,
+ "128k": 0.37498216355710723,
+ "256k": 0.3784059998283628
+ },
+ "contextual_requirement": {
+ "Full": 0.4378657173135933,
+ "Partial": 0.5061143563382442
+ },
+ "difficulty": {
+ "Easy": 0.7390417273273124,
+ "Moderate": 0.31763207699520324,
+ "Hard": 0.37625789112044933,
+ "Extreme": 0.3297508281124345
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6917909626535441,
+ "T2. Sequencing & Structure Reconstruction": 0.7037670200170199,
+ "T3. Evidence-Grounded QA": 0.475,
+ "T4. Summarization & Synthesis": 0.4763487195551507,
+ "T5. Attribution & Citation Alignment": 0.30687631368745927,
+ "T6. Aggregation & Clustering": 0.4772314892310187,
+ "T7. Consistency & Compliance Checking": 0.24817243122652502,
+ "T8. Structured & Numeric Reasoning": 0.4217592592592592,
+ "T9. Version & Code Diff Analysis": 0.5559667511226697,
+ "T10. Rule Induction & In-Context Learning": 0.5098611111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
+ },
+ "language": {
+ "Chinese": 0.4238794242578244,
+ "English": 0.511910812711055
+ }
+ },
+ "pass@3": 0.22333333333333333
+}
\ No newline at end of file
diff --git a/results/Gemma-3-4B-It/nonthinking_context-120000_bon-3_summary.json b/results/Gemma-3-4B-It/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..58b773f2e218e0914f011b011369ec92b670647a
--- /dev/null
+++ b/results/Gemma-3-4B-It/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.2175748220994214,
+ "inference_iteration_1_overall_metric": 0.21852109706784154,
+ "inference_iteration_2_overall_metric": 0.2163322668515703,
+ "inference_iteration_3_overall_metric": 0.21787110237885274,
+ "average_token_length_metric": {
+ "8k": 0.24656523663590132,
+ "16k": 0.2205604877341683,
+ "32k": 0.23963284248634728,
+ "64k": 0.21111513028758372,
+ "128k": 0.19343593120899555,
+ "256k": 0.19413930424353257
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.21444597727329182,
+ "Partial": 0.22155698824176823
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.28179521332096,
+ "Moderate": 0.15821172453341914,
+ "Hard": 0.20704875252188354,
+ "Extreme": 0.19312877831692504
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.4236351950773965,
+ "T2. Sequencing & Structure Reconstruction": 0.45359587924627603,
+ "T3. Evidence-Grounded QA": 0.3194444444444444,
+ "T4. Summarization & Synthesis": 0.443681734303759,
+ "T5. Attribution & Citation Alignment": 0.042786026910229404,
+ "T6. Aggregation & Clustering": 0.19637703429305803,
+ "T7. Consistency & Compliance Checking": 0.09142599749178396,
+ "T8. Structured & Numeric Reasoning": 0.02438271604938272,
+ "T9. Version & Code Diff Analysis": 0.1616430041389551,
+ "T10. Rule Induction & In-Context Learning": 0.22050925925925927,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.18611111111111114
+ },
+ "average_language_metric": {
+ "Chinese": 0.20889372279815283,
+ "English": 0.22625592140069015
+ },
+ "BoN-1": {
+ "overall_metric": 0.21852109706784154,
+ "token_length": {
+ "8k": 0.2476866042969573,
+ "16k": 0.21620176526480195,
+ "32k": 0.24753627633784483,
+ "64k": 0.21130221395252485,
+ "128k": 0.19685176433002144,
+ "256k": 0.19154795822489784
+ },
+ "contextual_requirement": {
+ "Full": 0.21588524494794425,
+ "Partial": 0.22187581794771047
+ },
+ "difficulty": {
+ "Easy": 0.28399652990881596,
+ "Moderate": 0.1580374343562769,
+ "Hard": 0.20857790675358978,
+ "Extreme": 0.19305337410218493
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.4309006915611223,
+ "T2. Sequencing & Structure Reconstruction": 0.4447710584938352,
+ "T3. Evidence-Grounded QA": 0.3333333333333333,
+ "T4. Summarization & Synthesis": 0.441308668504869,
+ "T5. Attribution & Citation Alignment": 0.03940722221903115,
+ "T6. Aggregation & Clustering": 0.20337572553598166,
+ "T7. Consistency & Compliance Checking": 0.08406090392051238,
+ "T8. Structured & Numeric Reasoning": 0.02361111111111111,
+ "T9. Version & Code Diff Analysis": 0.16341557282886326,
+ "T10. Rule Induction & In-Context Learning": 0.22013888888888886,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
+ },
+ "language": {
+ "Chinese": 0.2120533839166827,
+ "English": 0.22498881021900005
+ }
+ },
+ "pass@1": 0.06733333333333333,
+ "BoN-2": {
+ "overall_metric": 0.23271347684478086,
+ "token_length": {
+ "8k": 0.2631313909667352,
+ "16k": 0.2312324535272521,
+ "32k": 0.25978977734748787,
+ "64k": 0.23161875208541116,
+ "128k": 0.20537856042145283,
+ "256k": 0.20512992672034427
+ },
+ "contextual_requirement": {
+ "Full": 0.2292600969188946,
+ "Partial": 0.23710868765954476
+ },
+ "difficulty": {
+ "Easy": 0.2946012095946107,
+ "Moderate": 0.1735404680186527,
+ "Hard": 0.22252519869057716,
+ "Extreme": 0.21047603309909335
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.4536230189628364,
+ "T2. Sequencing & Structure Reconstruction": 0.4791046371928724,
+ "T3. Evidence-Grounded QA": 0.3333333333333333,
+ "T4. Summarization & Synthesis": 0.4555393638488407,
+ "T5. Attribution & Citation Alignment": 0.04458180082240233,
+ "T6. Aggregation & Clustering": 0.22301925498410874,
+ "T7. Consistency & Compliance Checking": 0.10512521699205389,
+ "T8. Structured & Numeric Reasoning": 0.029166666666666667,
+ "T9. Version & Code Diff Analysis": 0.18246404287967255,
+ "T10. Rule Induction & In-Context Learning": 0.23263888888888887,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
+ },
+ "language": {
+ "Chinese": 0.22667394776839053,
+ "English": 0.2387530059211707
+ }
+ },
+ "pass@2": 0.06866666666666667,
+ "BoN-3": {
+ "overall_metric": 0.2401600102211412,
+ "token_length": {
+ "8k": 0.26882845096050667,
+ "16k": 0.24012842113745536,
+ "32k": 0.2657505147595882,
+ "64k": 0.237575165952952,
+ "128k": 0.21797743491880378,
+ "256k": 0.21070007359754095
+ },
+ "contextual_requirement": {
+ "Full": 0.23644784183701267,
+ "Partial": 0.24488458816457748
+ },
+ "difficulty": {
+ "Easy": 0.3019876106242685,
+ "Moderate": 0.17780290261890433,
+ "Hard": 0.22967603723970634,
+ "Extreme": 0.22027403043562677
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.4596998605405389,
+ "T2. Sequencing & Structure Reconstruction": 0.5016641610023963,
+ "T3. Evidence-Grounded QA": 0.3333333333333333,
+ "T4. Summarization & Synthesis": 0.4669275398584054,
+ "T5. Attribution & Citation Alignment": 0.051513017753619265,
+ "T6. Aggregation & Clustering": 0.23601198455578568,
+ "T7. Consistency & Compliance Checking": 0.11322244187918541,
+ "T8. Structured & Numeric Reasoning": 0.03148148148148148,
+ "T9. Version & Code Diff Analysis": 0.19070502006795878,
+ "T10. Rule Induction & In-Context Learning": 0.23541666666666666,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
+ },
+ "language": {
+ "Chinese": 0.23450699509250633,
+ "English": 0.24581302534977606
+ }
+ },
+ "pass@3": 0.07
+}
\ No newline at end of file
diff --git a/results/Gemma-3-4B-It/thinking_context-120000_bon-3_summary.json b/results/Gemma-3-4B-It/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e659459e0da902fd55fd9f70dd35ea2331e62a4
--- /dev/null
+++ b/results/Gemma-3-4B-It/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.2119885030064203,
+ "inference_iteration_1_overall_metric": 0.21437249924782262,
+ "inference_iteration_2_overall_metric": 0.21469684223951344,
+ "inference_iteration_3_overall_metric": 0.20689616753192464,
+ "average_token_length_metric": {
+ "8k": 0.24366425705090342,
+ "16k": 0.2312288563166909,
+ "32k": 0.24934489050979397,
+ "64k": 0.17455165550407764,
+ "128k": 0.18287496802077124,
+ "256k": 0.19026639063628464
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.20450724123760522,
+ "Partial": 0.221510108894003
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.28661876230483024,
+ "Moderate": 0.13848484214026374,
+ "Hard": 0.1987191002737751,
+ "Extreme": 0.18722857209328425
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.4122355140839005,
+ "T2. Sequencing & Structure Reconstruction": 0.4238909558076955,
+ "T3. Evidence-Grounded QA": 0.2361111111111111,
+ "T4. Summarization & Synthesis": 0.4241502347645632,
+ "T5. Attribution & Citation Alignment": 0.04263401505989564,
+ "T6. Aggregation & Clustering": 0.23701628551906095,
+ "T7. Consistency & Compliance Checking": 0.07531749025982754,
+ "T8. Structured & Numeric Reasoning": 0.10648148148148145,
+ "T9. Version & Code Diff Analysis": 0.15631527456623664,
+ "T10. Rule Induction & In-Context Learning": 0.20962962962962953,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.11666666666666664
+ },
+ "average_language_metric": {
+ "Chinese": 0.19120313153416005,
+ "English": 0.23277387447868045
+ },
+ "BoN-1": {
+ "overall_metric": 0.21437249924782262,
+ "token_length": {
+ "8k": 0.260899815427902,
+ "16k": 0.2556087799887465,
+ "32k": 0.23432924859855547,
+ "64k": 0.1786455712537597,
+ "128k": 0.1740181124720787,
+ "256k": 0.18273346774589402
+ },
+ "contextual_requirement": {
+ "Full": 0.2082323257071287,
+ "Partial": 0.22218726557234228
+ },
+ "difficulty": {
+ "Easy": 0.28518994612968396,
+ "Moderate": 0.1458906399979874,
+ "Hard": 0.19668309266803527,
+ "Extreme": 0.19339405931078624
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.44068453349253894,
+ "T2. Sequencing & Structure Reconstruction": 0.42484177261436434,
+ "T3. Evidence-Grounded QA": 0.25,
+ "T4. Summarization & Synthesis": 0.42183521824061226,
+ "T5. Attribution & Citation Alignment": 0.03826565166477447,
+ "T6. Aggregation & Clustering": 0.25040130008880007,
+ "T7. Consistency & Compliance Checking": 0.08179456674127736,
+ "T8. Structured & Numeric Reasoning": 0.08703703703703704,
+ "T9. Version & Code Diff Analysis": 0.16976304211815604,
+ "T10. Rule Induction & In-Context Learning": 0.19708333333333336,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.10833333333333334
+ },
+ "language": {
+ "Chinese": 0.19621205437007752,
+ "English": 0.23253294412556783
+ }
+ },
+ "pass@1": 0.06333333333333334,
+ "BoN-2": {
+ "overall_metric": 0.2636576810412479,
+ "token_length": {
+ "8k": 0.3079135549113116,
+ "16k": 0.28734590159692097,
+ "32k": 0.30413774794877135,
+ "64k": 0.21523254007725492,
+ "128k": 0.22804447155407118,
+ "256k": 0.23927187015915638
+ },
+ "contextual_requirement": {
+ "Full": 0.250251500957686,
+ "Partial": 0.28072009205669
+ },
+ "difficulty": {
+ "Easy": 0.364020302669296,
+ "Moderate": 0.1823512513190994,
+ "Hard": 0.23524288009416136,
+ "Extreme": 0.22578189022008766
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5017988282740397,
+ "T2. Sequencing & Structure Reconstruction": 0.5025411389881287,
+ "T3. Evidence-Grounded QA": 0.275,
+ "T4. Summarization & Synthesis": 0.44235758157965793,
+ "T5. Attribution & Citation Alignment": 0.07123185213097494,
+ "T6. Aggregation & Clustering": 0.3076327414869081,
+ "T7. Consistency & Compliance Checking": 0.11226263632567846,
+ "T8. Structured & Numeric Reasoning": 0.14583333333333334,
+ "T9. Version & Code Diff Analysis": 0.2253096564350266,
+ "T10. Rule Induction & In-Context Learning": 0.26222222222222225,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.16666666666666666
+ },
+ "language": {
+ "Chinese": 0.24369127122121384,
+ "English": 0.28362409086128176
+ }
+ },
+ "pass@2": 0.086,
+ "BoN-3": {
+ "overall_metric": 0.29217739130674947,
+ "token_length": {
+ "8k": 0.3390735712300384,
+ "16k": 0.3084943145981662,
+ "32k": 0.331141630537573,
+ "64k": 0.24932737796255805,
+ "128k": 0.25196083809503234,
+ "256k": 0.27306661541712973
+ },
+ "contextual_requirement": {
+ "Full": 0.2733927975152606,
+ "Partial": 0.3160850561322812
+ },
+ "difficulty": {
+ "Easy": 0.4041733526262008,
+ "Moderate": 0.2194670736422517,
+ "Hard": 0.2580927963130063,
+ "Extreme": 0.23963574676642313
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5332538419210622,
+ "T2. Sequencing & Structure Reconstruction": 0.5533578363062058,
+ "T3. Evidence-Grounded QA": 0.3416666666666667,
+ "T4. Summarization & Synthesis": 0.4493150256833664,
+ "T5. Attribution & Citation Alignment": 0.09079534419446701,
+ "T6. Aggregation & Clustering": 0.3434824981209785,
+ "T7. Consistency & Compliance Checking": 0.12257916371054065,
+ "T8. Structured & Numeric Reasoning": 0.16805555555555557,
+ "T9. Version & Code Diff Analysis": 0.24931951714865616,
+ "T10. Rule Induction & In-Context Learning": 0.30833333333333335,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.175
+ },
+ "language": {
+ "Chinese": 0.27088663804629004,
+ "English": 0.3134681445672092
+ }
+ },
+ "pass@3": 0.102
+}
\ No newline at end of file
diff --git a/results/Kimi-K2-Instruct-0905/nonthinking_context-224000_bon-3_summary.json b/results/Kimi-K2-Instruct-0905/nonthinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..38091a0c0cac6468902915e449a15a7abc238bb9
--- /dev/null
+++ b/results/Kimi-K2-Instruct-0905/nonthinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 67,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5009443422920304,
+ "inference_iteration_1_overall_metric": 0.5011015308802983,
+ "inference_iteration_2_overall_metric": 0.49751406897312744,
+ "inference_iteration_3_overall_metric": 0.5042174270226657,
+ "average_token_length_metric": {
+ "8k": 0.5193469215810047,
+ "16k": 0.5532046525085649,
+ "32k": 0.5393076869166767,
+ "64k": 0.45954315717941974,
+ "128k": 0.4753071835553842,
+ "256k": 0.4589564520111373
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.45716785858755954,
+ "Partial": 0.5566598670068132
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6491770551060967,
+ "Moderate": 0.4905460752618018,
+ "Hard": 0.43431147544571097,
+ "Extreme": 0.3960536599333797
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7424451818013483,
+ "T2. Sequencing & Structure Reconstruction": 0.7428685203685202,
+ "T3. Evidence-Grounded QA": 0.5472222222222223,
+ "T4. Summarization & Synthesis": 0.5115863734748296,
+ "T5. Attribution & Citation Alignment": 0.5310286936858898,
+ "T6. Aggregation & Clustering": 0.481867796853936,
+ "T7. Consistency & Compliance Checking": 0.36661627375742456,
+ "T8. Structured & Numeric Reasoning": 0.24089506172839517,
+ "T9. Version & Code Diff Analysis": 0.607908662662019,
+ "T10. Rule Induction & In-Context Learning": 0.5085648148148147,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.43611111111111117
+ },
+ "average_language_metric": {
+ "Chinese": 0.502898779221276,
+ "English": 0.4989899053627864
+ },
+ "BoN-1": {
+ "overall_metric": 0.5011015308802983,
+ "token_length": {
+ "8k": 0.5266204875905753,
+ "16k": 0.5431095505457598,
+ "32k": 0.5392474254502099,
+ "64k": 0.46556965866611255,
+ "128k": 0.48161604654304363,
+ "256k": 0.45044601648609367
+ },
+ "contextual_requirement": {
+ "Full": 0.45533929299371073,
+ "Partial": 0.5593443790995934
+ },
+ "difficulty": {
+ "Easy": 0.6458118886638174,
+ "Moderate": 0.4926159658473186,
+ "Hard": 0.42892012991547307,
+ "Extreme": 0.4021536227747844
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.734408014297419,
+ "T2. Sequencing & Structure Reconstruction": 0.7401580826580821,
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
+ "T4. Summarization & Synthesis": 0.5077956681433179,
+ "T5. Attribution & Citation Alignment": 0.5196800787679438,
+ "T6. Aggregation & Clustering": 0.4941189417411527,
+ "T7. Consistency & Compliance Checking": 0.3706991980056276,
+ "T8. Structured & Numeric Reasoning": 0.22268518518518515,
+ "T9. Version & Code Diff Analysis": 0.6228334158501364,
+ "T10. Rule Induction & In-Context Learning": 0.5243055555555556,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
+ },
+ "language": {
+ "Chinese": 0.49773496712013804,
+ "English": 0.5044680946404603
+ }
+ },
+ "pass@1": 0.23,
+ "BoN-2": {
+ "overall_metric": 0.5483962332490746,
+ "token_length": {
+ "8k": 0.5725846797858735,
+ "16k": 0.6070506360109902,
+ "32k": 0.5785761030801342,
+ "64k": 0.5092135264066221,
+ "128k": 0.5131800223023555,
+ "256k": 0.5097724319084751
+ },
+ "contextual_requirement": {
+ "Full": 0.49887561173193595,
+ "Partial": 0.6114224788163434
+ },
+ "difficulty": {
+ "Easy": 0.7046240276873015,
+ "Moderate": 0.5462638174512109,
+ "Hard": 0.4707831873116643,
+ "Extreme": 0.4364568187575384
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7623386988134085,
+ "T2. Sequencing & Structure Reconstruction": 0.7700949513449513,
+ "T3. Evidence-Grounded QA": 0.6,
+ "T4. Summarization & Synthesis": 0.5249959487118497,
+ "T5. Attribution & Citation Alignment": 0.5953546052259285,
+ "T6. Aggregation & Clustering": 0.5412045721601162,
+ "T7. Consistency & Compliance Checking": 0.41849900851913713,
+ "T8. Structured & Numeric Reasoning": 0.28935185185185186,
+ "T9. Version & Code Diff Analysis": 0.6584466738317519,
+ "T10. Rule Induction & In-Context Learning": 0.5701388888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5
+ },
+ "language": {
+ "Chinese": 0.5438314702987085,
+ "English": 0.552960996199442
+ }
+ },
+ "pass@2": 0.27666666666666667,
+ "BoN-3": {
+ "overall_metric": 0.5729921291255787,
+ "token_length": {
+ "8k": 0.5918227174634976,
+ "16k": 0.6198567950677695,
+ "32k": 0.6115768945303457,
+ "64k": 0.5284625404433138,
+ "128k": 0.558149430962295,
+ "256k": 0.528084396286257
+ },
+ "contextual_requirement": {
+ "Full": 0.524120078602167,
+ "Partial": 0.6351929207008328
+ },
+ "difficulty": {
+ "Easy": 0.728216785648972,
+ "Moderate": 0.5755547819370206,
+ "Hard": 0.5066701999617829,
+ "Extreme": 0.45151519711603216
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7860047126136753,
+ "T2. Sequencing & Structure Reconstruction": 0.7853528878528878,
+ "T3. Evidence-Grounded QA": 0.625,
+ "T4. Summarization & Synthesis": 0.5322270993764735,
+ "T5. Attribution & Citation Alignment": 0.6208079019292253,
+ "T6. Aggregation & Clustering": 0.5652450687006129,
+ "T7. Consistency & Compliance Checking": 0.44884599567602773,
+ "T8. Structured & Numeric Reasoning": 0.31157407407407406,
+ "T9. Version & Code Diff Analysis": 0.6765946379547446,
+ "T10. Rule Induction & In-Context Learning": 0.5895833333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5583333333333333
+ },
+ "language": {
+ "Chinese": 0.5755180331103958,
+ "English": 0.5704662251407642
+ }
+ },
+ "pass@3": 0.3
+}
\ No newline at end of file
diff --git a/results/Kimi-K2-Instruct-0905/thinking_context-224000_bon-3_summary.json b/results/Kimi-K2-Instruct-0905/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..26db8474a6b75109bab5053a74aa49a112ec2fd2
--- /dev/null
+++ b/results/Kimi-K2-Instruct-0905/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 69,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5553060678788313,
+ "inference_iteration_1_overall_metric": 0.558917739810572,
+ "inference_iteration_2_overall_metric": 0.5552262066724464,
+ "inference_iteration_3_overall_metric": 0.5517742571534756,
+ "average_token_length_metric": {
+ "8k": 0.5978532013581613,
+ "16k": 0.5816609532803436,
+ "32k": 0.5872894997726004,
+ "64k": 0.5360933501085343,
+ "128k": 0.522886026665569,
+ "256k": 0.5060533760877814
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5076499747745465,
+ "Partial": 0.6159592772842868
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7729188134795828,
+ "Moderate": 0.5733088402612271,
+ "Hard": 0.4375074213815535,
+ "Extreme": 0.38246894115073926
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.758377055109974,
+ "T2. Sequencing & Structure Reconstruction": 0.7423694091857211,
+ "T3. Evidence-Grounded QA": 0.4861111111111112,
+ "T4. Summarization & Synthesis": 0.5011656658098056,
+ "T5. Attribution & Citation Alignment": 0.6197584764672828,
+ "T6. Aggregation & Clustering": 0.5164556923382113,
+ "T7. Consistency & Compliance Checking": 0.3547519606397262,
+ "T8. Structured & Numeric Reasoning": 0.5962962962962963,
+ "T9. Version & Code Diff Analysis": 0.6299270957790389,
+ "T10. Rule Induction & In-Context Learning": 0.5606944444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666654
+ },
+ "average_language_metric": {
+ "Chinese": 0.5409680916435047,
+ "English": 0.5696440441141594
+ },
+ "BoN-1": {
+ "overall_metric": 0.558917739810572,
+ "token_length": {
+ "8k": 0.6032704706738693,
+ "16k": 0.5808397170448323,
+ "32k": 0.5927696772272222,
+ "64k": 0.5485389223926213,
+ "128k": 0.5293584568340762,
+ "256k": 0.49872919469081073
+ },
+ "contextual_requirement": {
+ "Full": 0.5058707162683681,
+ "Partial": 0.6264321334097424
+ },
+ "difficulty": {
+ "Easy": 0.7833558783440772,
+ "Moderate": 0.5768808845916201,
+ "Hard": 0.4381739958885578,
+ "Extreme": 0.3805641270346401
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7528903747933631,
+ "T2. Sequencing & Structure Reconstruction": 0.7690795456301782,
+ "T3. Evidence-Grounded QA": 0.525,
+ "T4. Summarization & Synthesis": 0.5001505956406523,
+ "T5. Attribution & Citation Alignment": 0.6016113058386018,
+ "T6. Aggregation & Clustering": 0.5066860396628982,
+ "T7. Consistency & Compliance Checking": 0.37783095340307493,
+ "T8. Structured & Numeric Reasoning": 0.5875000000000001,
+ "T9. Version & Code Diff Analysis": 0.6056866583526187,
+ "T10. Rule Induction & In-Context Learning": 0.5740277777777777,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.45
+ },
+ "language": {
+ "Chinese": 0.5344377355706056,
+ "English": 0.5833977440505398
+ }
+ },
+ "pass@1": 0.30733333333333335,
+ "BoN-2": {
+ "overall_metric": 0.6315444476656525,
+ "token_length": {
+ "8k": 0.6765293955798586,
+ "16k": 0.6653958948787116,
+ "32k": 0.6614274068144226,
+ "64k": 0.6099720675525434,
+ "128k": 0.5887552831784975,
+ "256k": 0.5871866379898839
+ },
+ "contextual_requirement": {
+ "Full": 0.5906087497944075,
+ "Partial": 0.6836444267745122
+ },
+ "difficulty": {
+ "Easy": 0.8546630775122501,
+ "Moderate": 0.6784971908255039,
+ "Hard": 0.5110824387985737,
+ "Extreme": 0.4354103526732191
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7926072573330768,
+ "T2. Sequencing & Structure Reconstruction": 0.814456423206423,
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
+ "T4. Summarization & Synthesis": 0.5156065007848037,
+ "T5. Attribution & Citation Alignment": 0.705413217864198,
+ "T6. Aggregation & Clustering": 0.6176306197741449,
+ "T7. Consistency & Compliance Checking": 0.42951044567324476,
+ "T8. Structured & Numeric Reasoning": 0.6861111111111111,
+ "T9. Version & Code Diff Analysis": 0.7066217095721875,
+ "T10. Rule Induction & In-Context Learning": 0.6680555555555556,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.525
+ },
+ "language": {
+ "Chinese": 0.617638270540212,
+ "English": 0.6454506247910954
+ }
+ },
+ "pass@2": 0.37066666666666664,
+ "BoN-3": {
+ "overall_metric": 0.6596591269299542,
+ "token_length": {
+ "8k": 0.6994761842119754,
+ "16k": 0.6880103332979634,
+ "32k": 0.6831628614701334,
+ "64k": 0.6414281237947528,
+ "128k": 0.6223896794908902,
+ "256k": 0.6234875793140215
+ },
+ "contextual_requirement": {
+ "Full": 0.6186943779973499,
+ "Partial": 0.7117960801169108
+ },
+ "difficulty": {
+ "Easy": 0.865163305432102,
+ "Moderate": 0.7223592192252923,
+ "Hard": 0.5480741090923853,
+ "Extreme": 0.4666471483928459
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8175679720885881,
+ "T2. Sequencing & Structure Reconstruction": 0.8413347300847297,
+ "T3. Evidence-Grounded QA": 0.6,
+ "T4. Summarization & Synthesis": 0.5241839616339089,
+ "T5. Attribution & Citation Alignment": 0.7286969407202896,
+ "T6. Aggregation & Clustering": 0.6482898910584165,
+ "T7. Consistency & Compliance Checking": 0.47389920059132556,
+ "T8. Structured & Numeric Reasoning": 0.7092592592592593,
+ "T9. Version & Code Diff Analysis": 0.7448385112889893,
+ "T10. Rule Induction & In-Context Learning": 0.7002777777777779,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
+ },
+ "language": {
+ "Chinese": 0.646426154128056,
+ "English": 0.6728920997318576
+ }
+ },
+ "pass@3": 0.4013333333333333
+}
\ No newline at end of file
diff --git a/results/Llama-3.1-405B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Llama-3.1-405B-Instruct/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..813cd67a14dd030568e5c15988acd0c513604750
--- /dev/null
+++ b/results/Llama-3.1-405B-Instruct/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.4006972406362581,
+ "inference_iteration_1_overall_metric": 0.4033767470362484,
+ "inference_iteration_2_overall_metric": 0.4037400979033875,
+ "inference_iteration_3_overall_metric": 0.3949748769691391,
+ "average_token_length_metric": {
+ "8k": 0.495611810737427,
+ "16k": 0.47999448108480186,
+ "32k": 0.4902670612376324,
+ "64k": 0.40596651011726403,
+ "128k": 0.2929513776114342,
+ "256k": 0.23939220302898997
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3708303649326616,
+ "Partial": 0.4387096278953802
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5544944910188809,
+ "Moderate": 0.2906684222492303,
+ "Hard": 0.35510805093665054,
+ "Extreme": 0.33443208075583397
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7010722319989281,
+ "T2. Sequencing & Structure Reconstruction": 0.6852361434861434,
+ "T3. Evidence-Grounded QA": 0.39722222222222225,
+ "T4. Summarization & Synthesis": 0.49860889459519425,
+ "T5. Attribution & Citation Alignment": 0.33280185377001925,
+ "T6. Aggregation & Clustering": 0.4288546372402087,
+ "T7. Consistency & Compliance Checking": 0.22662521432331084,
+ "T8. Structured & Numeric Reasoning": 0.1651234567901234,
+ "T9. Version & Code Diff Analysis": 0.4037951252761809,
+ "T10. Rule Induction & In-Context Learning": 0.43962962962962965,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3194444444444445
+ },
+ "average_language_metric": {
+ "Chinese": 0.38105191405127925,
+ "English": 0.42034256722123686
+ },
+ "BoN-1": {
+ "overall_metric": 0.4033767470362484,
+ "token_length": {
+ "8k": 0.5193261381744707,
+ "16k": 0.47458776670121805,
+ "32k": 0.4893823156329057,
+ "64k": 0.41443272045065593,
+ "128k": 0.28371722198812693,
+ "256k": 0.2388143192701119
+ },
+ "contextual_requirement": {
+ "Full": 0.3743223052410306,
+ "Partial": 0.44035512750288913
+ },
+ "difficulty": {
+ "Easy": 0.5734527544487493,
+ "Moderate": 0.2799762087674356,
+ "Hard": 0.35650549807929216,
+ "Extreme": 0.3289038173440243
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6946832479137959,
+ "T2. Sequencing & Structure Reconstruction": 0.6867736892736891,
+ "T3. Evidence-Grounded QA": 0.4083333333333333,
+ "T4. Summarization & Synthesis": 0.5005294746426308,
+ "T5. Attribution & Citation Alignment": 0.3263529995384955,
+ "T6. Aggregation & Clustering": 0.4192752041935995,
+ "T7. Consistency & Compliance Checking": 0.23609519970144652,
+ "T8. Structured & Numeric Reasoning": 0.18101851851851852,
+ "T9. Version & Code Diff Analysis": 0.391786542964143,
+ "T10. Rule Induction & In-Context Learning": 0.47083333333333327,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.30833333333333335
+ },
+ "language": {
+ "Chinese": 0.3889859691853749,
+ "English": 0.41776752488712193
+ }
+ },
+ "pass@1": 0.16266666666666665,
+ "BoN-2": {
+ "overall_metric": 0.44522764896517386,
+ "token_length": {
+ "8k": 0.558048061407528,
+ "16k": 0.5158119021911939,
+ "32k": 0.530639639804942,
+ "64k": 0.45350021661205264,
+ "128k": 0.3440588360232635,
+ "256k": 0.26930723775206405
+ },
+ "contextual_requirement": {
+ "Full": 0.40918538567791846,
+ "Partial": 0.49109962042168165
+ },
+ "difficulty": {
+ "Easy": 0.6186902100237465,
+ "Moderate": 0.32736405593203205,
+ "Hard": 0.39034570869637836,
+ "Extreme": 0.36866970508796515
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7336012615268711,
+ "T2. Sequencing & Structure Reconstruction": 0.737358012358012,
+ "T3. Evidence-Grounded QA": 0.45,
+ "T4. Summarization & Synthesis": 0.5147616333746151,
+ "T5. Attribution & Citation Alignment": 0.3674427224531845,
+ "T6. Aggregation & Clustering": 0.4743354751511784,
+ "T7. Consistency & Compliance Checking": 0.26985986799607303,
+ "T8. Structured & Numeric Reasoning": 0.20462962962962963,
+ "T9. Version & Code Diff Analysis": 0.44727785652000346,
+ "T10. Rule Induction & In-Context Learning": 0.5249999999999999,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664
+ },
+ "language": {
+ "Chinese": 0.433626244820898,
+ "English": 0.4568290531094507
+ }
+ },
+ "pass@2": 0.19,
+ "BoN-3": {
+ "overall_metric": 0.4623473432019363,
+ "token_length": {
+ "8k": 0.5703117586774046,
+ "16k": 0.531567048206503,
+ "32k": 0.5504750677715584,
+ "64k": 0.47066633396240054,
+ "128k": 0.3715170610120738,
+ "256k": 0.27954678958167684
+ },
+ "contextual_requirement": {
+ "Full": 0.42433070289568703,
+ "Partial": 0.510732158137163
+ },
+ "difficulty": {
+ "Easy": 0.6297066688011407,
+ "Moderate": 0.34570653296360615,
+ "Hard": 0.4161126844074583,
+ "Extreme": 0.3859923786829926
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7453735974615648,
+ "T2. Sequencing & Structure Reconstruction": 0.7443156843156841,
+ "T3. Evidence-Grounded QA": 0.45,
+ "T4. Summarization & Synthesis": 0.52546578139621,
+ "T5. Attribution & Citation Alignment": 0.4112102181587475,
+ "T6. Aggregation & Clustering": 0.5005878653860251,
+ "T7. Consistency & Compliance Checking": 0.2880921279803352,
+ "T8. Structured & Numeric Reasoning": 0.22685185185185183,
+ "T9. Version & Code Diff Analysis": 0.4880120741980102,
+ "T10. Rule Induction & In-Context Learning": 0.5249999999999999,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664
+ },
+ "language": {
+ "Chinese": 0.4464982468640505,
+ "English": 0.47819643953982244
+ }
+ },
+ "pass@3": 0.20266666666666666
+}
\ No newline at end of file
diff --git a/results/Llama-3.1-405B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Llama-3.1-405B-Instruct/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0dac0fb76a0adce17e358a97f522b4efb811204
--- /dev/null
+++ b/results/Llama-3.1-405B-Instruct/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.40659333298471173,
+ "inference_iteration_1_overall_metric": 0.3990307184980547,
+ "inference_iteration_2_overall_metric": 0.40746318832781453,
+ "inference_iteration_3_overall_metric": 0.4132860921282645,
+ "average_token_length_metric": {
+ "8k": 0.5237859741060642,
+ "16k": 0.517961066700275,
+ "32k": 0.4641044292483723,
+ "64k": 0.4182090511525332,
+ "128k": 0.260073356838944,
+ "256k": 0.25542611986208036
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3729971231698154,
+ "Partial": 0.44935214547639707
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.613595833316379,
+ "Moderate": 0.2921863382975652,
+ "Hard": 0.3409469495726698,
+ "Extreme": 0.29809383550926016
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6642543814799384,
+ "T2. Sequencing & Structure Reconstruction": 0.660868776285443,
+ "T3. Evidence-Grounded QA": 0.3,
+ "T4. Summarization & Synthesis": 0.4699204609681677,
+ "T5. Attribution & Citation Alignment": 0.35415278369391223,
+ "T6. Aggregation & Clustering": 0.44523374235335905,
+ "T7. Consistency & Compliance Checking": 0.2145859605426568,
+ "T8. Structured & Numeric Reasoning": 0.36867283950617286,
+ "T9. Version & Code Diff Analysis": 0.43511107590777826,
+ "T10. Rule Induction & In-Context Learning": 0.37759259259259265,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2777777777777778
+ },
+ "average_language_metric": {
+ "Chinese": 0.36859109108368215,
+ "English": 0.4445955748857405
+ },
+ "BoN-1": {
+ "overall_metric": 0.3990307184980547,
+ "token_length": {
+ "8k": 0.5319602144193065,
+ "16k": 0.495294502488435,
+ "32k": 0.44900241142401065,
+ "64k": 0.4104061454120568,
+ "128k": 0.24974039710926457,
+ "256k": 0.2577806401352564
+ },
+ "contextual_requirement": {
+ "Full": 0.3663953815207406,
+ "Partial": 0.4405666019237288
+ },
+ "difficulty": {
+ "Easy": 0.6016370051267634,
+ "Moderate": 0.26872089549030526,
+ "Hard": 0.34181177303603094,
+ "Extreme": 0.3002570456178896
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6857190555998647,
+ "T2. Sequencing & Structure Reconstruction": 0.6476817164317161,
+ "T3. Evidence-Grounded QA": 0.2916666666666667,
+ "T4. Summarization & Synthesis": 0.4722890782260215,
+ "T5. Attribution & Citation Alignment": 0.3720362720390496,
+ "T6. Aggregation & Clustering": 0.4527252876757789,
+ "T7. Consistency & Compliance Checking": 0.19573002717715302,
+ "T8. Structured & Numeric Reasoning": 0.3111111111111111,
+ "T9. Version & Code Diff Analysis": 0.4434471088718592,
+ "T10. Rule Induction & In-Context Learning": 0.39402777777777775,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667
+ },
+ "language": {
+ "Chinese": 0.3646393289213864,
+ "English": 0.43342210807472437
+ }
+ },
+ "pass@1": 0.17866666666666667,
+ "BoN-2": {
+ "overall_metric": 0.47482444558969183,
+ "token_length": {
+ "8k": 0.6017952145070679,
+ "16k": 0.591088648529682,
+ "32k": 0.5375509398549342,
+ "64k": 0.4892562129103632,
+ "128k": 0.3164459575452793,
+ "256k": 0.3128097001908253
+ },
+ "contextual_requirement": {
+ "Full": 0.43173242380596516,
+ "Partial": 0.5296688369507991
+ },
+ "difficulty": {
+ "Easy": 0.7232714101187123,
+ "Moderate": 0.3321291637725583,
+ "Hard": 0.3952399665178113,
+ "Extreme": 0.3486594773940952
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7289955546187882,
+ "T2. Sequencing & Structure Reconstruction": 0.7412526825026818,
+ "T3. Evidence-Grounded QA": 0.4166666666666667,
+ "T4. Summarization & Synthesis": 0.49118408476251574,
+ "T5. Attribution & Citation Alignment": 0.40752656877934634,
+ "T6. Aggregation & Clustering": 0.5058127366024506,
+ "T7. Consistency & Compliance Checking": 0.2513867378923854,
+ "T8. Structured & Numeric Reasoning": 0.4481481481481482,
+ "T9. Version & Code Diff Analysis": 0.5031863563544513,
+ "T10. Rule Induction & In-Context Learning": 0.48847222222222225,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35
+ },
+ "language": {
+ "Chinese": 0.43768030437389926,
+ "English": 0.511968586805485
+ }
+ },
+ "pass@2": 0.24,
+ "BoN-3": {
+ "overall_metric": 0.5112966568518927,
+ "token_length": {
+ "8k": 0.638281295176006,
+ "16k": 0.6423408107009556,
+ "32k": 0.5654314097480697,
+ "64k": 0.5253595874826821,
+ "128k": 0.3530891462917609,
+ "256k": 0.3432776917118821
+ },
+ "contextual_requirement": {
+ "Full": 0.4709592628636537,
+ "Partial": 0.5626351582914695
+ },
+ "difficulty": {
+ "Easy": 0.7672241772316003,
+ "Moderate": 0.375515223666526,
+ "Hard": 0.43497012537117896,
+ "Extreme": 0.3702560590461619
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7536336494527094,
+ "T2. Sequencing & Structure Reconstruction": 0.7626347726347721,
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
+ "T4. Summarization & Synthesis": 0.5004237684828063,
+ "T5. Attribution & Citation Alignment": 0.44213397827806017,
+ "T6. Aggregation & Clustering": 0.5445203343972627,
+ "T7. Consistency & Compliance Checking": 0.2924260189234064,
+ "T8. Structured & Numeric Reasoning": 0.5148148148148148,
+ "T9. Version & Code Diff Analysis": 0.5454347340415279,
+ "T10. Rule Induction & In-Context Learning": 0.5093055555555556,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
+ },
+ "language": {
+ "Chinese": 0.47723166126529054,
+ "English": 0.545361652438495
+ }
+ },
+ "pass@3": 0.27466666666666667
+}
\ No newline at end of file
diff --git a/results/Llama-3.1-70B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Llama-3.1-70B-Instruct/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e84d47ed865a21f77a21e18b459481428432968
--- /dev/null
+++ b/results/Llama-3.1-70B-Instruct/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.31531891526483563,
+ "inference_iteration_1_overall_metric": 0.3187840899451787,
+ "inference_iteration_2_overall_metric": 0.3181153572604232,
+ "inference_iteration_3_overall_metric": 0.3090572985889053,
+ "average_token_length_metric": {
+ "8k": 0.44072420920233435,
+ "16k": 0.4154170524608382,
+ "32k": 0.39938052404397517,
+ "64k": 0.3038357678876172,
+ "128k": 0.16668959617065898,
+ "256k": 0.16586634182358947
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.28629550819966637,
+ "Partial": 0.3522577969841428
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.4401547192469389,
+ "Moderate": 0.21463302825178993,
+ "Hard": 0.2886301431775311,
+ "Extreme": 0.26222895835717064
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.5528929885667343,
+ "T2. Sequencing & Structure Reconstruction": 0.5351228410297036,
+ "T3. Evidence-Grounded QA": 0.34722222222222227,
+ "T4. Summarization & Synthesis": 0.48251270202602564,
+ "T5. Attribution & Citation Alignment": 0.2383585369805011,
+ "T6. Aggregation & Clustering": 0.3094103218555832,
+ "T7. Consistency & Compliance Checking": 0.18317006381802675,
+ "T8. Structured & Numeric Reasoning": 0.11111111111111113,
+ "T9. Version & Code Diff Analysis": 0.35298805295632424,
+ "T10. Rule Induction & In-Context Learning": 0.3324074074074074,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.1944444444444444
+ },
+ "average_language_metric": {
+ "Chinese": 0.27963370499725654,
+ "English": 0.3510041255324155
+ },
+ "BoN-1": {
+ "overall_metric": 0.3187840899451787,
+ "token_length": {
+ "8k": 0.45086107163290673,
+ "16k": 0.42082616936389317,
+ "32k": 0.4024958091173256,
+ "64k": 0.31412697939763573,
+ "128k": 0.16271506723441695,
+ "256k": 0.16167944292489206
+ },
+ "contextual_requirement": {
+ "Full": 0.28863047320759827,
+ "Partial": 0.35716142033846177
+ },
+ "difficulty": {
+ "Easy": 0.4450445268436371,
+ "Moderate": 0.23675467668036354,
+ "Hard": 0.2806267700317323,
+ "Extreme": 0.25941235199849716
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5578290894275468,
+ "T2. Sequencing & Structure Reconstruction": 0.5340221352721353,
+ "T3. Evidence-Grounded QA": 0.36666666666666664,
+ "T4. Summarization & Synthesis": 0.4824250532281757,
+ "T5. Attribution & Citation Alignment": 0.22435495508076153,
+ "T6. Aggregation & Clustering": 0.3143695347862014,
+ "T7. Consistency & Compliance Checking": 0.1819753578820025,
+ "T8. Structured & Numeric Reasoning": 0.11342592592592592,
+ "T9. Version & Code Diff Analysis": 0.33790255230380384,
+ "T10. Rule Induction & In-Context Learning": 0.3586111111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334
+ },
+ "language": {
+ "Chinese": 0.29265028040862706,
+ "English": 0.34491789948172935
+ }
+ },
+ "pass@1": 0.116,
+ "BoN-2": {
+ "overall_metric": 0.3704962078912661,
+ "token_length": {
+ "8k": 0.5007140188951568,
+ "16k": 0.471793141596409,
+ "32k": 0.4656246883405684,
+ "64k": 0.3555228262932919,
+ "128k": 0.22278285337549444,
+ "256k": 0.20653971884667408
+ },
+ "contextual_requirement": {
+ "Full": 0.33766430263043345,
+ "Partial": 0.41228226913232546
+ },
+ "difficulty": {
+ "Easy": 0.505817704967975,
+ "Moderate": 0.26539475589230027,
+ "Hard": 0.33902165222515473,
+ "Extreme": 0.3119632638554326
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6325785537327107,
+ "T2. Sequencing & Structure Reconstruction": 0.6024494976700858,
+ "T3. Evidence-Grounded QA": 0.4083333333333333,
+ "T4. Summarization & Synthesis": 0.5068628671369674,
+ "T5. Attribution & Citation Alignment": 0.27736426767676775,
+ "T6. Aggregation & Clustering": 0.39468559218559207,
+ "T7. Consistency & Compliance Checking": 0.23698781644898675,
+ "T8. Structured & Numeric Reasoning": 0.1412037037037037,
+ "T9. Version & Code Diff Analysis": 0.42207618836131144,
+ "T10. Rule Induction & In-Context Learning": 0.3888888888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334
+ },
+ "language": {
+ "Chinese": 0.3386454170505693,
+ "English": 0.4023469987319626
+ }
+ },
+ "pass@2": 0.144,
+ "BoN-3": {
+ "overall_metric": 0.39272719549993146,
+ "token_length": {
+ "8k": 0.5352185196666011,
+ "16k": 0.48774731079355566,
+ "32k": 0.4914359262892438,
+ "64k": 0.38089490889752375,
+ "128k": 0.23106162164056493,
+ "256k": 0.23000488571210054
+ },
+ "contextual_requirement": {
+ "Full": 0.3601448943540025,
+ "Partial": 0.434195578776569
+ },
+ "difficulty": {
+ "Easy": 0.5296491139243411,
+ "Moderate": 0.28398341296374197,
+ "Hard": 0.3678982241467773,
+ "Extreme": 0.3304684709396022
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6534337905352087,
+ "T2. Sequencing & Structure Reconstruction": 0.6269316794316794,
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
+ "T4. Summarization & Synthesis": 0.5138761303847846,
+ "T5. Attribution & Citation Alignment": 0.30993542257550877,
+ "T6. Aggregation & Clustering": 0.4248105135605135,
+ "T7. Consistency & Compliance Checking": 0.25284892696838046,
+ "T8. Structured & Numeric Reasoning": 0.1523148148148148,
+ "T9. Version & Code Diff Analysis": 0.4435626489175109,
+ "T10. Rule Induction & In-Context Learning": 0.4330555555555556,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25
+ },
+ "language": {
+ "Chinese": 0.3526743750393581,
+ "English": 0.4327800159605054
+ }
+ },
+ "pass@3": 0.16266666666666665
+}
\ No newline at end of file
diff --git a/results/Llama-3.1-70B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Llama-3.1-70B-Instruct/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..a114871cfe65a4d1acbb6687755e515db1a58be0
--- /dev/null
+++ b/results/Llama-3.1-70B-Instruct/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3212355454655496,
+ "inference_iteration_1_overall_metric": 0.3168848382877898,
+ "inference_iteration_2_overall_metric": 0.3235694833261471,
+ "inference_iteration_3_overall_metric": 0.32325231478271244,
+ "average_token_length_metric": {
+ "8k": 0.44607201777886324,
+ "16k": 0.43551597572008266,
+ "32k": 0.40532179664339874,
+ "64k": 0.3372735574136524,
+ "128k": 0.14963906519016765,
+ "256k": 0.15359086004713354
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.2920710323592253,
+ "Partial": 0.3583540166917813
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.4845503461095368,
+ "Moderate": 0.21437000317705404,
+ "Hard": 0.28040007010006374,
+ "Extreme": 0.2393443186282747
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.5203183043430563,
+ "T2. Sequencing & Structure Reconstruction": 0.5048829192634423,
+ "T3. Evidence-Grounded QA": 0.2972222222222221,
+ "T4. Summarization & Synthesis": 0.46869982746667466,
+ "T5. Attribution & Citation Alignment": 0.24051072181402314,
+ "T6. Aggregation & Clustering": 0.3352447566072413,
+ "T7. Consistency & Compliance Checking": 0.16831334519236535,
+ "T8. Structured & Numeric Reasoning": 0.24182098765432103,
+ "T9. Version & Code Diff Analysis": 0.3504639112512836,
+ "T10. Rule Induction & In-Context Learning": 0.3041666666666666,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.21111111111111108
+ },
+ "average_language_metric": {
+ "Chinese": 0.27396983223922683,
+ "English": 0.368501258691873
+ },
+ "BoN-1": {
+ "overall_metric": 0.3168848382877898,
+ "token_length": {
+ "8k": 0.4288512287020653,
+ "16k": 0.431350020549353,
+ "32k": 0.41805229286031653,
+ "64k": 0.3101016124828498,
+ "128k": 0.16200619520020057,
+ "256k": 0.1509476799319523
+ },
+ "contextual_requirement": {
+ "Full": 0.28780546371048366,
+ "Partial": 0.3538949513861792
+ },
+ "difficulty": {
+ "Easy": 0.48546407013873055,
+ "Moderate": 0.1945019101312392,
+ "Hard": 0.2709484383728626,
+ "Extreme": 0.24276914751620637
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.48095324443543325,
+ "T2. Sequencing & Structure Reconstruction": 0.4873936033102699,
+ "T3. Evidence-Grounded QA": 0.275,
+ "T4. Summarization & Synthesis": 0.46817349027530025,
+ "T5. Attribution & Citation Alignment": 0.21679364691461467,
+ "T6. Aggregation & Clustering": 0.3425653712663516,
+ "T7. Consistency & Compliance Checking": 0.17647214053474497,
+ "T8. Structured & Numeric Reasoning": 0.25462962962962965,
+ "T9. Version & Code Diff Analysis": 0.347662448182329,
+ "T10. Rule Induction & In-Context Learning": 0.31625,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334
+ },
+ "language": {
+ "Chinese": 0.2661805594268294,
+ "English": 0.36758911714875
+ }
+ },
+ "pass@1": 0.13733333333333334,
+ "BoN-2": {
+ "overall_metric": 0.39714621701263747,
+ "token_length": {
+ "8k": 0.5101215226314286,
+ "16k": 0.5452446091546882,
+ "32k": 0.5033835517180962,
+ "64k": 0.40629425415229514,
+ "128k": 0.20367087402793838,
+ "256k": 0.21416249039138233
+ },
+ "contextual_requirement": {
+ "Full": 0.3628015637327161,
+ "Partial": 0.4408575939143573
+ },
+ "difficulty": {
+ "Easy": 0.6103509331261675,
+ "Moderate": 0.2739929015343184,
+ "Hard": 0.33264890818640663,
+ "Extreme": 0.2868424835064885
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6190525866570429,
+ "T2. Sequencing & Structure Reconstruction": 0.5863848211097236,
+ "T3. Evidence-Grounded QA": 0.4,
+ "T4. Summarization & Synthesis": 0.48619725203488445,
+ "T5. Attribution & Citation Alignment": 0.290019754922306,
+ "T6. Aggregation & Clustering": 0.4354357416367219,
+ "T7. Consistency & Compliance Checking": 0.21250441110110105,
+ "T8. Structured & Numeric Reasoning": 0.3388888888888889,
+ "T9. Version & Code Diff Analysis": 0.4174297354939513,
+ "T10. Rule Induction & In-Context Learning": 0.42666666666666675,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336
+ },
+ "language": {
+ "Chinese": 0.3417787747514596,
+ "English": 0.45251365927381704
+ }
+ },
+ "pass@2": 0.19133333333333333,
+ "BoN-3": {
+ "overall_metric": 0.43806378317359107,
+ "token_length": {
+ "8k": 0.5718190971548373,
+ "16k": 0.5932030702540269,
+ "32k": 0.5401634596906353,
+ "64k": 0.45712267932316264,
+ "128k": 0.2296143794444798,
+ "256k": 0.23646001317440798
+ },
+ "contextual_requirement": {
+ "Full": 0.39971488113838277,
+ "Partial": 0.48687147667294856
+ },
+ "difficulty": {
+ "Easy": 0.6591788734633496,
+ "Moderate": 0.3103483415629278,
+ "Hard": 0.38512512860737047,
+ "Extreme": 0.3144991474359921
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6739172902122323,
+ "T2. Sequencing & Structure Reconstruction": 0.6553062801836215,
+ "T3. Evidence-Grounded QA": 0.475,
+ "T4. Summarization & Synthesis": 0.5017174053624821,
+ "T5. Attribution & Citation Alignment": 0.31632987998243095,
+ "T6. Aggregation & Clustering": 0.4726401785278596,
+ "T7. Consistency & Compliance Checking": 0.24365040952524628,
+ "T8. Structured & Numeric Reasoning": 0.36944444444444446,
+ "T9. Version & Code Diff Analysis": 0.44797944073835727,
+ "T10. Rule Induction & In-Context Learning": 0.46027777777777784,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
+ },
+ "language": {
+ "Chinese": 0.386238080047615,
+ "English": 0.48988948629956824
+ }
+ },
+ "pass@3": 0.21933333333333332
+}
\ No newline at end of file
diff --git a/results/Llama-3.1-8B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Llama-3.1-8B-Instruct/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbbed086dc03a451c3ba5d51dc42498caa85232d
--- /dev/null
+++ b/results/Llama-3.1-8B-Instruct/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.21094590782574696,
+ "inference_iteration_1_overall_metric": 0.20814425242445228,
+ "inference_iteration_2_overall_metric": 0.213015185500322,
+ "inference_iteration_3_overall_metric": 0.21167828555246626,
+ "average_token_length_metric": {
+ "8k": 0.24549737122739362,
+ "16k": 0.2608710428868677,
+ "32k": 0.2249354240045269,
+ "64k": 0.18691854981764278,
+ "128k": 0.18010527298228765,
+ "256k": 0.16734778603576234
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.1813007104244737,
+ "Partial": 0.24867615906373072
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.254652823450579,
+ "Moderate": 0.13823151671179162,
+ "Hard": 0.21215047305696197,
+ "Extreme": 0.21003592225516218
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.4667231034949644,
+ "T2. Sequencing & Structure Reconstruction": 0.4213756913607651,
+ "T3. Evidence-Grounded QA": 0.15000000000000002,
+ "T4. Summarization & Synthesis": 0.47465443881044483,
+ "T5. Attribution & Citation Alignment": 0.08095709533952888,
+ "T6. Aggregation & Clustering": 0.1895252817222955,
+ "T7. Consistency & Compliance Checking": 0.12098051997071714,
+ "T8. Structured & Numeric Reasoning": 0.04969135802469136,
+ "T9. Version & Code Diff Analysis": 0.21578074220253873,
+ "T10. Rule Induction & In-Context Learning": 0.16759259259259265,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.11944444444444445
+ },
+ "average_language_metric": {
+ "Chinese": 0.17905397547001678,
+ "English": 0.24283784018147686
+ },
+ "BoN-1": {
+ "overall_metric": 0.20814425242445228,
+ "token_length": {
+ "8k": 0.22656456269661684,
+ "16k": 0.25006742264480875,
+ "32k": 0.22983820975916858,
+ "64k": 0.17867648708372652,
+ "128k": 0.1893868580863639,
+ "256k": 0.17433197427602828
+ },
+ "contextual_requirement": {
+ "Full": 0.1770456924597648,
+ "Partial": 0.24772423783405423
+ },
+ "difficulty": {
+ "Easy": 0.2566570988180275,
+ "Moderate": 0.1324196374719661,
+ "Hard": 0.20899042479807298,
+ "Extreme": 0.2041821568416995
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.4630702505342932,
+ "T2. Sequencing & Structure Reconstruction": 0.4120432796938637,
+ "T3. Evidence-Grounded QA": 0.13333333333333333,
+ "T4. Summarization & Synthesis": 0.47400084179308405,
+ "T5. Attribution & Citation Alignment": 0.0690637373143065,
+ "T6. Aggregation & Clustering": 0.19812149190741143,
+ "T7. Consistency & Compliance Checking": 0.11267079352137302,
+ "T8. Structured & Numeric Reasoning": 0.04490740740740741,
+ "T9. Version & Code Diff Analysis": 0.19979772893803915,
+ "T10. Rule Induction & In-Context Learning": 0.18361111111111109,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.13333333333333333
+ },
+ "language": {
+ "Chinese": 0.17869850347586846,
+ "English": 0.2375900013730359
+ }
+ },
+ "pass@1": 0.052,
+ "BoN-2": {
+ "overall_metric": 0.24847856382430364,
+ "token_length": {
+ "8k": 0.28288621820265775,
+ "16k": 0.30147254873111595,
+ "32k": 0.2728297990630907,
+ "64k": 0.21066076283394117,
+ "128k": 0.21820825855623124,
+ "256k": 0.2048137955587849
+ },
+ "contextual_requirement": {
+ "Full": 0.2142227788917067,
+ "Partial": 0.29207683555669967
+ },
+ "difficulty": {
+ "Easy": 0.3015279708931293,
+ "Moderate": 0.1575465527579005,
+ "Hard": 0.2601383702201079,
+ "Extreme": 0.24243277505755167
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5278328421453239,
+ "T2. Sequencing & Structure Reconstruction": 0.4858897020853544,
+ "T3. Evidence-Grounded QA": 0.18333333333333332,
+ "T4. Summarization & Synthesis": 0.4923937781072418,
+ "T5. Attribution & Citation Alignment": 0.10321693015669395,
+ "T6. Aggregation & Clustering": 0.24202529361366723,
+ "T7. Consistency & Compliance Checking": 0.1496372306528247,
+ "T8. Structured & Numeric Reasoning": 0.06435185185185185,
+ "T9. Version & Code Diff Analysis": 0.2640161200205543,
+ "T10. Rule Induction & In-Context Learning": 0.22361111111111112,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
+ },
+ "language": {
+ "Chinese": 0.21521719416329613,
+ "English": 0.2817399334853111
+ }
+ },
+ "pass@2": 0.068,
+ "BoN-3": {
+ "overall_metric": 0.27478587280873845,
+ "token_length": {
+ "8k": 0.32764512992755657,
+ "16k": 0.3226696445637056,
+ "32k": 0.29882237152612906,
+ "64k": 0.24198009489674097,
+ "128k": 0.23991327950411545,
+ "256k": 0.21768471643418316
+ },
+ "contextual_requirement": {
+ "Full": 0.2383436635644805,
+ "Partial": 0.32116686639233955
+ },
+ "difficulty": {
+ "Easy": 0.33550759893050514,
+ "Moderate": 0.18287651202405897,
+ "Hard": 0.28514599318022166,
+ "Extreme": 0.26183100573765283
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5538072480868509,
+ "T2. Sequencing & Structure Reconstruction": 0.5290554353054353,
+ "T3. Evidence-Grounded QA": 0.2,
+ "T4. Summarization & Synthesis": 0.5032056223860172,
+ "T5. Attribution & Citation Alignment": 0.1333833560986498,
+ "T6. Aggregation & Clustering": 0.2737365591929987,
+ "T7. Consistency & Compliance Checking": 0.17502387399050007,
+ "T8. Structured & Numeric Reasoning": 0.08750000000000001,
+ "T9. Version & Code Diff Analysis": 0.3068144317903629,
+ "T10. Rule Induction & In-Context Learning": 0.2625,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
+ },
+ "language": {
+ "Chinese": 0.23247692481372026,
+ "English": 0.3170948208037568
+ }
+ },
+ "pass@3": 0.078
+}
\ No newline at end of file
diff --git a/results/Llama-3.1-8B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Llama-3.1-8B-Instruct/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f58089e21405a416305ba97c005165e2257a2d9
--- /dev/null
+++ b/results/Llama-3.1-8B-Instruct/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.20055372622856252,
+ "inference_iteration_1_overall_metric": 0.20930536348826667,
+ "inference_iteration_2_overall_metric": 0.19814963328771615,
+ "inference_iteration_3_overall_metric": 0.19420618190970523,
+ "average_token_length_metric": {
+ "8k": 0.25813666494087695,
+ "16k": 0.2584728735658432,
+ "32k": 0.22849707778354275,
+ "64k": 0.18730191383793596,
+ "128k": 0.12903377597359986,
+ "256k": 0.14188005126957745
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.1871659292157323,
+ "Partial": 0.21759274060852876
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.2628031197661334,
+ "Moderate": 0.12316795604778738,
+ "Hard": 0.17987117557385604,
+ "Extreme": 0.19677540131116525
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.4057709287198289,
+ "T2. Sequencing & Structure Reconstruction": 0.34219177348181007,
+ "T3. Evidence-Grounded QA": 0.12222222222222225,
+ "T4. Summarization & Synthesis": 0.4554008546339488,
+ "T5. Attribution & Citation Alignment": 0.09150573083223558,
+ "T6. Aggregation & Clustering": 0.2205869310535933,
+ "T7. Consistency & Compliance Checking": 0.10384155329285091,
+ "T8. Structured & Numeric Reasoning": 0.10123456790123456,
+ "T9. Version & Code Diff Analysis": 0.17017808218806194,
+ "T10. Rule Induction & In-Context Learning": 0.1756018518518519,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.10555555555555557
+ },
+ "average_language_metric": {
+ "Chinese": 0.14712355199329824,
+ "English": 0.2539839004638273
+ },
+ "BoN-1": {
+ "overall_metric": 0.20930536348826667,
+ "token_length": {
+ "8k": 0.2793229214860049,
+ "16k": 0.271683549702249,
+ "32k": 0.23348902534053415,
+ "64k": 0.19344554869301367,
+ "128k": 0.12028061678065591,
+ "256k": 0.15761051892714206
+ },
+ "contextual_requirement": {
+ "Full": 0.18602878507502096,
+ "Partial": 0.2389300996505795
+ },
+ "difficulty": {
+ "Easy": 0.28424670669624624,
+ "Moderate": 0.1253791912220988,
+ "Hard": 0.17541923812572474,
+ "Extreme": 0.20459178755292767
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.4385523704155171,
+ "T2. Sequencing & Structure Reconstruction": 0.33931459481923876,
+ "T3. Evidence-Grounded QA": 0.11666666666666667,
+ "T4. Summarization & Synthesis": 0.45953789724020144,
+ "T5. Attribution & Citation Alignment": 0.0722496761788766,
+ "T6. Aggregation & Clustering": 0.21044140806468803,
+ "T7. Consistency & Compliance Checking": 0.1283895768466609,
+ "T8. Structured & Numeric Reasoning": 0.1125,
+ "T9. Version & Code Diff Analysis": 0.2038326942491424,
+ "T10. Rule Induction & In-Context Learning": 0.19249999999999998,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.11666666666666667
+ },
+ "language": {
+ "Chinese": 0.15245924739914177,
+ "English": 0.26615147957739155
+ }
+ },
+ "pass@1": 0.06,
+ "BoN-2": {
+ "overall_metric": 0.2668017232451993,
+ "token_length": {
+ "8k": 0.3461725655993255,
+ "16k": 0.34802165706779187,
+ "32k": 0.31037609052110204,
+ "64k": 0.2389649239073623,
+ "128k": 0.1748998439051164,
+ "256k": 0.18237525847049635
+ },
+ "contextual_requirement": {
+ "Full": 0.2499435538255567,
+ "Partial": 0.28825757523383516
+ },
+ "difficulty": {
+ "Easy": 0.3588173402056394,
+ "Moderate": 0.16793760263800153,
+ "Hard": 0.24236535579298768,
+ "Extreme": 0.246988799777556
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5197176048343898,
+ "T2. Sequencing & Structure Reconstruction": 0.48746135503643234,
+ "T3. Evidence-Grounded QA": 0.16666666666666666,
+ "T4. Summarization & Synthesis": 0.4794299302901568,
+ "T5. Attribution & Citation Alignment": 0.15065800813584843,
+ "T6. Aggregation & Clustering": 0.3052547184784026,
+ "T7. Consistency & Compliance Checking": 0.15302354691124467,
+ "T8. Structured & Numeric Reasoning": 0.14444444444444443,
+ "T9. Version & Code Diff Analysis": 0.2507539108503574,
+ "T10. Rule Induction & In-Context Learning": 0.23458333333333334,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
+ },
+ "language": {
+ "Chinese": 0.19465748810477185,
+ "English": 0.33894595838562674
+ }
+ },
+ "pass@2": 0.082,
+ "BoN-3": {
+ "overall_metric": 0.29957877493017654,
+ "token_length": {
+ "8k": 0.3888483252966101,
+ "16k": 0.39060272417227027,
+ "32k": 0.34430797611932984,
+ "64k": 0.2620211212858359,
+ "128k": 0.19544431047747687,
+ "256k": 0.21624819222953548
+ },
+ "contextual_requirement": {
+ "Full": 0.2794795135612338,
+ "Partial": 0.3251596530361036
+ },
+ "difficulty": {
+ "Easy": 0.4075586546242195,
+ "Moderate": 0.19423764623204026,
+ "Hard": 0.27460228265341846,
+ "Extreme": 0.26688670776930307
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5783680297471054,
+ "T2. Sequencing & Structure Reconstruction": 0.5321006964107361,
+ "T3. Evidence-Grounded QA": 0.20833333333333334,
+ "T4. Summarization & Synthesis": 0.4907425557642246,
+ "T5. Attribution & Citation Alignment": 0.1724297780962502,
+ "T6. Aggregation & Clustering": 0.3362168666306595,
+ "T7. Consistency & Compliance Checking": 0.16743722857143514,
+ "T8. Structured & Numeric Reasoning": 0.1921296296296296,
+ "T9. Version & Code Diff Analysis": 0.2933902615835246,
+ "T10. Rule Induction & In-Context Learning": 0.2673611111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.15833333333333333
+ },
+ "language": {
+ "Chinese": 0.22893100795326488,
+ "English": 0.37022654190708804
+ }
+ },
+ "pass@3": 0.09866666666666667
+}
\ No newline at end of file
diff --git a/results/Llama-3.2-3B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Llama-3.2-3B-Instruct/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..11639dccb9160ccc28868b95572f9c9314dbd63f
--- /dev/null
+++ b/results/Llama-3.2-3B-Instruct/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.15708345836639478,
+ "inference_iteration_1_overall_metric": 0.1522499746285183,
+ "inference_iteration_2_overall_metric": 0.16292071303675215,
+ "inference_iteration_3_overall_metric": 0.15607968743391326,
+ "average_token_length_metric": {
+ "8k": 0.19074751544937305,
+ "16k": 0.18311448739692587,
+ "32k": 0.15849934191444578,
+ "64k": 0.13711682904337855,
+ "128k": 0.13633492271322553,
+ "256k": 0.13668765368101904
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.15215903834853456,
+ "Partial": 0.16335090202548921
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.18487754128195516,
+ "Moderate": 0.10373568516116806,
+ "Hard": 0.15009434290092064,
+ "Extreme": 0.16626666941305895
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.3024747670482544,
+ "T2. Sequencing & Structure Reconstruction": 0.30977286822552064,
+ "T3. Evidence-Grounded QA": 0.11388888888888889,
+ "T4. Summarization & Synthesis": 0.4341408375280332,
+ "T5. Attribution & Citation Alignment": 0.0344779091920434,
+ "T6. Aggregation & Clustering": 0.15408667402382106,
+ "T7. Consistency & Compliance Checking": 0.07795789080091817,
+ "T8. Structured & Numeric Reasoning": 0.04398148148148148,
+ "T9. Version & Code Diff Analysis": 0.12905444479341674,
+ "T10. Rule Induction & In-Context Learning": 0.1479166666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.07777777777777778
+ },
+ "average_language_metric": {
+ "Chinese": 0.10513615313151659,
+ "English": 0.20903076360127287
+ },
+ "BoN-1": {
+ "overall_metric": 0.1522499746285183,
+ "token_length": {
+ "8k": 0.18381886404580258,
+ "16k": 0.1763640779318065,
+ "32k": 0.1530403977981516,
+ "64k": 0.1336449793468121,
+ "128k": 0.13665833409911293,
+ "256k": 0.1299731945494239
+ },
+ "contextual_requirement": {
+ "Full": 0.14616662224309246,
+ "Partial": 0.15999242311906023
+ },
+ "difficulty": {
+ "Easy": 0.17269103520136797,
+ "Moderate": 0.10010586621394478,
+ "Hard": 0.1522959770717054,
+ "Extreme": 0.16407670515037534
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.31616711715222934,
+ "T2. Sequencing & Structure Reconstruction": 0.30997657033436127,
+ "T3. Evidence-Grounded QA": 0.075,
+ "T4. Summarization & Synthesis": 0.433290104470351,
+ "T5. Attribution & Citation Alignment": 0.040719448989792455,
+ "T6. Aggregation & Clustering": 0.1420105345821374,
+ "T7. Consistency & Compliance Checking": 0.0818116476937141,
+ "T8. Structured & Numeric Reasoning": 0.04583333333333333,
+ "T9. Version & Code Diff Analysis": 0.12251594627374464,
+ "T10. Rule Induction & In-Context Learning": 0.12597222222222224,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.075
+ },
+ "language": {
+ "Chinese": 0.10313580635298143,
+ "English": 0.2013641429040551
+ }
+ },
+ "pass@1": 0.03333333333333333,
+ "BoN-2": {
+ "overall_metric": 0.1921634438744948,
+ "token_length": {
+ "8k": 0.21983364549684287,
+ "16k": 0.21955667723795985,
+ "32k": 0.18995358273392637,
+ "64k": 0.16240795470998307,
+ "128k": 0.17591239432067907,
+ "256k": 0.18531640874757638
+ },
+ "contextual_requirement": {
+ "Full": 0.183991826715688,
+ "Partial": 0.20256368389479382
+ },
+ "difficulty": {
+ "Easy": 0.23050124787481288,
+ "Moderate": 0.13024657792029828,
+ "Hard": 0.18670800635922485,
+ "Extreme": 0.19441747608931817
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.3586662331587295,
+ "T2. Sequencing & Structure Reconstruction": 0.3847515389630534,
+ "T3. Evidence-Grounded QA": 0.16666666666666666,
+ "T4. Summarization & Synthesis": 0.4623366120534231,
+ "T5. Attribution & Citation Alignment": 0.04912393153232278,
+ "T6. Aggregation & Clustering": 0.19169925535926752,
+ "T7. Consistency & Compliance Checking": 0.09784140674649551,
+ "T8. Structured & Numeric Reasoning": 0.06805555555555555,
+ "T9. Version & Code Diff Analysis": 0.17896485067612064,
+ "T10. Rule Induction & In-Context Learning": 0.17347222222222225,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666
+ },
+ "language": {
+ "Chinese": 0.1340947957448097,
+ "English": 0.25023209200417945
+ }
+ },
+ "pass@2": 0.052,
+ "BoN-3": {
+ "overall_metric": 0.2126068902708674,
+ "token_length": {
+ "8k": 0.24600389478810633,
+ "16k": 0.23558970928733117,
+ "32k": 0.21547322231985894,
+ "64k": 0.18904442314365716,
+ "128k": 0.19236619778234643,
+ "256k": 0.19716389430390446
+ },
+ "contextual_requirement": {
+ "Full": 0.20471895219678654,
+ "Partial": 0.22264608418333381
+ },
+ "difficulty": {
+ "Easy": 0.2591916862859481,
+ "Moderate": 0.1495495342063559,
+ "Hard": 0.20304294133612527,
+ "Extreme": 0.20927435723794804
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.3790383678432267,
+ "T2. Sequencing & Structure Reconstruction": 0.41972832869160254,
+ "T3. Evidence-Grounded QA": 0.19166666666666668,
+ "T4. Summarization & Synthesis": 0.47495636922941775,
+ "T5. Attribution & Citation Alignment": 0.058968622858899346,
+ "T6. Aggregation & Clustering": 0.22320851430069694,
+ "T7. Consistency & Compliance Checking": 0.10913808800759871,
+ "T8. Structured & Numeric Reasoning": 0.08194444444444444,
+ "T9. Version & Code Diff Analysis": 0.19665231407803055,
+ "T10. Rule Induction & In-Context Learning": 0.21513888888888888,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.1
+ },
+ "language": {
+ "Chinese": 0.1489641004694211,
+ "English": 0.27624968007231393
+ }
+ },
+ "pass@3": 0.059333333333333335
+}
\ No newline at end of file
diff --git a/results/Llama-3.2-3B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Llama-3.2-3B-Instruct/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..55789f7fa33a798169bfbac5733132f51548c16e
--- /dev/null
+++ b/results/Llama-3.2-3B-Instruct/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.12579532534277046,
+ "inference_iteration_1_overall_metric": 0.12189144574406009,
+ "inference_iteration_2_overall_metric": 0.1293014670146224,
+ "inference_iteration_3_overall_metric": 0.12619306326962915,
+ "average_token_length_metric": {
+ "8k": 0.15520842809468816,
+ "16k": 0.1484137730398096,
+ "32k": 0.13566213672791996,
+ "64k": 0.10339908100452032,
+ "128k": 0.11523413736015264,
+ "256k": 0.09685439582953186
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.11497129858976732,
+ "Partial": 0.13957135939204723
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.14352977294197386,
+ "Moderate": 0.07165328990150331,
+ "Hard": 0.10476288088788434,
+ "Extreme": 0.15574400931361643
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.18027953901797078,
+ "T2. Sequencing & Structure Reconstruction": 0.2032463362205395,
+ "T3. Evidence-Grounded QA": 0.11944444444444445,
+ "T4. Summarization & Synthesis": 0.42455853122133613,
+ "T5. Attribution & Citation Alignment": 0.03812352880578544,
+ "T6. Aggregation & Clustering": 0.12499067187507265,
+ "T7. Consistency & Compliance Checking": 0.06067661860338902,
+ "T8. Structured & Numeric Reasoning": 0.029629629629629627,
+ "T9. Version & Code Diff Analysis": 0.09370491802352847,
+ "T10. Rule Induction & In-Context Learning": 0.09847222222222225,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666
+ },
+ "average_language_metric": {
+ "Chinese": 0.08710157372538106,
+ "English": 0.16448907696016007
+ },
+ "BoN-1": {
+ "overall_metric": 0.12189144574406009,
+ "token_length": {
+ "8k": 0.14926024363329887,
+ "16k": 0.14140607600695068,
+ "32k": 0.12814754734467546,
+ "64k": 0.09111044280283606,
+ "128k": 0.1221228193452808,
+ "256k": 0.09930154533131832
+ },
+ "contextual_requirement": {
+ "Full": 0.11202506506940242,
+ "Partial": 0.13444865751180604
+ },
+ "difficulty": {
+ "Easy": 0.14039643942766825,
+ "Moderate": 0.056507474186827354,
+ "Hard": 0.10349493747731114,
+ "Extreme": 0.15664860872958616
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.16389165825733903,
+ "T2. Sequencing & Structure Reconstruction": 0.1787449973585681,
+ "T3. Evidence-Grounded QA": 0.09166666666666666,
+ "T4. Summarization & Synthesis": 0.42251209889763974,
+ "T5. Attribution & Citation Alignment": 0.029073247426826414,
+ "T6. Aggregation & Clustering": 0.13197902480468204,
+ "T7. Consistency & Compliance Checking": 0.06122238755841022,
+ "T8. Structured & Numeric Reasoning": 0.044444444444444446,
+ "T9. Version & Code Diff Analysis": 0.08781339576018317,
+ "T10. Rule Induction & In-Context Learning": 0.10180555555555555,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666
+ },
+ "language": {
+ "Chinese": 0.07947872192042414,
+ "English": 0.16430416956769595
+ }
+ },
+ "pass@1": 0.03333333333333333,
+ "BoN-2": {
+ "overall_metric": 0.16673174742075972,
+ "token_length": {
+ "8k": 0.21235596929835865,
+ "16k": 0.19139324741892721,
+ "32k": 0.17847356237746767,
+ "64k": 0.1323473405869264,
+ "128k": 0.15654182190882684,
+ "256k": 0.12927854293405058
+ },
+ "contextual_requirement": {
+ "Full": 0.15641485621192497,
+ "Partial": 0.1798623362320038
+ },
+ "difficulty": {
+ "Easy": 0.20556729410789223,
+ "Moderate": 0.09843476269293053,
+ "Hard": 0.13991984386404396,
+ "Extreme": 0.1866584183549311
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.23716632921131517,
+ "T2. Sequencing & Structure Reconstruction": 0.283700678823595,
+ "T3. Evidence-Grounded QA": 0.18333333333333332,
+ "T4. Summarization & Synthesis": 0.44817949561782316,
+ "T5. Attribution & Citation Alignment": 0.07044117047262258,
+ "T6. Aggregation & Clustering": 0.1774625349060259,
+ "T7. Consistency & Compliance Checking": 0.0801161795021524,
+ "T8. Structured & Numeric Reasoning": 0.05,
+ "T9. Version & Code Diff Analysis": 0.14648554146631595,
+ "T10. Rule Induction & In-Context Learning": 0.14513888888888887,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.10833333333333334
+ },
+ "language": {
+ "Chinese": 0.11086445603583714,
+ "English": 0.22259903880568205
+ }
+ },
+ "pass@2": 0.048,
+ "BoN-3": {
+ "overall_metric": 0.19563257554771482,
+ "token_length": {
+ "8k": 0.2530151029327433,
+ "16k": 0.2145335526961687,
+ "32k": 0.21507949798738746,
+ "64k": 0.15928462128906565,
+ "128k": 0.18536894288047534,
+ "256k": 0.14651373550044794
+ },
+ "contextual_requirement": {
+ "Full": 0.18678441171314086,
+ "Partial": 0.20689387497353642
+ },
+ "difficulty": {
+ "Easy": 0.24756743178790308,
+ "Moderate": 0.1259095390094924,
+ "Hard": 0.1628721973860393,
+ "Extreme": 0.20605327132157794
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.3006781841143776,
+ "T2. Sequencing & Structure Reconstruction": 0.34291202176503843,
+ "T3. Evidence-Grounded QA": 0.20833333333333334,
+ "T4. Summarization & Synthesis": 0.46018159312540347,
+ "T5. Attribution & Citation Alignment": 0.0815602367002352,
+ "T6. Aggregation & Clustering": 0.20248617019109905,
+ "T7. Consistency & Compliance Checking": 0.08636604532829192,
+ "T8. Structured & Numeric Reasoning": 0.07777777777777778,
+ "T9. Version & Code Diff Analysis": 0.17054683536229345,
+ "T10. Rule Induction & In-Context Learning": 0.18958333333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
+ },
+ "language": {
+ "Chinese": 0.13358536850999,
+ "English": 0.25767978258543983
+ }
+ },
+ "pass@3": 0.06133333333333333
+}
\ No newline at end of file
diff --git a/results/Llama-3.3-70B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Llama-3.3-70B-Instruct/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..62ef2beeaf886aba533300cd870538dbf5265a5f
--- /dev/null
+++ b/results/Llama-3.3-70B-Instruct/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3189017909858673,
+ "inference_iteration_1_overall_metric": 0.3156125985423413,
+ "inference_iteration_2_overall_metric": 0.32123074093436993,
+ "inference_iteration_3_overall_metric": 0.31986203348089143,
+ "average_token_length_metric": {
+ "8k": 0.4593043661575621,
+ "16k": 0.4357005279195819,
+ "32k": 0.4042163423898818,
+ "64k": 0.33700892278371447,
+ "128k": 0.1425979596199625,
+ "256k": 0.13458262704450225
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.291318303255093,
+ "Partial": 0.35400804809776254
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.440396417724264,
+ "Moderate": 0.22606271262921054,
+ "Hard": 0.29071041097108385,
+ "Extreme": 0.2653049554891388
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.5698993177933688,
+ "T2. Sequencing & Structure Reconstruction": 0.5207791047484032,
+ "T3. Evidence-Grounded QA": 0.33055555555555555,
+ "T4. Summarization & Synthesis": 0.4889794661436827,
+ "T5. Attribution & Citation Alignment": 0.24622344831110268,
+ "T6. Aggregation & Clustering": 0.31517757482240366,
+ "T7. Consistency & Compliance Checking": 0.19554447151545906,
+ "T8. Structured & Numeric Reasoning": 0.11126543209876544,
+ "T9. Version & Code Diff Analysis": 0.3449561289681397,
+ "T10. Rule Induction & In-Context Learning": 0.3185648148148148,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334
+ },
+ "average_language_metric": {
+ "Chinese": 0.28656733073968105,
+ "English": 0.35123625123205404
+ },
+ "BoN-1": {
+ "overall_metric": 0.3156125985423413,
+ "token_length": {
+ "8k": 0.4598857003901339,
+ "16k": 0.43280589089049554,
+ "32k": 0.4031552748630893,
+ "64k": 0.32858688094254546,
+ "128k": 0.13499648051535806,
+ "256k": 0.1342453636524253
+ },
+ "contextual_requirement": {
+ "Full": 0.28665010278500924,
+ "Partial": 0.3524739567789461
+ },
+ "difficulty": {
+ "Easy": 0.4365832341037067,
+ "Moderate": 0.2165303315022285,
+ "Hard": 0.2928446211124342,
+ "Extreme": 0.26312822197701774
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5433299618590155,
+ "T2. Sequencing & Structure Reconstruction": 0.526070226070226,
+ "T3. Evidence-Grounded QA": 0.325,
+ "T4. Summarization & Synthesis": 0.49127697260080533,
+ "T5. Attribution & Citation Alignment": 0.24102740431192388,
+ "T6. Aggregation & Clustering": 0.31258403804363555,
+ "T7. Consistency & Compliance Checking": 0.20132235403207502,
+ "T8. Structured & Numeric Reasoning": 0.11481481481481483,
+ "T9. Version & Code Diff Analysis": 0.34245443993484,
+ "T10. Rule Induction & In-Context Learning": 0.30791666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.225
+ },
+ "language": {
+ "Chinese": 0.28075822912737913,
+ "English": 0.3504669679573037
+ }
+ },
+ "pass@1": 0.11266666666666666,
+ "BoN-2": {
+ "overall_metric": 0.34649569529069124,
+ "token_length": {
+ "8k": 0.47647116314268717,
+ "16k": 0.4687523820747641,
+ "32k": 0.4305852114495849,
+ "64k": 0.3618707734054544,
+ "128k": 0.17753523030724935,
+ "256k": 0.16375941136440833
+ },
+ "contextual_requirement": {
+ "Full": 0.3168861362630559,
+ "Partial": 0.3841805885985913
+ },
+ "difficulty": {
+ "Easy": 0.4708835305116446,
+ "Moderate": 0.2536880874829281,
+ "Hard": 0.31598267575873745,
+ "Extreme": 0.29123370602859766
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.613795052914502,
+ "T2. Sequencing & Structure Reconstruction": 0.572449494949495,
+ "T3. Evidence-Grounded QA": 0.35833333333333334,
+ "T4. Summarization & Synthesis": 0.5065243852070945,
+ "T5. Attribution & Citation Alignment": 0.26010872017700837,
+ "T6. Aggregation & Clustering": 0.3543572676296878,
+ "T7. Consistency & Compliance Checking": 0.2180957885923682,
+ "T8. Structured & Numeric Reasoning": 0.1287037037037037,
+ "T9. Version & Code Diff Analysis": 0.3871389535524576,
+ "T10. Rule Induction & In-Context Learning": 0.3311111111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25
+ },
+ "language": {
+ "Chinese": 0.3170227577667401,
+ "English": 0.3759686328146428
+ }
+ },
+ "pass@2": 0.126,
+ "BoN-3": {
+ "overall_metric": 0.358435386899099,
+ "token_length": {
+ "8k": 0.48710556132647576,
+ "16k": 0.4795308481526823,
+ "32k": 0.4449722178774269,
+ "64k": 0.37149023172449164,
+ "128k": 0.19457817433016786,
+ "256k": 0.17293528798335014
+ },
+ "contextual_requirement": {
+ "Full": 0.33005566769629513,
+ "Partial": 0.39455502952084975
+ },
+ "difficulty": {
+ "Easy": 0.48736321739782795,
+ "Moderate": 0.2624427701465103,
+ "Hard": 0.3272030727162918,
+ "Extreme": 0.30076445676260405
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6366814621432787,
+ "T2. Sequencing & Structure Reconstruction": 0.5821049783549782,
+ "T3. Evidence-Grounded QA": 0.36666666666666664,
+ "T4. Summarization & Synthesis": 0.5152102198399457,
+ "T5. Attribution & Citation Alignment": 0.27428748176856066,
+ "T6. Aggregation & Clustering": 0.36977067074988085,
+ "T7. Consistency & Compliance Checking": 0.22440889882489728,
+ "T8. Structured & Numeric Reasoning": 0.1300925925925926,
+ "T9. Version & Code Diff Analysis": 0.39630550643647333,
+ "T10. Rule Induction & In-Context Learning": 0.37277777777777776,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25
+ },
+ "language": {
+ "Chinese": 0.33234090111179226,
+ "English": 0.3845298726864061
+ }
+ },
+ "pass@3": 0.13466666666666666
+}
\ No newline at end of file
diff --git a/results/Llama-3.3-70B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Llama-3.3-70B-Instruct/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d5b8a496fc89154ed9b99552066db4e6375dc58
--- /dev/null
+++ b/results/Llama-3.3-70B-Instruct/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3368788983987977,
+ "inference_iteration_1_overall_metric": 0.3346445205602255,
+ "inference_iteration_2_overall_metric": 0.34105124981338825,
+ "inference_iteration_3_overall_metric": 0.3349409248227798,
+ "average_token_length_metric": {
+ "8k": 0.48257436937624887,
+ "16k": 0.4570891611420083,
+ "32k": 0.43164967032208246,
+ "64k": 0.37974005621997625,
+ "128k": 0.14494662029982153,
+ "256k": 0.1252735130326494
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3148605467957358,
+ "Partial": 0.3649022549845135
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5193942390521589,
+ "Moderate": 0.22606077851307846,
+ "Hard": 0.2859477654450278,
+ "Extreme": 0.2431814890253722
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.5716856431399633,
+ "T2. Sequencing & Structure Reconstruction": 0.5180708023900146,
+ "T3. Evidence-Grounded QA": 0.3222222222222222,
+ "T4. Summarization & Synthesis": 0.4675755720039648,
+ "T5. Attribution & Citation Alignment": 0.2472281333613823,
+ "T6. Aggregation & Clustering": 0.36222233277682875,
+ "T7. Consistency & Compliance Checking": 0.16955304749357702,
+ "T8. Structured & Numeric Reasoning": 0.2820987654320987,
+ "T9. Version & Code Diff Analysis": 0.34705004572107623,
+ "T10. Rule Induction & In-Context Learning": 0.31356481481481485,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2027777777777778
+ },
+ "average_language_metric": {
+ "Chinese": 0.28230462501190684,
+ "English": 0.39145317178568884
+ },
+ "BoN-1": {
+ "overall_metric": 0.3346445205602255,
+ "token_length": {
+ "8k": 0.4542953722813034,
+ "16k": 0.44790102117035924,
+ "32k": 0.4589206338089406,
+ "64k": 0.3903508423882444,
+ "128k": 0.1390327640597696,
+ "256k": 0.11736648965273745
+ },
+ "contextual_requirement": {
+ "Full": 0.3229561312874356,
+ "Partial": 0.34952065236195895
+ },
+ "difficulty": {
+ "Easy": 0.5259881421298087,
+ "Moderate": 0.21937379810223762,
+ "Hard": 0.272924266331999,
+ "Extreme": 0.24128717207335554
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5894507286480519,
+ "T2. Sequencing & Structure Reconstruction": 0.49666058774955946,
+ "T3. Evidence-Grounded QA": 0.31666666666666665,
+ "T4. Summarization & Synthesis": 0.4602747124060712,
+ "T5. Attribution & Citation Alignment": 0.2717574105274334,
+ "T6. Aggregation & Clustering": 0.34752011368589375,
+ "T7. Consistency & Compliance Checking": 0.15669004787219404,
+ "T8. Structured & Numeric Reasoning": 0.3101851851851852,
+ "T9. Version & Code Diff Analysis": 0.3494311586679074,
+ "T10. Rule Induction & In-Context Learning": 0.2688888888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334
+ },
+ "language": {
+ "Chinese": 0.28273488892882487,
+ "English": 0.3865541521916267
+ }
+ },
+ "pass@1": 0.14066666666666666,
+ "BoN-2": {
+ "overall_metric": 0.3958348563743987,
+ "token_length": {
+ "8k": 0.5375237165063974,
+ "16k": 0.5279796876480686,
+ "32k": 0.5128361604334021,
+ "64k": 0.44897938061236586,
+ "128k": 0.18481857856663617,
+ "256k": 0.16287161447952125
+ },
+ "contextual_requirement": {
+ "Full": 0.37597657573399246,
+ "Partial": 0.4211090317349159
+ },
+ "difficulty": {
+ "Easy": 0.6024719774255488,
+ "Moderate": 0.2783382985783699,
+ "Hard": 0.3365596754926361,
+ "Extreme": 0.28558017485446086
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6560921957788185,
+ "T2. Sequencing & Structure Reconstruction": 0.590698368130755,
+ "T3. Evidence-Grounded QA": 0.38333333333333336,
+ "T4. Summarization & Synthesis": 0.4863977172263582,
+ "T5. Attribution & Citation Alignment": 0.29159793863898964,
+ "T6. Aggregation & Clustering": 0.424325611415665,
+ "T7. Consistency & Compliance Checking": 0.20595551459420022,
+ "T8. Structured & Numeric Reasoning": 0.3527777777777778,
+ "T9. Version & Code Diff Analysis": 0.3998111292235962,
+ "T10. Rule Induction & In-Context Learning": 0.42375,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667
+ },
+ "language": {
+ "Chinese": 0.3369511703916512,
+ "English": 0.4547185423571466
+ }
+ },
+ "pass@2": 0.18533333333333332,
+ "BoN-3": {
+ "overall_metric": 0.4231932102533526,
+ "token_length": {
+ "8k": 0.5825111026223746,
+ "16k": 0.5470777230462098,
+ "32k": 0.5441468137547846,
+ "64k": 0.476981081390983,
+ "128k": 0.19851254776601635,
+ "256k": 0.189929992939747
+ },
+ "contextual_requirement": {
+ "Full": 0.4008399164475989,
+ "Partial": 0.4516428569152216
+ },
+ "difficulty": {
+ "Easy": 0.6383633989735824,
+ "Moderate": 0.29622937081220263,
+ "Hard": 0.3631324161592812,
+ "Extreme": 0.31032522872728074
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6823167917941978,
+ "T2. Sequencing & Structure Reconstruction": 0.6268177888335086,
+ "T3. Evidence-Grounded QA": 0.38333333333333336,
+ "T4. Summarization & Synthesis": 0.49909323423974894,
+ "T5. Attribution & Citation Alignment": 0.30308485751721276,
+ "T6. Aggregation & Clustering": 0.4702913178424183,
+ "T7. Consistency & Compliance Checking": 0.22495072081956943,
+ "T8. Structured & Numeric Reasoning": 0.38055555555555554,
+ "T9. Version & Code Diff Analysis": 0.4307393977892574,
+ "T10. Rule Induction & In-Context Learning": 0.47583333333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.275
+ },
+ "language": {
+ "Chinese": 0.36349427675262524,
+ "English": 0.48289214375408057
+ }
+ },
+ "pass@3": 0.20866666666666667
+}
\ No newline at end of file
diff --git a/results/Magistral-Small-2509/thinking_context-120000_bon-3_summary.json b/results/Magistral-Small-2509/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..e75935e36612f86d6bb7a6e93be99e5989405b6f
--- /dev/null
+++ b/results/Magistral-Small-2509/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.38398116897357343,
+ "inference_iteration_1_overall_metric": 0.3775125226933459,
+ "inference_iteration_2_overall_metric": 0.3891788972494985,
+ "inference_iteration_3_overall_metric": 0.38525208697787594,
+ "average_token_length_metric": {
+ "8k": 0.5449912961150511,
+ "16k": 0.47444243155128724,
+ "32k": 0.4264163670766384,
+ "64k": 0.3029575907045873,
+ "128k": 0.302750216644183,
+ "256k": 0.2523291117496938
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3475865533347475,
+ "Partial": 0.4303015888775337
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5425168562606535,
+ "Moderate": 0.2943739375717643,
+ "Hard": 0.3291735952793175,
+ "Extreme": 0.30516679942900543
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6519835369865774,
+ "T2. Sequencing & Structure Reconstruction": 0.6259667241346109,
+ "T3. Evidence-Grounded QA": 0.438888888888889,
+ "T4. Summarization & Synthesis": 0.5181402784690045,
+ "T5. Attribution & Citation Alignment": 0.2964646210831104,
+ "T6. Aggregation & Clustering": 0.3430731095683128,
+ "T7. Consistency & Compliance Checking": 0.1940507194662843,
+ "T8. Structured & Numeric Reasoning": 0.23070987654320987,
+ "T9. Version & Code Diff Analysis": 0.40948667090743257,
+ "T10. Rule Induction & In-Context Learning": 0.4654166666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2416666666666666
+ },
+ "average_language_metric": {
+ "Chinese": 0.3639911930284364,
+ "English": 0.40397114491871056
+ },
+ "BoN-1": {
+ "overall_metric": 0.3775125226933459,
+ "token_length": {
+ "8k": 0.5435818039060978,
+ "16k": 0.4750140116790471,
+ "32k": 0.40129268121677775,
+ "64k": 0.28670320465634697,
+ "128k": 0.3074867892397562,
+ "256k": 0.25099664546205136
+ },
+ "contextual_requirement": {
+ "Full": 0.33866755109330426,
+ "Partial": 0.42695157745703605
+ },
+ "difficulty": {
+ "Easy": 0.5415570936201182,
+ "Moderate": 0.2821803430686346,
+ "Hard": 0.32058466511584804,
+ "Extreme": 0.29781631261319746
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6532513121925126,
+ "T2. Sequencing & Structure Reconstruction": 0.6111094803277807,
+ "T3. Evidence-Grounded QA": 0.425,
+ "T4. Summarization & Synthesis": 0.5176378460686947,
+ "T5. Attribution & Citation Alignment": 0.2913492653600891,
+ "T6. Aggregation & Clustering": 0.33400917389507784,
+ "T7. Consistency & Compliance Checking": 0.1887200677483396,
+ "T8. Structured & Numeric Reasoning": 0.22129629629629627,
+ "T9. Version & Code Diff Analysis": 0.41785365614151276,
+ "T10. Rule Induction & In-Context Learning": 0.445,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667
+ },
+ "language": {
+ "Chinese": 0.34935070830064097,
+ "English": 0.4056743370860514
+ }
+ },
+ "pass@1": 0.15666666666666668,
+ "BoN-2": {
+ "overall_metric": 0.4410818050309274,
+ "token_length": {
+ "8k": 0.6043547908072043,
+ "16k": 0.5353872646463821,
+ "32k": 0.50503501278225,
+ "64k": 0.35415051008140713,
+ "128k": 0.3590897028976117,
+ "256k": 0.28847354897071054
+ },
+ "contextual_requirement": {
+ "Full": 0.4052447124894285,
+ "Partial": 0.48669265008374507
+ },
+ "difficulty": {
+ "Easy": 0.6224120794085322,
+ "Moderate": 0.35760055550726977,
+ "Hard": 0.3739866973836827,
+ "Extreme": 0.3413440208772583
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7183527826830406,
+ "T2. Sequencing & Structure Reconstruction": 0.6865933482781307,
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
+ "T4. Summarization & Synthesis": 0.5317396047677334,
+ "T5. Attribution & Citation Alignment": 0.34712734561624925,
+ "T6. Aggregation & Clustering": 0.4084056110628732,
+ "T7. Consistency & Compliance Checking": 0.2396263809618489,
+ "T8. Structured & Numeric Reasoning": 0.28148148148148144,
+ "T9. Version & Code Diff Analysis": 0.4526614935043571,
+ "T10. Rule Induction & In-Context Learning": 0.5494444444444445,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3
+ },
+ "language": {
+ "Chinese": 0.4165337844683723,
+ "English": 0.4656298255934835
+ }
+ },
+ "pass@2": 0.202,
+ "BoN-3": {
+ "overall_metric": 0.47447373045805247,
+ "token_length": {
+ "8k": 0.6472581172783606,
+ "16k": 0.572013026736313,
+ "32k": 0.5323134358536711,
+ "64k": 0.38848449233451177,
+ "128k": 0.38802278615487296,
+ "256k": 0.31875052439058804
+ },
+ "contextual_requirement": {
+ "Full": 0.43884732486807027,
+ "Partial": 0.5198164284816681
+ },
+ "difficulty": {
+ "Easy": 0.6584366917924486,
+ "Moderate": 0.38635877266139323,
+ "Hard": 0.40375289537189885,
+ "Extreme": 0.3772769049579495
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7364126543991428,
+ "T2. Sequencing & Structure Reconstruction": 0.718547042731825,
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
+ "T4. Summarization & Synthesis": 0.5424760798885467,
+ "T5. Attribution & Citation Alignment": 0.3887278739210803,
+ "T6. Aggregation & Clustering": 0.45972240993615016,
+ "T7. Consistency & Compliance Checking": 0.258469847158553,
+ "T8. Structured & Numeric Reasoning": 0.32592592592592595,
+ "T9. Version & Code Diff Analysis": 0.48746959414301055,
+ "T10. Rule Induction & In-Context Learning": 0.5911111111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333
+ },
+ "language": {
+ "Chinese": 0.45726250895870557,
+ "English": 0.4916849519574011
+ }
+ },
+ "pass@3": 0.22866666666666666
+}
\ No newline at end of file
diff --git a/results/MiniMax-M2/thinking_context-1000000_bon-3_summary.json b/results/MiniMax-M2/thinking_context-1000000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c813b1a43cd4c125b6b414abea6d24710902e82
--- /dev/null
+++ b/results/MiniMax-M2/thinking_context-1000000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5320685707653132,
+ "inference_iteration_1_overall_metric": 0.535180398833494,
+ "inference_iteration_2_overall_metric": 0.5311849506804371,
+ "inference_iteration_3_overall_metric": 0.5298403627820072,
+ "average_token_length_metric": {
+ "8k": 0.654795970947119,
+ "16k": 0.5832041701523042,
+ "32k": 0.5830505446766833,
+ "64k": 0.5201561955794758,
+ "128k": 0.5060838591020447,
+ "256k": 0.3451206841342513
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.4938467607068266,
+ "Partial": 0.5807145108397509
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7219874781362817,
+ "Moderate": 0.599199335465557,
+ "Hard": 0.4257653962693645,
+ "Extreme": 0.34975019139747615
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.767571983047427,
+ "T2. Sequencing & Structure Reconstruction": 0.7186696477094124,
+ "T3. Evidence-Grounded QA": 0.4972222222222222,
+ "T4. Summarization & Synthesis": 0.4696599254603241,
+ "T5. Attribution & Citation Alignment": 0.54344042963745,
+ "T6. Aggregation & Clustering": 0.5123089198769455,
+ "T7. Consistency & Compliance Checking": 0.31381086481875964,
+ "T8. Structured & Numeric Reasoning": 0.6038580246913581,
+ "T9. Version & Code Diff Analysis": 0.5619188050015754,
+ "T10. Rule Induction & In-Context Learning": 0.5529629629629632,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.39444444444444454
+ },
+ "average_language_metric": {
+ "Chinese": 0.5354797348772966,
+ "English": 0.5286574066533296
+ },
+ "BoN-1": {
+ "overall_metric": 0.535180398833494,
+ "token_length": {
+ "8k": 0.6571274187960493,
+ "16k": 0.5855452098864022,
+ "32k": 0.6094638772285274,
+ "64k": 0.5094373867375244,
+ "128k": 0.5028727484199556,
+ "256k": 0.34663575193250185
+ },
+ "contextual_requirement": {
+ "Full": 0.4983496461876354,
+ "Partial": 0.5820559022009494
+ },
+ "difficulty": {
+ "Easy": 0.7348029222520711,
+ "Moderate": 0.6076522249303262,
+ "Hard": 0.4165082385065274,
+ "Extreme": 0.3468482177079349
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7483964432006224,
+ "T2. Sequencing & Structure Reconstruction": 0.6994320017261199,
+ "T3. Evidence-Grounded QA": 0.5,
+ "T4. Summarization & Synthesis": 0.46659438196842223,
+ "T5. Attribution & Citation Alignment": 0.5466093432829364,
+ "T6. Aggregation & Clustering": 0.5244645023077399,
+ "T7. Consistency & Compliance Checking": 0.32026132009110975,
+ "T8. Structured & Numeric Reasoning": 0.6097222222222223,
+ "T9. Version & Code Diff Analysis": 0.5581618594200692,
+ "T10. Rule Induction & In-Context Learning": 0.5638888888888888,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.425
+ },
+ "language": {
+ "Chinese": 0.5363273796225,
+ "English": 0.5340334180444871
+ }
+ },
+ "pass@1": 0.30133333333333334,
+ "BoN-2": {
+ "overall_metric": 0.631079137318095,
+ "token_length": {
+ "8k": 0.7607802022833232,
+ "16k": 0.6922523863155777,
+ "32k": 0.696226877019834,
+ "64k": 0.6061494664574102,
+ "128k": 0.5803070617163435,
+ "256k": 0.45075883011608214
+ },
+ "contextual_requirement": {
+ "Full": 0.5863061857618213,
+ "Partial": 0.6880628938442622
+ },
+ "difficulty": {
+ "Easy": 0.8429412379045521,
+ "Moderate": 0.7260325873797594,
+ "Hard": 0.5037591992235261,
+ "Extreme": 0.42025273404272534
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.854967268320159,
+ "T2. Sequencing & Structure Reconstruction": 0.7950413527388575,
+ "T3. Evidence-Grounded QA": 0.6333333333333333,
+ "T4. Summarization & Synthesis": 0.48790929716965253,
+ "T5. Attribution & Citation Alignment": 0.6628232709674524,
+ "T6. Aggregation & Clustering": 0.6070531962911038,
+ "T7. Consistency & Compliance Checking": 0.4273910542891768,
+ "T8. Structured & Numeric Reasoning": 0.6976851851851852,
+ "T9. Version & Code Diff Analysis": 0.6589983180763119,
+ "T10. Rule Induction & In-Context Learning": 0.663888888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333
+ },
+ "language": {
+ "Chinese": 0.6243713115466586,
+ "English": 0.6377869630895318
+ }
+ },
+ "pass@2": 0.38133333333333336,
+ "BoN-3": {
+ "overall_metric": 0.6838483190204042,
+ "token_length": {
+ "8k": 0.8056053063071357,
+ "16k": 0.754121676530954,
+ "32k": 0.7309373525434467,
+ "64k": 0.6802921620132278,
+ "128k": 0.6428076963616377,
+ "256k": 0.48932572036602695
+ },
+ "contextual_requirement": {
+ "Full": 0.6418563038987366,
+ "Partial": 0.7372927019025284
+ },
+ "difficulty": {
+ "Easy": 0.8932565923719491,
+ "Moderate": 0.7958991372015439,
+ "Hard": 0.5571346872299623,
+ "Extreme": 0.46408187669686923
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8856810957992655,
+ "T2. Sequencing & Structure Reconstruction": 0.8257039309014353,
+ "T3. Evidence-Grounded QA": 0.7083333333333334,
+ "T4. Summarization & Synthesis": 0.5020042367803014,
+ "T5. Attribution & Citation Alignment": 0.729994986816228,
+ "T6. Aggregation & Clustering": 0.6651081533166491,
+ "T7. Consistency & Compliance Checking": 0.49051401515979876,
+ "T8. Structured & Numeric Reasoning": 0.7680555555555556,
+ "T9. Version & Code Diff Analysis": 0.7036475958542688,
+ "T10. Rule Induction & In-Context Learning": 0.7072222222222223,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6
+ },
+ "language": {
+ "Chinese": 0.6727473750313149,
+ "English": 0.694949263009495
+ }
+ },
+ "pass@3": 0.444
+}
\ No newline at end of file
diff --git a/results/MiniMax-Text-01/nonthinking_context-1000000_bon-3_summary.json b/results/MiniMax-Text-01/nonthinking_context-1000000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..f73a7335187aa418a6d75eeed9e1ea7dd3d2f56e
--- /dev/null
+++ b/results/MiniMax-Text-01/nonthinking_context-1000000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.4113523778378889,
+ "inference_iteration_1_overall_metric": 0.4026546189679395,
+ "inference_iteration_2_overall_metric": 0.41422198000018023,
+ "inference_iteration_3_overall_metric": 0.41718053454554826,
+ "average_token_length_metric": {
+ "8k": 0.45750122785552744,
+ "16k": 0.40648581074103435,
+ "32k": 0.41953181726499883,
+ "64k": 0.3963813527019971,
+ "128k": 0.41323756281622565,
+ "256k": 0.3749764956475515
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.37732447212646125,
+ "Partial": 0.45466062147061553
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5125950929989945,
+ "Moderate": 0.38228847113922254,
+ "Hard": 0.3867421547849868,
+ "Extreme": 0.33569972963459577
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7019344870456294,
+ "T2. Sequencing & Structure Reconstruction": 0.6935557265385518,
+ "T3. Evidence-Grounded QA": 0.5000000000000001,
+ "T4. Summarization & Synthesis": 0.525289467915154,
+ "T5. Attribution & Citation Alignment": 0.40960389859884994,
+ "T6. Aggregation & Clustering": 0.3855189408594916,
+ "T7. Consistency & Compliance Checking": 0.2570183735053335,
+ "T8. Structured & Numeric Reasoning": 0.16126543209876543,
+ "T9. Version & Code Diff Analysis": 0.3763262824393013,
+ "T10. Rule Induction & In-Context Learning": 0.3850462962962962,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.34444444444444444
+ },
+ "average_language_metric": {
+ "Chinese": 0.4206201869029405,
+ "English": 0.40208456877283766
+ },
+ "BoN-1": {
+ "overall_metric": 0.4026546189679395,
+ "token_length": {
+ "8k": 0.4484815946744958,
+ "16k": 0.40023341947584756,
+ "32k": 0.39365195091822286,
+ "64k": 0.4050265329266902,
+ "128k": 0.40626760527764794,
+ "256k": 0.3622666105347326
+ },
+ "contextual_requirement": {
+ "Full": 0.3618050193216267,
+ "Partial": 0.45464501851779227
+ },
+ "difficulty": {
+ "Easy": 0.49189201078601713,
+ "Moderate": 0.3847984308236515,
+ "Hard": 0.39113612973801154,
+ "Extreme": 0.32419293466074633
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7071317552591362,
+ "T2. Sequencing & Structure Reconstruction": 0.692217342415818,
+ "T3. Evidence-Grounded QA": 0.475,
+ "T4. Summarization & Synthesis": 0.5252872492452957,
+ "T5. Attribution & Citation Alignment": 0.3965042482839467,
+ "T6. Aggregation & Clustering": 0.38900319686695384,
+ "T7. Consistency & Compliance Checking": 0.24881818692821855,
+ "T8. Structured & Numeric Reasoning": 0.1462962962962963,
+ "T9. Version & Code Diff Analysis": 0.35572673286895423,
+ "T10. Rule Induction & In-Context Learning": 0.36347222222222214,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667
+ },
+ "language": {
+ "Chinese": 0.4101250015901318,
+ "English": 0.3951842363457473
+ }
+ },
+ "pass@1": 0.15533333333333332,
+ "BoN-2": {
+ "overall_metric": 0.4807038949944852,
+ "token_length": {
+ "8k": 0.5303281501019884,
+ "16k": 0.4819497908714715,
+ "32k": 0.47954691765928337,
+ "64k": 0.48083012165065453,
+ "128k": 0.465293133114307,
+ "256k": 0.44627525656921
+ },
+ "contextual_requirement": {
+ "Full": 0.44656912126099607,
+ "Partial": 0.5241481524734732
+ },
+ "difficulty": {
+ "Easy": 0.596920990471646,
+ "Moderate": 0.4603818463137054,
+ "Hard": 0.4590470067460482,
+ "Extreme": 0.3809658785230146
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7449388025193358,
+ "T2. Sequencing & Structure Reconstruction": 0.7399444536944532,
+ "T3. Evidence-Grounded QA": 0.6166666666666667,
+ "T4. Summarization & Synthesis": 0.54417036696111,
+ "T5. Attribution & Citation Alignment": 0.5088222013004289,
+ "T6. Aggregation & Clustering": 0.4705462063266301,
+ "T7. Consistency & Compliance Checking": 0.3211976903039678,
+ "T8. Structured & Numeric Reasoning": 0.2083333333333333,
+ "T9. Version & Code Diff Analysis": 0.4528903513431796,
+ "T10. Rule Induction & In-Context Learning": 0.45958333333333334,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665
+ },
+ "language": {
+ "Chinese": 0.48759226495133245,
+ "English": 0.47381552503763963
+ }
+ },
+ "pass@2": 0.206,
+ "BoN-3": {
+ "overall_metric": 0.5286875532565248,
+ "token_length": {
+ "8k": 0.5760868208214227,
+ "16k": 0.5315447995369911,
+ "32k": 0.5297979513353553,
+ "64k": 0.5139951126923608,
+ "128k": 0.5285145377275431,
+ "256k": 0.49218609742548064
+ },
+ "contextual_requirement": {
+ "Full": 0.4922563052647642,
+ "Partial": 0.5750545961551319
+ },
+ "difficulty": {
+ "Easy": 0.6571166886876132,
+ "Moderate": 0.5008690293131257,
+ "Hard": 0.5070400734318661,
+ "Extreme": 0.42048944373649116
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7851733453547192,
+ "T2. Sequencing & Structure Reconstruction": 0.7758457283457282,
+ "T3. Evidence-Grounded QA": 0.7083333333333334,
+ "T4. Summarization & Synthesis": 0.5534698517064113,
+ "T5. Attribution & Citation Alignment": 0.5639720868179612,
+ "T6. Aggregation & Clustering": 0.503918026189678,
+ "T7. Consistency & Compliance Checking": 0.34945026972752397,
+ "T8. Structured & Numeric Reasoning": 0.25277777777777777,
+ "T9. Version & Code Diff Analysis": 0.5104976262726122,
+ "T10. Rule Induction & In-Context Learning": 0.5270833333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.525
+ },
+ "language": {
+ "Chinese": 0.5330279709661675,
+ "English": 0.5243471355468845
+ }
+ },
+ "pass@3": 0.24266666666666667
+}
\ No newline at end of file
diff --git a/results/MiniMax-Text-01/thinking_context-1000000_bon-3_summary.json b/results/MiniMax-Text-01/thinking_context-1000000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..984f604d36398edfbdb83e4dc1374752d6d0b1fd
--- /dev/null
+++ b/results/MiniMax-Text-01/thinking_context-1000000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.4499528005964066,
+ "inference_iteration_1_overall_metric": 0.4519835462001885,
+ "inference_iteration_2_overall_metric": 0.4481755772504262,
+ "inference_iteration_3_overall_metric": 0.4496992783386054,
+ "average_token_length_metric": {
+ "8k": 0.485225729559654,
+ "16k": 0.4524723240855649,
+ "32k": 0.46920448352940436,
+ "64k": 0.44046374240515457,
+ "128k": 0.4133092627171987,
+ "256k": 0.43904126128146514
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.4116545212336913,
+ "Partial": 0.49869606523986354
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6191934548978654,
+ "Moderate": 0.4082147550465631,
+ "Hard": 0.3801988071084879,
+ "Extreme": 0.33778735493415807
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6830330861399296,
+ "T2. Sequencing & Structure Reconstruction": 0.6403219944448011,
+ "T3. Evidence-Grounded QA": 0.4833333333333333,
+ "T4. Summarization & Synthesis": 0.5086176566073063,
+ "T5. Attribution & Citation Alignment": 0.416914270509611,
+ "T6. Aggregation & Clustering": 0.4334853794839026,
+ "T7. Consistency & Compliance Checking": 0.27119391146489646,
+ "T8. Structured & Numeric Reasoning": 0.38966049382716056,
+ "T9. Version & Code Diff Analysis": 0.4348929522191275,
+ "T10. Rule Induction & In-Context Learning": 0.41300925925925924,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4027777777777778
+ },
+ "average_language_metric": {
+ "Chinese": 0.45819903421860664,
+ "English": 0.4417065669742075
+ },
+ "BoN-1": {
+ "overall_metric": 0.4519835462001885,
+ "token_length": {
+ "8k": 0.4879779124929164,
+ "16k": 0.4554840853531918,
+ "32k": 0.4648286187996774,
+ "64k": 0.42985632449506034,
+ "128k": 0.4307020670264534,
+ "256k": 0.443052269033835
+ },
+ "contextual_requirement": {
+ "Full": 0.41228070711895354,
+ "Partial": 0.5025144323035801
+ },
+ "difficulty": {
+ "Easy": 0.6285595261886431,
+ "Moderate": 0.4057015689049336,
+ "Hard": 0.37791019658117175,
+ "Extreme": 0.33760415329971205
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6904671153390595,
+ "T2. Sequencing & Structure Reconstruction": 0.6319390331890332,
+ "T3. Evidence-Grounded QA": 0.44166666666666665,
+ "T4. Summarization & Synthesis": 0.5079368349605524,
+ "T5. Attribution & Citation Alignment": 0.3963567606333699,
+ "T6. Aggregation & Clustering": 0.4315669444489273,
+ "T7. Consistency & Compliance Checking": 0.26717481095169254,
+ "T8. Structured & Numeric Reasoning": 0.40648148148148144,
+ "T9. Version & Code Diff Analysis": 0.4533152836127507,
+ "T10. Rule Induction & In-Context Learning": 0.4119444444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4583333333333333
+ },
+ "language": {
+ "Chinese": 0.47043321599568005,
+ "English": 0.43353387640469826
+ }
+ },
+ "pass@1": 0.21,
+ "BoN-2": {
+ "overall_metric": 0.5523435379453717,
+ "token_length": {
+ "8k": 0.6041368153338821,
+ "16k": 0.553143416205592,
+ "32k": 0.5547357356840433,
+ "64k": 0.5474714891955119,
+ "128k": 0.5080001305944092,
+ "256k": 0.5465736406587951
+ },
+ "contextual_requirement": {
+ "Full": 0.5094377873914124,
+ "Partial": 0.6069508568322307
+ },
+ "difficulty": {
+ "Easy": 0.7582302423860908,
+ "Moderate": 0.5069058318579235,
+ "Hard": 0.47636905813527697,
+ "Extreme": 0.40654974290892704
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7753422134076285,
+ "T2. Sequencing & Structure Reconstruction": 0.7188864376364378,
+ "T3. Evidence-Grounded QA": 0.6083333333333333,
+ "T4. Summarization & Synthesis": 0.5276281423571613,
+ "T5. Attribution & Citation Alignment": 0.573374443874177,
+ "T6. Aggregation & Clustering": 0.5278895685136558,
+ "T7. Consistency & Compliance Checking": 0.35338346649949204,
+ "T8. Structured & Numeric Reasoning": 0.5027777777777779,
+ "T9. Version & Code Diff Analysis": 0.552570101188694,
+ "T10. Rule Induction & In-Context Learning": 0.5220833333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.55
+ },
+ "language": {
+ "Chinese": 0.5649919855932303,
+ "English": 0.5396950902975145
+ }
+ },
+ "pass@2": 0.2753333333333333,
+ "BoN-3": {
+ "overall_metric": 0.5997056103547938,
+ "token_length": {
+ "8k": 0.6457585156659336,
+ "16k": 0.6123141997231359,
+ "32k": 0.6242961953070552,
+ "64k": 0.5876928890236057,
+ "128k": 0.5497742714361217,
+ "256k": 0.5783975909729129
+ },
+ "contextual_requirement": {
+ "Full": 0.5540426758661396,
+ "Partial": 0.6578220724312633
+ },
+ "difficulty": {
+ "Easy": 0.8056166900447767,
+ "Moderate": 0.5614066990728871,
+ "Hard": 0.5178805893116146,
+ "Extreme": 0.45303896497156343
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8083334868935526,
+ "T2. Sequencing & Structure Reconstruction": 0.7593085155585156,
+ "T3. Evidence-Grounded QA": 0.6833333333333333,
+ "T4. Summarization & Synthesis": 0.5344316475303361,
+ "T5. Attribution & Citation Alignment": 0.6383957562170883,
+ "T6. Aggregation & Clustering": 0.5743997782942697,
+ "T7. Consistency & Compliance Checking": 0.39861351698347697,
+ "T8. Structured & Numeric Reasoning": 0.5527777777777778,
+ "T9. Version & Code Diff Analysis": 0.5853585580965909,
+ "T10. Rule Induction & In-Context Learning": 0.5984722222222222,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6
+ },
+ "language": {
+ "Chinese": 0.6144058958264887,
+ "English": 0.5850053248830991
+ }
+ },
+ "pass@3": 0.31933333333333336
+}
\ No newline at end of file
diff --git a/results/Ministral-3-14B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json b/results/Ministral-3-14B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..034d947740b5b771402360bda5c3d57661ebc5a3
--- /dev/null
+++ b/results/Ministral-3-14B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.40137741846169367,
+ "inference_iteration_1_overall_metric": 0.40238223426918757,
+ "inference_iteration_2_overall_metric": 0.4002952301959775,
+ "inference_iteration_3_overall_metric": 0.4014547909199151,
+ "average_token_length_metric": {
+ "8k": 0.43375517745982745,
+ "16k": 0.4624624276537502,
+ "32k": 0.4225675952668474,
+ "64k": 0.39702430034744873,
+ "128k": 0.3821065286583368,
+ "256k": 0.3103484813839513
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3639638275660222,
+ "Partial": 0.4489947159652748
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5260224045368944,
+ "Moderate": 0.3401707697729334,
+ "Hard": 0.35036961025265867,
+ "Extreme": 0.33853899745082133
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7189334347722878,
+ "T2. Sequencing & Structure Reconstruction": 0.6668259209925876,
+ "T3. Evidence-Grounded QA": 0.4583333333333333,
+ "T4. Summarization & Synthesis": 0.5221962563770883,
+ "T5. Attribution & Citation Alignment": 0.3136369167899536,
+ "T6. Aggregation & Clustering": 0.39798286937745314,
+ "T7. Consistency & Compliance Checking": 0.21946000799150195,
+ "T8. Structured & Numeric Reasoning": 0.14151234567901233,
+ "T9. Version & Code Diff Analysis": 0.47709977467470854,
+ "T10. Rule Induction & In-Context Learning": 0.4217592592592592,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.30000000000000004
+ },
+ "average_language_metric": {
+ "Chinese": 0.40560935669062276,
+ "English": 0.397145480232764
+ },
+ "BoN-1": {
+ "overall_metric": 0.40238223426918757,
+ "token_length": {
+ "8k": 0.433884589163934,
+ "16k": 0.46516909415802493,
+ "32k": 0.42676245820656594,
+ "64k": 0.3945954502711027,
+ "128k": 0.38087905781855047,
+ "256k": 0.31300275599695
+ },
+ "contextual_requirement": {
+ "Full": 0.36618501990567515,
+ "Partial": 0.44845141618638706
+ },
+ "difficulty": {
+ "Easy": 0.5313007708587812,
+ "Moderate": 0.33905574344991224,
+ "Hard": 0.35279460774167126,
+ "Extreme": 0.33532188262609514
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7231148921645373,
+ "T2. Sequencing & Structure Reconstruction": 0.6661998186998181,
+ "T3. Evidence-Grounded QA": 0.4666666666666667,
+ "T4. Summarization & Synthesis": 0.5230622050479962,
+ "T5. Attribution & Citation Alignment": 0.30176586689960616,
+ "T6. Aggregation & Clustering": 0.3921100544588917,
+ "T7. Consistency & Compliance Checking": 0.219946553696774,
+ "T8. Structured & Numeric Reasoning": 0.14675925925925926,
+ "T9. Version & Code Diff Analysis": 0.4747724555416158,
+ "T10. Rule Induction & In-Context Learning": 0.4193055555555556,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
+ },
+ "language": {
+ "Chinese": 0.40159850236109407,
+ "English": 0.4031659661772829
+ }
+ },
+ "pass@1": 0.158,
+ "BoN-2": {
+ "overall_metric": 0.4167918691419028,
+ "token_length": {
+ "8k": 0.4448864222174602,
+ "16k": 0.481483520242527,
+ "32k": 0.43929332248789354,
+ "64k": 0.4125546400739453,
+ "128k": 0.39499054247940135,
+ "256k": 0.3275427673501904
+ },
+ "contextual_requirement": {
+ "Full": 0.37697160991132156,
+ "Partial": 0.4674721990717337
+ },
+ "difficulty": {
+ "Easy": 0.543547780022054,
+ "Moderate": 0.3573487107807427,
+ "Hard": 0.3645996932831712,
+ "Extreme": 0.3512606476539851
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7338149078373205,
+ "T2. Sequencing & Structure Reconstruction": 0.6898572261072259,
+ "T3. Evidence-Grounded QA": 0.475,
+ "T4. Summarization & Synthesis": 0.5324649457390374,
+ "T5. Attribution & Citation Alignment": 0.3305062690008576,
+ "T6. Aggregation & Clustering": 0.41269190069190054,
+ "T7. Consistency & Compliance Checking": 0.24063695524418569,
+ "T8. Structured & Numeric Reasoning": 0.14953703703703705,
+ "T9. Version & Code Diff Analysis": 0.4968728427963255,
+ "T10. Rule Induction & In-Context Learning": 0.43041666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
+ },
+ "language": {
+ "Chinese": 0.42183857405387454,
+ "English": 0.4117451642299312
+ }
+ },
+ "pass@2": 0.164,
+ "BoN-3": {
+ "overall_metric": 0.42795282912937055,
+ "token_length": {
+ "8k": 0.45584907332800734,
+ "16k": 0.4930808335792047,
+ "32k": 0.45565921785915897,
+ "64k": 0.41883977659018873,
+ "128k": 0.4041598036532658,
+ "256k": 0.34012826976639843
+ },
+ "contextual_requirement": {
+ "Full": 0.3885912574254956,
+ "Partial": 0.47804937493430194
+ },
+ "difficulty": {
+ "Easy": 0.554784793655902,
+ "Moderate": 0.37124888788597155,
+ "Hard": 0.37318051863597795,
+ "Extreme": 0.36223380606151817
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7412124373820532,
+ "T2. Sequencing & Structure Reconstruction": 0.6988453213453213,
+ "T3. Evidence-Grounded QA": 0.48333333333333334,
+ "T4. Summarization & Synthesis": 0.538030953852775,
+ "T5. Attribution & Citation Alignment": 0.34888079906957237,
+ "T6. Aggregation & Clustering": 0.4339707150850415,
+ "T7. Consistency & Compliance Checking": 0.24529018187024293,
+ "T8. Structured & Numeric Reasoning": 0.15046296296296297,
+ "T9. Version & Code Diff Analysis": 0.5141050625900392,
+ "T10. Rule Induction & In-Context Learning": 0.44708333333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333
+ },
+ "language": {
+ "Chinese": 0.43352032726356865,
+ "English": 0.42238533099517206
+ }
+ },
+ "pass@3": 0.16933333333333334
+}
\ No newline at end of file
diff --git a/results/Ministral-3-14B-Instruct-2512/thinking_context-224000_bon-3_summary.json b/results/Ministral-3-14B-Instruct-2512/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..d11436d053696ca6931612d429ddedbfb43c15e3
--- /dev/null
+++ b/results/Ministral-3-14B-Instruct-2512/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.45799174186842606,
+ "inference_iteration_1_overall_metric": 0.45848259675544495,
+ "inference_iteration_2_overall_metric": 0.46152550901732137,
+ "inference_iteration_3_overall_metric": 0.453967119832514,
+ "average_token_length_metric": {
+ "8k": 0.5187827600069922,
+ "16k": 0.48518599796025474,
+ "32k": 0.48745678732020276,
+ "64k": 0.4570416898883375,
+ "128k": 0.42361656559174016,
+ "256k": 0.3758666504430352
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.4248803053301027,
+ "Partial": 0.500133570189931
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6756101555622144,
+ "Moderate": 0.39349242818800056,
+ "Hard": 0.37479147914388156,
+ "Extreme": 0.31661242864258926
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7608265794743194,
+ "T2. Sequencing & Structure Reconstruction": 0.6976467766457445,
+ "T3. Evidence-Grounded QA": 0.4138888888888888,
+ "T4. Summarization & Synthesis": 0.49943611989270814,
+ "T5. Attribution & Citation Alignment": 0.3460922902510361,
+ "T6. Aggregation & Clustering": 0.45809776319010986,
+ "T7. Consistency & Compliance Checking": 0.2280021697407482,
+ "T8. Structured & Numeric Reasoning": 0.41836419753086435,
+ "T9. Version & Code Diff Analysis": 0.5354951076952419,
+ "T10. Rule Induction & In-Context Learning": 0.462037037037037,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35277777777777775
+ },
+ "average_language_metric": {
+ "Chinese": 0.43851283791556633,
+ "English": 0.4774706458212879
+ },
+ "BoN-1": {
+ "overall_metric": 0.45848259675544495,
+ "token_length": {
+ "8k": 0.538436412123529,
+ "16k": 0.47816429220213824,
+ "32k": 0.4805712657434,
+ "64k": 0.44911476875649875,
+ "128k": 0.4390480988508506,
+ "256k": 0.3655607428562557
+ },
+ "contextual_requirement": {
+ "Full": 0.42758275308934174,
+ "Partial": 0.49780967051230435
+ },
+ "difficulty": {
+ "Easy": 0.6788797898422243,
+ "Moderate": 0.39582478863697335,
+ "Hard": 0.37581391782555706,
+ "Extreme": 0.3125005687762375
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7602730263789274,
+ "T2. Sequencing & Structure Reconstruction": 0.7038347568610727,
+ "T3. Evidence-Grounded QA": 0.4583333333333333,
+ "T4. Summarization & Synthesis": 0.4990357866977353,
+ "T5. Attribution & Citation Alignment": 0.35405443033443207,
+ "T6. Aggregation & Clustering": 0.43771712805012347,
+ "T7. Consistency & Compliance Checking": 0.21275716732831734,
+ "T8. Structured & Numeric Reasoning": 0.42546296296296293,
+ "T9. Version & Code Diff Analysis": 0.5013174605476809,
+ "T10. Rule Induction & In-Context Learning": 0.48194444444444445,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334
+ },
+ "language": {
+ "Chinese": 0.43269825076497226,
+ "English": 0.48426694274591864
+ }
+ },
+ "pass@1": 0.22733333333333333,
+ "BoN-2": {
+ "overall_metric": 0.5227530650530186,
+ "token_length": {
+ "8k": 0.5815903903274882,
+ "16k": 0.5495634888346752,
+ "32k": 0.5446963722215565,
+ "64k": 0.5299811967182008,
+ "128k": 0.4888075896458958,
+ "256k": 0.4418793525702981
+ },
+ "contextual_requirement": {
+ "Full": 0.489360650067773,
+ "Partial": 0.565252502306969
+ },
+ "difficulty": {
+ "Easy": 0.7604456302850787,
+ "Moderate": 0.463797787486306,
+ "Hard": 0.43728803588921916,
+ "Extreme": 0.35722954733316814
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8036702370810763,
+ "T2. Sequencing & Structure Reconstruction": 0.7488194240400123,
+ "T3. Evidence-Grounded QA": 0.5083333333333333,
+ "T4. Summarization & Synthesis": 0.5120697680827194,
+ "T5. Attribution & Citation Alignment": 0.4029499793922618,
+ "T6. Aggregation & Clustering": 0.5304624812643679,
+ "T7. Consistency & Compliance Checking": 0.27354479208047294,
+ "T8. Structured & Numeric Reasoning": 0.4962962962962963,
+ "T9. Version & Code Diff Analysis": 0.6272818834382945,
+ "T10. Rule Induction & In-Context Learning": 0.5725,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
+ },
+ "language": {
+ "Chinese": 0.5068736065410564,
+ "English": 0.5386325235649823
+ }
+ },
+ "pass@2": 0.2833333333333333,
+ "BoN-3": {
+ "overall_metric": 0.5542260073321058,
+ "token_length": {
+ "8k": 0.6154421507239413,
+ "16k": 0.5842317006106588,
+ "32k": 0.5699342902198496,
+ "64k": 0.5576866952391514,
+ "128k": 0.5184072720538686,
+ "256k": 0.4796539351451706
+ },
+ "contextual_requirement": {
+ "Full": 0.5204393104129562,
+ "Partial": 0.5972272579564804
+ },
+ "difficulty": {
+ "Easy": 0.7905996585472251,
+ "Moderate": 0.5012307381116559,
+ "Hard": 0.46914010908714937,
+ "Extreme": 0.3859836380407792
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8214474184096282,
+ "T2. Sequencing & Structure Reconstruction": 0.7666511293717175,
+ "T3. Evidence-Grounded QA": 0.525,
+ "T4. Summarization & Synthesis": 0.5197606048087388,
+ "T5. Attribution & Citation Alignment": 0.4423027060091871,
+ "T6. Aggregation & Clustering": 0.5772430200792206,
+ "T7. Consistency & Compliance Checking": 0.3040682257365206,
+ "T8. Structured & Numeric Reasoning": 0.5402777777777777,
+ "T9. Version & Code Diff Analysis": 0.661946364328448,
+ "T10. Rule Induction & In-Context Learning": 0.5916666666666668,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667
+ },
+ "language": {
+ "Chinese": 0.5366930860176997,
+ "English": 0.5717589286465138
+ }
+ },
+ "pass@3": 0.31133333333333335
+}
\ No newline at end of file
diff --git a/results/Ministral-3-3B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json b/results/Ministral-3-3B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..32c3d32fcf911b08d21f1b583c0511c193295456
--- /dev/null
+++ b/results/Ministral-3-3B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.30179346535967666,
+ "inference_iteration_1_overall_metric": 0.3029527040012084,
+ "inference_iteration_2_overall_metric": 0.30315184463083855,
+ "inference_iteration_3_overall_metric": 0.2992758474469829,
+ "average_token_length_metric": {
+ "8k": 0.31721543930238405,
+ "16k": 0.3412774869133842,
+ "32k": 0.31691876705456545,
+ "64k": 0.2687371996940676,
+ "128k": 0.30453000021054333,
+ "256k": 0.2620818989831162
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.2701610943483024,
+ "Partial": 0.34205284664688057
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.38811634301315323,
+ "Moderate": 0.23804107928640247,
+ "Hard": 0.2856786739337558,
+ "Extreme": 0.2596888909694311
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.5494421813257797,
+ "T2. Sequencing & Structure Reconstruction": 0.5158060495268492,
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
+ "T4. Summarization & Synthesis": 0.5079974835401471,
+ "T5. Attribution & Citation Alignment": 0.1926176241888911,
+ "T6. Aggregation & Clustering": 0.30095602813594396,
+ "T7. Consistency & Compliance Checking": 0.11630381954916796,
+ "T8. Structured & Numeric Reasoning": 0.0558641975308642,
+ "T9. Version & Code Diff Analysis": 0.27038742910884633,
+ "T10. Rule Induction & In-Context Learning": 0.332037037037037,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2611111111111111
+ },
+ "average_language_metric": {
+ "Chinese": 0.32613475528214075,
+ "English": 0.2774521754372128
+ },
+ "BoN-1": {
+ "overall_metric": 0.3029527040012084,
+ "token_length": {
+ "8k": 0.3191767566971557,
+ "16k": 0.3423054126585435,
+ "32k": 0.31611056842908253,
+ "64k": 0.26522388398511487,
+ "128k": 0.31148016857049676,
+ "256k": 0.26341943366685827
+ },
+ "contextual_requirement": {
+ "Full": 0.27081993053812903,
+ "Partial": 0.3438489611360375
+ },
+ "difficulty": {
+ "Easy": 0.3900871254265116,
+ "Moderate": 0.24072949435669488,
+ "Hard": 0.2869463348423525,
+ "Extreme": 0.25888351569934015
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5387044717426223,
+ "T2. Sequencing & Structure Reconstruction": 0.5228428722670637,
+ "T3. Evidence-Grounded QA": 0.425,
+ "T4. Summarization & Synthesis": 0.5081133322337062,
+ "T5. Attribution & Citation Alignment": 0.19946236214954846,
+ "T6. Aggregation & Clustering": 0.300068633862112,
+ "T7. Consistency & Compliance Checking": 0.12608033855092354,
+ "T8. Structured & Numeric Reasoning": 0.05092592592592593,
+ "T9. Version & Code Diff Analysis": 0.2735623030026126,
+ "T10. Rule Induction & In-Context Learning": 0.33694444444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.26666666666666666
+ },
+ "language": {
+ "Chinese": 0.3278788873616267,
+ "English": 0.2780265206407908
+ }
+ },
+ "pass@1": 0.09933333333333333,
+ "BoN-2": {
+ "overall_metric": 0.3235664407635165,
+ "token_length": {
+ "8k": 0.33752116035721835,
+ "16k": 0.3644601713974257,
+ "32k": 0.34581977441150585,
+ "64k": 0.2903196448924103,
+ "128k": 0.3272049759144316,
+ "256k": 0.2760729176081059
+ },
+ "contextual_requirement": {
+ "Full": 0.2920011834968168,
+ "Partial": 0.3637404045574977
+ },
+ "difficulty": {
+ "Easy": 0.4147958762035026,
+ "Moderate": 0.2544155279477453,
+ "Hard": 0.3096540579882773,
+ "Extreme": 0.2781862238174447
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5653125926955125,
+ "T2. Sequencing & Structure Reconstruction": 0.5686493732875312,
+ "T3. Evidence-Grounded QA": 0.4583333333333333,
+ "T4. Summarization & Synthesis": 0.5187643397494163,
+ "T5. Attribution & Citation Alignment": 0.2137291166026424,
+ "T6. Aggregation & Clustering": 0.32250141360204826,
+ "T7. Consistency & Compliance Checking": 0.1323008909170473,
+ "T8. Structured & Numeric Reasoning": 0.057870370370370364,
+ "T9. Version & Code Diff Analysis": 0.3024494082079856,
+ "T10. Rule Induction & In-Context Learning": 0.35666666666666663,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667
+ },
+ "language": {
+ "Chinese": 0.34580247197246466,
+ "English": 0.3013304095545682
+ }
+ },
+ "pass@2": 0.108,
+ "BoN-3": {
+ "overall_metric": 0.3337187991769299,
+ "token_length": {
+ "8k": 0.3545410484752056,
+ "16k": 0.37343595068728874,
+ "32k": 0.3540675331326773,
+ "64k": 0.3014677883245934,
+ "128k": 0.3329694407689707,
+ "256k": 0.2858310336728446
+ },
+ "contextual_requirement": {
+ "Full": 0.30091145355832494,
+ "Partial": 0.3754736026915186
+ },
+ "difficulty": {
+ "Easy": 0.4222694522861904,
+ "Moderate": 0.2710401461853368,
+ "Hard": 0.31823781489879627,
+ "Extreme": 0.28805225456849304
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5911236100282776,
+ "T2. Sequencing & Structure Reconstruction": 0.5777883824265405,
+ "T3. Evidence-Grounded QA": 0.4583333333333333,
+ "T4. Summarization & Synthesis": 0.5244656227142894,
+ "T5. Attribution & Citation Alignment": 0.22343915037802925,
+ "T6. Aggregation & Clustering": 0.3402498647457522,
+ "T7. Consistency & Compliance Checking": 0.14046319763373694,
+ "T8. Structured & Numeric Reasoning": 0.07175925925925926,
+ "T9. Version & Code Diff Analysis": 0.3067930750396991,
+ "T10. Rule Induction & In-Context Learning": 0.3608333333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3
+ },
+ "language": {
+ "Chinese": 0.35283833596795733,
+ "English": 0.3145992623859028
+ }
+ },
+ "pass@3": 0.112
+}
\ No newline at end of file
diff --git a/results/Ministral-3-3B-Instruct-2512/thinking_context-224000_bon-3_summary.json b/results/Ministral-3-3B-Instruct-2512/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..148c001169a70f28357640aeaa567a5480f2c298
--- /dev/null
+++ b/results/Ministral-3-3B-Instruct-2512/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3454036931179677,
+ "inference_iteration_1_overall_metric": 0.34873430118319537,
+ "inference_iteration_2_overall_metric": 0.34879316780005654,
+ "inference_iteration_3_overall_metric": 0.338683610370652,
+ "average_token_length_metric": {
+ "8k": 0.3692200735727051,
+ "16k": 0.40119881531813145,
+ "32k": 0.3781626650292353,
+ "64k": 0.32612186138726695,
+ "128k": 0.3193179750507245,
+ "256k": 0.27840076834974453
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.31819246274226237,
+ "Partial": 0.3800361681415936
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.4964533039214179,
+ "Moderate": 0.255962770404739,
+ "Hard": 0.30232089813471313,
+ "Extreme": 0.2669796976793795
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.5858449524514547,
+ "T2. Sequencing & Structure Reconstruction": 0.586324154256027,
+ "T3. Evidence-Grounded QA": 0.37777777777777766,
+ "T4. Summarization & Synthesis": 0.49033515640007624,
+ "T5. Attribution & Citation Alignment": 0.21401668096493356,
+ "T6. Aggregation & Clustering": 0.3516741155045124,
+ "T7. Consistency & Compliance Checking": 0.154158877596512,
+ "T8. Structured & Numeric Reasoning": 0.24166666666666664,
+ "T9. Version & Code Diff Analysis": 0.352322026546868,
+ "T10. Rule Induction & In-Context Learning": 0.32856481481481475,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2611111111111111
+ },
+ "average_language_metric": {
+ "Chinese": 0.32661149524563937,
+ "English": 0.364195890990297
+ },
+ "BoN-1": {
+ "overall_metric": 0.34873430118319537,
+ "token_length": {
+ "8k": 0.3656122243049499,
+ "16k": 0.3958940853914839,
+ "32k": 0.3703173794807147,
+ "64k": 0.3317196346770501,
+ "128k": 0.34303779448458255,
+ "256k": 0.2858246887603903
+ },
+ "contextual_requirement": {
+ "Full": 0.32665848057093605,
+ "Partial": 0.3768308001442518
+ },
+ "difficulty": {
+ "Easy": 0.50262211292123,
+ "Moderate": 0.25361014093585454,
+ "Hard": 0.310150509372198,
+ "Extreme": 0.2679790122903014
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.58240433072649,
+ "T2. Sequencing & Structure Reconstruction": 0.6066101012652736,
+ "T3. Evidence-Grounded QA": 0.36666666666666664,
+ "T4. Summarization & Synthesis": 0.4928112466906771,
+ "T5. Attribution & Citation Alignment": 0.2064491285924213,
+ "T6. Aggregation & Clustering": 0.3455987518896014,
+ "T7. Consistency & Compliance Checking": 0.15983398189125114,
+ "T8. Structured & Numeric Reasoning": 0.2453703703703704,
+ "T9. Version & Code Diff Analysis": 0.37553263462157643,
+ "T10. Rule Induction & In-Context Learning": 0.31083333333333335,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667
+ },
+ "language": {
+ "Chinese": 0.3267683933919365,
+ "English": 0.3707002089744536
+ }
+ },
+ "pass@1": 0.13466666666666666,
+ "BoN-2": {
+ "overall_metric": 0.4230141245251924,
+ "token_length": {
+ "8k": 0.46908993057301923,
+ "16k": 0.4637780067071494,
+ "32k": 0.4631374771588592,
+ "64k": 0.4023395380855012,
+ "128k": 0.38963984120856304,
+ "256k": 0.3500999534180651
+ },
+ "contextual_requirement": {
+ "Full": 0.38841564042851795,
+ "Partial": 0.4670485588300523
+ },
+ "difficulty": {
+ "Easy": 0.6182575201716307,
+ "Moderate": 0.3194064855847735,
+ "Hard": 0.36683106532741566,
+ "Extreme": 0.3140872951169107
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6586830692904562,
+ "T2. Sequencing & Structure Reconstruction": 0.6861384911384911,
+ "T3. Evidence-Grounded QA": 0.5,
+ "T4. Summarization & Synthesis": 0.5047315613131492,
+ "T5. Attribution & Citation Alignment": 0.2972254153066172,
+ "T6. Aggregation & Clustering": 0.42732208121097004,
+ "T7. Consistency & Compliance Checking": 0.19799483338543603,
+ "T8. Structured & Numeric Reasoning": 0.3277777777777777,
+ "T9. Version & Code Diff Analysis": 0.4437559809549206,
+ "T10. Rule Induction & In-Context Learning": 0.41750000000000004,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35
+ },
+ "language": {
+ "Chinese": 0.40188885021867327,
+ "English": 0.4441393988317126
+ }
+ },
+ "pass@2": 0.184,
+ "BoN-3": {
+ "overall_metric": 0.45091674287019584,
+ "token_length": {
+ "8k": 0.4890912273204133,
+ "16k": 0.5060340159767878,
+ "32k": 0.48168220208134793,
+ "64k": 0.43952399968187594,
+ "128k": 0.41252202308139446,
+ "256k": 0.3766469890793584
+ },
+ "contextual_requirement": {
+ "Full": 0.4168658522047825,
+ "Partial": 0.4942542400807231
+ },
+ "difficulty": {
+ "Easy": 0.6450588018724265,
+ "Moderate": 0.3584312989173043,
+ "Hard": 0.39189079094493173,
+ "Extreme": 0.3377583814377241
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6863503216486717,
+ "T2. Sequencing & Structure Reconstruction": 0.7174943112443114,
+ "T3. Evidence-Grounded QA": 0.5416666666666666,
+ "T4. Summarization & Synthesis": 0.5097159356890285,
+ "T5. Attribution & Citation Alignment": 0.3359084101583427,
+ "T6. Aggregation & Clustering": 0.46349171404726974,
+ "T7. Consistency & Compliance Checking": 0.2201801789041147,
+ "T8. Structured & Numeric Reasoning": 0.34444444444444444,
+ "T9. Version & Code Diff Analysis": 0.4909269121544662,
+ "T10. Rule Induction & In-Context Learning": 0.44555555555555554,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664
+ },
+ "language": {
+ "Chinese": 0.41909228907998713,
+ "English": 0.4827411966604057
+ }
+ },
+ "pass@3": 0.202
+}
\ No newline at end of file
diff --git a/results/Ministral-3-8B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json b/results/Ministral-3-8B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..66fe280b380dd2e9bd48276327777d7025fdaa01
--- /dev/null
+++ b/results/Ministral-3-8B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3780437992177988,
+ "inference_iteration_1_overall_metric": 0.37746736206286746,
+ "inference_iteration_2_overall_metric": 0.3812187781897308,
+ "inference_iteration_3_overall_metric": 0.3754452574007977,
+ "average_token_length_metric": {
+ "8k": 0.40747091961293536,
+ "16k": 0.4217586689370988,
+ "32k": 0.4100622794449128,
+ "64k": 0.3509822308146611,
+ "128k": 0.3761773667981329,
+ "256k": 0.30181132969905095
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3448423936300968,
+ "Partial": 0.4203001336021466
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5026909853492837,
+ "Moderate": 0.3172643692616012,
+ "Hard": 0.3210729729273563,
+ "Extreme": 0.3188389818084875
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6788284158016221,
+ "T2. Sequencing & Structure Reconstruction": 0.6236659328325996,
+ "T3. Evidence-Grounded QA": 0.47777777777777775,
+ "T4. Summarization & Synthesis": 0.5139559182541029,
+ "T5. Attribution & Citation Alignment": 0.2898615675534933,
+ "T6. Aggregation & Clustering": 0.38892311924203327,
+ "T7. Consistency & Compliance Checking": 0.2090272807257386,
+ "T8. Structured & Numeric Reasoning": 0.10648148148148148,
+ "T9. Version & Code Diff Analysis": 0.4152267224956728,
+ "T10. Rule Induction & In-Context Learning": 0.3640277777777777,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3055555555555555
+ },
+ "average_language_metric": {
+ "Chinese": 0.3899531126999766,
+ "English": 0.36613448573562096
+ },
+ "BoN-1": {
+ "overall_metric": 0.37746736206286746,
+ "token_length": {
+ "8k": 0.4069937376708668,
+ "16k": 0.41722807754864144,
+ "32k": 0.40789581779808437,
+ "64k": 0.34921996994765486,
+ "128k": 0.37545000489943714,
+ "256k": 0.3080165645125228
+ },
+ "contextual_requirement": {
+ "Full": 0.3453649180362828,
+ "Partial": 0.4183250180967035
+ },
+ "difficulty": {
+ "Easy": 0.5044527526099465,
+ "Moderate": 0.3125637979759982,
+ "Hard": 0.3178750307733104,
+ "Extreme": 0.32013180642672173
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6846023385004407,
+ "T2. Sequencing & Structure Reconstruction": 0.6255770155770156,
+ "T3. Evidence-Grounded QA": 0.475,
+ "T4. Summarization & Synthesis": 0.5150710795645007,
+ "T5. Attribution & Citation Alignment": 0.2866964120580854,
+ "T6. Aggregation & Clustering": 0.38933176968982663,
+ "T7. Consistency & Compliance Checking": 0.20492564574674327,
+ "T8. Structured & Numeric Reasoning": 0.10555555555555556,
+ "T9. Version & Code Diff Analysis": 0.42320350137539475,
+ "T10. Rule Induction & In-Context Learning": 0.3668055555555556,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667
+ },
+ "language": {
+ "Chinese": 0.3917885072903086,
+ "English": 0.3631462168354272
+ }
+ },
+ "pass@1": 0.14066666666666666,
+ "BoN-2": {
+ "overall_metric": 0.3960720661735587,
+ "token_length": {
+ "8k": 0.4267853319337846,
+ "16k": 0.4487497839538122,
+ "32k": 0.42026835310130006,
+ "64k": 0.37076824135007136,
+ "128k": 0.3903702458059971,
+ "256k": 0.3194904408963875
+ },
+ "contextual_requirement": {
+ "Full": 0.3617541651771871,
+ "Partial": 0.43974939471439556
+ },
+ "difficulty": {
+ "Easy": 0.5236613839420188,
+ "Moderate": 0.32881680957334175,
+ "Hard": 0.3378863045377575,
+ "Extreme": 0.3386957095958586
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6941496673211041,
+ "T2. Sequencing & Structure Reconstruction": 0.6537227587227588,
+ "T3. Evidence-Grounded QA": 0.5,
+ "T4. Summarization & Synthesis": 0.5266833803950245,
+ "T5. Attribution & Citation Alignment": 0.30914312627907664,
+ "T6. Aggregation & Clustering": 0.4099625037688756,
+ "T7. Consistency & Compliance Checking": 0.21406107873077526,
+ "T8. Structured & Numeric Reasoning": 0.11296296296296296,
+ "T9. Version & Code Diff Analysis": 0.43908318736871094,
+ "T10. Rule Induction & In-Context Learning": 0.3893055555555555,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333
+ },
+ "language": {
+ "Chinese": 0.41022855768773003,
+ "English": 0.38191557465938764
+ }
+ },
+ "pass@2": 0.15333333333333332,
+ "BoN-3": {
+ "overall_metric": 0.40392076628937795,
+ "token_length": {
+ "8k": 0.44011246245033525,
+ "16k": 0.45248242271387923,
+ "32k": 0.4364757916450588,
+ "64k": 0.3739727159425725,
+ "128k": 0.3968686251866505,
+ "256k": 0.3236125797977717
+ },
+ "contextual_requirement": {
+ "Full": 0.36776720734616364,
+ "Partial": 0.4499343867625605
+ },
+ "difficulty": {
+ "Easy": 0.5291795040463627,
+ "Moderate": 0.3398411270992665,
+ "Hard": 0.3454895416328106,
+ "Extreme": 0.3471728755003404
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6957621908179982,
+ "T2. Sequencing & Structure Reconstruction": 0.6586220261220265,
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
+ "T4. Summarization & Synthesis": 0.5308128403233062,
+ "T5. Attribution & Citation Alignment": 0.3225311270119217,
+ "T6. Aggregation & Clustering": 0.4233804016791107,
+ "T7. Consistency & Compliance Checking": 0.22394925367429327,
+ "T8. Structured & Numeric Reasoning": 0.11851851851851854,
+ "T9. Version & Code Diff Analysis": 0.4448702446451986,
+ "T10. Rule Induction & In-Context Learning": 0.3893055555555555,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667
+ },
+ "language": {
+ "Chinese": 0.4204961295794779,
+ "English": 0.3873454029992785
+ }
+ },
+ "pass@3": 0.15933333333333333
+}
\ No newline at end of file
diff --git a/results/Ministral-3-8B-Instruct-2512/thinking_context-224000_bon-3_summary.json b/results/Ministral-3-8B-Instruct-2512/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a0d612493f5f689a1c1d1d43fa3d387509f2e3e
--- /dev/null
+++ b/results/Ministral-3-8B-Instruct-2512/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.4445697906316412,
+ "inference_iteration_1_overall_metric": 0.4414354535926015,
+ "inference_iteration_2_overall_metric": 0.44685007554136563,
+ "inference_iteration_3_overall_metric": 0.44542384276095776,
+ "average_token_length_metric": {
+ "8k": 0.5011776417934968,
+ "16k": 0.48676163299638525,
+ "32k": 0.4768790712429065,
+ "64k": 0.43057655714960325,
+ "128k": 0.41136598921457657,
+ "256k": 0.36065785139288176
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.406935522390496,
+ "Partial": 0.49246795021128126
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6714266695708587,
+ "Moderate": 0.3525997582691683,
+ "Hard": 0.3499422819715659,
+ "Extreme": 0.3186190490562193
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7086003520112202,
+ "T2. Sequencing & Structure Reconstruction": 0.6809300421800422,
+ "T3. Evidence-Grounded QA": 0.43888888888888894,
+ "T4. Summarization & Synthesis": 0.4966730523372876,
+ "T5. Attribution & Citation Alignment": 0.3928781055792416,
+ "T6. Aggregation & Clustering": 0.4235524897993562,
+ "T7. Consistency & Compliance Checking": 0.2048019407256743,
+ "T8. Structured & Numeric Reasoning": 0.39459876543209865,
+ "T9. Version & Code Diff Analysis": 0.5072684442594425,
+ "T10. Rule Induction & In-Context Learning": 0.46134259259259264,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3361111111111112
+ },
+ "average_language_metric": {
+ "Chinese": 0.4274680156614393,
+ "English": 0.4616715656018438
+ },
+ "BoN-1": {
+ "overall_metric": 0.4414354535926015,
+ "token_length": {
+ "8k": 0.49046479436781104,
+ "16k": 0.467031355142411,
+ "32k": 0.48202229982107525,
+ "64k": 0.4432590731600766,
+ "128k": 0.42424252636864085,
+ "256k": 0.34159267269559335
+ },
+ "contextual_requirement": {
+ "Full": 0.4063419548203588,
+ "Partial": 0.4860999065754557
+ },
+ "difficulty": {
+ "Easy": 0.6551940293849869,
+ "Moderate": 0.3509192356404266,
+ "Hard": 0.3599913200493228,
+ "Extreme": 0.32021933552500187
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7053507564972957,
+ "T2. Sequencing & Structure Reconstruction": 0.6859980759980755,
+ "T3. Evidence-Grounded QA": 0.44166666666666665,
+ "T4. Summarization & Synthesis": 0.49921805286585674,
+ "T5. Attribution & Citation Alignment": 0.3949316390801899,
+ "T6. Aggregation & Clustering": 0.41861071554912704,
+ "T7. Consistency & Compliance Checking": 0.2063855467243845,
+ "T8. Structured & Numeric Reasoning": 0.37129629629629635,
+ "T9. Version & Code Diff Analysis": 0.5077280298336089,
+ "T10. Rule Induction & In-Context Learning": 0.4469444444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667
+ },
+ "language": {
+ "Chinese": 0.42410938192847564,
+ "English": 0.4587615252567272
+ }
+ },
+ "pass@1": 0.204,
+ "BoN-2": {
+ "overall_metric": 0.5073666298083515,
+ "token_length": {
+ "8k": 0.5709490486184731,
+ "16k": 0.5511354485905285,
+ "32k": 0.5387154833941795,
+ "64k": 0.4998902412554904,
+ "128k": 0.46094293028745753,
+ "256k": 0.422566626703988
+ },
+ "contextual_requirement": {
+ "Full": 0.47415575005443095,
+ "Partial": 0.549635022222436
+ },
+ "difficulty": {
+ "Easy": 0.7571129884853747,
+ "Moderate": 0.4236864876869295,
+ "Hard": 0.4106963399857809,
+ "Extreme": 0.3522382887844671
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7586612129799296,
+ "T2. Sequencing & Structure Reconstruction": 0.7268826543826544,
+ "T3. Evidence-Grounded QA": 0.525,
+ "T4. Summarization & Synthesis": 0.5087847643179455,
+ "T5. Attribution & Citation Alignment": 0.4821472159010296,
+ "T6. Aggregation & Clustering": 0.5023386543115935,
+ "T7. Consistency & Compliance Checking": 0.2538128132470806,
+ "T8. Structured & Numeric Reasoning": 0.45879629629629626,
+ "T9. Version & Code Diff Analysis": 0.5837409347959492,
+ "T10. Rule Induction & In-Context Learning": 0.5177777777777777,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
+ },
+ "language": {
+ "Chinese": 0.48982515153338074,
+ "English": 0.5249081080833256
+ }
+ },
+ "pass@2": 0.26,
+ "BoN-3": {
+ "overall_metric": 0.54231740926941,
+ "token_length": {
+ "8k": 0.6068057358810921,
+ "16k": 0.604506768606215,
+ "32k": 0.5734498234618363,
+ "64k": 0.5181523235316485,
+ "128k": 0.4882018386364437,
+ "256k": 0.4627879654992305
+ },
+ "contextual_requirement": {
+ "Full": 0.5027116133357081,
+ "Partial": 0.592724785912306
+ },
+ "difficulty": {
+ "Easy": 0.8109806331655524,
+ "Moderate": 0.44576401021132206,
+ "Hard": 0.43505232412248046,
+ "Extreme": 0.3818807456723898
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8065060178436674,
+ "T2. Sequencing & Structure Reconstruction": 0.761479215229215,
+ "T3. Evidence-Grounded QA": 0.55,
+ "T4. Summarization & Synthesis": 0.512142930512617,
+ "T5. Attribution & Citation Alignment": 0.5013791499027004,
+ "T6. Aggregation & Clustering": 0.5419289746475281,
+ "T7. Consistency & Compliance Checking": 0.27906109745150515,
+ "T8. Structured & Numeric Reasoning": 0.5125,
+ "T9. Version & Code Diff Analysis": 0.6030585275642201,
+ "T10. Rule Induction & In-Context Learning": 0.6025000000000001,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665
+ },
+ "language": {
+ "Chinese": 0.5225371072526017,
+ "English": 0.5620977112862204
+ }
+ },
+ "pass@3": 0.292
+}
\ No newline at end of file
diff --git a/results/Ministral-8B-Instruct-2410/nonthinking_context-120000_bon-3_summary.json b/results/Ministral-8B-Instruct-2410/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..e514863f61844da0b44070579cb8da37777ac545
--- /dev/null
+++ b/results/Ministral-8B-Instruct-2410/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.17559419950360972,
+ "inference_iteration_1_overall_metric": 0.1798365624509928,
+ "inference_iteration_2_overall_metric": 0.17628623970116333,
+ "inference_iteration_3_overall_metric": 0.17065979635867265,
+ "average_token_length_metric": {
+ "8k": 0.23670773015182286,
+ "16k": 0.2225568752590229,
+ "32k": 0.18016652444641598,
+ "64k": 0.1429897738729207,
+ "128k": 0.13700010614467598,
+ "256k": 0.13414418714679874
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.16475960378398763,
+ "Partial": 0.1893836849649465
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.1985800924396908,
+ "Moderate": 0.12257638309875012,
+ "Hard": 0.18613560242292748,
+ "Extreme": 0.1783134792811784
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.39248676390172876,
+ "T2. Sequencing & Structure Reconstruction": 0.41139201610622733,
+ "T3. Evidence-Grounded QA": 0.16388888888888892,
+ "T4. Summarization & Synthesis": 0.3808268181260766,
+ "T5. Attribution & Citation Alignment": 0.07226655367312615,
+ "T6. Aggregation & Clustering": 0.1829192046779024,
+ "T7. Consistency & Compliance Checking": 0.0756354691154679,
+ "T8. Structured & Numeric Reasoning": 0.01790123456790123,
+ "T9. Version & Code Diff Analysis": 0.10878073870531223,
+ "T10. Rule Induction & In-Context Learning": 0.16171296296296298,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.08888888888888888
+ },
+ "average_language_metric": {
+ "Chinese": 0.16472451449115244,
+ "English": 0.1864638845160666
+ },
+ "BoN-1": {
+ "overall_metric": 0.1798365624509928,
+ "token_length": {
+ "8k": 0.24346526021614548,
+ "16k": 0.22653552572263724,
+ "32k": 0.1770017940649281,
+ "64k": 0.15054614321487733,
+ "128k": 0.15816316532578398,
+ "256k": 0.12330748616158495
+ },
+ "contextual_requirement": {
+ "Full": 0.17138999029023816,
+ "Partial": 0.19058674520104432
+ },
+ "difficulty": {
+ "Easy": 0.20041414087611367,
+ "Moderate": 0.12656096910842649,
+ "Hard": 0.20970773721313615,
+ "Extreme": 0.17266720828925194
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.39139525725719654,
+ "T2. Sequencing & Structure Reconstruction": 0.43729511229511225,
+ "T3. Evidence-Grounded QA": 0.19166666666666668,
+ "T4. Summarization & Synthesis": 0.3753264284837986,
+ "T5. Attribution & Citation Alignment": 0.07301644262170577,
+ "T6. Aggregation & Clustering": 0.18246822005155336,
+ "T7. Consistency & Compliance Checking": 0.07435907091608848,
+ "T8. Structured & Numeric Reasoning": 0.016666666666666666,
+ "T9. Version & Code Diff Analysis": 0.1033217424170237,
+ "T10. Rule Induction & In-Context Learning": 0.16569444444444445,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.1
+ },
+ "language": {
+ "Chinese": 0.17157094623226812,
+ "English": 0.18810217866971762
+ }
+ },
+ "pass@1": 0.042,
+ "BoN-2": {
+ "overall_metric": 0.24632458802491347,
+ "token_length": {
+ "8k": 0.3310742150705802,
+ "16k": 0.2986080788714333,
+ "32k": 0.2667237213160877,
+ "64k": 0.1950849880242041,
+ "128k": 0.20119049419756752,
+ "256k": 0.1852660306696089
+ },
+ "contextual_requirement": {
+ "Full": 0.22948115089254192,
+ "Partial": 0.2677616898297509
+ },
+ "difficulty": {
+ "Easy": 0.2985290634429019,
+ "Moderate": 0.17216911619732905,
+ "Hard": 0.26824836302970206,
+ "Extreme": 0.22344368627563585
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.49278911557042887,
+ "T2. Sequencing & Structure Reconstruction": 0.54627701002701,
+ "T3. Evidence-Grounded QA": 0.2833333333333333,
+ "T4. Summarization & Synthesis": 0.41486609223778365,
+ "T5. Attribution & Citation Alignment": 0.13174847956194702,
+ "T6. Aggregation & Clustering": 0.28143118609785267,
+ "T7. Consistency & Compliance Checking": 0.11797004643635843,
+ "T8. Structured & Numeric Reasoning": 0.027314814814814816,
+ "T9. Version & Code Diff Analysis": 0.16858035966848986,
+ "T10. Rule Induction & In-Context Learning": 0.2597222222222222,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
+ },
+ "language": {
+ "Chinese": 0.23254504310620405,
+ "English": 0.2601041329436236
+ }
+ },
+ "pass@2": 0.06733333333333333,
+ "BoN-3": {
+ "overall_metric": 0.28484680397856615,
+ "token_length": {
+ "8k": 0.3735683555982749,
+ "16k": 0.34629859523101947,
+ "32k": 0.31456400077002905,
+ "64k": 0.22697550391574145,
+ "128k": 0.22085267461030403,
+ "256k": 0.2268216937460278
+ },
+ "contextual_requirement": {
+ "Full": 0.265505541689755,
+ "Partial": 0.3094629559825076
+ },
+ "difficulty": {
+ "Easy": 0.3518525654113404,
+ "Moderate": 0.20247492584084434,
+ "Hard": 0.2991035774243336,
+ "Extreme": 0.2561842772635152
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.5450181829888936,
+ "T2. Sequencing & Structure Reconstruction": 0.5925035612535613,
+ "T3. Evidence-Grounded QA": 0.38333333333333336,
+ "T4. Summarization & Synthesis": 0.4350813325596142,
+ "T5. Attribution & Citation Alignment": 0.16950641606988356,
+ "T6. Aggregation & Clustering": 0.3244020623625887,
+ "T7. Consistency & Compliance Checking": 0.1368650403313524,
+ "T8. Structured & Numeric Reasoning": 0.04398148148148148,
+ "T9. Version & Code Diff Analysis": 0.20838045837476707,
+ "T10. Rule Induction & In-Context Learning": 0.27722222222222215,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
+ },
+ "language": {
+ "Chinese": 0.2733872958268853,
+ "English": 0.29630631213024705
+ }
+ },
+ "pass@3": 0.08666666666666667
+}
\ No newline at end of file
diff --git a/results/Ministral-8B-Instruct-2410/thinking_context-120000_bon-3_summary.json b/results/Ministral-8B-Instruct-2410/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5ecf959c2b2f40b926ab225c2de149c27a0568e
--- /dev/null
+++ b/results/Ministral-8B-Instruct-2410/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.14430959523166526,
+ "inference_iteration_1_overall_metric": 0.13849588125287102,
+ "inference_iteration_2_overall_metric": 0.14293536519106903,
+ "inference_iteration_3_overall_metric": 0.15149753925105558,
+ "average_token_length_metric": {
+ "8k": 0.21115060917104964,
+ "16k": 0.19398692652809305,
+ "32k": 0.12654145463127237,
+ "64k": 0.10693470763289076,
+ "128k": 0.11058360115846737,
+ "256k": 0.11666027226821786
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.13689831594533386,
+ "Partial": 0.15374213250517774
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.16844291277154172,
+ "Moderate": 0.09886695623599538,
+ "Hard": 0.13984977399541215,
+ "Extreme": 0.15064948603349507
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.2973074399585583,
+ "T2. Sequencing & Structure Reconstruction": 0.3370251608769255,
+ "T3. Evidence-Grounded QA": 0.11944444444444446,
+ "T4. Summarization & Synthesis": 0.38152337418528626,
+ "T5. Attribution & Citation Alignment": 0.04010115798342492,
+ "T6. Aggregation & Clustering": 0.13501345810105547,
+ "T7. Consistency & Compliance Checking": 0.05419569047101407,
+ "T8. Structured & Numeric Reasoning": 0.047839506172839504,
+ "T9. Version & Code Diff Analysis": 0.10414538082981198,
+ "T10. Rule Induction & In-Context Learning": 0.11319444444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.055555555555555546
+ },
+ "average_language_metric": {
+ "Chinese": 0.1233481918215467,
+ "English": 0.16527099864178366
+ },
+ "BoN-1": {
+ "overall_metric": 0.13849588125287102,
+ "token_length": {
+ "8k": 0.21715146588540263,
+ "16k": 0.17919497907671647,
+ "32k": 0.13758692284538804,
+ "64k": 0.09292136118327776,
+ "128k": 0.10222628424195211,
+ "256k": 0.10189427428448873
+ },
+ "contextual_requirement": {
+ "Full": 0.13174868671272114,
+ "Partial": 0.14708321975851618
+ },
+ "difficulty": {
+ "Easy": 0.14908853505231587,
+ "Moderate": 0.09146123280540662,
+ "Hard": 0.14088385418342952,
+ "Extreme": 0.15621685873708296
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.25192356457401754,
+ "T2. Sequencing & Structure Reconstruction": 0.3288852758782787,
+ "T3. Evidence-Grounded QA": 0.09166666666666666,
+ "T4. Summarization & Synthesis": 0.3927474127585449,
+ "T5. Attribution & Citation Alignment": 0.04019875024260989,
+ "T6. Aggregation & Clustering": 0.1463177717344383,
+ "T7. Consistency & Compliance Checking": 0.051579081014564904,
+ "T8. Structured & Numeric Reasoning": 0.05324074074074074,
+ "T9. Version & Code Diff Analysis": 0.07976489975059771,
+ "T10. Rule Induction & In-Context Learning": 0.11930555555555555,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.05
+ },
+ "language": {
+ "Chinese": 0.11950758971285085,
+ "English": 0.15748417279289106
+ }
+ },
+ "pass@1": 0.03133333333333333,
+ "BoN-2": {
+ "overall_metric": 0.20014764756254877,
+ "token_length": {
+ "8k": 0.28346565871948276,
+ "16k": 0.2485354487812065,
+ "32k": 0.1891519780196866,
+ "64k": 0.1488424509609456,
+ "128k": 0.16144012015437917,
+ "256k": 0.1694502287395916
+ },
+ "contextual_requirement": {
+ "Full": 0.19478513206751483,
+ "Partial": 0.20697266728350117
+ },
+ "difficulty": {
+ "Easy": 0.25273031996725753,
+ "Moderate": 0.13899098269897925,
+ "Hard": 0.1936738222234887,
+ "Extreme": 0.18696665567502654
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.39483140448121057,
+ "T2. Sequencing & Structure Reconstruction": 0.4611114324443421,
+ "T3. Evidence-Grounded QA": 0.18333333333333332,
+ "T4. Summarization & Synthesis": 0.42108537584236744,
+ "T5. Attribution & Citation Alignment": 0.06676507134840469,
+ "T6. Aggregation & Clustering": 0.1877574508713916,
+ "T7. Consistency & Compliance Checking": 0.08623797150678873,
+ "T8. Structured & Numeric Reasoning": 0.08194444444444446,
+ "T9. Version & Code Diff Analysis": 0.16317028795937463,
+ "T10. Rule Induction & In-Context Learning": 0.17763888888888887,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.1
+ },
+ "language": {
+ "Chinese": 0.17263611058145747,
+ "English": 0.22765918454364006
+ }
+ },
+ "pass@2": 0.058666666666666666,
+ "BoN-3": {
+ "overall_metric": 0.24816325726475538,
+ "token_length": {
+ "8k": 0.34035307822346317,
+ "16k": 0.3192701911329572,
+ "32k": 0.2250254030817654,
+ "64k": 0.2025517709430857,
+ "128k": 0.1946876666118698,
+ "256k": 0.20709143359539076
+ },
+ "contextual_requirement": {
+ "Full": 0.24205586173316382,
+ "Partial": 0.25593630612314483
+ },
+ "difficulty": {
+ "Easy": 0.32112947294679217,
+ "Moderate": 0.17708357883824535,
+ "Hard": 0.24279933271248433,
+ "Extreme": 0.21844163295140512
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.4707134866823802,
+ "T2. Sequencing & Structure Reconstruction": 0.5820753654025713,
+ "T3. Evidence-Grounded QA": 0.2833333333333333,
+ "T4. Summarization & Synthesis": 0.43262446200126664,
+ "T5. Attribution & Citation Alignment": 0.08657988616321947,
+ "T6. Aggregation & Clustering": 0.24002098313209438,
+ "T7. Consistency & Compliance Checking": 0.09483216743431798,
+ "T8. Structured & Numeric Reasoning": 0.11666666666666667,
+ "T9. Version & Code Diff Analysis": 0.20318445637705182,
+ "T10. Rule Induction & In-Context Learning": 0.23291666666666666,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.13333333333333333
+ },
+ "language": {
+ "Chinese": 0.22445802277836707,
+ "English": 0.27186849175114364
+ }
+ },
+ "pass@3": 0.08
+}
\ No newline at end of file
diff --git a/results/Mistral-Large-Instruct-2411/nonthinking_context-120000_bon-3_summary.json b/results/Mistral-Large-Instruct-2411/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e3720adfc00c7a910f75ab8638257196e8fafb5
--- /dev/null
+++ b/results/Mistral-Large-Instruct-2411/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3169030003205045,
+ "inference_iteration_1_overall_metric": 0.31907671414212213,
+ "inference_iteration_2_overall_metric": 0.31793129606674814,
+ "inference_iteration_3_overall_metric": 0.3137009907526448,
+ "average_token_length_metric": {
+ "8k": 0.4275967733792583,
+ "16k": 0.3967521615717436,
+ "32k": 0.3684760987014902,
+ "64k": 0.2785203784059932,
+ "128k": 0.230059702582881,
+ "256k": 0.2000128872816638
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.296348339160902,
+ "Partial": 0.3430634781600003
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.4165851756517525,
+ "Moderate": 0.23424308281587738,
+ "Hard": 0.29878920863003516,
+ "Extreme": 0.27389571270169133
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.5986650899154622,
+ "T2. Sequencing & Structure Reconstruction": 0.595066968754912,
+ "T3. Evidence-Grounded QA": 0.3194444444444444,
+ "T4. Summarization & Synthesis": 0.5064060211674865,
+ "T5. Attribution & Citation Alignment": 0.2495290285755284,
+ "T6. Aggregation & Clustering": 0.33225024606080356,
+ "T7. Consistency & Compliance Checking": 0.15459568382089198,
+ "T8. Structured & Numeric Reasoning": 0.07623456790123456,
+ "T9. Version & Code Diff Analysis": 0.307786685955565,
+ "T10. Rule Induction & In-Context Learning": 0.32310185185185186,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666665
+ },
+ "average_language_metric": {
+ "Chinese": 0.3028142034775456,
+ "English": 0.3309917971634648
+ },
+ "BoN-1": {
+ "overall_metric": 0.31907671414212213,
+ "token_length": {
+ "8k": 0.42705539380437574,
+ "16k": 0.40444023819384445,
+ "32k": 0.3490132720940125,
+ "64k": 0.2906900596810724,
+ "128k": 0.2317028482688344,
+ "256k": 0.21155847281059137
+ },
+ "contextual_requirement": {
+ "Full": 0.29415523855405945,
+ "Partial": 0.3507949557996556
+ },
+ "difficulty": {
+ "Easy": 0.4170748170246479,
+ "Moderate": 0.23410587578788378,
+ "Hard": 0.2992371396825697,
+ "Extreme": 0.2805656316735027
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6048650518041011,
+ "T2. Sequencing & Structure Reconstruction": 0.58605531755393,
+ "T3. Evidence-Grounded QA": 0.3333333333333333,
+ "T4. Summarization & Synthesis": 0.5119167501762161,
+ "T5. Attribution & Citation Alignment": 0.2473640440223312,
+ "T6. Aggregation & Clustering": 0.3368335027354634,
+ "T7. Consistency & Compliance Checking": 0.16052542025800243,
+ "T8. Structured & Numeric Reasoning": 0.06111111111111111,
+ "T9. Version & Code Diff Analysis": 0.3099971565075231,
+ "T10. Rule Induction & In-Context Learning": 0.3488888888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334
+ },
+ "language": {
+ "Chinese": 0.29912831688952574,
+ "English": 0.3390251113947178
+ }
+ },
+ "pass@1": 0.11466666666666667,
+ "BoN-2": {
+ "overall_metric": 0.38594527565749853,
+ "token_length": {
+ "8k": 0.5042831481575721,
+ "16k": 0.475227330555721,
+ "32k": 0.43467889359878076,
+ "64k": 0.3349905766445296,
+ "128k": 0.31010981403738264,
+ "256k": 0.2563818909510073
+ },
+ "contextual_requirement": {
+ "Full": 0.3530504764849349,
+ "Partial": 0.4278113836953078
+ },
+ "difficulty": {
+ "Easy": 0.5179857542164504,
+ "Moderate": 0.29460590189658764,
+ "Hard": 0.35300286039674555,
+ "Extreme": 0.3229314719344226
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6679073055509605,
+ "T2. Sequencing & Structure Reconstruction": 0.6741524216524215,
+ "T3. Evidence-Grounded QA": 0.4166666666666667,
+ "T4. Summarization & Synthesis": 0.5302045683446602,
+ "T5. Attribution & Citation Alignment": 0.3062092936556243,
+ "T6. Aggregation & Clustering": 0.4082451589804528,
+ "T7. Consistency & Compliance Checking": 0.2092540186651841,
+ "T8. Structured & Numeric Reasoning": 0.11203703703703705,
+ "T9. Version & Code Diff Analysis": 0.36639914560216913,
+ "T10. Rule Induction & In-Context Learning": 0.4518055555555556,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
+ },
+ "language": {
+ "Chinese": 0.3686971114406643,
+ "English": 0.4031934398743337
+ }
+ },
+ "pass@2": 0.154,
+ "BoN-3": {
+ "overall_metric": 0.421953191488323,
+ "token_length": {
+ "8k": 0.5418256257423728,
+ "16k": 0.514596656363469,
+ "32k": 0.47852902965649924,
+ "64k": 0.37391520136163825,
+ "128k": 0.34070102557165205,
+ "256k": 0.2821516102343087
+ },
+ "contextual_requirement": {
+ "Full": 0.3875133170087601,
+ "Partial": 0.4657857590077671
+ },
+ "difficulty": {
+ "Easy": 0.5664227594780717,
+ "Moderate": 0.3257686393502546,
+ "Hard": 0.38999210046214894,
+ "Extreme": 0.34786173490515865
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6882095518311481,
+ "T2. Sequencing & Structure Reconstruction": 0.7038069338069337,
+ "T3. Evidence-Grounded QA": 0.4666666666666667,
+ "T4. Summarization & Synthesis": 0.5370106443627681,
+ "T5. Attribution & Citation Alignment": 0.3586021289313467,
+ "T6. Aggregation & Clustering": 0.45934845005229746,
+ "T7. Consistency & Compliance Checking": 0.22554787508833687,
+ "T8. Structured & Numeric Reasoning": 0.14537037037037037,
+ "T9. Version & Code Diff Analysis": 0.42569114696089216,
+ "T10. Rule Induction & In-Context Learning": 0.5073611111111112,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667
+ },
+ "language": {
+ "Chinese": 0.40247669761625265,
+ "English": 0.44142968536039356
+ }
+ },
+ "pass@3": 0.17933333333333334
+}
\ No newline at end of file
diff --git a/results/Mistral-Large-Instruct-2411/thinking_context-120000_bon-3_summary.json b/results/Mistral-Large-Instruct-2411/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..d64abb01910ad5356ee22fc1003223b07633ce25
--- /dev/null
+++ b/results/Mistral-Large-Instruct-2411/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3624706912806339,
+ "inference_iteration_1_overall_metric": 0.3518340119783078,
+ "inference_iteration_2_overall_metric": 0.36714050613217963,
+ "inference_iteration_3_overall_metric": 0.36843755573141435,
+ "average_token_length_metric": {
+ "8k": 0.519368817474254,
+ "16k": 0.46878642904173495,
+ "32k": 0.4367602706000035,
+ "64k": 0.2820635855060585,
+ "128k": 0.2391844493104579,
+ "256k": 0.22866059575129521
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3358652015644663,
+ "Partial": 0.39633222364666526
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.536500103394983,
+ "Moderate": 0.2561851997238422,
+ "Hard": 0.29421165901910773,
+ "Extreme": 0.28647339751785583
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6270246927945843,
+ "T2. Sequencing & Structure Reconstruction": 0.5930747647414312,
+ "T3. Evidence-Grounded QA": 0.35000000000000014,
+ "T4. Summarization & Synthesis": 0.4891097262506477,
+ "T5. Attribution & Citation Alignment": 0.27219561539555,
+ "T6. Aggregation & Clustering": 0.3929461864831539,
+ "T7. Consistency & Compliance Checking": 0.17406259434511873,
+ "T8. Structured & Numeric Reasoning": 0.2887345679012346,
+ "T9. Version & Code Diff Analysis": 0.3055397446573754,
+ "T10. Rule Induction & In-Context Learning": 0.37699074074074074,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333325
+ },
+ "average_language_metric": {
+ "Chinese": 0.3335769489143806,
+ "English": 0.3913644336468867
+ },
+ "BoN-1": {
+ "overall_metric": 0.3518340119783078,
+ "token_length": {
+ "8k": 0.4978924620415691,
+ "16k": 0.4660407575784769,
+ "32k": 0.44439300101192125,
+ "64k": 0.2504015769837324,
+ "128k": 0.22569830445064326,
+ "256k": 0.22657796980350628
+ },
+ "contextual_requirement": {
+ "Full": 0.3296943530149145,
+ "Partial": 0.3800117597499008
+ },
+ "difficulty": {
+ "Easy": 0.5117779578352326,
+ "Moderate": 0.25778003157579815,
+ "Hard": 0.27490568840940766,
+ "Extreme": 0.28892697548899077
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.640262949096431,
+ "T2. Sequencing & Structure Reconstruction": 0.5669311244311241,
+ "T3. Evidence-Grounded QA": 0.3416666666666667,
+ "T4. Summarization & Synthesis": 0.4918201268608611,
+ "T5. Attribution & Citation Alignment": 0.23208406359722156,
+ "T6. Aggregation & Clustering": 0.36478858378368206,
+ "T7. Consistency & Compliance Checking": 0.1661247455189573,
+ "T8. Structured & Numeric Reasoning": 0.2925925925925926,
+ "T9. Version & Code Diff Analysis": 0.32795689178925624,
+ "T10. Rule Induction & In-Context Learning": 0.3702777777777778,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
+ },
+ "language": {
+ "Chinese": 0.3268255224599945,
+ "English": 0.3768425014966223
+ }
+ },
+ "pass@1": 0.15066666666666667,
+ "BoN-2": {
+ "overall_metric": 0.43679829295529543,
+ "token_length": {
+ "8k": 0.5911437236371854,
+ "16k": 0.5703298590207265,
+ "32k": 0.5326982597861109,
+ "64k": 0.346664895131139,
+ "128k": 0.3012300084386134,
+ "256k": 0.2787230117179987
+ },
+ "contextual_requirement": {
+ "Full": 0.407647673394358,
+ "Partial": 0.47389908148739823
+ },
+ "difficulty": {
+ "Easy": 0.6480374309185871,
+ "Moderate": 0.3165832098159298,
+ "Hard": 0.36022054379586954,
+ "Extreme": 0.3346525294008514
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7044693578616591,
+ "T2. Sequencing & Structure Reconstruction": 0.6615363340363338,
+ "T3. Evidence-Grounded QA": 0.45,
+ "T4. Summarization & Synthesis": 0.5100444551229819,
+ "T5. Attribution & Citation Alignment": 0.3337101477566481,
+ "T6. Aggregation & Clustering": 0.47858076006885547,
+ "T7. Consistency & Compliance Checking": 0.23101969311309353,
+ "T8. Structured & Numeric Reasoning": 0.3634259259259259,
+ "T9. Version & Code Diff Analysis": 0.3826232429462048,
+ "T10. Rule Induction & In-Context Learning": 0.46638888888888885,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667
+ },
+ "language": {
+ "Chinese": 0.40374567113223503,
+ "English": 0.4698509147783562
+ }
+ },
+ "pass@2": 0.206,
+ "BoN-3": {
+ "overall_metric": 0.48020123308951057,
+ "token_length": {
+ "8k": 0.6542419600875183,
+ "16k": 0.6183938644898413,
+ "32k": 0.571965197774906,
+ "64k": 0.3866173301264082,
+ "128k": 0.33388551488861024,
+ "256k": 0.3161035311697812
+ },
+ "contextual_requirement": {
+ "Full": 0.4442943380691916,
+ "Partial": 0.5259009176608265
+ },
+ "difficulty": {
+ "Easy": 0.7130100233942409,
+ "Moderate": 0.35462311357794873,
+ "Hard": 0.39367151583097126,
+ "Extreme": 0.3644860692251465
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7280998354477696,
+ "T2. Sequencing & Structure Reconstruction": 0.7144397731897729,
+ "T3. Evidence-Grounded QA": 0.5083333333333333,
+ "T4. Summarization & Synthesis": 0.5171556422011707,
+ "T5. Attribution & Citation Alignment": 0.37747723607793277,
+ "T6. Aggregation & Clustering": 0.5266240833565394,
+ "T7. Consistency & Compliance Checking": 0.25904571820731975,
+ "T8. Structured & Numeric Reasoning": 0.41620370370370363,
+ "T9. Version & Code Diff Analysis": 0.42031044657867184,
+ "T10. Rule Induction & In-Context Learning": 0.5422222222222222,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.39166666666666666
+ },
+ "language": {
+ "Chinese": 0.4469064954573681,
+ "English": 0.5134959707216539
+ }
+ },
+ "pass@3": 0.23866666666666667
+}
\ No newline at end of file
diff --git a/results/Mistral-Small-3.2-24B-Instruct-2506/nonthinking_context-120000_bon-3_summary.json b/results/Mistral-Small-3.2-24B-Instruct-2506/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..54ba21c89251ed84dca9b107b657f75048706b66
--- /dev/null
+++ b/results/Mistral-Small-3.2-24B-Instruct-2506/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3731986116770262,
+ "inference_iteration_1_overall_metric": 0.3764430164692507,
+ "inference_iteration_2_overall_metric": 0.37295344128487523,
+ "inference_iteration_3_overall_metric": 0.37019937727695235,
+ "average_token_length_metric": {
+ "8k": 0.47652440065044005,
+ "16k": 0.449888659714999,
+ "32k": 0.4078506319915549,
+ "64k": 0.3156364049708699,
+ "128k": 0.3188001493625175,
+ "256k": 0.2704914233717748
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3426671160616572,
+ "Partial": 0.4120568788238591
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5045346001029442,
+ "Moderate": 0.274199216283861,
+ "Hard": 0.33731265993779014,
+ "Extreme": 0.3179211546315068
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6709523819471523,
+ "T2. Sequencing & Structure Reconstruction": 0.6500401579022548,
+ "T3. Evidence-Grounded QA": 0.4277777777777778,
+ "T4. Summarization & Synthesis": 0.5256573469543858,
+ "T5. Attribution & Citation Alignment": 0.3048343833139541,
+ "T6. Aggregation & Clustering": 0.3439710542602968,
+ "T7. Consistency & Compliance Checking": 0.18369196767650903,
+ "T8. Structured & Numeric Reasoning": 0.14243827160493827,
+ "T9. Version & Code Diff Analysis": 0.392976065162092,
+ "T10. Rule Induction & In-Context Learning": 0.43203703703703705,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2555555555555555
+ },
+ "average_language_metric": {
+ "Chinese": 0.3541108289818725,
+ "English": 0.3922863943721798
+ },
+ "BoN-1": {
+ "overall_metric": 0.3764430164692507,
+ "token_length": {
+ "8k": 0.47293579301173544,
+ "16k": 0.4529323453367711,
+ "32k": 0.4036888939118912,
+ "64k": 0.3200427598731278,
+ "128k": 0.32499778534152896,
+ "256k": 0.2840605213404498
+ },
+ "contextual_requirement": {
+ "Full": 0.3477950533618385,
+ "Partial": 0.41290406042413935
+ },
+ "difficulty": {
+ "Easy": 0.5103779259997739,
+ "Moderate": 0.27686100242496664,
+ "Hard": 0.3386830547019045,
+ "Extreme": 0.31993211786890735
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6798326459221492,
+ "T2. Sequencing & Structure Reconstruction": 0.6539941668562641,
+ "T3. Evidence-Grounded QA": 0.425,
+ "T4. Summarization & Synthesis": 0.5249912472397621,
+ "T5. Attribution & Citation Alignment": 0.3021910633281516,
+ "T6. Aggregation & Clustering": 0.34743215354780066,
+ "T7. Consistency & Compliance Checking": 0.18127201803220644,
+ "T8. Structured & Numeric Reasoning": 0.1486111111111111,
+ "T9. Version & Code Diff Analysis": 0.39855565848262914,
+ "T10. Rule Induction & In-Context Learning": 0.4466666666666666,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336
+ },
+ "language": {
+ "Chinese": 0.35803273408066133,
+ "English": 0.3948532988578403
+ }
+ },
+ "pass@1": 0.14533333333333334,
+ "BoN-2": {
+ "overall_metric": 0.39540993027593374,
+ "token_length": {
+ "8k": 0.5041800135019555,
+ "16k": 0.476249230788589,
+ "32k": 0.4259536786274836,
+ "64k": 0.33796643925845926,
+ "128k": 0.338211260649731,
+ "256k": 0.28989895882938543
+ },
+ "contextual_requirement": {
+ "Full": 0.36338165608597955,
+ "Partial": 0.4361731883358756
+ },
+ "difficulty": {
+ "Easy": 0.5278149129484206,
+ "Moderate": 0.30819954499388696,
+ "Hard": 0.35088879158138625,
+ "Extreme": 0.3368899502329733
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6887146593492802,
+ "T2. Sequencing & Structure Reconstruction": 0.6648539552160522,
+ "T3. Evidence-Grounded QA": 0.4583333333333333,
+ "T4. Summarization & Synthesis": 0.5373243786993875,
+ "T5. Attribution & Citation Alignment": 0.34748354292868555,
+ "T6. Aggregation & Clustering": 0.37668161380086723,
+ "T7. Consistency & Compliance Checking": 0.19122702647405698,
+ "T8. Structured & Numeric Reasoning": 0.1597222222222222,
+ "T9. Version & Code Diff Analysis": 0.4286346318433818,
+ "T10. Rule Induction & In-Context Learning": 0.45916666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.26666666666666666
+ },
+ "language": {
+ "Chinese": 0.37724327797419344,
+ "English": 0.41357658257767443
+ }
+ },
+ "pass@2": 0.16066666666666668,
+ "BoN-3": {
+ "overall_metric": 0.406738281428378,
+ "token_length": {
+ "8k": 0.5137058259549282,
+ "16k": 0.4815649687345498,
+ "32k": 0.44638935934429075,
+ "64k": 0.3486156261221766,
+ "128k": 0.34787432869425106,
+ "256k": 0.30227957972006925
+ },
+ "contextual_requirement": {
+ "Full": 0.37465484620173795,
+ "Partial": 0.44757174444410075
+ },
+ "difficulty": {
+ "Easy": 0.5448834015964552,
+ "Moderate": 0.31056675024038943,
+ "Hard": 0.3651326226193961,
+ "Extreme": 0.34590114503772057
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6949591749410688,
+ "T2. Sequencing & Structure Reconstruction": 0.6751648017768989,
+ "T3. Evidence-Grounded QA": 0.475,
+ "T4. Summarization & Synthesis": 0.5442519483294492,
+ "T5. Attribution & Citation Alignment": 0.35765608366828805,
+ "T6. Aggregation & Clustering": 0.3948528763892126,
+ "T7. Consistency & Compliance Checking": 0.19845858401758282,
+ "T8. Structured & Numeric Reasoning": 0.17083333333333334,
+ "T9. Version & Code Diff Analysis": 0.43181265186215534,
+ "T10. Rule Induction & In-Context Learning": 0.4675,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667
+ },
+ "language": {
+ "Chinese": 0.3869496550877674,
+ "English": 0.4265269077689878
+ }
+ },
+ "pass@3": 0.168
+}
\ No newline at end of file
diff --git a/results/Mistral-Small-3.2-24B-Instruct-2506/thinking_context-120000_bon-3_summary.json b/results/Mistral-Small-3.2-24B-Instruct-2506/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..651a3d9332182d499c82750556491bf2214d27b7
--- /dev/null
+++ b/results/Mistral-Small-3.2-24B-Instruct-2506/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3987037370686582,
+ "inference_iteration_1_overall_metric": 0.4005466639339314,
+ "inference_iteration_2_overall_metric": 0.39333333377476243,
+ "inference_iteration_3_overall_metric": 0.40223121349728147,
+ "average_token_length_metric": {
+ "8k": 0.5424778112458041,
+ "16k": 0.48688051040801983,
+ "32k": 0.4403148811203121,
+ "64k": 0.3401436546897792,
+ "128k": 0.30659117482257736,
+ "256k": 0.27581439012545783
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3659155932981649,
+ "Partial": 0.44043410186746873
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6121824426461749,
+ "Moderate": 0.27744531940705713,
+ "Hard": 0.317638215137331,
+ "Extreme": 0.29773756082773445
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6558837112656615,
+ "T2. Sequencing & Structure Reconstruction": 0.6233137078970414,
+ "T3. Evidence-Grounded QA": 0.3972222222222222,
+ "T4. Summarization & Synthesis": 0.4982728933846063,
+ "T5. Attribution & Citation Alignment": 0.3406895449155636,
+ "T6. Aggregation & Clustering": 0.3784497613515588,
+ "T7. Consistency & Compliance Checking": 0.1850253553759489,
+ "T8. Structured & Numeric Reasoning": 0.3645061728395062,
+ "T9. Version & Code Diff Analysis": 0.4149149215448353,
+ "T10. Rule Induction & In-Context Learning": 0.44208333333333344,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.21944444444444447
+ },
+ "average_language_metric": {
+ "Chinese": 0.37181071338153865,
+ "English": 0.42559676075577846
+ },
+ "BoN-1": {
+ "overall_metric": 0.4005466639339314,
+ "token_length": {
+ "8k": 0.549936123733468,
+ "16k": 0.5059723107057712,
+ "32k": 0.436500147965248,
+ "64k": 0.3263663588238478,
+ "128k": 0.3118538606332236,
+ "256k": 0.27265118174202957
+ },
+ "contextual_requirement": {
+ "Full": 0.3706804581354897,
+ "Partial": 0.43855819858649375
+ },
+ "difficulty": {
+ "Easy": 0.6146432894264784,
+ "Moderate": 0.27885587375626963,
+ "Hard": 0.3246283820077309,
+ "Extreme": 0.29580677382167836
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6418078252086361,
+ "T2. Sequencing & Structure Reconstruction": 0.6254830724830722,
+ "T3. Evidence-Grounded QA": 0.4083333333333333,
+ "T4. Summarization & Synthesis": 0.4926333263044629,
+ "T5. Attribution & Citation Alignment": 0.34851784289183024,
+ "T6. Aggregation & Clustering": 0.38872032735842527,
+ "T7. Consistency & Compliance Checking": 0.19546994720957953,
+ "T8. Structured & Numeric Reasoning": 0.3689814814814814,
+ "T9. Version & Code Diff Analysis": 0.41307804265635545,
+ "T10. Rule Induction & In-Context Learning": 0.4305555555555556,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666667
+ },
+ "language": {
+ "Chinese": 0.372583682740498,
+ "English": 0.4285096451273649
+ }
+ },
+ "pass@1": 0.17933333333333334,
+ "BoN-2": {
+ "overall_metric": 0.44657411540443975,
+ "token_length": {
+ "8k": 0.5764947600450254,
+ "16k": 0.5474122233968183,
+ "32k": 0.5034002505230384,
+ "64k": 0.3860776417526754,
+ "128k": 0.36789187708581184,
+ "256k": 0.29816793962327126
+ },
+ "contextual_requirement": {
+ "Full": 0.4116834851076065,
+ "Partial": 0.49098037214586493
+ },
+ "difficulty": {
+ "Easy": 0.6920753978961921,
+ "Moderate": 0.310198508060507,
+ "Hard": 0.35436801382293803,
+ "Extreme": 0.32777524203722713
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6903770310691527,
+ "T2. Sequencing & Structure Reconstruction": 0.6839748214748216,
+ "T3. Evidence-Grounded QA": 0.4666666666666667,
+ "T4. Summarization & Synthesis": 0.5134804778640072,
+ "T5. Attribution & Citation Alignment": 0.38386685118201486,
+ "T6. Aggregation & Clustering": 0.4380455926805801,
+ "T7. Consistency & Compliance Checking": 0.21770656422693052,
+ "T8. Structured & Numeric Reasoning": 0.4087962962962963,
+ "T9. Version & Code Diff Analysis": 0.4801823589375706,
+ "T10. Rule Induction & In-Context Learning": 0.5001388888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.26666666666666666
+ },
+ "language": {
+ "Chinese": 0.420870958024232,
+ "English": 0.47227727278464793
+ }
+ },
+ "pass@2": 0.22,
+ "BoN-3": {
+ "overall_metric": 0.4796780923764449,
+ "token_length": {
+ "8k": 0.6046783631215611,
+ "16k": 0.5809303286628532,
+ "32k": 0.5430608854898975,
+ "64k": 0.4279434742821682,
+ "128k": 0.3893995220031886,
+ "256k": 0.3320559806990021
+ },
+ "contextual_requirement": {
+ "Full": 0.43765149013675825,
+ "Partial": 0.5331664952269561
+ },
+ "difficulty": {
+ "Easy": 0.7316590781086564,
+ "Moderate": 0.35837276698741044,
+ "Hard": 0.3776680447598276,
+ "Extreme": 0.35032197345759886
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7260943331172737,
+ "T2. Sequencing & Structure Reconstruction": 0.7150282125282126,
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
+ "T4. Summarization & Synthesis": 0.5186475702115565,
+ "T5. Attribution & Citation Alignment": 0.4078869630146266,
+ "T6. Aggregation & Clustering": 0.4834113173877446,
+ "T7. Consistency & Compliance Checking": 0.24236854182796289,
+ "T8. Structured & Numeric Reasoning": 0.42546296296296293,
+ "T9. Version & Code Diff Analysis": 0.5163159536769999,
+ "T10. Rule Induction & In-Context Learning": 0.5601388888888891,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667
+ },
+ "language": {
+ "Chinese": 0.4547335554957914,
+ "English": 0.5046226292570991
+ }
+ },
+ "pass@3": 0.24533333333333332
+}
\ No newline at end of file
diff --git a/results/Qwen2.5-72B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Qwen2.5-72B-Instruct/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..4011179399d5c41234deecb520b50ecee044f395
--- /dev/null
+++ b/results/Qwen2.5-72B-Instruct/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.39637256479971833,
+ "inference_iteration_1_overall_metric": 0.3937967514186296,
+ "inference_iteration_2_overall_metric": 0.3964337111478629,
+ "inference_iteration_3_overall_metric": 0.3988872318326605,
+ "average_token_length_metric": {
+ "8k": 0.47921715442818713,
+ "16k": 0.3991398791513683,
+ "32k": 0.44041622927702145,
+ "64k": 0.36697110801414906,
+ "128k": 0.37859902889300584,
+ "256k": 0.3138919890345743
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.38099477047000957,
+ "Partial": 0.41594430303752816
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.534784488551614,
+ "Moderate": 0.31363336895476984,
+ "Hard": 0.35900431211097755,
+ "Extreme": 0.323637349749313
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6846177524490639,
+ "T2. Sequencing & Structure Reconstruction": 0.657116734090758,
+ "T3. Evidence-Grounded QA": 0.4638888888888889,
+ "T4. Summarization & Synthesis": 0.5223335075531164,
+ "T5. Attribution & Citation Alignment": 0.35501048685955816,
+ "T6. Aggregation & Clustering": 0.3527926553465598,
+ "T7. Consistency & Compliance Checking": 0.22711088737402554,
+ "T8. Structured & Numeric Reasoning": 0.1410493827160494,
+ "T9. Version & Code Diff Analysis": 0.3933158575556893,
+ "T10. Rule Induction & In-Context Learning": 0.5025,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2944444444444444
+ },
+ "average_language_metric": {
+ "Chinese": 0.39791297636758555,
+ "English": 0.39483215323184995
+ },
+ "BoN-1": {
+ "overall_metric": 0.3937967514186296,
+ "token_length": {
+ "8k": 0.48150911063795604,
+ "16k": 0.3955134086911857,
+ "32k": 0.44051865480915253,
+ "64k": 0.35605224184998097,
+ "128k": 0.3735426660157483,
+ "256k": 0.31564442650775115
+ },
+ "contextual_requirement": {
+ "Full": 0.3790246648535571,
+ "Partial": 0.4125975888650856
+ },
+ "difficulty": {
+ "Easy": 0.5289899613446293,
+ "Moderate": 0.31455221378896076,
+ "Hard": 0.35328095705198703,
+ "Extreme": 0.3243594986063606
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6838538961825568,
+ "T2. Sequencing & Structure Reconstruction": 0.6576533446275381,
+ "T3. Evidence-Grounded QA": 0.4666666666666667,
+ "T4. Summarization & Synthesis": 0.5233466170189328,
+ "T5. Attribution & Citation Alignment": 0.34332894764623034,
+ "T6. Aggregation & Clustering": 0.35214311990772595,
+ "T7. Consistency & Compliance Checking": 0.22465424456172434,
+ "T8. Structured & Numeric Reasoning": 0.14120370370370372,
+ "T9. Version & Code Diff Analysis": 0.3972749849978754,
+ "T10. Rule Induction & In-Context Learning": 0.49000000000000005,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2833333333333333
+ },
+ "language": {
+ "Chinese": 0.39463375393747463,
+ "English": 0.3929597488997841
+ }
+ },
+ "pass@1": 0.158,
+ "BoN-2": {
+ "overall_metric": 0.4266968766552369,
+ "token_length": {
+ "8k": 0.5081210460935147,
+ "16k": 0.433884061206856,
+ "32k": 0.4739562311618829,
+ "64k": 0.3969295417300637,
+ "128k": 0.4063722243032158,
+ "256k": 0.3409181554358875
+ },
+ "contextual_requirement": {
+ "Full": 0.40567420103364404,
+ "Partial": 0.45345300926453685
+ },
+ "difficulty": {
+ "Easy": 0.5611962030845826,
+ "Moderate": 0.34459684961888387,
+ "Hard": 0.3903222118897577,
+ "Extreme": 0.35717530981838436
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7162339766047533,
+ "T2. Sequencing & Structure Reconstruction": 0.6883400190642125,
+ "T3. Evidence-Grounded QA": 0.5,
+ "T4. Summarization & Synthesis": 0.538190912835796,
+ "T5. Attribution & Citation Alignment": 0.40236792686670547,
+ "T6. Aggregation & Clustering": 0.3971289905185964,
+ "T7. Consistency & Compliance Checking": 0.2560060835156213,
+ "T8. Structured & Numeric Reasoning": 0.1550925925925926,
+ "T9. Version & Code Diff Analysis": 0.44331995621210946,
+ "T10. Rule Induction & In-Context Learning": 0.51625,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
+ },
+ "language": {
+ "Chinese": 0.4315697660767929,
+ "English": 0.42182398723368103
+ }
+ },
+ "pass@2": 0.17066666666666666,
+ "BoN-3": {
+ "overall_metric": 0.44674650669663196,
+ "token_length": {
+ "8k": 0.5262573822920757,
+ "16k": 0.4582783757995498,
+ "32k": 0.49570076375833805,
+ "64k": 0.4144687055290633,
+ "128k": 0.4299894189350878,
+ "256k": 0.3557843938656765
+ },
+ "contextual_requirement": {
+ "Full": 0.4233162526040225,
+ "Partial": 0.4765668300872253
+ },
+ "difficulty": {
+ "Easy": 0.5769237639050746,
+ "Moderate": 0.3742638517279143,
+ "Hard": 0.4173834394346273,
+ "Extreme": 0.37103736067438053
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7291777705678423,
+ "T2. Sequencing & Structure Reconstruction": 0.7155669062905904,
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
+ "T4. Summarization & Synthesis": 0.5442979937116319,
+ "T5. Attribution & Citation Alignment": 0.42622684636162084,
+ "T6. Aggregation & Clustering": 0.4288727477882729,
+ "T7. Consistency & Compliance Checking": 0.27916215760547175,
+ "T8. Structured & Numeric Reasoning": 0.16203703703703703,
+ "T9. Version & Code Diff Analysis": 0.4872872364633724,
+ "T10. Rule Induction & In-Context Learning": 0.535,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.325
+ },
+ "language": {
+ "Chinese": 0.45016861158798044,
+ "English": 0.44332440180528293
+ }
+ },
+ "pass@3": 0.18466666666666667
+}
\ No newline at end of file
diff --git a/results/Qwen2.5-72B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Qwen2.5-72B-Instruct/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..06866dcb22e077c418c27a0dbd121a1e1b05b7f7
--- /dev/null
+++ b/results/Qwen2.5-72B-Instruct/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.4408902584645425,
+ "inference_iteration_1_overall_metric": 0.44376702120969425,
+ "inference_iteration_2_overall_metric": 0.44036154482238754,
+ "inference_iteration_3_overall_metric": 0.43854220936154753,
+ "average_token_length_metric": {
+ "8k": 0.5285612004903454,
+ "16k": 0.47472561982724465,
+ "32k": 0.46043302796304997,
+ "64k": 0.4189652464720481,
+ "128k": 0.3917337204907261,
+ "256k": 0.37092273554384453
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.41330115115036786,
+ "Partial": 0.47600366777349373
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6779648379508855,
+ "Moderate": 0.31025727820561944,
+ "Hard": 0.36454303136241756,
+ "Extreme": 0.3171341962257389
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6877440761283994,
+ "T2. Sequencing & Structure Reconstruction": 0.6590772291315768,
+ "T3. Evidence-Grounded QA": 0.45555555555555544,
+ "T4. Summarization & Synthesis": 0.5129812732828513,
+ "T5. Attribution & Citation Alignment": 0.34764194837967977,
+ "T6. Aggregation & Clustering": 0.4267544160945399,
+ "T7. Consistency & Compliance Checking": 0.24278940543780603,
+ "T8. Structured & Numeric Reasoning": 0.4026234567901234,
+ "T9. Version & Code Diff Analysis": 0.49529389751168806,
+ "T10. Rule Induction & In-Context Learning": 0.43902777777777774,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.30555555555555564
+ },
+ "average_language_metric": {
+ "Chinese": 0.4318377484964527,
+ "English": 0.44994276843263364
+ },
+ "BoN-1": {
+ "overall_metric": 0.44376702120969425,
+ "token_length": {
+ "8k": 0.5322737731717848,
+ "16k": 0.4756587605561328,
+ "32k": 0.4739240524772759,
+ "64k": 0.4301284112089289,
+ "128k": 0.3925013454757625,
+ "256k": 0.3581157843682803
+ },
+ "contextual_requirement": {
+ "Full": 0.4190008048268543,
+ "Partial": 0.4752876602424002
+ },
+ "difficulty": {
+ "Easy": 0.6809264760303465,
+ "Moderate": 0.3123838002275438,
+ "Hard": 0.38330482067932314,
+ "Extreme": 0.30997717937688596
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6951653783701364,
+ "T2. Sequencing & Structure Reconstruction": 0.6538150476737434,
+ "T3. Evidence-Grounded QA": 0.4666666666666667,
+ "T4. Summarization & Synthesis": 0.5148026353764281,
+ "T5. Attribution & Citation Alignment": 0.3465569726210779,
+ "T6. Aggregation & Clustering": 0.4294465777942789,
+ "T7. Consistency & Compliance Checking": 0.24971806893075102,
+ "T8. Structured & Numeric Reasoning": 0.4111111111111112,
+ "T9. Version & Code Diff Analysis": 0.5131396498811356,
+ "T10. Rule Induction & In-Context Learning": 0.4131944444444445,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.30833333333333335
+ },
+ "language": {
+ "Chinese": 0.4290108132095998,
+ "English": 0.4585232292097895
+ }
+ },
+ "pass@1": 0.20933333333333334,
+ "BoN-2": {
+ "overall_metric": 0.5111971513374702,
+ "token_length": {
+ "8k": 0.608869344107263,
+ "16k": 0.5580527030947412,
+ "32k": 0.5259126977020612,
+ "64k": 0.49288230504222735,
+ "128k": 0.46074971792635044,
+ "256k": 0.42071614015218295
+ },
+ "contextual_requirement": {
+ "Full": 0.48375250466177205,
+ "Partial": 0.5461267016519972
+ },
+ "difficulty": {
+ "Easy": 0.7817754799095641,
+ "Moderate": 0.3628317914878497,
+ "Hard": 0.43323803533283267,
+ "Extreme": 0.36344492213322593
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7590349530612852,
+ "T2. Sequencing & Structure Reconstruction": 0.7222824598911556,
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
+ "T4. Summarization & Synthesis": 0.5277624952357224,
+ "T5. Attribution & Citation Alignment": 0.42967259054859913,
+ "T6. Aggregation & Clustering": 0.5148559581215818,
+ "T7. Consistency & Compliance Checking": 0.29132433156438,
+ "T8. Structured & Numeric Reasoning": 0.49027777777777776,
+ "T9. Version & Code Diff Analysis": 0.5752747917860145,
+ "T10. Rule Induction & In-Context Learning": 0.5229166666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.375
+ },
+ "language": {
+ "Chinese": 0.49799451206849277,
+ "English": 0.5243997906064495
+ }
+ },
+ "pass@2": 0.26466666666666666,
+ "BoN-3": {
+ "overall_metric": 0.5467736842178992,
+ "token_length": {
+ "8k": 0.6412013485454756,
+ "16k": 0.5997169544343945,
+ "32k": 0.5670917206414194,
+ "64k": 0.5191991662550954,
+ "128k": 0.4920959206351818,
+ "256k": 0.4613369947958361
+ },
+ "contextual_requirement": {
+ "Full": 0.5191971033346234,
+ "Partial": 0.5818711507966168
+ },
+ "difficulty": {
+ "Easy": 0.8347260400113382,
+ "Moderate": 0.3850286249254218,
+ "Hard": 0.46842929282635226,
+ "Extreme": 0.3890300836616431
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7891907338254459,
+ "T2. Sequencing & Structure Reconstruction": 0.7449808987308988,
+ "T3. Evidence-Grounded QA": 0.575,
+ "T4. Summarization & Synthesis": 0.533886440771335,
+ "T5. Attribution & Citation Alignment": 0.45243256205857046,
+ "T6. Aggregation & Clustering": 0.5676630345451723,
+ "T7. Consistency & Compliance Checking": 0.3275556384256395,
+ "T8. Structured & Numeric Reasoning": 0.5166666666666667,
+ "T9. Version & Code Diff Analysis": 0.6088524078812863,
+ "T10. Rule Induction & In-Context Learning": 0.5875,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.425
+ },
+ "language": {
+ "Chinese": 0.5347761594749554,
+ "English": 0.5587712089608458
+ }
+ },
+ "pass@3": 0.29933333333333334
+}
\ No newline at end of file
diff --git a/results/Qwen3-14B/nonthinking_context-120000_bon-3_summary.json b/results/Qwen3-14B/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..47747dd77eb13f5abb8245c9e776e27a6d41674f
--- /dev/null
+++ b/results/Qwen3-14B/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.371082468941152,
+ "inference_iteration_1_overall_metric": 0.3702373257200545,
+ "inference_iteration_2_overall_metric": 0.3726984925368399,
+ "inference_iteration_3_overall_metric": 0.37031158856656204,
+ "average_token_length_metric": {
+ "8k": 0.43147751060158823,
+ "16k": 0.4033401435644649,
+ "32k": 0.38388990895019404,
+ "64k": 0.35197621981023713,
+ "128k": 0.3411527865206871,
+ "256k": 0.3146582441997405
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.34812497724125846,
+ "Partial": 0.4003010947410168
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.4843882954180315,
+ "Moderate": 0.29072923835932707,
+ "Hard": 0.35344272393957227,
+ "Extreme": 0.31132465435421575
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6723638502162732,
+ "T2. Sequencing & Structure Reconstruction": 0.6447956689096103,
+ "T3. Evidence-Grounded QA": 0.4888888888888889,
+ "T4. Summarization & Synthesis": 0.5233374417827854,
+ "T5. Attribution & Citation Alignment": 0.28649007131147347,
+ "T6. Aggregation & Clustering": 0.36581903676825483,
+ "T7. Consistency & Compliance Checking": 0.2131249616899122,
+ "T8. Structured & Numeric Reasoning": 0.08441358024691357,
+ "T9. Version & Code Diff Analysis": 0.354238942968118,
+ "T10. Rule Induction & In-Context Learning": 0.43171296296296297,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667
+ },
+ "average_language_metric": {
+ "Chinese": 0.3760505297242663,
+ "English": 0.36611440815803775
+ },
+ "BoN-1": {
+ "overall_metric": 0.3702373257200545,
+ "token_length": {
+ "8k": 0.4365488155340593,
+ "16k": 0.40901874466031907,
+ "32k": 0.3764681078768914,
+ "64k": 0.3558737887299017,
+ "128k": 0.337849918010455,
+ "256k": 0.30566457950869874
+ },
+ "contextual_requirement": {
+ "Full": 0.34789877784316464,
+ "Partial": 0.39866820483609555
+ },
+ "difficulty": {
+ "Easy": 0.47550918682117166,
+ "Moderate": 0.2976353828419508,
+ "Hard": 0.3595591768355849,
+ "Extreme": 0.3096166654215201
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6693651539947573,
+ "T2. Sequencing & Structure Reconstruction": 0.643656093318885,
+ "T3. Evidence-Grounded QA": 0.49166666666666664,
+ "T4. Summarization & Synthesis": 0.5246008307406761,
+ "T5. Attribution & Citation Alignment": 0.28080851312128013,
+ "T6. Aggregation & Clustering": 0.3666330157824754,
+ "T7. Consistency & Compliance Checking": 0.219629388990068,
+ "T8. Structured & Numeric Reasoning": 0.07824074074074075,
+ "T9. Version & Code Diff Analysis": 0.3648645953884854,
+ "T10. Rule Induction & In-Context Learning": 0.41458333333333336,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667
+ },
+ "language": {
+ "Chinese": 0.37263158698072796,
+ "English": 0.36784306445938036
+ }
+ },
+ "pass@1": 0.136,
+ "BoN-2": {
+ "overall_metric": 0.39747790211532286,
+ "token_length": {
+ "8k": 0.4586397357227334,
+ "16k": 0.4346767556872062,
+ "32k": 0.40654133431254835,
+ "64k": 0.3846278376492738,
+ "128k": 0.366368675809838,
+ "256k": 0.334013073510339
+ },
+ "contextual_requirement": {
+ "Full": 0.3762242907679459,
+ "Partial": 0.4245279529210758
+ },
+ "difficulty": {
+ "Easy": 0.5137672201120023,
+ "Moderate": 0.3211245068584676,
+ "Hard": 0.37927074472552935,
+ "Extreme": 0.33219733038914645
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6959264095016384,
+ "T2. Sequencing & Structure Reconstruction": 0.6706657296788379,
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
+ "T4. Summarization & Synthesis": 0.5372603569538371,
+ "T5. Attribution & Citation Alignment": 0.30622924870075346,
+ "T6. Aggregation & Clustering": 0.394716262565116,
+ "T7. Consistency & Compliance Checking": 0.23852214824749268,
+ "T8. Structured & Numeric Reasoning": 0.10046296296296296,
+ "T9. Version & Code Diff Analysis": 0.389089970943113,
+ "T10. Rule Induction & In-Context Learning": 0.46875,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.26666666666666666
+ },
+ "language": {
+ "Chinese": 0.39990507125254376,
+ "English": 0.39505073297810234
+ }
+ },
+ "pass@2": 0.152,
+ "BoN-3": {
+ "overall_metric": 0.4076945334257239,
+ "token_length": {
+ "8k": 0.47272207856160825,
+ "16k": 0.44496379279331655,
+ "32k": 0.4150202640804616,
+ "64k": 0.39561603549517055,
+ "128k": 0.3755358328446131,
+ "256k": 0.3423091967791733
+ },
+ "contextual_requirement": {
+ "Full": 0.38567226658065773,
+ "Partial": 0.435722873046717
+ },
+ "difficulty": {
+ "Easy": 0.5205174207555429,
+ "Moderate": 0.3404168914689614,
+ "Hard": 0.3867590355007207,
+ "Extreme": 0.34204218281858184
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7035836621732522,
+ "T2. Sequencing & Structure Reconstruction": 0.6789876392507479,
+ "T3. Evidence-Grounded QA": 0.5416666666666666,
+ "T4. Summarization & Synthesis": 0.5440147604159721,
+ "T5. Attribution & Citation Alignment": 0.3324233311147572,
+ "T6. Aggregation & Clustering": 0.41234554444538063,
+ "T7. Consistency & Compliance Checking": 0.24338937917661474,
+ "T8. Structured & Numeric Reasoning": 0.10046296296296298,
+ "T9. Version & Code Diff Analysis": 0.41745877832271355,
+ "T10. Rule Induction & In-Context Learning": 0.46874999999999994,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.275
+ },
+ "language": {
+ "Chinese": 0.41071052140088854,
+ "English": 0.4046785454505592
+ }
+ },
+ "pass@3": 0.15933333333333333
+}
\ No newline at end of file
diff --git a/results/Qwen3-14B/thinking_context-120000_bon-3_summary.json b/results/Qwen3-14B/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..d38d8acb001e92d7cf2a4bd2d46ab6091fafc040
--- /dev/null
+++ b/results/Qwen3-14B/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 19,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.47140163871696883,
+ "inference_iteration_1_overall_metric": 0.4702465037493788,
+ "inference_iteration_2_overall_metric": 0.4731184728103188,
+ "inference_iteration_3_overall_metric": 0.4708399395912094,
+ "average_token_length_metric": {
+ "8k": 0.5782430822549752,
+ "16k": 0.5324144227767025,
+ "32k": 0.497499382093641,
+ "64k": 0.4573465836705664,
+ "128k": 0.3964337558937265,
+ "256k": 0.36647260561220363
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.434181327100322,
+ "Partial": 0.5187729444108845
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6954869771942226,
+ "Moderate": 0.39026206272645725,
+ "Hard": 0.3840987663314641,
+ "Extreme": 0.3365623987956807
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7452599144109371,
+ "T2. Sequencing & Structure Reconstruction": 0.690021191602685,
+ "T3. Evidence-Grounded QA": 0.46388888888888874,
+ "T4. Summarization & Synthesis": 0.5079917038055713,
+ "T5. Attribution & Citation Alignment": 0.4011786521228004,
+ "T6. Aggregation & Clustering": 0.456013454446653,
+ "T7. Consistency & Compliance Checking": 0.26448705908382575,
+ "T8. Structured & Numeric Reasoning": 0.4464506172839507,
+ "T9. Version & Code Diff Analysis": 0.5003275109836627,
+ "T10. Rule Induction & In-Context Learning": 0.4528703703703705,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3805555555555556
+ },
+ "average_language_metric": {
+ "Chinese": 0.43752943274977507,
+ "English": 0.5052738446841633
+ },
+ "BoN-1": {
+ "overall_metric": 0.4702465037493788,
+ "token_length": {
+ "8k": 0.5913655956747179,
+ "16k": 0.5095272462031484,
+ "32k": 0.5071120818375107,
+ "64k": 0.44784910454974686,
+ "128k": 0.41043482344109344,
+ "256k": 0.35519017079005766
+ },
+ "contextual_requirement": {
+ "Full": 0.43466831346143797,
+ "Partial": 0.5155278368431229
+ },
+ "difficulty": {
+ "Easy": 0.6903049055473668,
+ "Moderate": 0.4015483173366612,
+ "Hard": 0.38434152744866484,
+ "Extreme": 0.330728695471088
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7350489226932438,
+ "T2. Sequencing & Structure Reconstruction": 0.7006655082390375,
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
+ "T4. Summarization & Synthesis": 0.5066399530046457,
+ "T5. Attribution & Citation Alignment": 0.4030762255317208,
+ "T6. Aggregation & Clustering": 0.44151556236290046,
+ "T7. Consistency & Compliance Checking": 0.2711344235218425,
+ "T8. Structured & Numeric Reasoning": 0.4435185185185186,
+ "T9. Version & Code Diff Analysis": 0.5056201530159209,
+ "T10. Rule Induction & In-Context Learning": 0.45111111111111113,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
+ },
+ "language": {
+ "Chinese": 0.43645831615890296,
+ "English": 0.5040346913398556
+ }
+ },
+ "pass@1": 0.23333333333333334,
+ "BoN-2": {
+ "overall_metric": 0.5468849991087089,
+ "token_length": {
+ "8k": 0.6615753489640523,
+ "16k": 0.6155193707982429,
+ "32k": 0.5758535655924854,
+ "64k": 0.540724309445691,
+ "128k": 0.4626871186035022,
+ "256k": 0.4249502812482792
+ },
+ "contextual_requirement": {
+ "Full": 0.5080630736758954,
+ "Partial": 0.5962947223868355
+ },
+ "difficulty": {
+ "Easy": 0.7925721699736568,
+ "Moderate": 0.479093636665099,
+ "Hard": 0.4528494469332917,
+ "Extreme": 0.38403673085414924
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8163493863260063,
+ "T2. Sequencing & Structure Reconstruction": 0.7442577955813249,
+ "T3. Evidence-Grounded QA": 0.5833333333333334,
+ "T4. Summarization & Synthesis": 0.5201693233821205,
+ "T5. Attribution & Citation Alignment": 0.5067233304435315,
+ "T6. Aggregation & Clustering": 0.5291314359023842,
+ "T7. Consistency & Compliance Checking": 0.31988194373207235,
+ "T8. Structured & Numeric Reasoning": 0.5319444444444444,
+ "T9. Version & Code Diff Analysis": 0.5907370281186358,
+ "T10. Rule Induction & In-Context Learning": 0.5363888888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667
+ },
+ "language": {
+ "Chinese": 0.5179409589397771,
+ "English": 0.5758290392776407
+ }
+ },
+ "pass@2": 0.3006666666666667,
+ "BoN-3": {
+ "overall_metric": 0.5824444918225498,
+ "token_length": {
+ "8k": 0.6960404076989396,
+ "16k": 0.6595983514236444,
+ "32k": 0.6169744918377427,
+ "64k": 0.5731325003589102,
+ "128k": 0.48953898539680313,
+ "256k": 0.4593822142192594
+ },
+ "contextual_requirement": {
+ "Full": 0.5444884918996935,
+ "Partial": 0.6307521280880043
+ },
+ "difficulty": {
+ "Easy": 0.8395318293521158,
+ "Moderate": 0.5135257846150404,
+ "Hard": 0.4826106375194937,
+ "Extreme": 0.4116567499755111
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8331279763695215,
+ "T2. Sequencing & Structure Reconstruction": 0.7661840637575932,
+ "T3. Evidence-Grounded QA": 0.6416666666666667,
+ "T4. Summarization & Synthesis": 0.5304996896358936,
+ "T5. Attribution & Citation Alignment": 0.5410298862500872,
+ "T6. Aggregation & Clustering": 0.5653969713210634,
+ "T7. Consistency & Compliance Checking": 0.35530101198058683,
+ "T8. Structured & Numeric Reasoning": 0.5680555555555555,
+ "T9. Version & Code Diff Analysis": 0.6135286679274133,
+ "T10. Rule Induction & In-Context Learning": 0.6047222222222223,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5166666666666667
+ },
+ "language": {
+ "Chinese": 0.5531328379155088,
+ "English": 0.6117561457295911
+ }
+ },
+ "pass@3": 0.338
+}
\ No newline at end of file
diff --git a/results/Qwen3-235B-A22B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json b/results/Qwen3-235B-A22B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..343e9426996716d7c5daa5071aa88d27eb89b172
--- /dev/null
+++ b/results/Qwen3-235B-A22B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5250792648341495,
+ "inference_iteration_1_overall_metric": 0.5304344950538739,
+ "inference_iteration_2_overall_metric": 0.5198766479638736,
+ "inference_iteration_3_overall_metric": 0.5249266514847013,
+ "average_token_length_metric": {
+ "8k": 0.5437004868383555,
+ "16k": 0.5685851303356437,
+ "32k": 0.5571646658594525,
+ "64k": 0.4927762749794244,
+ "128k": 0.5219020146975271,
+ "256k": 0.46634701629449443
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.48053236591813503,
+ "Partial": 0.5817753179999874
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.617560842340432,
+ "Moderate": 0.5314310614765573,
+ "Hard": 0.5233520312908267,
+ "Extreme": 0.4207324032576421
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.794390974707769,
+ "T2. Sequencing & Structure Reconstruction": 0.7891406587239914,
+ "T3. Evidence-Grounded QA": 0.5694444444444444,
+ "T4. Summarization & Synthesis": 0.548404897537323,
+ "T5. Attribution & Citation Alignment": 0.7090811316885252,
+ "T6. Aggregation & Clustering": 0.4833303018822092,
+ "T7. Consistency & Compliance Checking": 0.36054654248613494,
+ "T8. Structured & Numeric Reasoning": 0.17916666666666667,
+ "T9. Version & Code Diff Analysis": 0.639574547883412,
+ "T10. Rule Induction & In-Context Learning": 0.5316666666666666,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4472222222222222
+ },
+ "average_language_metric": {
+ "Chinese": 0.5279929441888006,
+ "English": 0.5221655854795001
+ },
+ "BoN-1": {
+ "overall_metric": 0.5304344950538739,
+ "token_length": {
+ "8k": 0.5382631577044388,
+ "16k": 0.575757250526112,
+ "32k": 0.5564174429144133,
+ "64k": 0.49882324949171003,
+ "128k": 0.5360876545520631,
+ "256k": 0.47725821513450517
+ },
+ "contextual_requirement": {
+ "Full": 0.48609242430882327,
+ "Partial": 0.5868698578203024
+ },
+ "difficulty": {
+ "Easy": 0.621987238841124,
+ "Moderate": 0.5450178207268627,
+ "Hard": 0.5368976043475531,
+ "Extreme": 0.41631894457473306
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8032396811710955,
+ "T2. Sequencing & Structure Reconstruction": 0.7962516650016649,
+ "T3. Evidence-Grounded QA": 0.6166666666666667,
+ "T4. Summarization & Synthesis": 0.5449456903648067,
+ "T5. Attribution & Citation Alignment": 0.7007693812727995,
+ "T6. Aggregation & Clustering": 0.4931518214295991,
+ "T7. Consistency & Compliance Checking": 0.35790360694105,
+ "T8. Structured & Numeric Reasoning": 0.18935185185185183,
+ "T9. Version & Code Diff Analysis": 0.6460027389181906,
+ "T10. Rule Induction & In-Context Learning": 0.528611111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.43333333333333335
+ },
+ "language": {
+ "Chinese": 0.5351277676036212,
+ "English": 0.5257412225041267
+ }
+ },
+ "pass@1": 0.25,
+ "BoN-2": {
+ "overall_metric": 0.559631618514268,
+ "token_length": {
+ "8k": 0.5759701055353683,
+ "16k": 0.5915870050315596,
+ "32k": 0.5811019083400786,
+ "64k": 0.5340616992071157,
+ "128k": 0.566738039323504,
+ "256k": 0.5083309536479829
+ },
+ "contextual_requirement": {
+ "Full": 0.5121620583385146,
+ "Partial": 0.6200474223743184
+ },
+ "difficulty": {
+ "Easy": 0.6503756016170681,
+ "Moderate": 0.5630560246396688,
+ "Hard": 0.5739183320047223,
+ "Extreme": 0.4485927243676055
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8282631375366962,
+ "T2. Sequencing & Structure Reconstruction": 0.8215701428201425,
+ "T3. Evidence-Grounded QA": 0.625,
+ "T4. Summarization & Synthesis": 0.5605335358825536,
+ "T5. Attribution & Citation Alignment": 0.7353432686427397,
+ "T6. Aggregation & Clustering": 0.5159697175359719,
+ "T7. Consistency & Compliance Checking": 0.39569633402766907,
+ "T8. Structured & Numeric Reasoning": 0.2037037037037037,
+ "T9. Version & Code Diff Analysis": 0.6716305136452022,
+ "T10. Rule Induction & In-Context Learning": 0.5966666666666668,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
+ },
+ "language": {
+ "Chinese": 0.5625026563498647,
+ "English": 0.5567605806786718
+ }
+ },
+ "pass@2": 0.27666666666666667,
+ "BoN-3": {
+ "overall_metric": 0.5808888120058974,
+ "token_length": {
+ "8k": 0.585393137716361,
+ "16k": 0.6092994228323365,
+ "32k": 0.6038173310381142,
+ "64k": 0.5645208259955651,
+ "128k": 0.5986706621283983,
+ "256k": 0.5236314923246123
+ },
+ "contextual_requirement": {
+ "Full": 0.5307941999072111,
+ "Partial": 0.6446455910405908
+ },
+ "difficulty": {
+ "Easy": 0.6680957906246583,
+ "Moderate": 0.5949187228015328,
+ "Hard": 0.5923336661549259,
+ "Extreme": 0.46862524208942063
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8359951247705989,
+ "T2. Sequencing & Structure Reconstruction": 0.8325952750952746,
+ "T3. Evidence-Grounded QA": 0.6416666666666667,
+ "T4. Summarization & Synthesis": 0.5676340892602741,
+ "T5. Attribution & Citation Alignment": 0.7621749447875176,
+ "T6. Aggregation & Clustering": 0.5352498549729722,
+ "T7. Consistency & Compliance Checking": 0.4260968481164947,
+ "T8. Structured & Numeric Reasoning": 0.22037037037037036,
+ "T9. Version & Code Diff Analysis": 0.6990239948591902,
+ "T10. Rule Induction & In-Context Learning": 0.6077777777777779,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
+ },
+ "language": {
+ "Chinese": 0.5829028144055859,
+ "English": 0.5788748096062107
+ }
+ },
+ "pass@3": 0.2946666666666667
+}
\ No newline at end of file
diff --git a/results/Qwen3-235B-A22B-Instruct-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-235B-A22B-Instruct-2507/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..711ccbe3317e5d2a36f003a043e4ec093eca7a2c
--- /dev/null
+++ b/results/Qwen3-235B-A22B-Instruct-2507/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.6376818411345996,
+ "inference_iteration_1_overall_metric": 0.6339596775760088,
+ "inference_iteration_2_overall_metric": 0.6394834532022874,
+ "inference_iteration_3_overall_metric": 0.6396023926255001,
+ "average_token_length_metric": {
+ "8k": 0.6908518806680625,
+ "16k": 0.710583866590834,
+ "32k": 0.6732110240389698,
+ "64k": 0.621313683819941,
+ "128k": 0.601721218415662,
+ "256k": 0.5284093732741275
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5907324430049904,
+ "Partial": 0.6974356205722853
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.8298420157208769,
+ "Moderate": 0.6814554333905586,
+ "Hard": 0.5860331219068426,
+ "Extreme": 0.43235163100792684
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8583982650560713,
+ "T2. Sequencing & Structure Reconstruction": 0.848515913099246,
+ "T3. Evidence-Grounded QA": 0.6,
+ "T4. Summarization & Synthesis": 0.5177498916968352,
+ "T5. Attribution & Citation Alignment": 0.77066193723968,
+ "T6. Aggregation & Clustering": 0.6117211713831538,
+ "T7. Consistency & Compliance Checking": 0.42049824163434013,
+ "T8. Structured & Numeric Reasoning": 0.6337962962962962,
+ "T9. Version & Code Diff Analysis": 0.7293123320088637,
+ "T10. Rule Induction & In-Context Learning": 0.6056944444444443,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666664
+ },
+ "average_language_metric": {
+ "Chinese": 0.636515950238815,
+ "English": 0.6388477320303853
+ },
+ "BoN-1": {
+ "overall_metric": 0.6339596775760088,
+ "token_length": {
+ "8k": 0.6855442294551932,
+ "16k": 0.7000813231569181,
+ "32k": 0.6747494576040707,
+ "64k": 0.6226259564636208,
+ "128k": 0.5873958179866461,
+ "256k": 0.533361280789608
+ },
+ "contextual_requirement": {
+ "Full": 0.5879826501710673,
+ "Partial": 0.6924758942732089
+ },
+ "difficulty": {
+ "Easy": 0.82231148583884,
+ "Moderate": 0.6884371521646265,
+ "Hard": 0.5762430652636876,
+ "Extreme": 0.42975635762025166
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8670882121182245,
+ "T2. Sequencing & Structure Reconstruction": 0.834568857068857,
+ "T3. Evidence-Grounded QA": 0.5916666666666667,
+ "T4. Summarization & Synthesis": 0.5199050816665388,
+ "T5. Attribution & Citation Alignment": 0.7523417979282552,
+ "T6. Aggregation & Clustering": 0.6062619024899728,
+ "T7. Consistency & Compliance Checking": 0.4101310403597984,
+ "T8. Structured & Numeric Reasoning": 0.6231481481481482,
+ "T9. Version & Code Diff Analysis": 0.732669273310252,
+ "T10. Rule Induction & In-Context Learning": 0.6336111111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333
+ },
+ "language": {
+ "Chinese": 0.6266859734445107,
+ "English": 0.6412333817075085
+ }
+ },
+ "pass@1": 0.38466666666666666,
+ "BoN-2": {
+ "overall_metric": 0.6974027072595513,
+ "token_length": {
+ "8k": 0.7323866259678374,
+ "16k": 0.7745752522150099,
+ "32k": 0.7261213539879122,
+ "64k": 0.6928752138123638,
+ "128k": 0.6670434516287179,
+ "256k": 0.5914143459454726
+ },
+ "contextual_requirement": {
+ "Full": 0.6549789400684759,
+ "Partial": 0.751396592775468
+ },
+ "difficulty": {
+ "Easy": 0.8841904588128215,
+ "Moderate": 0.7743090948629835,
+ "Hard": 0.6459134355083986,
+ "Extreme": 0.47609079196413456
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8835231545032977,
+ "T2. Sequencing & Structure Reconstruction": 0.8910442335442333,
+ "T3. Evidence-Grounded QA": 0.6833333333333333,
+ "T4. Summarization & Synthesis": 0.5311781500383242,
+ "T5. Attribution & Citation Alignment": 0.8245467421256093,
+ "T6. Aggregation & Clustering": 0.6739689017829934,
+ "T7. Consistency & Compliance Checking": 0.48373798320712474,
+ "T8. Structured & Numeric Reasoning": 0.7101851851851851,
+ "T9. Version & Code Diff Analysis": 0.779292344158872,
+ "T10. Rule Induction & In-Context Learning": 0.706111111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6166666666666667
+ },
+ "language": {
+ "Chinese": 0.6932540877553197,
+ "English": 0.7015513267637853
+ }
+ },
+ "pass@2": 0.46,
+ "BoN-3": {
+ "overall_metric": 0.7313269007238051,
+ "token_length": {
+ "8k": 0.7568418794712882,
+ "16k": 0.7976309127316717,
+ "32k": 0.7546566659314653,
+ "64k": 0.7371399894465867,
+ "128k": 0.7061444864165639,
+ "256k": 0.6355474703452554
+ },
+ "contextual_requirement": {
+ "Full": 0.6842404276773796,
+ "Partial": 0.7912551391465288
+ },
+ "difficulty": {
+ "Easy": 0.9130888695564184,
+ "Moderate": 0.8144404231670943,
+ "Hard": 0.6852010111307348,
+ "Extreme": 0.5079207760169129
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9040064038436306,
+ "T2. Sequencing & Structure Reconstruction": 0.9156552244052241,
+ "T3. Evidence-Grounded QA": 0.7166666666666667,
+ "T4. Summarization & Synthesis": 0.5373815939695279,
+ "T5. Attribution & Citation Alignment": 0.8401844251202388,
+ "T6. Aggregation & Clustering": 0.7030048302464991,
+ "T7. Consistency & Compliance Checking": 0.5353694302280319,
+ "T8. Structured & Numeric Reasoning": 0.7574074074074073,
+ "T9. Version & Code Diff Analysis": 0.7970749987749245,
+ "T10. Rule Induction & In-Context Learning": 0.7536111111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6833333333333333
+ },
+ "language": {
+ "Chinese": 0.7336257997533113,
+ "English": 0.7290280016942995
+ }
+ },
+ "pass@3": 0.5053333333333333
+}
\ No newline at end of file
diff --git a/results/Qwen3-235B-A22B-Thinking-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-235B-A22B-Thinking-2507/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..db2d87509160fd257d0b0ae98bc1382489e5727e
--- /dev/null
+++ b/results/Qwen3-235B-A22B-Thinking-2507/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 13,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.6697401744849466,
+ "inference_iteration_1_overall_metric": 0.6645867118007756,
+ "inference_iteration_2_overall_metric": 0.6751104806595288,
+ "inference_iteration_3_overall_metric": 0.6695233309945317,
+ "average_token_length_metric": {
+ "8k": 0.7205839084021065,
+ "16k": 0.7043002698220667,
+ "32k": 0.6969133410539584,
+ "64k": 0.6685079197146846,
+ "128k": 0.6404675702828265,
+ "256k": 0.5876680376340346
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.634705106653694,
+ "Partial": 0.7143302608156324
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.8354877129168713,
+ "Moderate": 0.7511550276701573,
+ "Hard": 0.6709614325576964,
+ "Extreme": 0.43389437976281486
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8772550883476504,
+ "T2. Sequencing & Structure Reconstruction": 0.8623538806872134,
+ "T3. Evidence-Grounded QA": 0.6250000000000003,
+ "T4. Summarization & Synthesis": 0.5401699896092569,
+ "T5. Attribution & Citation Alignment": 0.7728207746125852,
+ "T6. Aggregation & Clustering": 0.6408623505152069,
+ "T7. Consistency & Compliance Checking": 0.46406727840672235,
+ "T8. Structured & Numeric Reasoning": 0.7117283950617284,
+ "T9. Version & Code Diff Analysis": 0.789350597014821,
+ "T10. Rule Induction & In-Context Learning": 0.6548148148148148,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.525
+ },
+ "average_language_metric": {
+ "Chinese": 0.6712139930169294,
+ "English": 0.668266355952964
+ },
+ "BoN-1": {
+ "overall_metric": 0.6645867118007756,
+ "token_length": {
+ "8k": 0.7097656837729444,
+ "16k": 0.7010367810682975,
+ "32k": 0.6918335677824285,
+ "64k": 0.6771373662881649,
+ "128k": 0.6229921235210027,
+ "256k": 0.584754748371821
+ },
+ "contextual_requirement": {
+ "Full": 0.6291093737088903,
+ "Partial": 0.7097396875540872
+ },
+ "difficulty": {
+ "Easy": 0.8231290446647995,
+ "Moderate": 0.7591822484146827,
+ "Hard": 0.6449304811041314,
+ "Extreme": 0.4416892939586231
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8690587447518973,
+ "T2. Sequencing & Structure Reconstruction": 0.8768765956265957,
+ "T3. Evidence-Grounded QA": 0.6416666666666667,
+ "T4. Summarization & Synthesis": 0.5380854111531137,
+ "T5. Attribution & Citation Alignment": 0.7488408410161091,
+ "T6. Aggregation & Clustering": 0.6513683909314774,
+ "T7. Consistency & Compliance Checking": 0.46848122327638125,
+ "T8. Structured & Numeric Reasoning": 0.6837962962962962,
+ "T9. Version & Code Diff Analysis": 0.7940034392057601,
+ "T10. Rule Induction & In-Context Learning": 0.6166666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5166666666666667
+ },
+ "language": {
+ "Chinese": 0.6644345913196646,
+ "English": 0.6647388322818897
+ }
+ },
+ "pass@1": 0.4266666666666667,
+ "BoN-2": {
+ "overall_metric": 0.7478859807771899,
+ "token_length": {
+ "8k": 0.7974609291497671,
+ "16k": 0.7833069738720557,
+ "32k": 0.7539511540579624,
+ "64k": 0.7523052610595647,
+ "128k": 0.7233847771912111,
+ "256k": 0.6769067893325807
+ },
+ "contextual_requirement": {
+ "Full": 0.709048497560702,
+ "Partial": 0.797315504870903
+ },
+ "difficulty": {
+ "Easy": 0.9183290544233588,
+ "Moderate": 0.8402480622734665,
+ "Hard": 0.7656967810020093,
+ "Extreme": 0.4888097891525321
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8996374413195756,
+ "T2. Sequencing & Structure Reconstruction": 0.9140728715728713,
+ "T3. Evidence-Grounded QA": 0.7416666666666667,
+ "T4. Summarization & Synthesis": 0.5510532445357571,
+ "T5. Attribution & Citation Alignment": 0.8553010122231844,
+ "T6. Aggregation & Clustering": 0.72259791476162,
+ "T7. Consistency & Compliance Checking": 0.550051783712619,
+ "T8. Structured & Numeric Reasoning": 0.7916666666666666,
+ "T9. Version & Code Diff Analysis": 0.8487023090187992,
+ "T10. Rule Induction & In-Context Learning": 0.7916666666666666,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.65
+ },
+ "language": {
+ "Chinese": 0.7479449056610918,
+ "English": 0.7478270558932893
+ }
+ },
+ "pass@2": 0.524,
+ "BoN-3": {
+ "overall_metric": 0.7822004920631344,
+ "token_length": {
+ "8k": 0.8225828352477704,
+ "16k": 0.8267205325130447,
+ "32k": 0.8024853199503054,
+ "64k": 0.7952104500545913,
+ "128k": 0.7471444685103535,
+ "256k": 0.6990593461027415
+ },
+ "contextual_requirement": {
+ "Full": 0.7488719518681644,
+ "Partial": 0.824618634129461
+ },
+ "difficulty": {
+ "Easy": 0.9465988423811745,
+ "Moderate": 0.8961057909699204,
+ "Hard": 0.8084893169461448,
+ "Extreme": 0.5100275224528017
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9298480703943819,
+ "T2. Sequencing & Structure Reconstruction": 0.9221497946497943,
+ "T3. Evidence-Grounded QA": 0.8166666666666667,
+ "T4. Summarization & Synthesis": 0.5589114384792794,
+ "T5. Attribution & Citation Alignment": 0.8835244862799918,
+ "T6. Aggregation & Clustering": 0.7476293375871608,
+ "T7. Consistency & Compliance Checking": 0.5892510818578476,
+ "T8. Structured & Numeric Reasoning": 0.8444444444444444,
+ "T9. Version & Code Diff Analysis": 0.8566406207071109,
+ "T10. Rule Induction & In-Context Learning": 0.8377777777777777,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7
+ },
+ "language": {
+ "Chinese": 0.7822230658547494,
+ "English": 0.78217791827152
+ }
+ },
+ "pass@3": 0.572
+}
\ No newline at end of file
diff --git a/results/Qwen3-30B-A3B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json b/results/Qwen3-30B-A3B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..f50c8e1229dae86ceba53ca63ef69e550bf2af1a
--- /dev/null
+++ b/results/Qwen3-30B-A3B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.43835174236080204,
+ "inference_iteration_1_overall_metric": 0.4347938238206688,
+ "inference_iteration_2_overall_metric": 0.43739224653302283,
+ "inference_iteration_3_overall_metric": 0.4428691567287152,
+ "average_token_length_metric": {
+ "8k": 0.4640531253197961,
+ "16k": 0.4647383786815656,
+ "32k": 0.4769242453969347,
+ "64k": 0.4179394455075989,
+ "128k": 0.41351252229299035,
+ "256k": 0.39294273696592813
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.41095809725832383,
+ "Partial": 0.4732163815821386
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5588711611175687,
+ "Moderate": 0.39037585922610546,
+ "Hard": 0.41317943830896897,
+ "Extreme": 0.3543732565793154
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7127770907550041,
+ "T2. Sequencing & Structure Reconstruction": 0.6851617075892437,
+ "T3. Evidence-Grounded QA": 0.5611111111111112,
+ "T4. Summarization & Synthesis": 0.5287940406323086,
+ "T5. Attribution & Citation Alignment": 0.5540826057234692,
+ "T6. Aggregation & Clustering": 0.3922538101756423,
+ "T7. Consistency & Compliance Checking": 0.25219880289608915,
+ "T8. Structured & Numeric Reasoning": 0.15478395061728395,
+ "T9. Version & Code Diff Analysis": 0.4794950077949977,
+ "T10. Rule Induction & In-Context Learning": 0.4563425925925925,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3027777777777778
+ },
+ "average_language_metric": {
+ "Chinese": 0.4449592008603477,
+ "English": 0.43174428386125685
+ },
+ "BoN-1": {
+ "overall_metric": 0.4347938238206688,
+ "token_length": {
+ "8k": 0.45780843124261406,
+ "16k": 0.4544510486962234,
+ "32k": 0.48190834806509436,
+ "64k": 0.41649388756464034,
+ "128k": 0.4188417047217461,
+ "256k": 0.3792595226336961
+ },
+ "contextual_requirement": {
+ "Full": 0.4119506732449559,
+ "Partial": 0.4638669245533959
+ },
+ "difficulty": {
+ "Easy": 0.5484313078217694,
+ "Moderate": 0.3819799640213123,
+ "Hard": 0.40444373140916484,
+ "Extreme": 0.3649327213215975
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7075508946315197,
+ "T2. Sequencing & Structure Reconstruction": 0.6841748991748985,
+ "T3. Evidence-Grounded QA": 0.5583333333333333,
+ "T4. Summarization & Synthesis": 0.5338944636170653,
+ "T5. Attribution & Citation Alignment": 0.5472533802884338,
+ "T6. Aggregation & Clustering": 0.4012455230126335,
+ "T7. Consistency & Compliance Checking": 0.2457091900873631,
+ "T8. Structured & Numeric Reasoning": 0.15416666666666667,
+ "T9. Version & Code Diff Analysis": 0.46883931261867207,
+ "T10. Rule Induction & In-Context Learning": 0.4331944444444443,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3
+ },
+ "language": {
+ "Chinese": 0.44364717413939403,
+ "English": 0.4259404735019449
+ }
+ },
+ "pass@1": 0.17733333333333334,
+ "BoN-2": {
+ "overall_metric": 0.46640045487719345,
+ "token_length": {
+ "8k": 0.48850623477567573,
+ "16k": 0.48941657724686394,
+ "32k": 0.5072169193313956,
+ "64k": 0.4523087841468838,
+ "128k": 0.44368419626075345,
+ "256k": 0.41727001750158577
+ },
+ "contextual_requirement": {
+ "Full": 0.44061393965405443,
+ "Partial": 0.4992196560702788
+ },
+ "difficulty": {
+ "Easy": 0.5800147754645664,
+ "Moderate": 0.42352739928412814,
+ "Hard": 0.44183822997113786,
+ "Extreme": 0.3862338492888558
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.74209693890316,
+ "T2. Sequencing & Structure Reconstruction": 0.7157272357272357,
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
+ "T4. Summarization & Synthesis": 0.5450289133980314,
+ "T5. Attribution & Citation Alignment": 0.5961599581640015,
+ "T6. Aggregation & Clustering": 0.4338491861878003,
+ "T7. Consistency & Compliance Checking": 0.27109254578114006,
+ "T8. Structured & Numeric Reasoning": 0.18379629629629632,
+ "T9. Version & Code Diff Analysis": 0.5158022640412946,
+ "T10. Rule Induction & In-Context Learning": 0.49041666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.325
+ },
+ "language": {
+ "Chinese": 0.4708259876166834,
+ "English": 0.4619749221377029
+ }
+ },
+ "pass@2": 0.198,
+ "BoN-3": {
+ "overall_metric": 0.48455280677595725,
+ "token_length": {
+ "8k": 0.5068277344354145,
+ "16k": 0.5063031851071311,
+ "32k": 0.5248706991014918,
+ "64k": 0.462267170875692,
+ "128k": 0.4631788587622459,
+ "256k": 0.44386919237376954
+ },
+ "contextual_requirement": {
+ "Full": 0.45623565272994365,
+ "Partial": 0.5205928210163393
+ },
+ "difficulty": {
+ "Easy": 0.5983262263426448,
+ "Moderate": 0.44016373789846164,
+ "Hard": 0.4662266857524413,
+ "Extreme": 0.40111167416288757
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7549544231445334,
+ "T2. Sequencing & Structure Reconstruction": 0.737453426203426,
+ "T3. Evidence-Grounded QA": 0.5833333333333334,
+ "T4. Summarization & Synthesis": 0.5505766147965558,
+ "T5. Attribution & Citation Alignment": 0.6027932502768333,
+ "T6. Aggregation & Clustering": 0.45641196612410123,
+ "T7. Consistency & Compliance Checking": 0.29641454225252506,
+ "T8. Structured & Numeric Reasoning": 0.1949074074074074,
+ "T9. Version & Code Diff Analysis": 0.546337052157623,
+ "T10. Rule Induction & In-Context Learning": 0.49319444444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664
+ },
+ "language": {
+ "Chinese": 0.48823712145503023,
+ "English": 0.48086849209688537
+ }
+ },
+ "pass@3": 0.20866666666666667
+}
\ No newline at end of file
diff --git a/results/Qwen3-30B-A3B-Instruct-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-30B-A3B-Instruct-2507/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..95bae708c215018b2cac88a4d6ee4e929a0866d7
--- /dev/null
+++ b/results/Qwen3-30B-A3B-Instruct-2507/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5452162786351843,
+ "inference_iteration_1_overall_metric": 0.5475365928256798,
+ "inference_iteration_2_overall_metric": 0.5453410517872768,
+ "inference_iteration_3_overall_metric": 0.5427711912925933,
+ "average_token_length_metric": {
+ "8k": 0.6035050249721831,
+ "16k": 0.5987273703229294,
+ "32k": 0.5599890143683894,
+ "64k": 0.5400303382355833,
+ "128k": 0.49784316183098404,
+ "256k": 0.4712027620810347
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5045780200164052,
+ "Partial": 0.5969376986954488
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7559096468444604,
+ "Moderate": 0.564732272741711,
+ "Hard": 0.4403591043429616,
+ "Extreme": 0.37046490953699124
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7874723394436245,
+ "T2. Sequencing & Structure Reconstruction": 0.747240085843135,
+ "T3. Evidence-Grounded QA": 0.5416666666666666,
+ "T4. Summarization & Synthesis": 0.5127484150702732,
+ "T5. Attribution & Citation Alignment": 0.6032294596505793,
+ "T6. Aggregation & Clustering": 0.5166889927087092,
+ "T7. Consistency & Compliance Checking": 0.31859016422394293,
+ "T8. Structured & Numeric Reasoning": 0.5577160493827158,
+ "T9. Version & Code Diff Analysis": 0.5756222253109884,
+ "T10. Rule Induction & In-Context Learning": 0.5438425925925927,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4138888888888889
+ },
+ "average_language_metric": {
+ "Chinese": 0.534932377457035,
+ "English": 0.5555001798133334
+ },
+ "BoN-1": {
+ "overall_metric": 0.5475365928256798,
+ "token_length": {
+ "8k": 0.6016020197847444,
+ "16k": 0.5955245523768168,
+ "32k": 0.5628331081172674,
+ "64k": 0.5480517546124761,
+ "128k": 0.5064957488766313,
+ "256k": 0.4707123731861472
+ },
+ "contextual_requirement": {
+ "Full": 0.5089038204182644,
+ "Partial": 0.5967055758896659
+ },
+ "difficulty": {
+ "Easy": 0.7601650922395514,
+ "Moderate": 0.5686448581258617,
+ "Hard": 0.44460622355514695,
+ "Extreme": 0.36835398221192234
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7928563896826898,
+ "T2. Sequencing & Structure Reconstruction": 0.7446857472592766,
+ "T3. Evidence-Grounded QA": 0.5583333333333333,
+ "T4. Summarization & Synthesis": 0.5140071321863116,
+ "T5. Attribution & Citation Alignment": 0.5999370151569586,
+ "T6. Aggregation & Clustering": 0.5192504692366862,
+ "T7. Consistency & Compliance Checking": 0.33101895293351313,
+ "T8. Structured & Numeric Reasoning": 0.5685185185185185,
+ "T9. Version & Code Diff Analysis": 0.5777336594471387,
+ "T10. Rule Induction & In-Context Learning": 0.5034722222222221,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.425
+ },
+ "language": {
+ "Chinese": 0.5279438377377988,
+ "English": 0.5671293479135634
+ }
+ },
+ "pass@1": 0.29733333333333334,
+ "BoN-2": {
+ "overall_metric": 0.6226628481842147,
+ "token_length": {
+ "8k": 0.6834746444616835,
+ "16k": 0.6777780429260933,
+ "32k": 0.6235393400581155,
+ "64k": 0.6117468620961922,
+ "128k": 0.5816405990800028,
+ "256k": 0.5577976004832055
+ },
+ "contextual_requirement": {
+ "Full": 0.5762829279061583,
+ "Partial": 0.6816918376290168
+ },
+ "difficulty": {
+ "Easy": 0.8411762869831911,
+ "Moderate": 0.6700737072149301,
+ "Hard": 0.5094718947245584,
+ "Extreme": 0.4264968704316206
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8378092426392205,
+ "T2. Sequencing & Structure Reconstruction": 0.8185840085840084,
+ "T3. Evidence-Grounded QA": 0.6083333333333333,
+ "T4. Summarization & Synthesis": 0.5304585800985443,
+ "T5. Attribution & Citation Alignment": 0.6911490951783217,
+ "T6. Aggregation & Clustering": 0.5992410622374563,
+ "T7. Consistency & Compliance Checking": 0.39138387898011423,
+ "T8. Structured & Numeric Reasoning": 0.6398148148148147,
+ "T9. Version & Code Diff Analysis": 0.6585139306429096,
+ "T10. Rule Induction & In-Context Learning": 0.6511111111111112,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
+ },
+ "language": {
+ "Chinese": 0.6113970502008556,
+ "English": 0.6339286461675765
+ }
+ },
+ "pass@2": 0.36666666666666664,
+ "BoN-3": {
+ "overall_metric": 0.6598719096201268,
+ "token_length": {
+ "8k": 0.7112228338386369,
+ "16k": 0.704821999670257,
+ "32k": 0.6679623687905272,
+ "64k": 0.6494739136662858,
+ "128k": 0.6304280872767323,
+ "256k": 0.5953222544783268
+ },
+ "contextual_requirement": {
+ "Full": 0.6175176296001476,
+ "Partial": 0.7137773569182847
+ },
+ "difficulty": {
+ "Easy": 0.8737608485181246,
+ "Moderate": 0.7316366823753812,
+ "Hard": 0.5427232232690453,
+ "Extreme": 0.45537527889367313
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.858239586310667,
+ "T2. Sequencing & Structure Reconstruction": 0.8379063991563991,
+ "T3. Evidence-Grounded QA": 0.675,
+ "T4. Summarization & Synthesis": 0.5353639111706652,
+ "T5. Attribution & Citation Alignment": 0.7194455371416155,
+ "T6. Aggregation & Clustering": 0.6373541014616064,
+ "T7. Consistency & Compliance Checking": 0.43430884132356135,
+ "T8. Structured & Numeric Reasoning": 0.6731481481481482,
+ "T9. Version & Code Diff Analysis": 0.699949022294497,
+ "T10. Rule Induction & In-Context Learning": 0.7136111111111112,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5916666666666667
+ },
+ "language": {
+ "Chinese": 0.6502665902311631,
+ "English": 0.6694772290090928
+ }
+ },
+ "pass@3": 0.4053333333333333
+}
\ No newline at end of file
diff --git a/results/Qwen3-30B-A3B-Thinking-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-30B-A3B-Thinking-2507/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..c427f8324376d7e81ba236d91ad4694e7e46f54e
--- /dev/null
+++ b/results/Qwen3-30B-A3B-Thinking-2507/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5967923243935424,
+ "inference_iteration_1_overall_metric": 0.5971485082647852,
+ "inference_iteration_2_overall_metric": 0.5964644421604209,
+ "inference_iteration_3_overall_metric": 0.5967640227554183,
+ "average_token_length_metric": {
+ "8k": 0.6603587329609213,
+ "16k": 0.6544573971618018,
+ "32k": 0.6327664105346437,
+ "64k": 0.5866153064796855,
+ "128k": 0.5279135083169344,
+ "256k": 0.5186425909072657
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5570876315601245,
+ "Partial": 0.6473255698178918
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7963733087366597,
+ "Moderate": 0.6254851987507496,
+ "Hard": 0.5276205174176363,
+ "Extreme": 0.40474772683313665
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8293501125585261,
+ "T2. Sequencing & Structure Reconstruction": 0.8116547209831653,
+ "T3. Evidence-Grounded QA": 0.5750000000000003,
+ "T4. Summarization & Synthesis": 0.5368729714469394,
+ "T5. Attribution & Citation Alignment": 0.6564732160656777,
+ "T6. Aggregation & Clustering": 0.5777371247422506,
+ "T7. Consistency & Compliance Checking": 0.3565708837074654,
+ "T8. Structured & Numeric Reasoning": 0.6209876543209873,
+ "T9. Version & Code Diff Analysis": 0.6645539841533562,
+ "T10. Rule Induction & In-Context Learning": 0.5863888888888888,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.46666666666666673
+ },
+ "average_language_metric": {
+ "Chinese": 0.5922310064298796,
+ "English": 0.6013536423572048
+ },
+ "BoN-1": {
+ "overall_metric": 0.5971485082647852,
+ "token_length": {
+ "8k": 0.6631122039610606,
+ "16k": 0.6695831110367371,
+ "32k": 0.6289403768757488,
+ "64k": 0.5821521427163692,
+ "128k": 0.522724275200541,
+ "256k": 0.5163789397982567
+ },
+ "contextual_requirement": {
+ "Full": 0.5537794331097636,
+ "Partial": 0.6523455130075411
+ },
+ "difficulty": {
+ "Easy": 0.7810131026617345,
+ "Moderate": 0.6463897022556653,
+ "Hard": 0.5345120229683763,
+ "Extreme": 0.4045314734647145
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8311430041412194,
+ "T2. Sequencing & Structure Reconstruction": 0.8249045861545858,
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
+ "T4. Summarization & Synthesis": 0.5383277195885704,
+ "T5. Attribution & Citation Alignment": 0.6451460821905192,
+ "T6. Aggregation & Clustering": 0.5737635479840616,
+ "T7. Consistency & Compliance Checking": 0.37517972023255514,
+ "T8. Structured & Numeric Reasoning": 0.6055555555555555,
+ "T9. Version & Code Diff Analysis": 0.6730867255766664,
+ "T10. Rule Induction & In-Context Learning": 0.5700000000000001,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
+ },
+ "language": {
+ "Chinese": 0.5907857688067475,
+ "English": 0.603511247722824
+ }
+ },
+ "pass@1": 0.3546666666666667,
+ "BoN-2": {
+ "overall_metric": 0.6705564073004658,
+ "token_length": {
+ "8k": 0.7144249310094072,
+ "16k": 0.7233569898208718,
+ "32k": 0.7203526229389474,
+ "64k": 0.666470085141174,
+ "128k": 0.5975716987083816,
+ "256k": 0.6011621161840196
+ },
+ "contextual_requirement": {
+ "Full": 0.6242851454438455,
+ "Partial": 0.7294471042088951
+ },
+ "difficulty": {
+ "Easy": 0.8669888777194616,
+ "Moderate": 0.7348384499045617,
+ "Hard": 0.6052202090307908,
+ "Extreme": 0.45606595331273436
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8629693587046573,
+ "T2. Sequencing & Structure Reconstruction": 0.8585263810263809,
+ "T3. Evidence-Grounded QA": 0.6833333333333333,
+ "T4. Summarization & Synthesis": 0.5520973617668322,
+ "T5. Attribution & Citation Alignment": 0.7329873034344809,
+ "T6. Aggregation & Clustering": 0.647246392300239,
+ "T7. Consistency & Compliance Checking": 0.4479448894336623,
+ "T8. Structured & Numeric Reasoning": 0.7083333333333334,
+ "T9. Version & Code Diff Analysis": 0.7361988748337432,
+ "T10. Rule Induction & In-Context Learning": 0.683888888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667
+ },
+ "language": {
+ "Chinese": 0.6628481693891412,
+ "English": 0.6782646452117937
+ }
+ },
+ "pass@2": 0.43266666666666664,
+ "BoN-3": {
+ "overall_metric": 0.7073514436172486,
+ "token_length": {
+ "8k": 0.7517926459779344,
+ "16k": 0.7722619232731515,
+ "32k": 0.7511237446632382,
+ "64k": 0.6976847688527813,
+ "128k": 0.629184440307289,
+ "256k": 0.6420611386291009
+ },
+ "contextual_requirement": {
+ "Full": 0.6691986411009105,
+ "Partial": 0.7559095559107719
+ },
+ "difficulty": {
+ "Easy": 0.915791957089584,
+ "Moderate": 0.766786734359907,
+ "Hard": 0.6353007083282902,
+ "Extreme": 0.48730038902683037
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8832882758900559,
+ "T2. Sequencing & Structure Reconstruction": 0.8749057424057421,
+ "T3. Evidence-Grounded QA": 0.75,
+ "T4. Summarization & Synthesis": 0.55766162876669,
+ "T5. Attribution & Citation Alignment": 0.7834137982359759,
+ "T6. Aggregation & Clustering": 0.6805855792952568,
+ "T7. Consistency & Compliance Checking": 0.4766259604534431,
+ "T8. Structured & Numeric Reasoning": 0.7546296296296295,
+ "T9. Version & Code Diff Analysis": 0.7651396236274333,
+ "T10. Rule Induction & In-Context Learning": 0.7513888888888889,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6083333333333333
+ },
+ "language": {
+ "Chinese": 0.703943401540441,
+ "English": 0.7107594856940571
+ }
+ },
+ "pass@3": 0.478
+}
\ No newline at end of file
diff --git a/results/Qwen3-32B/nonthinking_context-120000_bon-3_summary.json b/results/Qwen3-32B/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..87137b4c067878070d19a926904b2e0d3640d9b1
--- /dev/null
+++ b/results/Qwen3-32B/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.40276958737882135,
+ "inference_iteration_1_overall_metric": 0.4016952023158214,
+ "inference_iteration_2_overall_metric": 0.4048631802018712,
+ "inference_iteration_3_overall_metric": 0.40175037961877114,
+ "average_token_length_metric": {
+ "8k": 0.4617524313525191,
+ "16k": 0.45515708355028794,
+ "32k": 0.4245573275344764,
+ "64k": 0.38812942658699545,
+ "128k": 0.37207065716673743,
+ "256k": 0.3149505980819123
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.37062296245474236,
+ "Partial": 0.4436834736458306
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5190221632028355,
+ "Moderate": 0.3430474305314189,
+ "Hard": 0.3861354100882252,
+ "Extreme": 0.32557240333042736
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6783903124505437,
+ "T2. Sequencing & Structure Reconstruction": 0.691038080579805,
+ "T3. Evidence-Grounded QA": 0.5361111111111111,
+ "T4. Summarization & Synthesis": 0.5235103467599385,
+ "T5. Attribution & Citation Alignment": 0.36734324660318,
+ "T6. Aggregation & Clustering": 0.3859827152631366,
+ "T7. Consistency & Compliance Checking": 0.22774134858655815,
+ "T8. Structured & Numeric Reasoning": 0.09135802469135802,
+ "T9. Version & Code Diff Analysis": 0.4480573156228135,
+ "T10. Rule Induction & In-Context Learning": 0.39643518518518506,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3361111111111111
+ },
+ "average_language_metric": {
+ "Chinese": 0.40772752509755494,
+ "English": 0.3978116496600876
+ },
+ "BoN-1": {
+ "overall_metric": 0.4016952023158214,
+ "token_length": {
+ "8k": 0.46663668330951064,
+ "16k": 0.4538107088644924,
+ "32k": 0.4171039185976235,
+ "64k": 0.38935415644800225,
+ "128k": 0.3718552714760765,
+ "256k": 0.31141047519922166
+ },
+ "contextual_requirement": {
+ "Full": 0.3669026836885647,
+ "Partial": 0.445976589659603
+ },
+ "difficulty": {
+ "Easy": 0.5194117663381852,
+ "Moderate": 0.3405231428124576,
+ "Hard": 0.3855439159391187,
+ "Extreme": 0.32352943663527506
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6715192705862865,
+ "T2. Sequencing & Structure Reconstruction": 0.6997405107762323,
+ "T3. Evidence-Grounded QA": 0.55,
+ "T4. Summarization & Synthesis": 0.5220646003288852,
+ "T5. Attribution & Citation Alignment": 0.3550270822278831,
+ "T6. Aggregation & Clustering": 0.37946082280852406,
+ "T7. Consistency & Compliance Checking": 0.23788006601247816,
+ "T8. Structured & Numeric Reasoning": 0.09490740740740743,
+ "T9. Version & Code Diff Analysis": 0.4582161206858629,
+ "T10. Rule Induction & In-Context Learning": 0.4045833333333333,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667
+ },
+ "language": {
+ "Chinese": 0.4096699821210686,
+ "English": 0.3937204225105745
+ }
+ },
+ "pass@1": 0.15666666666666668,
+ "BoN-2": {
+ "overall_metric": 0.4458899241198126,
+ "token_length": {
+ "8k": 0.5107956286409391,
+ "16k": 0.4940196458333606,
+ "32k": 0.46285124603401756,
+ "64k": 0.4356914401730522,
+ "128k": 0.4150838110618531,
+ "256k": 0.3568977729756547
+ },
+ "contextual_requirement": {
+ "Full": 0.41105292891627904,
+ "Partial": 0.4902279180152205
+ },
+ "difficulty": {
+ "Easy": 0.5623795180533137,
+ "Moderate": 0.38380867566198085,
+ "Hard": 0.4305011006295078,
+ "Extreme": 0.36916462074950435
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7155725341000191,
+ "T2. Sequencing & Structure Reconstruction": 0.7385771329569484,
+ "T3. Evidence-Grounded QA": 0.5583333333333333,
+ "T4. Summarization & Synthesis": 0.5363931015769866,
+ "T5. Attribution & Citation Alignment": 0.40577037385073444,
+ "T6. Aggregation & Clustering": 0.4381787255377262,
+ "T7. Consistency & Compliance Checking": 0.2654075018995459,
+ "T8. Structured & Numeric Reasoning": 0.12083333333333335,
+ "T9. Version & Code Diff Analysis": 0.5102649011903974,
+ "T10. Rule Induction & In-Context Learning": 0.45541666666666664,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
+ },
+ "language": {
+ "Chinese": 0.4499576438501969,
+ "English": 0.4418222043894293
+ }
+ },
+ "pass@2": 0.184,
+ "BoN-3": {
+ "overall_metric": 0.46707342636344645,
+ "token_length": {
+ "8k": 0.5223679274291421,
+ "16k": 0.5196118628123836,
+ "32k": 0.48374718677985445,
+ "64k": 0.45430831830027746,
+ "128k": 0.44035677780297794,
+ "256k": 0.3820484850560422
+ },
+ "contextual_requirement": {
+ "Full": 0.4329127928337618,
+ "Partial": 0.5105505963103175
+ },
+ "difficulty": {
+ "Easy": 0.5794161679541103,
+ "Moderate": 0.41412466934190495,
+ "Hard": 0.45291372678066194,
+ "Extreme": 0.3880851138906052
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7343093370344386,
+ "T2. Sequencing & Structure Reconstruction": 0.7559608376898355,
+ "T3. Evidence-Grounded QA": 0.5833333333333334,
+ "T4. Summarization & Synthesis": 0.5455373041006337,
+ "T5. Attribution & Citation Alignment": 0.44072256206800897,
+ "T6. Aggregation & Clustering": 0.47661016797778905,
+ "T7. Consistency & Compliance Checking": 0.2859756712512137,
+ "T8. Structured & Numeric Reasoning": 0.13425925925925924,
+ "T9. Version & Code Diff Analysis": 0.5528701409177667,
+ "T10. Rule Induction & In-Context Learning": 0.45541666666666664,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.425
+ },
+ "language": {
+ "Chinese": 0.4707375089112187,
+ "English": 0.4634093438156739
+ }
+ },
+ "pass@3": 0.19666666666666666
+}
\ No newline at end of file
diff --git a/results/Qwen3-32B/thinking_context-120000_bon-3_summary.json b/results/Qwen3-32B/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8ae14db2066963f00f74888025051ae990a3cd9
--- /dev/null
+++ b/results/Qwen3-32B/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.511230077368557,
+ "inference_iteration_1_overall_metric": 0.5125551266359154,
+ "inference_iteration_2_overall_metric": 0.5093558836789825,
+ "inference_iteration_3_overall_metric": 0.5117792217907733,
+ "average_token_length_metric": {
+ "8k": 0.6093750682953369,
+ "16k": 0.5606336434386148,
+ "32k": 0.5407757802194578,
+ "64k": 0.49507317974870746,
+ "128k": 0.4498486791562606,
+ "256k": 0.4116741133529667
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.47415262721658286,
+ "Partial": 0.5584195593801601
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7280370376711341,
+ "Moderate": 0.46181032202581557,
+ "Hard": 0.42241569201251666,
+ "Extreme": 0.36452260417788923
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.770604191490206,
+ "T2. Sequencing & Structure Reconstruction": 0.7391521133187797,
+ "T3. Evidence-Grounded QA": 0.47222222222222227,
+ "T4. Summarization & Synthesis": 0.5173657920719582,
+ "T5. Attribution & Citation Alignment": 0.4466370952378391,
+ "T6. Aggregation & Clustering": 0.4909460945890284,
+ "T7. Consistency & Compliance Checking": 0.2977388131082807,
+ "T8. Structured & Numeric Reasoning": 0.5168209876543208,
+ "T9. Version & Code Diff Analysis": 0.5534968208496264,
+ "T10. Rule Induction & In-Context Learning": 0.49097222222222225,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4416666666666667
+ },
+ "average_language_metric": {
+ "Chinese": 0.5000696493273706,
+ "English": 0.5223905054097431
+ },
+ "BoN-1": {
+ "overall_metric": 0.5125551266359154,
+ "token_length": {
+ "8k": 0.6199505611114019,
+ "16k": 0.5449906257247921,
+ "32k": 0.5573838747951294,
+ "64k": 0.48881589783475765,
+ "128k": 0.4414701560105687,
+ "256k": 0.4227196443388436
+ },
+ "contextual_requirement": {
+ "Full": 0.4840737513137435,
+ "Partial": 0.5488041497732257
+ },
+ "difficulty": {
+ "Easy": 0.7237636460264806,
+ "Moderate": 0.45634319610108454,
+ "Hard": 0.435386696673209,
+ "Extreme": 0.368792439903043
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7683888534222428,
+ "T2. Sequencing & Structure Reconstruction": 0.7350878750878753,
+ "T3. Evidence-Grounded QA": 0.475,
+ "T4. Summarization & Synthesis": 0.5164166688813505,
+ "T5. Attribution & Citation Alignment": 0.4327235065619549,
+ "T6. Aggregation & Clustering": 0.48097372489084406,
+ "T7. Consistency & Compliance Checking": 0.28774132062670765,
+ "T8. Structured & Numeric Reasoning": 0.5685185185185185,
+ "T9. Version & Code Diff Analysis": 0.552360721830303,
+ "T10. Rule Induction & In-Context Learning": 0.4877777777777777,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.43333333333333335
+ },
+ "language": {
+ "Chinese": 0.5036040074009573,
+ "English": 0.5215062458708739
+ }
+ },
+ "pass@1": 0.26,
+ "BoN-2": {
+ "overall_metric": 0.5889072134846955,
+ "token_length": {
+ "8k": 0.6825707673405994,
+ "16k": 0.6560634814652357,
+ "32k": 0.6151197445549184,
+ "64k": 0.5702241454706968,
+ "128k": 0.5232780250539345,
+ "256k": 0.4861871170227882
+ },
+ "contextual_requirement": {
+ "Full": 0.5510603442388599,
+ "Partial": 0.6370759561612137
+ },
+ "difficulty": {
+ "Easy": 0.8070382684727199,
+ "Moderate": 0.5619198253755202,
+ "Hard": 0.5090922692146654,
+ "Extreme": 0.42010427156051966
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8345700718573785,
+ "T2. Sequencing & Structure Reconstruction": 0.7945044307544308,
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
+ "T4. Summarization & Synthesis": 0.5346021319695609,
+ "T5. Attribution & Citation Alignment": 0.5351280430956195,
+ "T6. Aggregation & Clustering": 0.5706144095580495,
+ "T7. Consistency & Compliance Checking": 0.38333556049784645,
+ "T8. Structured & Numeric Reasoning": 0.6296296296296297,
+ "T9. Version & Code Diff Analysis": 0.6241105357978589,
+ "T10. Rule Induction & In-Context Learning": 0.5713888888888888,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.525
+ },
+ "language": {
+ "Chinese": 0.5813115070957934,
+ "English": 0.5965029198735978
+ }
+ },
+ "pass@2": 0.3393333333333333,
+ "BoN-3": {
+ "overall_metric": 0.6236268197943012,
+ "token_length": {
+ "8k": 0.7099773255313729,
+ "16k": 0.6915751481616828,
+ "32k": 0.6387773010277447,
+ "64k": 0.615062805287137,
+ "128k": 0.5761902661960858,
+ "256k": 0.5101780725617869
+ },
+ "contextual_requirement": {
+ "Full": 0.5835815761216835,
+ "Partial": 0.6745934935594524
+ },
+ "difficulty": {
+ "Easy": 0.8441219878850473,
+ "Moderate": 0.6121934384709582,
+ "Hard": 0.5394052983997921,
+ "Extreme": 0.4449122649436655
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8757613870336376,
+ "T2. Sequencing & Structure Reconstruction": 0.8133404095904098,
+ "T3. Evidence-Grounded QA": 0.6083333333333333,
+ "T4. Summarization & Synthesis": 0.5410211462556259,
+ "T5. Attribution & Citation Alignment": 0.5882233881768194,
+ "T6. Aggregation & Clustering": 0.6075727032146355,
+ "T7. Consistency & Compliance Checking": 0.41240156108775533,
+ "T8. Structured & Numeric Reasoning": 0.6592592592592592,
+ "T9. Version & Code Diff Analysis": 0.658971964363135,
+ "T10. Rule Induction & In-Context Learning": 0.6241666666666666,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667
+ },
+ "language": {
+ "Chinese": 0.6173730284036301,
+ "English": 0.6298806111849737
+ }
+ },
+ "pass@3": 0.37466666666666665
+}
\ No newline at end of file
diff --git a/results/Qwen3-4B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json b/results/Qwen3-4B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..b67d7cb77cf75fceb16f0f95696236fa3c9c645b
--- /dev/null
+++ b/results/Qwen3-4B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.36776939054307617,
+ "inference_iteration_1_overall_metric": 0.3687031785251978,
+ "inference_iteration_2_overall_metric": 0.36661913973941934,
+ "inference_iteration_3_overall_metric": 0.3679858533646109,
+ "average_token_length_metric": {
+ "8k": 0.44810945279428716,
+ "16k": 0.4111037532728985,
+ "32k": 0.3865217233693066,
+ "64k": 0.34256931704639887,
+ "128k": 0.3342606193069893,
+ "256k": 0.28405147746857656
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3471098106864211,
+ "Partial": 0.39406340126972855
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.4833480360927418,
+ "Moderate": 0.3021440333762109,
+ "Hard": 0.3393622074968013,
+ "Extreme": 0.3029205200127554
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6636265683033888,
+ "T2. Sequencing & Structure Reconstruction": 0.5914089584991948,
+ "T3. Evidence-Grounded QA": 0.5027777777777778,
+ "T4. Summarization & Synthesis": 0.5180185643786743,
+ "T5. Attribution & Citation Alignment": 0.3538642246619649,
+ "T6. Aggregation & Clustering": 0.3312771882807734,
+ "T7. Consistency & Compliance Checking": 0.1987781645586334,
+ "T8. Structured & Numeric Reasoning": 0.09861111111111111,
+ "T9. Version & Code Diff Analysis": 0.38761603668611966,
+ "T10. Rule Induction & In-Context Learning": 0.39236111111111105,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.24444444444444444
+ },
+ "average_language_metric": {
+ "Chinese": 0.3784916959047209,
+ "English": 0.35704708518143186
+ },
+ "BoN-1": {
+ "overall_metric": 0.3687031785251978,
+ "token_length": {
+ "8k": 0.45858099952533665,
+ "16k": 0.4197668091574054,
+ "32k": 0.3847492794129437,
+ "64k": 0.3409058296099612,
+ "128k": 0.32898091620375153,
+ "256k": 0.2792352372417906
+ },
+ "contextual_requirement": {
+ "Full": 0.34899758846399653,
+ "Partial": 0.3937830204212733
+ },
+ "difficulty": {
+ "Easy": 0.48834649579871187,
+ "Moderate": 0.30447624121232425,
+ "Hard": 0.3355461000287082,
+ "Extreme": 0.30160295498672696
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6577542622394902,
+ "T2. Sequencing & Structure Reconstruction": 0.5875901640003564,
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
+ "T4. Summarization & Synthesis": 0.5182254030588427,
+ "T5. Attribution & Citation Alignment": 0.3501215130410147,
+ "T6. Aggregation & Clustering": 0.32672419090342486,
+ "T7. Consistency & Compliance Checking": 0.19951249587965042,
+ "T8. Structured & Numeric Reasoning": 0.10509259259259258,
+ "T9. Version & Code Diff Analysis": 0.3982433590506608,
+ "T10. Rule Induction & In-Context Learning": 0.3831944444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25
+ },
+ "language": {
+ "Chinese": 0.3804916349954618,
+ "English": 0.35691472205493485
+ }
+ },
+ "pass@1": 0.13733333333333334,
+ "BoN-2": {
+ "overall_metric": 0.3840460410863504,
+ "token_length": {
+ "8k": 0.46700683938890614,
+ "16k": 0.4284116254260062,
+ "32k": 0.40307059075358426,
+ "64k": 0.36117723090379694,
+ "128k": 0.34663017301008414,
+ "256k": 0.29797978703572725
+ },
+ "contextual_requirement": {
+ "Full": 0.3640278472961897,
+ "Partial": 0.40952374227382876
+ },
+ "difficulty": {
+ "Easy": 0.49937652616615,
+ "Moderate": 0.3169552390958882,
+ "Hard": 0.3549456654310522,
+ "Extreme": 0.32088685138444606
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6767487727424361,
+ "T2. Sequencing & Structure Reconstruction": 0.6066056866188131,
+ "T3. Evidence-Grounded QA": 0.525,
+ "T4. Summarization & Synthesis": 0.5276694633153738,
+ "T5. Attribution & Citation Alignment": 0.3727088921038869,
+ "T6. Aggregation & Clustering": 0.3527528047911058,
+ "T7. Consistency & Compliance Checking": 0.20981326336377637,
+ "T8. Structured & Numeric Reasoning": 0.11620370370370371,
+ "T9. Version & Code Diff Analysis": 0.41827137434433015,
+ "T10. Rule Induction & In-Context Learning": 0.39708333333333334,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336
+ },
+ "language": {
+ "Chinese": 0.39280983345013615,
+ "English": 0.3752822487225656
+ }
+ },
+ "pass@2": 0.14733333333333334,
+ "BoN-3": {
+ "overall_metric": 0.39322455891947206,
+ "token_length": {
+ "8k": 0.4701089334853473,
+ "16k": 0.43225890052967636,
+ "32k": 0.4184755108081086,
+ "64k": 0.3714644148022562,
+ "128k": 0.35734202547412225,
+ "256k": 0.3096975684173234
+ },
+ "contextual_requirement": {
+ "Full": 0.371530129813719,
+ "Partial": 0.4208356505086133
+ },
+ "difficulty": {
+ "Easy": 0.5099419962893179,
+ "Moderate": 0.3245920787447158,
+ "Hard": 0.36491737661035256,
+ "Extreme": 0.32903764447759715
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6847508372365279,
+ "T2. Sequencing & Structure Reconstruction": 0.6160001081301075,
+ "T3. Evidence-Grounded QA": 0.525,
+ "T4. Summarization & Synthesis": 0.5341098799748382,
+ "T5. Attribution & Citation Alignment": 0.3970649159521875,
+ "T6. Aggregation & Clustering": 0.3694259879861735,
+ "T7. Consistency & Compliance Checking": 0.21278092728735123,
+ "T8. Structured & Numeric Reasoning": 0.11712962962962964,
+ "T9. Version & Code Diff Analysis": 0.42895976117834506,
+ "T10. Rule Induction & In-Context Learning": 0.41375,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.26666666666666666
+ },
+ "language": {
+ "Chinese": 0.4014939630827822,
+ "English": 0.38495515475616277
+ }
+ },
+ "pass@3": 0.15333333333333332
+}
\ No newline at end of file
diff --git a/results/Qwen3-4B-Instruct-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-4B-Instruct-2507/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..be968e8d160a17c157e44d4d98626ce327db1d59
--- /dev/null
+++ b/results/Qwen3-4B-Instruct-2507/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.45680682461664274,
+ "inference_iteration_1_overall_metric": 0.46832062566231697,
+ "inference_iteration_2_overall_metric": 0.4448543197576111,
+ "inference_iteration_3_overall_metric": 0.4572455284300009,
+ "average_token_length_metric": {
+ "8k": 0.5361902897675453,
+ "16k": 0.5387135746745756,
+ "32k": 0.49630855084750314,
+ "64k": 0.4349221423372099,
+ "128k": 0.4097160751427924,
+ "256k": 0.32499031493023295
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.42152028749347914,
+ "Partial": 0.5017169627733982
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6782361744837745,
+ "Moderate": 0.39685509696404564,
+ "Hard": 0.36961618406661007,
+ "Extreme": 0.31088681955892267
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7107158896812488,
+ "T2. Sequencing & Structure Reconstruction": 0.6878841679901458,
+ "T3. Evidence-Grounded QA": 0.4972222222222221,
+ "T4. Summarization & Synthesis": 0.49954693637602554,
+ "T5. Attribution & Citation Alignment": 0.4031404238145398,
+ "T6. Aggregation & Clustering": 0.45473413170860827,
+ "T7. Consistency & Compliance Checking": 0.22366919601340218,
+ "T8. Structured & Numeric Reasoning": 0.39320987654321,
+ "T9. Version & Code Diff Analysis": 0.48054475011491815,
+ "T10. Rule Induction & In-Context Learning": 0.47361111111111115,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35
+ },
+ "average_language_metric": {
+ "Chinese": 0.45095633442828276,
+ "English": 0.46265731480500444
+ },
+ "BoN-1": {
+ "overall_metric": 0.46832062566231697,
+ "token_length": {
+ "8k": 0.5556419203082478,
+ "16k": 0.5568269490713018,
+ "32k": 0.496345489575371,
+ "64k": 0.43294552638521866,
+ "128k": 0.40700473005528304,
+ "256k": 0.36115913857848075
+ },
+ "contextual_requirement": {
+ "Full": 0.4357580294575037,
+ "Partial": 0.5097639299229886
+ },
+ "difficulty": {
+ "Easy": 0.6918638477497621,
+ "Moderate": 0.4331357742500079,
+ "Hard": 0.3807852518328578,
+ "Extreme": 0.3040440621366871
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7207239810971894,
+ "T2. Sequencing & Structure Reconstruction": 0.6905517621196958,
+ "T3. Evidence-Grounded QA": 0.49166666666666664,
+ "T4. Summarization & Synthesis": 0.5029601464767286,
+ "T5. Attribution & Citation Alignment": 0.4001875429352411,
+ "T6. Aggregation & Clustering": 0.47488345973500956,
+ "T7. Consistency & Compliance Checking": 0.23593295101774395,
+ "T8. Structured & Numeric Reasoning": 0.42916666666666664,
+ "T9. Version & Code Diff Analysis": 0.48280421646542415,
+ "T10. Rule Induction & In-Context Learning": 0.4718055555555555,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.38333333333333336
+ },
+ "language": {
+ "Chinese": 0.4569671050679203,
+ "English": 0.4796741462567137
+ }
+ },
+ "pass@1": 0.22333333333333333,
+ "BoN-2": {
+ "overall_metric": 0.5212696813472717,
+ "token_length": {
+ "8k": 0.5986675362212036,
+ "16k": 0.5973077359959412,
+ "32k": 0.5622230583396595,
+ "64k": 0.4981746316724617,
+ "128k": 0.4716644530251574,
+ "256k": 0.3995806728292114
+ },
+ "contextual_requirement": {
+ "Full": 0.48512808383126105,
+ "Partial": 0.5672680781858331
+ },
+ "difficulty": {
+ "Easy": 0.7545680056315479,
+ "Moderate": 0.47986119164396557,
+ "Hard": 0.4295378743414953,
+ "Extreme": 0.35315093917432855
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7512588469355942,
+ "T2. Sequencing & Structure Reconstruction": 0.7467928367928364,
+ "T3. Evidence-Grounded QA": 0.55,
+ "T4. Summarization & Synthesis": 0.51430009825831,
+ "T5. Attribution & Citation Alignment": 0.4706079149193554,
+ "T6. Aggregation & Clustering": 0.5469885223100209,
+ "T7. Consistency & Compliance Checking": 0.2738455117300439,
+ "T8. Structured & Numeric Reasoning": 0.4750000000000001,
+ "T9. Version & Code Diff Analysis": 0.5577713799858229,
+ "T10. Rule Induction & In-Context Learning": 0.5730555555555554,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
+ },
+ "language": {
+ "Chinese": 0.5125792853133213,
+ "English": 0.5299600773812242
+ }
+ },
+ "pass@2": 0.266,
+ "BoN-3": {
+ "overall_metric": 0.555104746077484,
+ "token_length": {
+ "8k": 0.6338196004796416,
+ "16k": 0.6317696475388848,
+ "32k": 0.5989131419373114,
+ "64k": 0.5293428230237414,
+ "128k": 0.5149355297222864,
+ "256k": 0.421847733763042
+ },
+ "contextual_requirement": {
+ "Full": 0.5193420147209636,
+ "Partial": 0.6006209496221481
+ },
+ "difficulty": {
+ "Easy": 0.78804822617357,
+ "Moderate": 0.5160928935941635,
+ "Hard": 0.46600874198821185,
+ "Extreme": 0.3840693214028631
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.789852791947241,
+ "T2. Sequencing & Structure Reconstruction": 0.7723075073075069,
+ "T3. Evidence-Grounded QA": 0.5833333333333334,
+ "T4. Summarization & Synthesis": 0.5189479579160801,
+ "T5. Attribution & Citation Alignment": 0.5055110625297212,
+ "T6. Aggregation & Clustering": 0.5940840810722463,
+ "T7. Consistency & Compliance Checking": 0.30869842588741486,
+ "T8. Structured & Numeric Reasoning": 0.5157407407407408,
+ "T9. Version & Code Diff Analysis": 0.5819051347174037,
+ "T10. Rule Induction & In-Context Learning": 0.6091666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.45
+ },
+ "language": {
+ "Chinese": 0.544214789512689,
+ "English": 0.5659947026422806
+ }
+ },
+ "pass@3": 0.29933333333333334
+}
\ No newline at end of file
diff --git a/results/Qwen3-4B-Thinking-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-4B-Thinking-2507/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cb8d09e7a395ff28dd04c41d686c202bd0d5141
--- /dev/null
+++ b/results/Qwen3-4B-Thinking-2507/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5009859553721181,
+ "inference_iteration_1_overall_metric": 0.5048548848776019,
+ "inference_iteration_2_overall_metric": 0.5024790497866946,
+ "inference_iteration_3_overall_metric": 0.49562393145205963,
+ "average_token_length_metric": {
+ "8k": 0.632687775526695,
+ "16k": 0.5918320458309317,
+ "32k": 0.5823146846057027,
+ "64k": 0.46822470780697634,
+ "128k": 0.41206384144125535,
+ "256k": 0.31879267702115177
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.4729409452430084,
+ "Partial": 0.5366796046273523
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.7052606065866089,
+ "Moderate": 0.47658986127708997,
+ "Hard": 0.40985193332208675,
+ "Extreme": 0.3530946410097064
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7734823342182894,
+ "T2. Sequencing & Structure Reconstruction": 0.7182270412956683,
+ "T3. Evidence-Grounded QA": 0.5111111111111112,
+ "T4. Summarization & Synthesis": 0.5256473893029667,
+ "T5. Attribution & Citation Alignment": 0.453174269183897,
+ "T6. Aggregation & Clustering": 0.5035281189490585,
+ "T7. Consistency & Compliance Checking": 0.28246589806288247,
+ "T8. Structured & Numeric Reasoning": 0.46820987654321,
+ "T9. Version & Code Diff Analysis": 0.5332190492994193,
+ "T10. Rule Induction & In-Context Learning": 0.49949074074074074,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664
+ },
+ "average_language_metric": {
+ "Chinese": 0.5038793951541226,
+ "English": 0.49809251559011675
+ },
+ "BoN-1": {
+ "overall_metric": 0.5048548848776019,
+ "token_length": {
+ "8k": 0.6415649649637651,
+ "16k": 0.5932697726935569,
+ "32k": 0.5781773434671414,
+ "64k": 0.47574852291888636,
+ "128k": 0.42188240734810956,
+ "256k": 0.3184862978741517
+ },
+ "contextual_requirement": {
+ "Full": 0.48018245228514805,
+ "Partial": 0.5362561627225435
+ },
+ "difficulty": {
+ "Easy": 0.7166716175061497,
+ "Moderate": 0.478876954837501,
+ "Hard": 0.4052133058691331,
+ "Extreme": 0.3553284598505033
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7738486865765832,
+ "T2. Sequencing & Structure Reconstruction": 0.7040430674989494,
+ "T3. Evidence-Grounded QA": 0.5416666666666666,
+ "T4. Summarization & Synthesis": 0.5245700102853508,
+ "T5. Attribution & Citation Alignment": 0.46396062856589165,
+ "T6. Aggregation & Clustering": 0.5033233762229643,
+ "T7. Consistency & Compliance Checking": 0.2898251412665043,
+ "T8. Structured & Numeric Reasoning": 0.4486111111111112,
+ "T9. Version & Code Diff Analysis": 0.5448186695868223,
+ "T10. Rule Induction & In-Context Learning": 0.5034722222222222,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.39166666666666666
+ },
+ "language": {
+ "Chinese": 0.5010388581951326,
+ "English": 0.5086709115600716
+ }
+ },
+ "pass@1": 0.2693333333333333,
+ "BoN-2": {
+ "overall_metric": 0.5815730990020301,
+ "token_length": {
+ "8k": 0.7031921430610256,
+ "16k": 0.6820413887129978,
+ "32k": 0.6633230661301664,
+ "64k": 0.5513587902856957,
+ "128k": 0.4989138280568502,
+ "256k": 0.390609377765447
+ },
+ "contextual_requirement": {
+ "Full": 0.5521045772934127,
+ "Partial": 0.6190784902675451
+ },
+ "difficulty": {
+ "Easy": 0.8002564788590355,
+ "Moderate": 0.5756263992124815,
+ "Hard": 0.4952494575695323,
+ "Extreme": 0.40262023427951893
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8347164772660534,
+ "T2. Sequencing & Structure Reconstruction": 0.7779337329337325,
+ "T3. Evidence-Grounded QA": 0.625,
+ "T4. Summarization & Synthesis": 0.5385917063825295,
+ "T5. Attribution & Citation Alignment": 0.5486563132286816,
+ "T6. Aggregation & Clustering": 0.5970895337084138,
+ "T7. Consistency & Compliance Checking": 0.34483711167791187,
+ "T8. Structured & Numeric Reasoning": 0.563425925925926,
+ "T9. Version & Code Diff Analysis": 0.629792206301561,
+ "T10. Rule Induction & In-Context Learning": 0.6152777777777777,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665
+ },
+ "language": {
+ "Chinese": 0.5864920535980012,
+ "English": 0.5766541444060607
+ }
+ },
+ "pass@2": 0.3353333333333333,
+ "BoN-3": {
+ "overall_metric": 0.6109433025170551,
+ "token_length": {
+ "8k": 0.7379808139270423,
+ "16k": 0.7090640257842541,
+ "32k": 0.6929789126030277,
+ "64k": 0.5972496482938813,
+ "128k": 0.5202041288295823,
+ "256k": 0.4081822856645453
+ },
+ "contextual_requirement": {
+ "Full": 0.587829494832041,
+ "Partial": 0.6403608759343477
+ },
+ "difficulty": {
+ "Easy": 0.8231196664846697,
+ "Moderate": 0.624752372815826,
+ "Hard": 0.5190752370733982,
+ "Extreme": 0.429784307436333
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8563273820048136,
+ "T2. Sequencing & Structure Reconstruction": 0.8023217060717058,
+ "T3. Evidence-Grounded QA": 0.625,
+ "T4. Summarization & Synthesis": 0.545280602732678,
+ "T5. Attribution & Citation Alignment": 0.5707889491113174,
+ "T6. Aggregation & Clustering": 0.625445879237853,
+ "T7. Consistency & Compliance Checking": 0.39806047878615286,
+ "T8. Structured & Numeric Reasoning": 0.6027777777777777,
+ "T9. Version & Code Diff Analysis": 0.6498686600622252,
+ "T10. Rule Induction & In-Context Learning": 0.6477777777777778,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5
+ },
+ "language": {
+ "Chinese": 0.6146496797050219,
+ "English": 0.60723692532909
+ }
+ },
+ "pass@3": 0.36466666666666664
+}
\ No newline at end of file
diff --git a/results/Qwen3-4B/nonthinking_context-120000_bon-3_summary.json b/results/Qwen3-4B/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..7038193def03bbdb5dc71c2fe624024211773e5f
--- /dev/null
+++ b/results/Qwen3-4B/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3126302932284987,
+ "inference_iteration_1_overall_metric": 0.31121626849914974,
+ "inference_iteration_2_overall_metric": 0.314752838275754,
+ "inference_iteration_3_overall_metric": 0.31192177291059353,
+ "average_token_length_metric": {
+ "8k": 0.3741643840311737,
+ "16k": 0.31633468288420485,
+ "32k": 0.3451526313630571,
+ "64k": 0.28786229377190237,
+ "128k": 0.28715220052628043,
+ "256k": 0.26511556679437487
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.30107503486959114,
+ "Partial": 0.3273369856852913
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.404275621826777,
+ "Moderate": 0.23333879206992347,
+ "Hard": 0.30100444227102663,
+ "Extreme": 0.27195317135842567
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6097768436246711,
+ "T2. Sequencing & Structure Reconstruction": 0.5604218483789652,
+ "T3. Evidence-Grounded QA": 0.4305555555555555,
+ "T4. Summarization & Synthesis": 0.5071899925945303,
+ "T5. Attribution & Citation Alignment": 0.2003512792090914,
+ "T6. Aggregation & Clustering": 0.31485071719679697,
+ "T7. Consistency & Compliance Checking": 0.15277263547227143,
+ "T8. Structured & Numeric Reasoning": 0.07777777777777777,
+ "T9. Version & Code Diff Analysis": 0.258888857730561,
+ "T10. Rule Induction & In-Context Learning": 0.3309259259259259,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
+ },
+ "average_language_metric": {
+ "Chinese": 0.30923682893499727,
+ "English": 0.3160237575220013
+ },
+ "BoN-1": {
+ "overall_metric": 0.31121626849914974,
+ "token_length": {
+ "8k": 0.36915288725425155,
+ "16k": 0.3139436770180253,
+ "32k": 0.35672840494575175,
+ "64k": 0.2822737049825353,
+ "128k": 0.2867451109679906,
+ "256k": 0.25845382582634246
+ },
+ "contextual_requirement": {
+ "Full": 0.2946313817206056,
+ "Partial": 0.33232430621729636
+ },
+ "difficulty": {
+ "Easy": 0.40768136684677136,
+ "Moderate": 0.22509851498282202,
+ "Hard": 0.3024334030525033,
+ "Extreme": 0.26787559003720773
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6106663615330956,
+ "T2. Sequencing & Structure Reconstruction": 0.5653203286176459,
+ "T3. Evidence-Grounded QA": 0.4166666666666667,
+ "T4. Summarization & Synthesis": 0.5026988695288178,
+ "T5. Attribution & Citation Alignment": 0.1941123340186669,
+ "T6. Aggregation & Clustering": 0.30605814169863926,
+ "T7. Consistency & Compliance Checking": 0.153482524546797,
+ "T8. Structured & Numeric Reasoning": 0.08148148148148147,
+ "T9. Version & Code Diff Analysis": 0.2592055742840997,
+ "T10. Rule Induction & In-Context Learning": 0.33,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2
+ },
+ "language": {
+ "Chinese": 0.30874607113427527,
+ "English": 0.31368646586402377
+ }
+ },
+ "pass@1": 0.10266666666666667,
+ "BoN-2": {
+ "overall_metric": 0.33311542718948595,
+ "token_length": {
+ "8k": 0.390054030159741,
+ "16k": 0.32615430525928374,
+ "32k": 0.3817681434674135,
+ "64k": 0.3127897381386726,
+ "128k": 0.3077977855264991,
+ "256k": 0.28012856058530294
+ },
+ "contextual_requirement": {
+ "Full": 0.31637653425658996,
+ "Partial": 0.3544194727404438
+ },
+ "difficulty": {
+ "Easy": 0.4246995397112217,
+ "Moderate": 0.25227259034564015,
+ "Hard": 0.32260424775606733,
+ "Extreme": 0.29279217370914995
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6411601457898036,
+ "T2. Sequencing & Structure Reconstruction": 0.5895421729520939,
+ "T3. Evidence-Grounded QA": 0.44166666666666665,
+ "T4. Summarization & Synthesis": 0.5176163934525975,
+ "T5. Attribution & Citation Alignment": 0.22133852710308327,
+ "T6. Aggregation & Clustering": 0.34506363663570866,
+ "T7. Consistency & Compliance Checking": 0.17423574705889155,
+ "T8. Structured & Numeric Reasoning": 0.08148148148148147,
+ "T9. Version & Code Diff Analysis": 0.299919858362423,
+ "T10. Rule Induction & In-Context Learning": 0.3515277777777778,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2
+ },
+ "language": {
+ "Chinese": 0.33158121218981956,
+ "English": 0.33464964218915183
+ }
+ },
+ "pass@2": 0.10933333333333334,
+ "BoN-3": {
+ "overall_metric": 0.3459735709442073,
+ "token_length": {
+ "8k": 0.4018096372567551,
+ "16k": 0.34146668092259574,
+ "32k": 0.39270536315408494,
+ "64k": 0.3308531716059115,
+ "128k": 0.31620040712911074,
+ "256k": 0.29280616559678363
+ },
+ "contextual_requirement": {
+ "Full": 0.3294609526166743,
+ "Partial": 0.3669896306337945
+ },
+ "difficulty": {
+ "Easy": 0.43779012322510574,
+ "Moderate": 0.27067727963707533,
+ "Hard": 0.33120650215495034,
+ "Extreme": 0.3045479638270778
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.646933160923283,
+ "T2. Sequencing & Structure Reconstruction": 0.606818823765589,
+ "T3. Evidence-Grounded QA": 0.45,
+ "T4. Summarization & Synthesis": 0.5261451995899935,
+ "T5. Attribution & Citation Alignment": 0.23085472354458936,
+ "T6. Aggregation & Clustering": 0.3665833344388899,
+ "T7. Consistency & Compliance Checking": 0.18478135703392412,
+ "T8. Structured & Numeric Reasoning": 0.08703703703703704,
+ "T9. Version & Code Diff Analysis": 0.31728735843657757,
+ "T10. Rule Induction & In-Context Learning": 0.36402777777777773,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.225
+ },
+ "language": {
+ "Chinese": 0.3427214477984433,
+ "English": 0.3492256940899709
+ }
+ },
+ "pass@3": 0.11466666666666667
+}
\ No newline at end of file
diff --git a/results/Qwen3-4B/thinking_context-120000_bon-3_summary.json b/results/Qwen3-4B/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae0db6f0903c73a0aabaa0e44ca5ad47ad804067
--- /dev/null
+++ b/results/Qwen3-4B/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 17,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.40820377372546796,
+ "inference_iteration_1_overall_metric": 0.40990347673882405,
+ "inference_iteration_2_overall_metric": 0.39860881288038574,
+ "inference_iteration_3_overall_metric": 0.41609903155719674,
+ "average_token_length_metric": {
+ "8k": 0.518261214308827,
+ "16k": 0.45885281352828067,
+ "32k": 0.45766504375868816,
+ "64k": 0.34332890432195795,
+ "128k": 0.3677707675199754,
+ "256k": 0.30334389891508334
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.38531613330536113,
+ "Partial": 0.4373334978965144
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.5984772560988485,
+ "Moderate": 0.3126595325360281,
+ "Hard": 0.34066117314091837,
+ "Extreme": 0.3068862254725463
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6569147883782315,
+ "T2. Sequencing & Structure Reconstruction": 0.6466218735859872,
+ "T3. Evidence-Grounded QA": 0.45,
+ "T4. Summarization & Synthesis": 0.5084591641143323,
+ "T5. Attribution & Citation Alignment": 0.33382002543718364,
+ "T6. Aggregation & Clustering": 0.4088451076318483,
+ "T7. Consistency & Compliance Checking": 0.19607244984784905,
+ "T8. Structured & Numeric Reasoning": 0.35447530864197535,
+ "T9. Version & Code Diff Analysis": 0.412734613462707,
+ "T10. Rule Induction & In-Context Learning": 0.41046296296296286,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2444444444444444
+ },
+ "average_language_metric": {
+ "Chinese": 0.3969713362374892,
+ "English": 0.41943621121344793
+ },
+ "BoN-1": {
+ "overall_metric": 0.40990347673882405,
+ "token_length": {
+ "8k": 0.5073935877667868,
+ "16k": 0.4625290937943797,
+ "32k": 0.46311499347751756,
+ "64k": 0.3585619661835362,
+ "128k": 0.3674845287723082,
+ "256k": 0.30033669043841327
+ },
+ "contextual_requirement": {
+ "Full": 0.3859794000653783,
+ "Partial": 0.4403523015959356
+ },
+ "difficulty": {
+ "Easy": 0.5960294211438244,
+ "Moderate": 0.3327872754241752,
+ "Hard": 0.3381491070256522,
+ "Extreme": 0.30379186270207253
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.663842843941683,
+ "T2. Sequencing & Structure Reconstruction": 0.6545791245791246,
+ "T3. Evidence-Grounded QA": 0.475,
+ "T4. Summarization & Synthesis": 0.5062086959994923,
+ "T5. Attribution & Citation Alignment": 0.3264498285141694,
+ "T6. Aggregation & Clustering": 0.405516164076872,
+ "T7. Consistency & Compliance Checking": 0.19553549353623603,
+ "T8. Structured & Numeric Reasoning": 0.36342592592592593,
+ "T9. Version & Code Diff Analysis": 0.4201632575589423,
+ "T10. Rule Induction & In-Context Learning": 0.3725,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336
+ },
+ "language": {
+ "Chinese": 0.3981684556698,
+ "English": 0.42163849780784673
+ }
+ },
+ "pass@1": 0.18,
+ "BoN-2": {
+ "overall_metric": 0.46589831041740126,
+ "token_length": {
+ "8k": 0.5699928269386603,
+ "16k": 0.5343488480787704,
+ "32k": 0.5264237759640333,
+ "64k": 0.39895039817299616,
+ "128k": 0.40892582534920624,
+ "256k": 0.3567481880007445
+ },
+ "contextual_requirement": {
+ "Full": 0.43584342446033286,
+ "Partial": 0.5041499834536717
+ },
+ "difficulty": {
+ "Easy": 0.6647834735405885,
+ "Moderate": 0.37447270682167993,
+ "Hard": 0.39477207070516285,
+ "Extreme": 0.3547956606233666
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7038416454068085,
+ "T2. Sequencing & Structure Reconstruction": 0.7210729085729087,
+ "T3. Evidence-Grounded QA": 0.5,
+ "T4. Summarization & Synthesis": 0.5215419635980021,
+ "T5. Attribution & Citation Alignment": 0.40511713894417817,
+ "T6. Aggregation & Clustering": 0.4739903039012416,
+ "T7. Consistency & Compliance Checking": 0.24396925466217162,
+ "T8. Structured & Numeric Reasoning": 0.4175925925925927,
+ "T9. Version & Code Diff Analysis": 0.48896588585050343,
+ "T10. Rule Induction & In-Context Learning": 0.44652777777777775,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333
+ },
+ "language": {
+ "Chinese": 0.46255576530710224,
+ "English": 0.4692408555277019
+ }
+ },
+ "pass@2": 0.216,
+ "BoN-3": {
+ "overall_metric": 0.504738390514233,
+ "token_length": {
+ "8k": 0.6204219657283213,
+ "16k": 0.5577361733074144,
+ "32k": 0.5696181517494681,
+ "64k": 0.43160873083607526,
+ "128k": 0.464000536292567,
+ "256k": 0.3850447851715533
+ },
+ "contextual_requirement": {
+ "Full": 0.47762538119520404,
+ "Partial": 0.5392458569202704
+ },
+ "difficulty": {
+ "Easy": 0.7265559994106623,
+ "Moderate": 0.4074392024047854,
+ "Hard": 0.4294021958454043,
+ "Extreme": 0.37513734082069433
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7403624370535461,
+ "T2. Sequencing & Structure Reconstruction": 0.7454373866873865,
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
+ "T4. Summarization & Synthesis": 0.5277914424080392,
+ "T5. Attribution & Citation Alignment": 0.4422798028876217,
+ "T6. Aggregation & Clustering": 0.5201921178442834,
+ "T7. Consistency & Compliance Checking": 0.26324271269098753,
+ "T8. Structured & Numeric Reasoning": 0.4777777777777778,
+ "T9. Version & Code Diff Analysis": 0.5316787888106357,
+ "T10. Rule Induction & In-Context Learning": 0.5215277777777778,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.375
+ },
+ "language": {
+ "Chinese": 0.49773088107965896,
+ "English": 0.5117458999488075
+ }
+ },
+ "pass@3": 0.24666666666666667
+}
\ No newline at end of file
diff --git a/results/Qwen3-8B/nonthinking_context-120000_bon-3_summary.json b/results/Qwen3-8B/nonthinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b30b38eab6700cbd762d670ff679b683fc6a281
--- /dev/null
+++ b/results/Qwen3-8B/nonthinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.3341263002850569,
+ "inference_iteration_1_overall_metric": 0.33209862396824386,
+ "inference_iteration_2_overall_metric": 0.33541946267252254,
+ "inference_iteration_3_overall_metric": 0.3348608142144041,
+ "average_token_length_metric": {
+ "8k": 0.3784198847034242,
+ "16k": 0.37953608827547447,
+ "32k": 0.36499771721065843,
+ "64k": 0.28008358934905153,
+ "128k": 0.3132115847486681,
+ "256k": 0.2885089374230642
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.3137072166929107,
+ "Partial": 0.3601142248568793
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.428559130222621,
+ "Moderate": 0.25195385511657276,
+ "Hard": 0.31092306311632556,
+ "Extreme": 0.2998920915703961
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.6412648134912415,
+ "T2. Sequencing & Structure Reconstruction": 0.6145308824378115,
+ "T3. Evidence-Grounded QA": 0.46388888888888896,
+ "T4. Summarization & Synthesis": 0.5163026571908423,
+ "T5. Attribution & Citation Alignment": 0.2523030174294647,
+ "T6. Aggregation & Clustering": 0.3470095028864795,
+ "T7. Consistency & Compliance Checking": 0.1606584732901947,
+ "T8. Structured & Numeric Reasoning": 0.07453703703703704,
+ "T9. Version & Code Diff Analysis": 0.2941939372673569,
+ "T10. Rule Induction & In-Context Learning": 0.30689814814814814,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2138888888888889
+ },
+ "average_language_metric": {
+ "Chinese": 0.33785469604432067,
+ "English": 0.3303979045257931
+ },
+ "BoN-1": {
+ "overall_metric": 0.33209862396824386,
+ "token_length": {
+ "8k": 0.37642064292707555,
+ "16k": 0.3763334563411937,
+ "32k": 0.3747061763092641,
+ "64k": 0.26888181342117645,
+ "128k": 0.2968232384609332,
+ "256k": 0.2994264163498193
+ },
+ "contextual_requirement": {
+ "Full": 0.31079609686291426,
+ "Partial": 0.35921093119320874
+ },
+ "difficulty": {
+ "Easy": 0.42792762230121,
+ "Moderate": 0.25490326136513564,
+ "Hard": 0.30852284836466365,
+ "Extreme": 0.2933106279347054
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6344283740437071,
+ "T2. Sequencing & Structure Reconstruction": 0.627808712385359,
+ "T3. Evidence-Grounded QA": 0.45,
+ "T4. Summarization & Synthesis": 0.5165407101035675,
+ "T5. Attribution & Citation Alignment": 0.2452312681950123,
+ "T6. Aggregation & Clustering": 0.3367099858810609,
+ "T7. Consistency & Compliance Checking": 0.15639989872237942,
+ "T8. Structured & Numeric Reasoning": 0.08287037037037037,
+ "T9. Version & Code Diff Analysis": 0.30783668574801865,
+ "T10. Rule Induction & In-Context Learning": 0.29708333333333337,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334
+ },
+ "language": {
+ "Chinese": 0.3358334164162984,
+ "English": 0.3283638315201894
+ }
+ },
+ "pass@1": 0.11333333333333333,
+ "BoN-2": {
+ "overall_metric": 0.3675196408761814,
+ "token_length": {
+ "8k": 0.3886475538954706,
+ "16k": 0.41257816091326605,
+ "32k": 0.41159385509736474,
+ "64k": 0.31664455480081405,
+ "128k": 0.35128941591866764,
+ "256k": 0.32436430463150734
+ },
+ "contextual_requirement": {
+ "Full": 0.34706677159026683,
+ "Partial": 0.39355056542189176
+ },
+ "difficulty": {
+ "Easy": 0.4703351630161875,
+ "Moderate": 0.29345326870943717,
+ "Hard": 0.341763065112941,
+ "Extreme": 0.32045543696773376
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6724018408268643,
+ "T2. Sequencing & Structure Reconstruction": 0.6527223172989636,
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
+ "T4. Summarization & Synthesis": 0.5284447990610109,
+ "T5. Attribution & Citation Alignment": 0.2918155891981263,
+ "T6. Aggregation & Clustering": 0.3951103080046706,
+ "T7. Consistency & Compliance Checking": 0.17493867106904193,
+ "T8. Structured & Numeric Reasoning": 0.09398148148148147,
+ "T9. Version & Code Diff Analysis": 0.33853749595673677,
+ "T10. Rule Induction & In-Context Learning": 0.33902777777777776,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336
+ },
+ "language": {
+ "Chinese": 0.36969778846765494,
+ "English": 0.36534149328470883
+ }
+ },
+ "pass@2": 0.13133333333333333,
+ "BoN-3": {
+ "overall_metric": 0.3871899208655039,
+ "token_length": {
+ "8k": 0.40912701437278987,
+ "16k": 0.4254527731119934,
+ "32k": 0.43307631053998147,
+ "64k": 0.3370765423947205,
+ "128k": 0.3753522812020257,
+ "256k": 0.3430546035715128
+ },
+ "contextual_requirement": {
+ "Full": 0.36557204234325574,
+ "Partial": 0.4147035844392751
+ },
+ "difficulty": {
+ "Easy": 0.48861463017463347,
+ "Moderate": 0.30875721939034645,
+ "Hard": 0.3637658481982982,
+ "Extreme": 0.3429851432294641
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6857312280559049,
+ "T2. Sequencing & Structure Reconstruction": 0.6763734162000631,
+ "T3. Evidence-Grounded QA": 0.5416666666666666,
+ "T4. Summarization & Synthesis": 0.5345601158557589,
+ "T5. Attribution & Citation Alignment": 0.32057216845982056,
+ "T6. Aggregation & Clustering": 0.4271227501408007,
+ "T7. Consistency & Compliance Checking": 0.19336910203847105,
+ "T8. Structured & Numeric Reasoning": 0.11620370370370368,
+ "T9. Version & Code Diff Analysis": 0.3618993039783438,
+ "T10. Rule Induction & In-Context Learning": 0.3556944444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336
+ },
+ "language": {
+ "Chinese": 0.3912716913429664,
+ "English": 0.38310815038804175
+ }
+ },
+ "pass@3": 0.14133333333333334
+}
\ No newline at end of file
diff --git a/results/Qwen3-8B/thinking_context-120000_bon-3_summary.json b/results/Qwen3-8B/thinking_context-120000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6b23d23be76ce6d63a792692c4cd059287da569
--- /dev/null
+++ b/results/Qwen3-8B/thinking_context-120000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 17,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.4433948911173604,
+ "inference_iteration_1_overall_metric": 0.4399572436291822,
+ "inference_iteration_2_overall_metric": 0.44495624974340414,
+ "inference_iteration_3_overall_metric": 0.4452711799794956,
+ "average_token_length_metric": {
+ "8k": 0.5383434714089156,
+ "16k": 0.49806741885943173,
+ "32k": 0.5048813586036265,
+ "64k": 0.3824635395778342,
+ "128k": 0.38213602550979975,
+ "256k": 0.3544775327445564
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.4169048892199721,
+ "Partial": 0.47710943898676395
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.670801806919622,
+ "Moderate": 0.3015890712418365,
+ "Hard": 0.3710263419639324,
+ "Extreme": 0.334954571191627
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.691601826560709,
+ "T2. Sequencing & Structure Reconstruction": 0.6929147615138178,
+ "T3. Evidence-Grounded QA": 0.46944444444444433,
+ "T4. Summarization & Synthesis": 0.5153536178825013,
+ "T5. Attribution & Citation Alignment": 0.343764413672508,
+ "T6. Aggregation & Clustering": 0.4465373768548264,
+ "T7. Consistency & Compliance Checking": 0.21215633369348813,
+ "T8. Structured & Numeric Reasoning": 0.43070987654320986,
+ "T9. Version & Code Diff Analysis": 0.48094613870018443,
+ "T10. Rule Induction & In-Context Learning": 0.3948611111111111,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3194444444444445
+ },
+ "average_language_metric": {
+ "Chinese": 0.440767848226732,
+ "English": 0.4460219340079889
+ },
+ "BoN-1": {
+ "overall_metric": 0.4399572436291822,
+ "token_length": {
+ "8k": 0.5103007254314137,
+ "16k": 0.47828585706153987,
+ "32k": 0.5290020762560991,
+ "64k": 0.3888098595475681,
+ "128k": 0.3734417981216912,
+ "256k": 0.3599031453567786
+ },
+ "contextual_requirement": {
+ "Full": 0.41724803011166967,
+ "Partial": 0.46885987901510623
+ },
+ "difficulty": {
+ "Easy": 0.6690339330094581,
+ "Moderate": 0.2916811720097684,
+ "Hard": 0.36205062647700836,
+ "Extreme": 0.33757504538758054
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.6812085604476644,
+ "T2. Sequencing & Structure Reconstruction": 0.6967399267399262,
+ "T3. Evidence-Grounded QA": 0.475,
+ "T4. Summarization & Synthesis": 0.5146666315316712,
+ "T5. Attribution & Citation Alignment": 0.3410958151689159,
+ "T6. Aggregation & Clustering": 0.44159343486389535,
+ "T7. Consistency & Compliance Checking": 0.19800047364593318,
+ "T8. Structured & Numeric Reasoning": 0.43009259259259264,
+ "T9. Version & Code Diff Analysis": 0.4938637487118504,
+ "T10. Rule Induction & In-Context Learning": 0.3506944444444444,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667
+ },
+ "language": {
+ "Chinese": 0.4330192432359592,
+ "English": 0.4468952440224042
+ }
+ },
+ "pass@1": 0.20066666666666666,
+ "BoN-2": {
+ "overall_metric": 0.5164964536465051,
+ "token_length": {
+ "8k": 0.5863358411112154,
+ "16k": 0.566411710517629,
+ "32k": 0.5835338435058043,
+ "64k": 0.4844792024621684,
+ "128k": 0.45010561125903437,
+ "256k": 0.42811251302317804
+ },
+ "contextual_requirement": {
+ "Full": 0.4885834016176216,
+ "Partial": 0.5520221562287199
+ },
+ "difficulty": {
+ "Easy": 0.779988512571463,
+ "Moderate": 0.36284391366821606,
+ "Hard": 0.43271065222226107,
+ "Extreme": 0.3838066996999193
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7478708806546476,
+ "T2. Sequencing & Structure Reconstruction": 0.7636960261960256,
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
+ "T4. Summarization & Synthesis": 0.5296118538392725,
+ "T5. Attribution & Citation Alignment": 0.4258014929372265,
+ "T6. Aggregation & Clustering": 0.5129489242363339,
+ "T7. Consistency & Compliance Checking": 0.2736301164132571,
+ "T8. Structured & Numeric Reasoning": 0.5393518518518519,
+ "T9. Version & Code Diff Analysis": 0.5663013004241957,
+ "T10. Rule Induction & In-Context Learning": 0.48402777777777767,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.38333333333333336
+ },
+ "language": {
+ "Chinese": 0.5161513606389965,
+ "English": 0.5168415466540133
+ }
+ },
+ "pass@2": 0.25733333333333336,
+ "BoN-3": {
+ "overall_metric": 0.5567263766803745,
+ "token_length": {
+ "8k": 0.6362828635230333,
+ "16k": 0.6080554567904433,
+ "32k": 0.6268068570169055,
+ "64k": 0.52544445261354,
+ "128k": 0.4829862086238965,
+ "256k": 0.46078242151443083
+ },
+ "contextual_requirement": {
+ "Full": 0.5197805205774073,
+ "Partial": 0.603748375356879
+ },
+ "difficulty": {
+ "Easy": 0.8403187597200309,
+ "Moderate": 0.39459843208319806,
+ "Hard": 0.46151769338036713,
+ "Extreme": 0.4150871876739478
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.785936662865175,
+ "T2. Sequencing & Structure Reconstruction": 0.779274429274429,
+ "T3. Evidence-Grounded QA": 0.625,
+ "T4. Summarization & Synthesis": 0.5363098621370127,
+ "T5. Attribution & Citation Alignment": 0.4698549989248756,
+ "T6. Aggregation & Clustering": 0.5494079005598616,
+ "T7. Consistency & Compliance Checking": 0.3203866786545349,
+ "T8. Structured & Numeric Reasoning": 0.5949074074074073,
+ "T9. Version & Code Diff Analysis": 0.5913452198149313,
+ "T10. Rule Induction & In-Context Learning": 0.5493055555555556,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.425
+ },
+ "language": {
+ "Chinese": 0.5523930398850595,
+ "English": 0.5610597134756903
+ }
+ },
+ "pass@3": 0.29133333333333333
+}
\ No newline at end of file
diff --git a/results/Qwen3-Next-80B-A3B-Instruct/nonthinking_context-224000_bon-3_summary.json b/results/Qwen3-Next-80B-A3B-Instruct/nonthinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed60182cc22b951c5d266b8975bb864780780a32
--- /dev/null
+++ b/results/Qwen3-Next-80B-A3B-Instruct/nonthinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.5153699428141112,
+ "inference_iteration_1_overall_metric": 0.5166303102959966,
+ "inference_iteration_2_overall_metric": 0.5157188374753582,
+ "inference_iteration_3_overall_metric": 0.5137606806709764,
+ "average_token_length_metric": {
+ "8k": 0.5444390450433558,
+ "16k": 0.5325817607161717,
+ "32k": 0.55004125822099,
+ "64k": 0.5168441600849382,
+ "128k": 0.48782896231770906,
+ "256k": 0.4604844705014988
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.4773995367563536,
+ "Partial": 0.5636959141603475
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.6525257555815487,
+ "Moderate": 0.48765154528061855,
+ "Hard": 0.4992512003996272,
+ "Extreme": 0.3939150148392464
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.7836410129145194,
+ "T2. Sequencing & Structure Reconstruction": 0.7601097359430687,
+ "T3. Evidence-Grounded QA": 0.5722222222222223,
+ "T4. Summarization & Synthesis": 0.5429874590199666,
+ "T5. Attribution & Citation Alignment": 0.5936748905924045,
+ "T6. Aggregation & Clustering": 0.47375423753978874,
+ "T7. Consistency & Compliance Checking": 0.3365343156781269,
+ "T8. Structured & Numeric Reasoning": 0.2592592592592593,
+ "T9. Version & Code Diff Analysis": 0.621000579101771,
+ "T10. Rule Induction & In-Context Learning": 0.4863888888888888,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4777777777777778
+ },
+ "average_language_metric": {
+ "Chinese": 0.5268601573059153,
+ "English": 0.5038797283223065
+ },
+ "BoN-1": {
+ "overall_metric": 0.5166303102959966,
+ "token_length": {
+ "8k": 0.5438980115785723,
+ "16k": 0.5379344457831181,
+ "32k": 0.5499773393749967,
+ "64k": 0.5239673563810381,
+ "128k": 0.48407264135846406,
+ "256k": 0.45993206729979164
+ },
+ "contextual_requirement": {
+ "Full": 0.4777093693790403,
+ "Partial": 0.566166053281215
+ },
+ "difficulty": {
+ "Easy": 0.6553713573709563,
+ "Moderate": 0.48623131248023554,
+ "Hard": 0.4957585702580054,
+ "Extreme": 0.398321352454188
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7850605454303755,
+ "T2. Sequencing & Structure Reconstruction": 0.7605132367632368,
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
+ "T4. Summarization & Synthesis": 0.5438750921908875,
+ "T5. Attribution & Citation Alignment": 0.5978797516792373,
+ "T6. Aggregation & Clustering": 0.4781550130242737,
+ "T7. Consistency & Compliance Checking": 0.3404060026855857,
+ "T8. Structured & Numeric Reasoning": 0.2601851851851852,
+ "T9. Version & Code Diff Analysis": 0.6199309512936556,
+ "T10. Rule Induction & In-Context Learning": 0.49083333333333334,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.475
+ },
+ "language": {
+ "Chinese": 0.5240393165444767,
+ "English": 0.5092213040475176
+ }
+ },
+ "pass@1": 0.23933333333333334,
+ "BoN-2": {
+ "overall_metric": 0.5307678792049177,
+ "token_length": {
+ "8k": 0.5566045118760491,
+ "16k": 0.5523528870760122,
+ "32k": 0.565064007850647,
+ "64k": 0.5329215232825427,
+ "128k": 0.5070346589234701,
+ "256k": 0.47062968622078627
+ },
+ "contextual_requirement": {
+ "Full": 0.4936133806201656,
+ "Partial": 0.5780554228582404
+ },
+ "difficulty": {
+ "Easy": 0.6646977492242093,
+ "Moderate": 0.50755257479461,
+ "Hard": 0.5154147759439242,
+ "Extreme": 0.40938622572243505
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.794910051304565,
+ "T2. Sequencing & Structure Reconstruction": 0.7644632682132678,
+ "T3. Evidence-Grounded QA": 0.6,
+ "T4. Summarization & Synthesis": 0.5529787281614388,
+ "T5. Attribution & Citation Alignment": 0.6065695384216199,
+ "T6. Aggregation & Clustering": 0.4918158422307883,
+ "T7. Consistency & Compliance Checking": 0.35643719860134077,
+ "T8. Structured & Numeric Reasoning": 0.2837962962962963,
+ "T9. Version & Code Diff Analysis": 0.6413528982679426,
+ "T10. Rule Induction & In-Context Learning": 0.49291666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
+ },
+ "language": {
+ "Chinese": 0.5373954743254495,
+ "English": 0.5241402840843874
+ }
+ },
+ "pass@2": 0.25133333333333335,
+ "BoN-3": {
+ "overall_metric": 0.5363452114492876,
+ "token_length": {
+ "8k": 0.5662343509403671,
+ "16k": 0.5541053906547336,
+ "32k": 0.5714052653352649,
+ "64k": 0.5365347531934883,
+ "128k": 0.5125057318235955,
+ "256k": 0.477285776748277
+ },
+ "contextual_requirement": {
+ "Full": 0.49978312289665056,
+ "Partial": 0.5828787786980997
+ },
+ "difficulty": {
+ "Easy": 0.6709349653337122,
+ "Moderate": 0.5142353747150851,
+ "Hard": 0.5181865994276285,
+ "Extreme": 0.4153573030814503
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.7957673791297515,
+ "T2. Sequencing & Structure Reconstruction": 0.7681934269434265,
+ "T3. Evidence-Grounded QA": 0.6,
+ "T4. Summarization & Synthesis": 0.5577840086379114,
+ "T5. Attribution & Citation Alignment": 0.6152937974264557,
+ "T6. Aggregation & Clustering": 0.49800879082542016,
+ "T7. Consistency & Compliance Checking": 0.3608232159498475,
+ "T8. Structured & Numeric Reasoning": 0.30046296296296293,
+ "T9. Version & Code Diff Analysis": 0.6437507430378715,
+ "T10. Rule Induction & In-Context Learning": 0.49291666666666667,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.49166666666666664
+ },
+ "language": {
+ "Chinese": 0.5451725427917254,
+ "English": 0.5275178801068509
+ }
+ },
+ "pass@3": 0.25533333333333336
+}
\ No newline at end of file
diff --git a/results/Qwen3-Next-80B-A3B-Instruct/thinking_context-224000_bon-3_summary.json b/results/Qwen3-Next-80B-A3B-Instruct/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..2034ab9a0e6eee1f50f647f9ee32107e3dbf7f66
--- /dev/null
+++ b/results/Qwen3-Next-80B-A3B-Instruct/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 0,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.6075716303534897,
+ "inference_iteration_1_overall_metric": 0.6011630214015442,
+ "inference_iteration_2_overall_metric": 0.6048517302521914,
+ "inference_iteration_3_overall_metric": 0.6167001394067311,
+ "average_token_length_metric": {
+ "8k": 0.6538122003152632,
+ "16k": 0.6386361113417942,
+ "32k": 0.6075792549321678,
+ "64k": 0.6198997893946241,
+ "128k": 0.5589567834361618,
+ "256k": 0.5665456427009256
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5584048701107478,
+ "Partial": 0.6701475070260704
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.8084452986372921,
+ "Moderate": 0.6415908415050756,
+ "Hard": 0.5474008359025712,
+ "Extreme": 0.4047004927642043
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8543788758470637,
+ "T2. Sequencing & Structure Reconstruction": 0.805848765432098,
+ "T3. Evidence-Grounded QA": 0.5944444444444446,
+ "T4. Summarization & Synthesis": 0.5231378715129659,
+ "T5. Attribution & Citation Alignment": 0.6489902589376487,
+ "T6. Aggregation & Clustering": 0.5332869283079988,
+ "T7. Consistency & Compliance Checking": 0.3951514391075655,
+ "T8. Structured & Numeric Reasoning": 0.6266975308641974,
+ "T9. Version & Code Diff Analysis": 0.7189839084173466,
+ "T10. Rule Induction & In-Context Learning": 0.5828240740740741,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333334
+ },
+ "average_language_metric": {
+ "Chinese": 0.6121820357866055,
+ "English": 0.6029612249203741
+ },
+ "BoN-1": {
+ "overall_metric": 0.6011630214015442,
+ "token_length": {
+ "8k": 0.6624953563895736,
+ "16k": 0.6300056666980762,
+ "32k": 0.5997782992095927,
+ "64k": 0.6121693133437759,
+ "128k": 0.5635843711638716,
+ "256k": 0.5389451216043789
+ },
+ "contextual_requirement": {
+ "Full": 0.54428292048123,
+ "Partial": 0.67355587711831
+ },
+ "difficulty": {
+ "Easy": 0.800956148011297,
+ "Moderate": 0.6440797168615464,
+ "Hard": 0.5307293137505391,
+ "Extreme": 0.4003724066226809
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8631061933334538,
+ "T2. Sequencing & Structure Reconstruction": 0.7992355329855325,
+ "T3. Evidence-Grounded QA": 0.625,
+ "T4. Summarization & Synthesis": 0.5237860950102718,
+ "T5. Attribution & Citation Alignment": 0.6286606136418816,
+ "T6. Aggregation & Clustering": 0.523201899147464,
+ "T7. Consistency & Compliance Checking": 0.4046354997828942,
+ "T8. Structured & Numeric Reasoning": 0.625,
+ "T9. Version & Code Diff Analysis": 0.6992432341526317,
+ "T10. Rule Induction & In-Context Learning": 0.52125,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.525
+ },
+ "language": {
+ "Chinese": 0.6058889746353506,
+ "English": 0.5964370681677398
+ }
+ },
+ "pass@1": 0.3546666666666667,
+ "BoN-2": {
+ "overall_metric": 0.6610666781489654,
+ "token_length": {
+ "8k": 0.7056825593069123,
+ "16k": 0.7025120747104457,
+ "32k": 0.6769789260877618,
+ "64k": 0.6679511379675653,
+ "128k": 0.5958338915375376,
+ "256k": 0.6174414792835736
+ },
+ "contextual_requirement": {
+ "Full": 0.6082650541075186,
+ "Partial": 0.7282687451108096
+ },
+ "difficulty": {
+ "Easy": 0.8495933138250492,
+ "Moderate": 0.7152672047846778,
+ "Hard": 0.6173703518012653,
+ "Extreme": 0.44764496842372237
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9026227557629393,
+ "T2. Sequencing & Structure Reconstruction": 0.8607230269730269,
+ "T3. Evidence-Grounded QA": 0.6833333333333333,
+ "T4. Summarization & Synthesis": 0.5332664489935818,
+ "T5. Attribution & Citation Alignment": 0.708058003290001,
+ "T6. Aggregation & Clustering": 0.6005714039047372,
+ "T7. Consistency & Compliance Checking": 0.4659994086764647,
+ "T8. Structured & Numeric Reasoning": 0.6902777777777778,
+ "T9. Version & Code Diff Analysis": 0.7525570229707225,
+ "T10. Rule Induction & In-Context Learning": 0.6125,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.575
+ },
+ "language": {
+ "Chinese": 0.6606066535667863,
+ "English": 0.6615267027311469
+ }
+ },
+ "pass@2": 0.4246666666666667,
+ "BoN-3": {
+ "overall_metric": 0.7007864030989018,
+ "token_length": {
+ "8k": 0.7348618958536445,
+ "16k": 0.7307241128589752,
+ "32k": 0.7095991520281644,
+ "64k": 0.7158858832025538,
+ "128k": 0.6481921445010119,
+ "256k": 0.6654552301490679
+ },
+ "contextual_requirement": {
+ "Full": 0.6551104064886832,
+ "Partial": 0.7589194896937296
+ },
+ "difficulty": {
+ "Easy": 0.8851870155041537,
+ "Moderate": 0.7611742624890716,
+ "Hard": 0.6700836828309551,
+ "Extreme": 0.47928617494969256
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9126235121225394,
+ "T2. Sequencing & Structure Reconstruction": 0.8791730029230027,
+ "T3. Evidence-Grounded QA": 0.7166666666666667,
+ "T4. Summarization & Synthesis": 0.5379060712223888,
+ "T5. Attribution & Citation Alignment": 0.7258679135396275,
+ "T6. Aggregation & Clustering": 0.6480317213650545,
+ "T7. Consistency & Compliance Checking": 0.4953056552621231,
+ "T8. Structured & Numeric Reasoning": 0.7416666666666667,
+ "T9. Version & Code Diff Analysis": 0.7806423628768492,
+ "T10. Rule Induction & In-Context Learning": 0.7211111111111113,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333
+ },
+ "language": {
+ "Chinese": 0.706235211076793,
+ "English": 0.6953375951210139
+ }
+ },
+ "pass@3": 0.47533333333333333
+}
\ No newline at end of file
diff --git a/results/Qwen3-Next-80B-A3B-Thinking/thinking_context-224000_bon-3_summary.json b/results/Qwen3-Next-80B-A3B-Thinking/thinking_context-224000_bon-3_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..ddc3c1067ea62d9dd515db8be2211b587e71d714
--- /dev/null
+++ b/results/Qwen3-Next-80B-A3B-Thinking/thinking_context-224000_bon-3_summary.json
@@ -0,0 +1,164 @@
+{
+ "date": "2025-12-08",
+ "total_questions_num": 1500,
+ "inference_iterations": 3,
+ "total_samples_num": 4500,
+ "fail_samples_num": 34,
+ "inference_inconsistent_samples_num": 0,
+ "average_overall_metric": 0.6395225139139809,
+ "inference_iteration_1_overall_metric": 0.6341195678747302,
+ "inference_iteration_2_overall_metric": 0.6408716243063108,
+ "inference_iteration_3_overall_metric": 0.6435763495608982,
+ "average_token_length_metric": {
+ "8k": 0.7086149367852088,
+ "16k": 0.6851138026926965,
+ "32k": 0.6504005097985216,
+ "64k": 0.6399676067396364,
+ "128k": 0.6081903722855004,
+ "256k": 0.544847855182321
+ },
+ "average_contextual_requirement_metric": {
+ "Full": 0.5847832398021344,
+ "Partial": 0.7091906809654226
+ },
+ "average_difficulty_metric": {
+ "Easy": 0.8189651329867498,
+ "Moderate": 0.6922832177824265,
+ "Hard": 0.6146117453677709,
+ "Extreme": 0.4246589373203432
+ },
+ "average_primary_task_metric": {
+ "T1. Retrieval & Ranking": 0.8753831884628448,
+ "T2. Sequencing & Structure Reconstruction": 0.836079250742492,
+ "T3. Evidence-Grounded QA": 0.6472222222222221,
+ "T4. Summarization & Synthesis": 0.5559024180310073,
+ "T5. Attribution & Citation Alignment": 0.6831202809594916,
+ "T6. Aggregation & Clustering": 0.6046103225117172,
+ "T7. Consistency & Compliance Checking": 0.4200843891775127,
+ "T8. Structured & Numeric Reasoning": 0.6665123456790124,
+ "T9. Version & Code Diff Analysis": 0.7482171811580411,
+ "T10. Rule Induction & In-Context Learning": 0.5751851851851851,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5361111111111112
+ },
+ "average_language_metric": {
+ "Chinese": 0.6498950648240791,
+ "English": 0.6291499630038833
+ },
+ "BoN-1": {
+ "overall_metric": 0.6341195678747302,
+ "token_length": {
+ "8k": 0.7020231017646056,
+ "16k": 0.6839496608825781,
+ "32k": 0.6481596388780655,
+ "64k": 0.6264029281930145,
+ "128k": 0.6089383419761589,
+ "256k": 0.5352437355539651
+ },
+ "contextual_requirement": {
+ "Full": 0.5805803773748881,
+ "Partial": 0.702260355783623
+ },
+ "difficulty": {
+ "Easy": 0.8169438449940154,
+ "Moderate": 0.7044054869931783,
+ "Hard": 0.5782945616258349,
+ "Extreme": 0.42434569198656086
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.8803664149043899,
+ "T2. Sequencing & Structure Reconstruction": 0.8312176889574149,
+ "T3. Evidence-Grounded QA": 0.6333333333333333,
+ "T4. Summarization & Synthesis": 0.5547574749584397,
+ "T5. Attribution & Citation Alignment": 0.6921710523314937,
+ "T6. Aggregation & Clustering": 0.5879316250623552,
+ "T7. Consistency & Compliance Checking": 0.4025441003215575,
+ "T8. Structured & Numeric Reasoning": 0.6638888888888889,
+ "T9. Version & Code Diff Analysis": 0.7462961569843115,
+ "T10. Rule Induction & In-Context Learning": 0.5484722222222224,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5583333333333333
+ },
+ "language": {
+ "Chinese": 0.6383248368091883,
+ "English": 0.6299142989402748
+ }
+ },
+ "pass@1": 0.4033333333333333,
+ "BoN-2": {
+ "overall_metric": 0.7030204024240833,
+ "token_length": {
+ "8k": 0.7639776186212258,
+ "16k": 0.7486638197028843,
+ "32k": 0.7095546783100515,
+ "64k": 0.6979238211892997,
+ "128k": 0.6806853813914886,
+ "256k": 0.6173170953295508
+ },
+ "contextual_requirement": {
+ "Full": 0.6539478610781624,
+ "Partial": 0.7654763641370751
+ },
+ "difficulty": {
+ "Easy": 0.888641690030574,
+ "Moderate": 0.7693477973677636,
+ "Hard": 0.6905310978675843,
+ "Extreme": 0.46431888254175707
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9014160909768955,
+ "T2. Sequencing & Structure Reconstruction": 0.8695868020868018,
+ "T3. Evidence-Grounded QA": 0.75,
+ "T4. Summarization & Synthesis": 0.5700264981260397,
+ "T5. Attribution & Citation Alignment": 0.7558486553637924,
+ "T6. Aggregation & Clustering": 0.6562172241338908,
+ "T7. Consistency & Compliance Checking": 0.4824541236458827,
+ "T8. Structured & Numeric Reasoning": 0.7527777777777778,
+ "T9. Version & Code Diff Analysis": 0.8007866287445203,
+ "T10. Rule Induction & In-Context Learning": 0.6445833333333334,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333
+ },
+ "language": {
+ "Chinese": 0.7121930712441974,
+ "English": 0.6938477336039707
+ }
+ },
+ "pass@2": 0.4826666666666667,
+ "BoN-3": {
+ "overall_metric": 0.7357638591230633,
+ "token_length": {
+ "8k": 0.7947994120787286,
+ "16k": 0.7776820176641183,
+ "32k": 0.7381243144177431,
+ "64k": 0.7311339212477369,
+ "128k": 0.7186000776011717,
+ "256k": 0.654243411728885
+ },
+ "contextual_requirement": {
+ "Full": 0.682832657763219,
+ "Partial": 0.8031308426719579
+ },
+ "difficulty": {
+ "Easy": 0.9249743747143141,
+ "Moderate": 0.7994375898491137,
+ "Hard": 0.7200534393057935,
+ "Extreme": 0.496989165105746
+ },
+ "primary_task": {
+ "T1. Retrieval & Ranking": 0.9130805871242748,
+ "T2. Sequencing & Structure Reconstruction": 0.8928626466126465,
+ "T3. Evidence-Grounded QA": 0.7916666666666666,
+ "T4. Summarization & Synthesis": 0.5775148472726891,
+ "T5. Attribution & Citation Alignment": 0.7740008061816878,
+ "T6. Aggregation & Clustering": 0.7032171864847305,
+ "T7. Consistency & Compliance Checking": 0.5232353490903346,
+ "T8. Structured & Numeric Reasoning": 0.7805555555555556,
+ "T9. Version & Code Diff Analysis": 0.8344938818177356,
+ "T10. Rule Induction & In-Context Learning": 0.7195833333333334,
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6833333333333333
+ },
+ "language": {
+ "Chinese": 0.7443997677249928,
+ "English": 0.7271279505211355
+ }
+ },
+ "pass@3": 0.528
+}
\ No newline at end of file
diff --git a/results/model_info.json b/results/model_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff1994a45f947e4df3b69912a3e8ebfa3c3a0f7a
--- /dev/null
+++ b/results/model_info.json
@@ -0,0 +1,232 @@
+{
+ "Gemini-2.5-Pro": {
+ "type": "Thinking",
+ "context_length": "1M",
+ "url": "https://ai.google.dev/gemini-api/docs/models?hl=zh-cn#gemini-2.5-pro"
+ },
+ "Gemini-2.5-Flash": {
+ "type": "Mixed",
+ "context_length": "1M",
+ "url": "https://ai.google.dev/gemini-api/docs/models?hl=zh-cn#gemini-2.5-flash"
+ },
+ "Gemma-3-27B-It": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/google/gemma-3-27b-it"
+ },
+ "Gemma-3-12B-It": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/google/gemma-3-12b-it"
+ },
+ "Gemma-3-4B-It": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/google/gemma-3-4b-it"
+ },
+ "GPT-5": {
+ "type": "Thinking",
+ "context_length": "272k",
+ "url": "https://platform.openai.com/docs/models/gpt-5"
+ },
+ "GPT-4o": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://platform.openai.com/docs/models/gpt-4o"
+ },
+ "GPT-OSS-120B": {
+ "type": "Thinking",
+ "context_length": "128k",
+ "url": "https://huggingface.co/openai/gpt-oss-120b"
+ },
+ "GPT-OSS-20B": {
+ "type": "Thinking",
+ "context_length": "128k",
+ "url": "https://huggingface.co/openai/gpt-oss-20b"
+ },
+ "Claude-4-Sonnet": {
+ "type": "Mixed",
+ "context_length": "1M",
+ "url": "https://www.anthropic.com/news/claude-4"
+ },
+ "Claude-3.7-Sonnet": {
+ "type": "Mixed",
+ "context_length": "200k",
+ "url": "https://www.anthropic.com/news/claude-3-7-sonnet"
+ },
+ "DeepSeek-V3.2": {
+ "type": "Mixed",
+ "context_length": "160k",
+ "url": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2"
+ },
+ "DeepSeek-V3.1": {
+ "type": "Mixed",
+ "context_length": "128k",
+ "url": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1"
+ },
+ "DeepSeek-R1-0528": {
+ "type": "Thinking",
+ "context_length": "128k",
+ "url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528"
+ },
+ "DeepSeek-R1": {
+ "type": "Thinking",
+ "context_length": "128k",
+ "url": "https://huggingface.co/deepseek-ai/DeepSeek-R1"
+ },
+ "DeepSeek-V3-0324": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324"
+ },
+ "Qwen3-235B-A22B-Thinking-2507": {
+ "type": "Thinking",
+ "context_length": "256k",
+ "url": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507"
+ },
+ "Qwen3-235B-A22B-Instruct-2507": {
+ "type": "Instruct",
+ "context_length": "256k",
+ "url": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507"
+ },
+ "Qwen3-Next-80B-A3B-Thinking": {
+ "type": "Thinking",
+ "context_length": "256k",
+ "url": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking"
+ },
+ "Qwen3-Next-80B-A3B-Instruct": {
+ "type": "Instruct",
+ "context_length": "256k",
+ "url": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct"
+ },
+ "Qwen3-30B-A3B-Thinking-2507": {
+ "type": "Thinking",
+ "context_length": "256k",
+ "url": "https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507"
+ },
+ "Qwen3-30B-A3B-Instruct-2507": {
+ "type": "Instruct",
+ "context_length": "256k",
+ "url": "https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507"
+ },
+ "Qwen3-4B-Thinking-2507": {
+ "type": "Thinking",
+ "context_length": "256k",
+ "url": "https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507"
+ },
+ "Qwen3-4B-Instruct-2507": {
+ "type": "Instruct",
+ "context_length": "256k",
+ "url": "https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507"
+ },
+ "Qwen3-32B": {
+ "type": "Mixed",
+ "context_length": "128k",
+ "url": "https://huggingface.co/Qwen/Qwen3-32B"
+ },
+ "Qwen3-14B": {
+ "type": "Mixed",
+ "context_length": "128k",
+ "url": "https://huggingface.co/Qwen/Qwen3-14B"
+ },
+ "Qwen3-8B": {
+ "type": "Mixed",
+ "context_length": "128k",
+ "url": "https://huggingface.co/Qwen/Qwen3-8B"
+ },
+ "Qwen3-4B": {
+ "type": "Mixed",
+ "context_length": "128k",
+ "url": "https://huggingface.co/Qwen/Qwen3-4B"
+ },
+ "Qwen2.5-72B-Instruct": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct"
+ },
+ "GLM-4.6": {
+ "type": "Mixed",
+ "context_length": "198k",
+ "url": "https://huggingface.co/zai-org/GLM-4.6"
+ },
+ "GLM-4.5": {
+ "type": "Mixed",
+ "context_length": "128k",
+ "url": "https://huggingface.co/zai-org/GLM-4.5"
+ },
+ "Kimi-K2-Instruct-0905": {
+ "type": "Instruct",
+ "context_length": "256k",
+ "url": "https://huggingface.co/moonshotai/Kimi-K2-Instruct-0905"
+ },
+ "MiniMax-M2": {
+ "type": "Thinking",
+ "context_length": "192k",
+ "url": "https://huggingface.co/MiniMaxAI/MiniMax-M2"
+ },
+ "MiniMax-Text-01": {
+ "type": "Instruct",
+ "context_length": "4M",
+ "url": "https://huggingface.co/MiniMaxAI/MiniMax-Text-01"
+ },
+ "Ministral-3-14B-Instruct-2512": {
+ "type": "Instruct",
+ "context_length": "256k",
+ "url": "https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512"
+ },
+ "Ministral-3-8B-Instruct-2512": {
+ "type": "Instruct",
+ "context_length": "256k",
+ "url": "https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512"
+ },
+ "Ministral-3-3B-Instruct-2512": {
+ "type": "Instruct",
+ "context_length": "256k",
+ "url": "https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512"
+ },
+ "Magistral-Small-2509": {
+ "type": "Thinking",
+ "context_length": "128k",
+ "url": "https://huggingface.co/mistralai/Magistral-Small-2509"
+ },
+ "Mistral-Small-3.2-24B-Instruct-2506": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506"
+ },
+ "Mistral-Large-Instruct-2411": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2411"
+ },
+ "Ministral-8B-Instruct-2410": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410"
+ },
+ "Llama-3.1-405B-Instruct": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct"
+ },
+ "Llama-3.3-70B-Instruct": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct"
+ },
+ "Llama-3.1-70B-Instruct": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct"
+ },
+ "Llama-3.1-8B-Instruct": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct"
+ },
+ "Llama-3.2-3B-Instruct": {
+ "type": "Instruct",
+ "context_length": "128k",
+ "url": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct"
+ }
+}
\ No newline at end of file