diff --git a/README.md b/README.md index 8cf69bc2df1860fce1000109ea502ff1ce62bfa4..2af69967b32a311992ef1b99b8434760e307e860 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,4 @@ app_file: app.py pinned: false license: apache-2.0 short_description: Realistic and Comprehensive Bilingual Long-Context Benchmark ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference \ No newline at end of file +--- \ No newline at end of file diff --git a/app.py b/app.py index cbffdf1ba490e3ae1fb244c10909cccfa7652993..45df31031d17acbf8aeaaf420b343b8cd685e430 100644 --- a/app.py +++ b/app.py @@ -1,7 +1,880 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +LongBenchmark 结果可视化 +""" + +import json +import re +import pandas as pd +from pathlib import Path import gradio as gr +import plotly.graph_objects as go + +with open('./results/model_info.json', 'r', encoding='utf-8') as f: + MODLE_INFO_DICT = json.load(f) + +def get_color(index): + """基于索引生成颜色,使用黄金角度确保颜色分布均匀且无限""" + # 黄金角度约 137.508 度,确保颜色在色环上分布均匀 + hue = (index * 137.508) % 360 + # 固定饱和度为70%,亮度为60%,确保颜色既鲜艳又不刺眼 + return f"hsl({hue}, 70%, 60%)" + +class ResultParser: + def __init__(self, output_dir: str): + self.output_dir = Path(output_dir) + self.results = [] + + def parse_filename(self, filename: str): + """解析文件名,提取context长度和是否包含thinking或nonthinking""" + # 提取context长度 + context_match = re.search(r'context-(\d+)', filename) + context_length = int(context_match.group(1)) if context_match else 0 + + filename_lower = filename.lower() + # 检查是否包含nonthinking(优先检查,因为nonthinking也包含thinking) + has_nonthinking = 'nonthinking' in filename_lower + # 检查是否包含thinking(但不包含nonthinking) + has_thinking = 'thinking' in filename_lower and not has_nonthinking + + return context_length, has_thinking, has_nonthinking + + def parse_result_file(self, model_name: str, file_path: Path): + """解析单个结果文件""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + context_length, has_thinking, has_nonthinking = self.parse_filename(file_path.name) + # 使用JSON文件中的date字段作为评估日期 + eval_date = data.get('date', "未知") + + # 提取BoN数据 + bon_data = {} + for bon_key in ['BoN-1', 'BoN-2', 'BoN-3']: + if bon_key in data and 'overall_metric' in data[bon_key]: + bon_data[bon_key] = data[bon_key]['overall_metric'] + + result = { + 'model_name': model_name, + 'eval_date': eval_date, + 'context_length': context_length, + 'has_thinking': has_thinking, + 'has_nonthinking': has_nonthinking, + 'overall_metric': data.get('average_overall_metric', 0.0), + 'token_length_metrics': data.get('average_token_length_metric', {}), + 'contextual_requirement': data.get('average_contextual_requirement_metric', {}), + 'difficulty': data.get('average_difficulty_metric', {}), + 'primary_task': data.get('average_primary_task_metric', {}), + 'language': data.get('average_language_metric', {}), + 'bon_data': bon_data, # 存储BoN-1, BoN-2, BoN-3的overall_metric + 'pass_at_k': { + 'pass@1': data.get('pass@1'), + 'pass@2': data.get('pass@2'), + 'pass@3': data.get('pass@3') + } + } + + return result + + except Exception as e: + print(f"解析文件 {file_path} 时出错: {e}") + return None + + def scan_all_results(self): + """扫描所有模型的结果文件""" + self.results = [] + + if not self.output_dir.exists(): + print(f"输出目录不存在: {self.output_dir}") + return + + # 遍历所有模型目录 + for model_dir in self.output_dir.iterdir(): + if not model_dir.is_dir(): + continue + + model_name = model_dir.name + print(f"扫描模型: {model_name}") + + # 查找该模型下的所有_summary.json文件 + for file_path in model_dir.glob("*_summary.json"): + print(f" 解析文件: {file_path.name}") + result = self.parse_result_file(model_name, file_path) + if result: + self.results.append(result) + + print(f"总共解析了 {len(self.results)} 个结果文件") + + def get_leaderboard_data(self): + """获取排行榜数据""" + if not self.results: + return pd.DataFrame() + + # 按模型名称聚合数据 + model_groups = {} + for result in self.results: + model_name = result['model_name'] + if model_name not in model_groups: + model_groups[model_name] = { + 'dates': [], + 'contexts': [], + 'thinking_scores': [], + 'non_thinking_scores': [] + } + + group = model_groups[model_name] + group['dates'].append(result['eval_date']) + group['contexts'].append(result['context_length']) + + score = result['overall_metric'] + if result['has_thinking']: + group['thinking_scores'].append(score) + else: + group['non_thinking_scores'].append(score) + + leaderboard_data = [] + for model_name, group in model_groups.items(): + # 获取最新日期 + valid_dates = [d for d in group['dates'] if d != "未知"] + latest_date = max(valid_dates) if valid_dates else "未知" + + # 获取最大Context Window + max_context = max(group['contexts']) if group['contexts'] else 0 + + # 格式化截断长度 + if max_context >= 1000000: + context_str = f"{max_context/1000000:.0f}M" if max_context % 1000000 == 0 else f"{max_context/1000000:.1f}M" + elif max_context >= 1000: + context_str = f"{max_context/1000:.0f}k" if max_context % 1000 == 0 else f"{max_context/1000:.1f}k" + else: + context_str = str(max_context) + + # 获取模型类型和上下文长度 + model_context = "-" + model_url = "" + if model_name in MODLE_INFO_DICT: + model_info = MODLE_INFO_DICT[model_name] + if isinstance(model_info, dict): + model_type = model_info.get("type", "Unknown") + model_context = model_info.get("context_length", "-") + model_url = model_info.get("url", "") + else: + model_type = str(model_info) + else: + model_type = "Unknown" + + # 处理模型名称链接和图标 + display_model_name = model_name + + if model_url: + display_model_name = f"[{display_model_name}]({model_url})" + + # 计算平均分 + nt_score_val = 0 + nt_score_str = "-" + if group['non_thinking_scores']: + nt_score_val = sum(group['non_thinking_scores']) / len(group['non_thinking_scores']) + nt_score_str = f"{nt_score_val * 100:.2f}" + + t_score_val = 0 + t_score_str = "-" + if group['thinking_scores']: + t_score_val = sum(group['thinking_scores']) / len(group['thinking_scores']) + t_score_str = f"{t_score_val * 100:.2f}" + + leaderboard_data.append({ + '模型名称': display_model_name, + '模型类型': model_type, + '上下文长度': model_context, + '截断长度': context_str, + '非思考得分': nt_score_str, + '思考得分': t_score_str, + '_sort_score': max(nt_score_val, t_score_val) + }) + + df = pd.DataFrame(leaderboard_data) + # 按最高分降序排列 + if not df.empty: + df = df.sort_values('_sort_score', ascending=False).drop(columns=['_sort_score']).reset_index(drop=True) + + return df + +def get_display_name_for_result(result): + """获取模型的显示名称(根据是否包含thinking或nonthinking添加后缀)""" + if result.get('has_nonthinking'): + return f"{result['model_name']}_nonthinking" + elif result.get('has_thinking'): + return f"{result['model_name']}_thinking" + else: + return result['model_name'] + +def get_model_color_index(model_name, all_models): + """获取模型在颜色列表中的索引""" + try: + return all_models.index(model_name) + except ValueError: + return 0 + +def create_contextual_requirement_chart(results, selected_models): + """创建上下文需求对比柱状图""" + if not selected_models: + return go.Figure() + + # 收集数据 - 直接使用summary中的值,不需要计算平均值 + chart_data = {} + + for result in results: + display_name = get_display_name_for_result(result) + if display_name in selected_models: + model_name = display_name + contextual_requirement = result['contextual_requirement'] + + # 直接存储每个模型的结果,不需要计算平均值 + if model_name not in chart_data: + chart_data[model_name] = {} + + for req_type, score in contextual_requirement.items(): + chart_data[model_name][req_type] = score * 100 # 乘以100 + + # 创建图表 + fig = go.Figure() + + # 获取所有需求类型 - 保持原始顺序,不排序 + all_req_types = [] + for result in results: + display_name = get_display_name_for_result(result) + if display_name in selected_models: + contextual_requirement = result['contextual_requirement'] + for req_type in contextual_requirement.keys(): + if req_type not in all_req_types: + all_req_types.append(req_type) + + for model_name in selected_models: + if model_name in chart_data: + scores = [chart_data[model_name].get(req_type, 0) for req_type in all_req_types] + color_index = get_model_color_index(model_name, selected_models) + + fig.add_trace(go.Bar( + name=model_name, + x=all_req_types, + y=scores, + marker_color=get_color(color_index), + text=[f"{score:.2f}" for score in scores], # 保留2位小数 + textposition='auto' + )) + + fig.update_layout( + title='模型在不同上下文需求上的性能对比', + xaxis_title='上下文需求类型', + yaxis_title='平均得分', + barmode='group', + autosize=True, # 自动调整大小 + legend=dict( + orientation="h", + yanchor="top", + y=-0.25, # 调整到更下方 + xanchor="center", + x=0.5 + ), + margin=dict(b=100) # 增加底部边距 + ) + + return fig + +def create_primary_task_radar_chart(results, selected_models): + """创建主要任务雷达图(按任务前缀聚合,使用'.'前缀,绘制最多11个任务)""" + if not selected_models: + return go.Figure() + + # 收集所有模型下的任务前缀,保持出现顺序 + prefix_order = [] + # 为每个模型构建 前缀 -> [scores] 的映射 + model_prefix_scores = {} + + for result in results: + display_name = get_display_name_for_result(result) + if display_name not in selected_models: + continue + primary_task = result.get('primary_task', {}) + if display_name not in model_prefix_scores: + model_prefix_scores[display_name] = {} + for task_key, score in primary_task.items(): + prefix = task_key.split('.')[0].strip() if isinstance(task_key, str) else str(task_key) + if prefix not in prefix_order: + prefix_order.append(prefix) + if prefix not in model_prefix_scores[display_name]: + model_prefix_scores[display_name][prefix] = [] + model_prefix_scores[display_name][prefix].append(score * 100) + + # 只取前11个前缀用于绘制 + categories = prefix_order[:11] + + # 创建雷达图 + fig = go.Figure() + + for model_name in selected_models: + if model_name not in model_prefix_scores: + continue + # 对每个前缀做均值聚合;缺失则为0 + values = [] + for prefix in categories: + scores = model_prefix_scores[model_name].get(prefix, []) + if scores: + values.append(sum(scores) / len(scores)) + else: + values.append(0) + # 闭合多边形 + r_values = values + ([values[0]] if values else []) + theta_values = categories + ([categories[0]] if categories else []) + color_index = get_model_color_index(model_name, selected_models) + fig.add_trace(go.Scatterpolar( + r=r_values, + theta=theta_values, + mode='lines+markers', + name=model_name, + line=dict(color=get_color(color_index), width=3), + marker=dict(size=6), + fill='toself' + )) + + fig.update_layout( + title='模型在不同主要任务上的性能对比', + polar=dict( + radialaxis=dict(visible=True, range=[0, 100]) + ), + legend=dict( + orientation="h", + yanchor="top", + y=-0.2, + xanchor="center", + x=0.5 + ), + margin=dict(b=100) + ) + + return fig + +def create_language_chart(results, selected_models): + """创建语言对比柱状图""" + if not selected_models: + return go.Figure() + + # 收集数据 - 直接使用summary中的值,不需要计算平均值 + chart_data = {} + + for result in results: + display_name = get_display_name_for_result(result) + if display_name in selected_models: + model_name = display_name + language = result['language'] + + # 直接存储每个模型的结果,不需要计算平均值 + if model_name not in chart_data: + chart_data[model_name] = {} + + for lang_type, score in language.items(): + chart_data[model_name][lang_type] = score * 100 # 乘以100 + + # 创建图表 + fig = go.Figure() + + # 获取所有语言类型 - 保持原始顺序,不排序 + all_lang_types = [] + for result in results: + display_name = get_display_name_for_result(result) + if display_name in selected_models: + language = result['language'] + for lang_type in language.keys(): + if lang_type not in all_lang_types: + all_lang_types.append(lang_type) + + for model_name in selected_models: + if model_name in chart_data: + scores = [chart_data[model_name].get(lang_type, 0) for lang_type in all_lang_types] + color_index = get_model_color_index(model_name, selected_models) + + fig.add_trace(go.Bar( + name=model_name, + x=all_lang_types, + y=scores, + marker_color=get_color(color_index), + text=[f"{score:.2f}" for score in scores], # 保留2位小数 + textposition='auto' + )) + + fig.update_layout( + title='模型在不同语言上的性能对比', + xaxis_title='语言类型', + yaxis_title='平均得分', + barmode='group', + autosize=True, # 自动调整大小 + legend=dict( + orientation="h", + yanchor="top", + y=-0.25, # 调整到更下方 + xanchor="center", + x=0.5 + ), + margin=dict(b=100) # 增加底部边距 + ) + + return fig + +def create_difficulty_chart(results, selected_models): + """创建难度对比柱状图""" + if not selected_models: + return go.Figure() + + # 收集数据 - 直接使用summary中的值,不需要计算平均值 + chart_data = {} + + for result in results: + display_name = get_display_name_for_result(result) + if display_name in selected_models: + model_name = display_name + difficulty = result['difficulty'] + + # 直接存储每个模型的结果,不需要计算平均值 + if model_name not in chart_data: + chart_data[model_name] = {} + + for diff_type, score in difficulty.items(): + chart_data[model_name][diff_type] = score * 100 # 乘以100 + + # 创建图表 + fig = go.Figure() + + # 获取所有难度类型 - 保持原始顺序,不排序 + all_diff_types = [] + for result in results: + display_name = get_display_name_for_result(result) + if display_name in selected_models: + difficulty = result['difficulty'] + for diff_type in difficulty.keys(): + if diff_type not in all_diff_types: + all_diff_types.append(diff_type) + + for model_name in selected_models: + if model_name in chart_data: + scores = [chart_data[model_name].get(diff_type, 0) for diff_type in all_diff_types] + color_index = get_model_color_index(model_name, selected_models) + + fig.add_trace(go.Bar( + name=model_name, + x=all_diff_types, + y=scores, + marker_color=get_color(color_index), + text=[f"{score:.2f}" for score in scores], # 保留2位小数 + textposition='auto' + )) + + fig.update_layout( + title='模型在不同难度上的性能对比', + xaxis_title='难度类型', + yaxis_title='平均得分', + barmode='group', + autosize=True, # 自动调整大小 + legend=dict( + orientation="h", + yanchor="top", + y=-0.25, # 调整到更下方 + xanchor="center", + x=0.5 + ), + margin=dict(b=100) # 增加底部边距 + ) + + return fig + +def create_length_heatmap(results, selected_models): + """创建长度热力图:横坐标为长度,纵坐标为模型""" + if not selected_models: + return go.Figure() + + # 定义标准的context长度范围:8k, 16k, 32k, 64k, 128k, 256k + standard_lengths = [8000, 16000, 32000, 64000, 128000, 256000] + standard_length_keys = ['8k', '16k', '32k', '64k', '128k', '256k'] + + # 准备热力图数据 + heatmap_data = [] + model_names = [] + + for result in results: + display_name = get_display_name_for_result(result) + if display_name in selected_models: + model_names.append(display_name) + + # 从token_length_metrics中获取数据 + token_length_metrics = result.get('token_length_metrics', {}) + row_data = [] + + for key in standard_length_keys: + if key in token_length_metrics: + row_data.append(token_length_metrics[key] * 100) # 乘以100转换为百分比 + else: + row_data.append(None) # 没有数据点 + + heatmap_data.append(row_data) + + # 创建热力图 + fig = go.Figure(data=go.Heatmap( + z=heatmap_data, + x=[f"{length//1000}k" for length in standard_lengths], # x轴标签 + y=model_names, # y轴标签 + colorscale='RdYlBu_r', # 颜色映射:红色表示低分,蓝色表示高分 + showscale=True, + text=[[f"{val:.2f}" if val is not None else "N/A" for val in row] for row in heatmap_data], # 显示数值 + texttemplate="%{text}", + textfont={"size": 10}, + hoverongaps=False + )) + + fig.update_layout( + title='模型在不同Context长度上的性能热力图', + xaxis_title='Context长度 (tokens)', + yaxis_title='模型名称', + autosize=True, + height=max(400, len(model_names) * 50), # 根据模型数量调整高度 + margin=dict(l=150, r=50, t=80, b=80) # 调整边距,左侧留更多空间给模型名称 + ) + + return fig + +def create_bon_chart(results, selected_models): + """创建BoN 1-3折线图,显示overall_metric""" + if not selected_models: + return go.Figure() + + # BoN 标签 + bon_labels = ['BoN-1', 'BoN-2', 'BoN-3'] + bon_indices = [1, 2, 3] + + # 为每个模型准备数据 + model_data = {} + for result in results: + display_name = get_display_name_for_result(result) + if display_name in selected_models: + if display_name not in model_data: + model_data[display_name] = {} + + # 从bon_data中获取数据 + bon_data = result.get('bon_data', {}) + for bon_key in bon_labels: + if bon_key in bon_data: + bon_index = bon_labels.index(bon_key) + 1 + model_data[display_name][bon_index] = bon_data[bon_key] * 100 # 乘以100转换为百分比 + + # 创建图表 + fig = go.Figure() + + for model_name, data in model_data.items(): + if not data: + continue + + # 为每个BoN准备数据 + x_values = [] + y_values = [] + text_values = [] + + for bon_index in bon_indices: + x_values.append(bon_index) + if bon_index in data: + y_values.append(data[bon_index]) + text_values.append(f"{data[bon_index]:.2f}") + else: + y_values.append(None) + text_values.append("") + + # 获取模型颜色索引 + color_index = get_model_color_index(model_name, selected_models) + + fig.add_trace(go.Scatter( + x=x_values, + y=y_values, + mode='lines+markers', + name=model_name, + line=dict(color=get_color(color_index), width=3), + marker=dict(size=10), + text=text_values, + textposition='top center', + connectgaps=False + )) + + # 设置x轴 + fig.update_layout( + title='模型在不同Best-of-N下的对比', + xaxis_title='N', + yaxis_title='平均得分', + autosize=True, + xaxis=dict( + tickmode='array', + tickvals=bon_indices, + ticktext=bon_labels, + tickangle=0 + ), + legend=dict( + orientation="h", + yanchor="top", + y=-0.25, + xanchor="center", + x=0.5 + ), + margin=dict(b=100) + ) + + return fig + +def create_pass_k_chart(results, selected_models): + """创建Pass@N 折线图""" + if not selected_models: + return go.Figure() + + # Pass@K 标签 + k_labels = ['pass@1', 'pass@2', 'pass@3'] + k_indices = [1, 2, 3] + + # 为每个模型准备数据 + model_data = {} + for result in results: + display_name = get_display_name_for_result(result) + if display_name in selected_models: + if display_name not in model_data: + model_data[display_name] = {} + + # 从pass_at_k中获取数据 + pass_data = result.get('pass_at_k', {}) + for i, k_key in enumerate(k_labels): + val = pass_data.get(k_key) + if val is not None: + k_index = k_indices[i] + model_data[display_name][k_index] = val * 100 # 乘以100转换为百分比 + + # 创建图表 + fig = go.Figure() + + for model_name, data in model_data.items(): + if not data: + continue + + # 为每个Pass@K准备数据 + x_values = [] + y_values = [] + text_values = [] + + for k_index in k_indices: + x_values.append(k_index) + if k_index in data: + y_values.append(data[k_index]) + text_values.append(f"{data[k_index]:.2f}") + else: + y_values.append(None) + text_values.append("") + + # 获取模型颜色索引 + color_index = get_model_color_index(model_name, selected_models) + + fig.add_trace(go.Scatter( + x=x_values, + y=y_values, + mode='lines+markers', + name=model_name, + line=dict(color=get_color(color_index), width=3), + marker=dict(size=10), + text=text_values, + textposition='top center', + connectgaps=False + )) + + # 设置x轴 + fig.update_layout( + title='模型在不同Pass@N下的对比', + xaxis_title='N', + yaxis_title='Pass@N (%)', + autosize=True, + xaxis=dict( + tickmode='array', + tickvals=k_indices, + ticktext=k_labels, + tickangle=0 + ), + legend=dict( + orientation="h", + yanchor="top", + y=-0.25, + xanchor="center", + x=0.5 + ), + margin=dict(b=100) + ) + + return fig + +def create_gradio_interface(parser: ResultParser): + """创建Gradio界面""" + + def refresh_data(): + """刷新数据""" + parser.scan_all_results() + return parser.get_leaderboard_data() + + def get_model_choices(): + """获取模型选择列表(按是否包含Thinking或NonThinking区分,以相应后缀标识)""" + if not parser.results: + return [] + display_names = set() + for r in parser.results: + name = get_display_name_for_result(r) + display_names.add(name) + models = sorted(list(display_names)) + return models + + def update_charts(selected_models): + """更新所有图表""" + if not selected_models: + return None, None, None, None, None, None, None + + length_heatmap = create_length_heatmap(parser.results, selected_models) + contextual_chart = create_contextual_requirement_chart(parser.results, selected_models) + primary_task_radar_chart = create_primary_task_radar_chart(parser.results, selected_models) + language_chart = create_language_chart(parser.results, selected_models) + difficulty_chart = create_difficulty_chart(parser.results, selected_models) + bon_chart = create_bon_chart(parser.results, selected_models) + pass_k_chart = create_pass_k_chart(parser.results, selected_models) + + return length_heatmap, contextual_chart, primary_task_radar_chart, language_chart, difficulty_chart, bon_chart, pass_k_chart + + # 自定义CSS: + # 1. 强制所有表头居中(包括内部的按钮或文本容器) + # 2. 除了第一列(模型名称),其他列内容居中 + custom_css = """ + /* 强制标题居中 */ + h1 { + text-align: center; + display: block; + } + + /* 表头居中 */ + #leaderboard_table th, + #leaderboard_table th button, + #leaderboard_table th span { + text-align: center !important; + justify-content: center !important; + } + + /* 内容列居中:从第3列开始(跳过行号和模型名称) */ + #leaderboard_table td:nth-child(n+3) { + text-align: center !important; + } + """ + + # 创建界面 + with gr.Blocks(title="LongBench Pro 结果可视化", theme=gr.themes.Soft(), css=custom_css) as demo: + gr.Markdown("# LongBench Pro 结果可视化") + + gr.HTML(""" +
+ HF Dataset + Github Code + Leaderboard + Paper +
+ """) + + # 排行榜区域 + gr.Markdown("## 🏆 总体性能排行榜") + gr.Markdown(""" + - *思考模型和混合思考模型的思考得分,使用本身的思考能力(Non-Thinking Prompt)* + - *指令模型的思考得分,使用思考提示获得(Thinking Prompt)* + """) + leaderboard_df = gr.Dataframe( + headers=["模型名称", "模型类型", "上下文长度", "截断长度", "非思考得分", "思考得分"], + datatype=["markdown", "str", "str", "str", "str", "str"], + interactive=False, + wrap=True, + show_row_numbers=True, + show_search="filter", + show_fullscreen_button=True, + max_height=800, + column_widths=["250px", "100px", "100px", "100px", "120px", "120px"], + elem_id="leaderboard_table" + ) + + # 模型筛选和图表区域 + gr.HTML("
") + gr.Markdown("## 📊 特定维度对比") + with gr.Row(): + with gr.Column(scale=4): + model_selector = gr.Dropdown( + choices=[], + label="选择模型", + value=[], + multiselect=True, + interactive=True + ) + with gr.Column(scale=1): + update_charts_btn = gr.Button("更新图表", variant="primary", size="lg") + + with gr.Tabs(): + with gr.TabItem("语言维度"): + language_plot = gr.Plot() + + with gr.TabItem("难度维度"): + difficulty_plot = gr.Plot() + + with gr.TabItem("长度维度"): + length_heatmap = gr.Plot() + + with gr.TabItem("主要任务维度"): + primary_task_radar_plot = gr.Plot() + + with gr.TabItem("上下文需求维度"): + contextual_plot = gr.Plot() + + with gr.TabItem("BoN维度"): + bon_plot = gr.Plot() + + with gr.TabItem("Pass@N维度"): + pass_k_plot = gr.Plot() + + # 事件处理 + def update_model_choices(): + models = get_model_choices() + return gr.Dropdown(choices=models, value=[]) + + update_charts_btn.click( + fn=update_charts, + inputs=[model_selector], + outputs=[length_heatmap, contextual_plot, primary_task_radar_plot, language_plot, difficulty_plot, bon_plot, pass_k_plot] + ) + + # 初始化 - 页面加载时自动刷新数据 + demo.load( + fn=refresh_data, + outputs=[leaderboard_df] + ).then( + fn=update_model_choices, + outputs=[model_selector] + ) + + return demo -def greet(name): - return "Hello " + name + "!!" +def main(): + """主函数""" + output_dir = "./results" + + print("初始化结果解析器...") + parser = ResultParser(output_dir) + + print("扫描结果文件...") + parser.scan_all_results() + + print("创建Gradio界面...") + demo = create_gradio_interface(parser) + + print("启动服务器...") + demo.launch() -demo = gr.Interface(fn=greet, inputs="text", outputs="text") -demo.launch() \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/results/Claude-3.7-Sonnet/nonthinking_context-120000_bon-3_summary.json b/results/Claude-3.7-Sonnet/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..14c2164381f9749a23ed31700de4b037cae30702 --- /dev/null +++ b/results/Claude-3.7-Sonnet/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5144730485997339, + "inference_iteration_1_overall_metric": 0.5192628714494713, + "inference_iteration_2_overall_metric": 0.5090899475543829, + "inference_iteration_3_overall_metric": 0.515066326795347, + "average_token_length_metric": { + "8k": 0.5927607589235461, + "16k": 0.5922491004183165, + "32k": 0.5555486925170308, + "64k": 0.4991997081584744, + "128k": 0.45285894052515324, + "256k": 0.39422109105588254 + }, + "average_contextual_requirement_metric": { + "Full": 0.47584256909012274, + "Partial": 0.5636391134301498 + }, + "average_difficulty_metric": { + "Easy": 0.6868572950582806, + "Moderate": 0.48375113564429373, + "Hard": 0.4728683670759167, + "Extreme": 0.3731393645349295 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7586982224527067, + "T2. Sequencing & Structure Reconstruction": 0.7545327049493711, + "T3. Evidence-Grounded QA": 0.5277777777777779, + "T4. Summarization & Synthesis": 0.5250996637138268, + "T5. Attribution & Citation Alignment": 0.5254132304220211, + "T6. Aggregation & Clustering": 0.47394883159992857, + "T7. Consistency & Compliance Checking": 0.3040021982475052, + "T8. Structured & Numeric Reasoning": 0.41188271604938276, + "T9. Version & Code Diff Analysis": 0.6042705189653765, + "T10. Rule Induction & In-Context Learning": 0.5114814814814815, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.43888888888888894 + }, + "average_language_metric": { + "Chinese": 0.5100199196277736, + "English": 0.5189261775716956 + }, + "BoN-1": { + "overall_metric": 0.5192628714494713, + "token_length": { + "8k": 0.5937761874854375, + "16k": 0.606154781504802, + "32k": 0.5701163293545726, + "64k": 0.49747085680734393, + "128k": 0.4476635155122931, + "256k": 0.40039555803238175 + }, + "contextual_requirement": { + "Full": 0.47604900489990065, + "Partial": 0.5742623379671082 + }, + "difficulty": { + "Easy": 0.6993477849390543, + "Moderate": 0.4824572359285609, + "Hard": 0.47426941765067004, + "Extreme": 0.37571516352087697 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7482072090157142, + "T2. Sequencing & Structure Reconstruction": 0.7523087560587559, + "T3. Evidence-Grounded QA": 0.5333333333333333, + "T4. Summarization & Synthesis": 0.5249609995597003, + "T5. Attribution & Citation Alignment": 0.5307787048666445, + "T6. Aggregation & Clustering": 0.47337460590728553, + "T7. Consistency & Compliance Checking": 0.30808530916861365, + "T8. Structured & Numeric Reasoning": 0.40740740740740744, + "T9. Version & Code Diff Analysis": 0.6209514621148434, + "T10. Rule Induction & In-Context Learning": 0.5469444444444443, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.45 + }, + "language": { + "Chinese": 0.5151862466463957, + "English": 0.5233394962525484 + } + }, + "pass@1": 0.2673333333333333, + "BoN-2": { + "overall_metric": 0.5638452937577435, + "token_length": { + "8k": 0.6334745299899752, + "16k": 0.6535009894669588, + "32k": 0.6109603205298609, + "64k": 0.5566414838337063, + "128k": 0.5030500434216845, + "256k": 0.4254443953042751 + }, + "contextual_requirement": { + "Full": 0.523183868231744, + "Partial": 0.6155961989726518 + }, + "difficulty": { + "Easy": 0.7554808778953406, + "Moderate": 0.5292531239954449, + "Hard": 0.5124984336029716, + "Extreme": 0.41036353942072434 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8018540164669292, + "T2. Sequencing & Structure Reconstruction": 0.7873358123358121, + "T3. Evidence-Grounded QA": 0.5666666666666667, + "T4. Summarization & Synthesis": 0.5413926274234249, + "T5. Attribution & Citation Alignment": 0.5675265408978069, + "T6. Aggregation & Clustering": 0.5409163851157314, + "T7. Consistency & Compliance Checking": 0.34558583778191654, + "T8. Structured & Numeric Reasoning": 0.4685185185185185, + "T9. Version & Code Diff Analysis": 0.6508982849457906, + "T10. Rule Induction & In-Context Learning": 0.6081944444444445, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.49166666666666664 + }, + "language": { + "Chinese": 0.5545031574164562, + "English": 0.5731874300990306 + } + }, + "pass@2": 0.30533333333333335, + "BoN-3": { + "overall_metric": 0.5987860690936202, + "token_length": { + "8k": 0.6767030215651172, + "16k": 0.6801366595488965, + "32k": 0.6345247903374839, + "64k": 0.6039272497657204, + "128k": 0.5233376257525678, + "256k": 0.47408706759193875 + }, + "contextual_requirement": { + "Full": 0.5614328850984434, + "Partial": 0.6463264850874838 + }, + "difficulty": { + "Easy": 0.7882244354415668, + "Moderate": 0.5766160107480841, + "Hard": 0.5655081578600745, + "Extreme": 0.42768418415872295 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8178727620501064, + "T2. Sequencing & Structure Reconstruction": 0.8166698116698113, + "T3. Evidence-Grounded QA": 0.6166666666666667, + "T4. Summarization & Synthesis": 0.5469270547891242, + "T5. Attribution & Citation Alignment": 0.581425502230592, + "T6. Aggregation & Clustering": 0.5699768544212985, + "T7. Consistency & Compliance Checking": 0.3875679491876082, + "T8. Structured & Numeric Reasoning": 0.5462962962962963, + "T9. Version & Code Diff Analysis": 0.6792246386283735, + "T10. Rule Induction & In-Context Learning": 0.6452777777777778, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.525 + }, + "language": { + "Chinese": 0.5867683815613326, + "English": 0.6108037566259096 + } + }, + "pass@3": 0.3433333333333333 +} \ No newline at end of file diff --git a/results/Claude-3.7-Sonnet/thinking_context-120000_bon-3_summary.json b/results/Claude-3.7-Sonnet/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..23a24c84a7e4bd03fb7073fed7fd86e496ac58f6 --- /dev/null +++ b/results/Claude-3.7-Sonnet/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5966078087059191, + "inference_iteration_1_overall_metric": 0.5938171820634314, + "inference_iteration_2_overall_metric": 0.5955816438384393, + "inference_iteration_3_overall_metric": 0.6004246002158852, + "average_token_length_metric": { + "8k": 0.6997135645823386, + "16k": 0.6577212798228894, + "32k": 0.6419035800281319, + "64k": 0.6238264957040918, + "128k": 0.523846643485212, + "256k": 0.43263528861285133 + }, + "average_contextual_requirement_metric": { + "Full": 0.5527640561663963, + "Partial": 0.652408948301675 + }, + "average_difficulty_metric": { + "Easy": 0.7825568834883242, + "Moderate": 0.6155843766907921, + "Hard": 0.5658238514809286, + "Extreme": 0.4006643574451805 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8486700113628681, + "T2. Sequencing & Structure Reconstruction": 0.8000456983629063, + "T3. Evidence-Grounded QA": 0.5027777777777777, + "T4. Summarization & Synthesis": 0.5309981037882555, + "T5. Attribution & Citation Alignment": 0.5878801280848494, + "T6. Aggregation & Clustering": 0.5732629573374424, + "T7. Consistency & Compliance Checking": 0.3939611740106759, + "T8. Structured & Numeric Reasoning": 0.6212962962962962, + "T9. Version & Code Diff Analysis": 0.69342672946219, + "T10. Rule Induction & In-Context Learning": 0.5610185185185187, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5499999999999997 + }, + "average_language_metric": { + "Chinese": 0.588350975552306, + "English": 0.6048646418595319 + }, + "BoN-1": { + "overall_metric": 0.5938171820634314, + "token_length": { + "8k": 0.7191856133745022, + "16k": 0.6402554520543442, + "32k": 0.6341044882273853, + "64k": 0.6259136300211012, + "128k": 0.5177437687626877, + "256k": 0.4257001399405685 + }, + "contextual_requirement": { + "Full": 0.5550251903269423, + "Partial": 0.6431888079098727 + }, + "difficulty": { + "Easy": 0.7745826629942724, + "Moderate": 0.6051939662024207, + "Hard": 0.5720458613452599, + "Extreme": 0.40262413493324634 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8535130690589529, + "T2. Sequencing & Structure Reconstruction": 0.7808294450361011, + "T3. Evidence-Grounded QA": 0.5, + "T4. Summarization & Synthesis": 0.5297721568932299, + "T5. Attribution & Citation Alignment": 0.5849862779597039, + "T6. Aggregation & Clustering": 0.5655938313957183, + "T7. Consistency & Compliance Checking": 0.3878929800328972, + "T8. Structured & Numeric Reasoning": 0.6314814814814815, + "T9. Version & Code Diff Analysis": 0.6772724985908704, + "T10. Rule Induction & In-Context Learning": 0.5855555555555555, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333 + }, + "language": { + "Chinese": 0.5905648959587577, + "English": 0.5970694681681056 + } + }, + "pass@1": 0.37266666666666665, + "BoN-2": { + "overall_metric": 0.6528466974974912, + "token_length": { + "8k": 0.7638003125719123, + "16k": 0.7101654799888735, + "32k": 0.7028814358570394, + "64k": 0.6870948174265773, + "128k": 0.565071084237125, + "256k": 0.4880670549034259 + }, + "contextual_requirement": { + "Full": 0.6177398094156152, + "Partial": 0.6975281914198818 + }, + "difficulty": { + "Easy": 0.8281250142111442, + "Moderate": 0.6919969423858446, + "Hard": 0.6370194614390577, + "Extreme": 0.4455182924797047 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8893518121473104, + "T2. Sequencing & Structure Reconstruction": 0.8424985248669454, + "T3. Evidence-Grounded QA": 0.6, + "T4. Summarization & Synthesis": 0.544816527949887, + "T5. Attribution & Citation Alignment": 0.6380851790586051, + "T6. Aggregation & Clustering": 0.6286177167059521, + "T7. Consistency & Compliance Checking": 0.45873422235423905, + "T8. Structured & Numeric Reasoning": 0.6578703703703704, + "T9. Version & Code Diff Analysis": 0.7510537661056169, + "T10. Rule Induction & In-Context Learning": 0.618611111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333 + }, + "language": { + "Chinese": 0.6376749344767387, + "English": 0.6680184605182464 + } + }, + "pass@2": 0.44333333333333336, + "BoN-3": { + "overall_metric": 0.6830970097115105, + "token_length": { + "8k": 0.7837282981657816, + "16k": 0.7434736805597991, + "32k": 0.7293201301488285, + "64k": 0.7233152422607191, + "128k": 0.6019561639441493, + "256k": 0.5167885431897875 + }, + "contextual_requirement": { + "Full": 0.6474666039819896, + "Partial": 0.7284447988218108 + }, + "difficulty": { + "Easy": 0.8607457235279341, + "Moderate": 0.7303808913214545, + "Hard": 0.6594975210053281, + "Extreme": 0.47293457878264067 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9066883296818435, + "T2. Sequencing & Structure Reconstruction": 0.8643315529499735, + "T3. Evidence-Grounded QA": 0.6166666666666667, + "T4. Summarization & Synthesis": 0.5522881837833883, + "T5. Attribution & Citation Alignment": 0.6507666087400348, + "T6. Aggregation & Clustering": 0.6674614963830652, + "T7. Consistency & Compliance Checking": 0.48674980053348876, + "T8. Structured & Numeric Reasoning": 0.7092592592592593, + "T9. Version & Code Diff Analysis": 0.7716543341971472, + "T10. Rule Induction & In-Context Learning": 0.6727777777777777, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334 + }, + "language": { + "Chinese": 0.6635013624534305, + "English": 0.7026926569695916 + } + }, + "pass@3": 0.4826666666666667 +} \ No newline at end of file diff --git a/results/Claude-4-Sonnet/nonthinking_context-1000000_bon-3_summary.json b/results/Claude-4-Sonnet/nonthinking_context-1000000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f9b4f9410b13418916d6f5d5f814cc03fde28168 --- /dev/null +++ b/results/Claude-4-Sonnet/nonthinking_context-1000000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5606565628619046, + "inference_iteration_1_overall_metric": 0.5620036050651629, + "inference_iteration_2_overall_metric": 0.5631248059457928, + "inference_iteration_3_overall_metric": 0.5568412775747574, + "average_token_length_metric": { + "8k": 0.6071528197414233, + "16k": 0.5816154959256097, + "32k": 0.5612446325027117, + "64k": 0.5254403501645888, + "128k": 0.5465188702735214, + "256k": 0.541967208563576 + }, + "average_contextual_requirement_metric": { + "Full": 0.5264359183584719, + "Partial": 0.6042101104117302 + }, + "average_difficulty_metric": { + "Easy": 0.6842286640015465, + "Moderate": 0.5396282888053806, + "Hard": 0.5757154269645611, + "Extreme": 0.4292097599695439 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8091570544332052, + "T2. Sequencing & Structure Reconstruction": 0.8207229190562522, + "T3. Evidence-Grounded QA": 0.5027777777777778, + "T4. Summarization & Synthesis": 0.5379205858992453, + "T5. Attribution & Citation Alignment": 0.6634794615844591, + "T6. Aggregation & Clustering": 0.5193980953038955, + "T7. Consistency & Compliance Checking": 0.41333574812040713, + "T8. Structured & Numeric Reasoning": 0.3430555555555555, + "T9. Version & Code Diff Analysis": 0.7423632867012375, + "T10. Rule Induction & In-Context Learning": 0.5125462962962964, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5055555555555556 + }, + "average_language_metric": { + "Chinese": 0.5499191967861178, + "English": 0.5713939289376934 + }, + "BoN-1": { + "overall_metric": 0.5620036050651629, + "token_length": { + "8k": 0.6116650341526972, + "16k": 0.5789009200875381, + "32k": 0.564884526756138, + "64k": 0.5150575277083638, + "128k": 0.5512053153279016, + "256k": 0.5503083063583438 + }, + "contextual_requirement": { + "Full": 0.5327035707633319, + "Partial": 0.5992945578129499 + }, + "difficulty": { + "Easy": 0.6903796852415736, + "Moderate": 0.5418819297458669, + "Hard": 0.5712079185155163, + "Extreme": 0.4285441662812896 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8022069774152061, + "T2. Sequencing & Structure Reconstruction": 0.8176220076220074, + "T3. Evidence-Grounded QA": 0.5333333333333333, + "T4. Summarization & Synthesis": 0.5377531684167628, + "T5. Attribution & Citation Alignment": 0.6702134910265403, + "T6. Aggregation & Clustering": 0.509165168922684, + "T7. Consistency & Compliance Checking": 0.4132211721134369, + "T8. Structured & Numeric Reasoning": 0.336574074074074, + "T9. Version & Code Diff Analysis": 0.755058796168736, + "T10. Rule Induction & In-Context Learning": 0.5204166666666666, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5 + }, + "language": { + "Chinese": 0.5525156622096248, + "English": 0.5714915479207029 + } + }, + "pass@1": 0.2986666666666667, + "BoN-2": { + "overall_metric": 0.6070413152649775, + "token_length": { + "8k": 0.648428309386666, + "16k": 0.6138312363156012, + "32k": 0.6050527198490672, + "64k": 0.5709459527531863, + "128k": 0.6043903989020145, + "256k": 0.5995992743833368 + }, + "contextual_requirement": { + "Full": 0.5725088578790201, + "Partial": 0.6509917155743812 + }, + "difficulty": { + "Easy": 0.7433988294310949, + "Moderate": 0.5959678783418261, + "Hard": 0.6127141026903381, + "Extreme": 0.461215101348602 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8352274736404431, + "T2. Sequencing & Structure Reconstruction": 0.8427542827542828, + "T3. Evidence-Grounded QA": 0.6, + "T4. Summarization & Synthesis": 0.5511890444295781, + "T5. Attribution & Citation Alignment": 0.709250434569062, + "T6. Aggregation & Clustering": 0.5590519848415796, + "T7. Consistency & Compliance Checking": 0.4538880045271121, + "T8. Structured & Numeric Reasoning": 0.400462962962963, + "T9. Version & Code Diff Analysis": 0.7896296658102724, + "T10. Rule Induction & In-Context Learning": 0.5898611111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.55 + }, + "language": { + "Chinese": 0.6018498726474579, + "English": 0.6122327578825001 + } + }, + "pass@2": 0.3486666666666667, + "BoN-3": { + "overall_metric": 0.6359674546066088, + "token_length": { + "8k": 0.6791089276748106, + "16k": 0.6344789904129863, + "32k": 0.6382754639404674, + "64k": 0.6155932927177382, + "128k": 0.6246025171745627, + "256k": 0.6237455357190941 + }, + "contextual_requirement": { + "Full": 0.5997037562420825, + "Partial": 0.6821212525251004 + }, + "difficulty": { + "Easy": 0.7810114340069705, + "Moderate": 0.6237269051463531, + "Hard": 0.6375808018524918, + "Extreme": 0.4840585077179298 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8531556276526557, + "T2. Sequencing & Structure Reconstruction": 0.8630080105080103, + "T3. Evidence-Grounded QA": 0.6416666666666667, + "T4. Summarization & Synthesis": 0.5557327849332047, + "T5. Attribution & Citation Alignment": 0.7262293044645985, + "T6. Aggregation & Clustering": 0.5966415249311195, + "T7. Consistency & Compliance Checking": 0.47966164058768634, + "T8. Structured & Numeric Reasoning": 0.4393518518518518, + "T9. Version & Code Diff Analysis": 0.7950682623015005, + "T10. Rule Induction & In-Context Learning": 0.6329166666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6083333333333333 + }, + "language": { + "Chinese": 0.6245809400158543, + "English": 0.6473539691973664 + } + }, + "pass@3": 0.38 +} \ No newline at end of file diff --git a/results/Claude-4-Sonnet/thinking_context-1000000_bon-3_summary.json b/results/Claude-4-Sonnet/thinking_context-1000000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..56c05dee9caa23c7aa666e1e62f40e3fb8b47e9a --- /dev/null +++ b/results/Claude-4-Sonnet/thinking_context-1000000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 3, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.6987364832054667, + "inference_iteration_1_overall_metric": 0.7019992434982991, + "inference_iteration_2_overall_metric": 0.6978899327024527, + "inference_iteration_3_overall_metric": 0.6963202734156487, + "average_token_length_metric": { + "8k": 0.7273068305229948, + "16k": 0.7148161402734813, + "32k": 0.7282156837997693, + "64k": 0.7051754330841736, + "128k": 0.6642984844940268, + "256k": 0.6526063270583587 + }, + "average_contextual_requirement_metric": { + "Full": 0.6617044440872328, + "Partial": 0.7458681693559481 + }, + "average_difficulty_metric": { + "Easy": 0.8377531760390221, + "Moderate": 0.7658446956684767, + "Hard": 0.7472224806628969, + "Extreme": 0.4705256363582413 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.907681462321177, + "T2. Sequencing & Structure Reconstruction": 0.8890326186159514, + "T3. Evidence-Grounded QA": 0.661111111111111, + "T4. Summarization & Synthesis": 0.5383660231848545, + "T5. Attribution & Citation Alignment": 0.7860152219301051, + "T6. Aggregation & Clustering": 0.6671470819716809, + "T7. Consistency & Compliance Checking": 0.5518199768375653, + "T8. Structured & Numeric Reasoning": 0.6859567901234568, + "T9. Version & Code Diff Analysis": 0.8575767924690506, + "T10. Rule Induction & In-Context Learning": 0.6481481481481483, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5888888888888888 + }, + "average_language_metric": { + "Chinese": 0.6865315143381795, + "English": 0.7109414520727555 + }, + "BoN-1": { + "overall_metric": 0.7019992434982991, + "token_length": { + "8k": 0.7253942984634015, + "16k": 0.7347686831241128, + "32k": 0.7405843026072749, + "64k": 0.6852109109698611, + "128k": 0.6670707753146399, + "256k": 0.6589664905105064 + }, + "contextual_requirement": { + "Full": 0.6664641511659741, + "Partial": 0.747225724648532 + }, + "difficulty": { + "Easy": 0.8574789158116471, + "Moderate": 0.7537775763318981, + "Hard": 0.7422849474268556, + "Extreme": 0.47120899649989867 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9235851079801015, + "T2. Sequencing & Structure Reconstruction": 0.8866411828911827, + "T3. Evidence-Grounded QA": 0.7166666666666667, + "T4. Summarization & Synthesis": 0.5397488846450174, + "T5. Attribution & Citation Alignment": 0.7968571187727533, + "T6. Aggregation & Clustering": 0.6625413639156731, + "T7. Consistency & Compliance Checking": 0.5348173324914223, + "T8. Structured & Numeric Reasoning": 0.7013888888888888, + "T9. Version & Code Diff Analysis": 0.8605924270512658, + "T10. Rule Induction & In-Context Learning": 0.6277777777777778, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.575 + }, + "language": { + "Chinese": 0.7011338968631923, + "English": 0.7028645901334066 + } + }, + "pass@1": 0.4786666666666667, + "BoN-2": { + "overall_metric": 0.7662921811826703, + "token_length": { + "8k": 0.7864626566034867, + "16k": 0.7903224578099031, + "32k": 0.7829418518359753, + "64k": 0.776573418015718, + "128k": 0.7369784925906059, + "256k": 0.724474210240336 + }, + "contextual_requirement": { + "Full": 0.7387739445109663, + "Partial": 0.8013153914921128 + }, + "difficulty": { + "Easy": 0.9044185942106279, + "Moderate": 0.8489744455895216, + "Hard": 0.8300963355442819, + "Extreme": 0.5187660309473054 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9337455622019767, + "T2. Sequencing & Structure Reconstruction": 0.920133061383061, + "T3. Evidence-Grounded QA": 0.8, + "T4. Summarization & Synthesis": 0.5512587507775879, + "T5. Attribution & Citation Alignment": 0.851460921546061, + "T6. Aggregation & Clustering": 0.7263341037175177, + "T7. Consistency & Compliance Checking": 0.6403373459035246, + "T8. Structured & Numeric Reasoning": 0.7763888888888889, + "T9. Version & Code Diff Analysis": 0.8952412388875778, + "T10. Rule Induction & In-Context Learning": 0.7372222222222222, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.675 + }, + "language": { + "Chinese": 0.7559027655426326, + "English": 0.7766815968227087 + } + }, + "pass@2": 0.5586666666666666, + "BoN-3": { + "overall_metric": 0.7907893209368455, + "token_length": { + "8k": 0.8018918816218435, + "16k": 0.8122398647423218, + "32k": 0.8052123235968958, + "64k": 0.8059913515132954, + "128k": 0.7658095463175019, + "256k": 0.7535909578292158 + }, + "contextual_requirement": { + "Full": 0.7602540929686584, + "Partial": 0.8296523383509014 + }, + "difficulty": { + "Easy": 0.9273787695935313, + "Moderate": 0.8740334338695857, + "Hard": 0.8656496547603084, + "Extreme": 0.5373159132889705 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9422092659363656, + "T2. Sequencing & Structure Reconstruction": 0.9295053557553554, + "T3. Evidence-Grounded QA": 0.8416666666666667, + "T4. Summarization & Synthesis": 0.5573876341160701, + "T5. Attribution & Citation Alignment": 0.8621875560226953, + "T6. Aggregation & Clustering": 0.7544804518953127, + "T7. Consistency & Compliance Checking": 0.6657566567798248, + "T8. Structured & Numeric Reasoning": 0.8055555555555556, + "T9. Version & Code Diff Analysis": 0.9043321479784867, + "T10. Rule Induction & In-Context Learning": 0.7838888888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.725 + }, + "language": { + "Chinese": 0.7791329047815275, + "English": 0.8024457370921636 + } + }, + "pass@3": 0.598 +} \ No newline at end of file diff --git a/results/DeepSeek-R1-0528/thinking_context-120000_bon-3_summary.json b/results/DeepSeek-R1-0528/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..46a5fb503fa4d93cb24b38f9f1f1a1d872fd58d0 --- /dev/null +++ b/results/DeepSeek-R1-0528/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 1, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.61893761586453, + "inference_iteration_1_overall_metric": 0.6270481753191374, + "inference_iteration_2_overall_metric": 0.6115350419668117, + "inference_iteration_3_overall_metric": 0.6182296303076407, + "average_token_length_metric": { + "8k": 0.7165288754873151, + "16k": 0.6828199674990499, + "32k": 0.6181133860648209, + "64k": 0.6286866574208946, + "128k": 0.5812085902020846, + "256k": 0.48626821851301744 + }, + "average_contextual_requirement_metric": { + "Full": 0.5840521215265608, + "Partial": 0.6633373359310379 + }, + "average_difficulty_metric": { + "Easy": 0.8266775116602355, + "Moderate": 0.6653109531944901, + "Hard": 0.5367579918507135, + "Extreme": 0.41488622286022364 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8218247532961501, + "T2. Sequencing & Structure Reconstruction": 0.8170893828393826, + "T3. Evidence-Grounded QA": 0.5638888888888887, + "T4. Summarization & Synthesis": 0.5505430321723027, + "T5. Attribution & Citation Alignment": 0.5766419140172199, + "T6. Aggregation & Clustering": 0.5593864809441226, + "T7. Consistency & Compliance Checking": 0.44029922393263027, + "T8. Structured & Numeric Reasoning": 0.6998456790123458, + "T9. Version & Code Diff Analysis": 0.7073888549627423, + "T10. Rule Induction & In-Context Learning": 0.613935185185185, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5361111111111111 + }, + "average_language_metric": { + "Chinese": 0.6388577912805221, + "English": 0.5990174404485396 + }, + "BoN-1": { + "overall_metric": 0.6270481753191374, + "token_length": { + "8k": 0.7085495640791158, + "16k": 0.6852891854029057, + "32k": 0.6523714114738722, + "64k": 0.6349276261566131, + "128k": 0.5990851877476868, + "256k": 0.48206607705463184 + }, + "contextual_requirement": { + "Full": 0.5954038691856903, + "Partial": 0.6673227467617081 + }, + "difficulty": { + "Easy": 0.842688379979359, + "Moderate": 0.6856872320332753, + "Hard": 0.5371806890772243, + "Extreme": 0.4113355332448204 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8085658637642787, + "T2. Sequencing & Structure Reconstruction": 0.8118172105672106, + "T3. Evidence-Grounded QA": 0.6083333333333333, + "T4. Summarization & Synthesis": 0.5488292098363808, + "T5. Attribution & Citation Alignment": 0.5873852854613878, + "T6. Aggregation & Clustering": 0.5544843141923278, + "T7. Consistency & Compliance Checking": 0.4479896544786345, + "T8. Structured & Numeric Reasoning": 0.7194444444444444, + "T9. Version & Code Diff Analysis": 0.704738113297963, + "T10. Rule Induction & In-Context Learning": 0.6438888888888888, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666 + }, + "language": { + "Chinese": 0.6382004484945908, + "English": 0.6158959021436853 + } + }, + "pass@1": 0.39066666666666666, + "BoN-2": { + "overall_metric": 0.6908377344426165, + "token_length": { + "8k": 0.7595240418657973, + "16k": 0.7570136230136708, + "32k": 0.7096605299566262, + "64k": 0.7135170329369038, + "128k": 0.6539368768633997, + "256k": 0.5513743020193068 + }, + "contextual_requirement": { + "Full": 0.6581544826620869, + "Partial": 0.7324346003451124 + }, + "difficulty": { + "Easy": 0.900422588090864, + "Moderate": 0.7645982045756979, + "Hard": 0.6173528116491919, + "Extreme": 0.46106606935258343 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8656746491642889, + "T2. Sequencing & Structure Reconstruction": 0.8850374162874161, + "T3. Evidence-Grounded QA": 0.6833333333333333, + "T4. Summarization & Synthesis": 0.5661734768963829, + "T5. Attribution & Citation Alignment": 0.6630436055268355, + "T6. Aggregation & Clustering": 0.6284207499424888, + "T7. Consistency & Compliance Checking": 0.520982919531347, + "T8. Structured & Numeric Reasoning": 0.7888888888888889, + "T9. Version & Code Diff Analysis": 0.7536592506692632, + "T10. Rule Induction & In-Context Learning": 0.7194444444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5916666666666667 + }, + "language": { + "Chinese": 0.6996429460445758, + "English": 0.6820325228406605 + } + }, + "pass@2": 0.4533333333333333, + "BoN-3": { + "overall_metric": 0.7254771584689355, + "token_length": { + "8k": 0.7994316297691582, + "16k": 0.7937454228456009, + "32k": 0.7297465435731235, + "64k": 0.7326783552830499, + "128k": 0.6951047141031044, + "256k": 0.6021562852395752 + }, + "contextual_requirement": { + "Full": 0.6913631461546793, + "Partial": 0.7688949923234434 + }, + "difficulty": { + "Easy": 0.9337047416686951, + "Moderate": 0.8141591847806626, + "Hard": 0.6485103013588345, + "Extreme": 0.4896785698290395 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8968239339375051, + "T2. Sequencing & Structure Reconstruction": 0.8988072575572574, + "T3. Evidence-Grounded QA": 0.7583333333333333, + "T4. Summarization & Synthesis": 0.5734497230075248, + "T5. Attribution & Citation Alignment": 0.6928069332150858, + "T6. Aggregation & Clustering": 0.6670419649341216, + "T7. Consistency & Compliance Checking": 0.5624760870987693, + "T8. Structured & Numeric Reasoning": 0.8055555555555556, + "T9. Version & Code Diff Analysis": 0.7817439995394256, + "T10. Rule Induction & In-Context Learning": 0.7638888888888888, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.65 + }, + "language": { + "Chinese": 0.7321739274049238, + "English": 0.7187803895329472 + } + }, + "pass@3": 0.49666666666666665 +} \ No newline at end of file diff --git a/results/DeepSeek-R1/thinking_context-120000_bon-3_summary.json b/results/DeepSeek-R1/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..37ba1159b595642c379f48be640b3e6204da9346 --- /dev/null +++ b/results/DeepSeek-R1/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.6006714049681133, + "inference_iteration_1_overall_metric": 0.6007584621917721, + "inference_iteration_2_overall_metric": 0.5960043654782469, + "inference_iteration_3_overall_metric": 0.6052513872343173, + "average_token_length_metric": { + "8k": 0.6896237775198697, + "16k": 0.66847824761939, + "32k": 0.6242811862728697, + "64k": 0.5907117819226532, + "128k": 0.526720556197483, + "256k": 0.5042128802764103 + }, + "average_contextual_requirement_metric": { + "Full": 0.5734808170096616, + "Partial": 0.6352776078243236 + }, + "average_difficulty_metric": { + "Easy": 0.8244195631460464, + "Moderate": 0.5882837964508552, + "Hard": 0.5338546774181954, + "Extreme": 0.4075883160627708 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8460279171139484, + "T2. Sequencing & Structure Reconstruction": 0.7927840387644306, + "T3. Evidence-Grounded QA": 0.5666666666666665, + "T4. Summarization & Synthesis": 0.5315482688091906, + "T5. Attribution & Citation Alignment": 0.46763122932017526, + "T6. Aggregation & Clustering": 0.5661396588973091, + "T7. Consistency & Compliance Checking": 0.4411785360364781, + "T8. Structured & Numeric Reasoning": 0.6290123456790124, + "T9. Version & Code Diff Analysis": 0.7118775193966861, + "T10. Rule Induction & In-Context Learning": 0.6290277777777776, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5083333333333333 + }, + "average_language_metric": { + "Chinese": 0.5813030699136731, + "English": 0.6200397400225522 + }, + "BoN-1": { + "overall_metric": 0.6007584621917721, + "token_length": { + "8k": 0.6794004597378533, + "16k": 0.6605152745514365, + "32k": 0.637696287010787, + "64k": 0.6015965809771497, + "128k": 0.5259184504039809, + "256k": 0.49942372046943184 + }, + "contextual_requirement": { + "Full": 0.5724096385120696, + "Partial": 0.6368387832386698 + }, + "difficulty": { + "Easy": 0.8239541273708798, + "Moderate": 0.5859117167110014, + "Hard": 0.541012801830159, + "Extreme": 0.40525140462840953 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8346607474979665, + "T2. Sequencing & Structure Reconstruction": 0.7960157843246078, + "T3. Evidence-Grounded QA": 0.5916666666666667, + "T4. Summarization & Synthesis": 0.5314348105743746, + "T5. Attribution & Citation Alignment": 0.46439938615714244, + "T6. Aggregation & Clustering": 0.5590113115895492, + "T7. Consistency & Compliance Checking": 0.4443221207730568, + "T8. Structured & Numeric Reasoning": 0.612962962962963, + "T9. Version & Code Diff Analysis": 0.7031087891880523, + "T10. Rule Induction & In-Context Learning": 0.6470833333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5166666666666667 + }, + "language": { + "Chinese": 0.5732559357352887, + "English": 0.6282609886482589 + } + }, + "pass@1": 0.3433333333333333, + "BoN-2": { + "overall_metric": 0.6591761776037405, + "token_length": { + "8k": 0.7392791599059808, + "16k": 0.7298238663653037, + "32k": 0.6777532203191601, + "64k": 0.6787515982515117, + "128k": 0.5821796500623371, + "256k": 0.5472695707181553 + }, + "contextual_requirement": { + "Full": 0.6319274531975012, + "Partial": 0.6938563723025926 + }, + "difficulty": { + "Easy": 0.8783150831551243, + "Moderate": 0.678502573566839, + "Hard": 0.5963536523109759, + "Extreme": 0.4476885160139848 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8861689034562782, + "T2. Sequencing & Structure Reconstruction": 0.8374043195366726, + "T3. Evidence-Grounded QA": 0.625, + "T4. Summarization & Synthesis": 0.5455058096240588, + "T5. Attribution & Citation Alignment": 0.5369499475317325, + "T6. Aggregation & Clustering": 0.6381104251141014, + "T7. Consistency & Compliance Checking": 0.5019132623573087, + "T8. Structured & Numeric Reasoning": 0.699537037037037, + "T9. Version & Code Diff Analysis": 0.7605821531353517, + "T10. Rule Induction & In-Context Learning": 0.7220833333333334, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667 + }, + "language": { + "Chinese": 0.6390711207410246, + "English": 0.6792812344664584 + } + }, + "pass@2": 0.4093333333333333, + "BoN-3": { + "overall_metric": 0.6982292810132508, + "token_length": { + "8k": 0.783866781971667, + "16k": 0.7689573555512769, + "32k": 0.7150318293064207, + "64k": 0.707631405403529, + "128k": 0.6291537694328186, + "256k": 0.5847345444137952 + }, + "contextual_requirement": { + "Full": 0.6747138288729008, + "Partial": 0.7281580382827888 + }, + "difficulty": { + "Easy": 0.9164006912153285, + "Moderate": 0.7293351521236777, + "Hard": 0.6332713918179128, + "Extreme": 0.48146703898856563 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9025424539049985, + "T2. Sequencing & Structure Reconstruction": 0.8813529526029528, + "T3. Evidence-Grounded QA": 0.6666666666666666, + "T4. Summarization & Synthesis": 0.5516760401143904, + "T5. Attribution & Citation Alignment": 0.5770877507616411, + "T6. Aggregation & Clustering": 0.6710035196738627, + "T7. Consistency & Compliance Checking": 0.5581333121650413, + "T8. Structured & Numeric Reasoning": 0.7560185185185184, + "T9. Version & Code Diff Analysis": 0.8001126786344129, + "T10. Rule Induction & In-Context Learning": 0.7456944444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.625 + }, + "language": { + "Chinese": 0.6861725652789407, + "English": 0.7102859967475628 + } + }, + "pass@3": 0.45866666666666667 +} \ No newline at end of file diff --git a/results/DeepSeek-V3-0324/nonthinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3-0324/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..332e56ff1213d74b488f7947e01df363fea2b2f3 --- /dev/null +++ b/results/DeepSeek-V3-0324/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5169762636111047, + "inference_iteration_1_overall_metric": 0.5181528065498966, + "inference_iteration_2_overall_metric": 0.5148683077997773, + "inference_iteration_3_overall_metric": 0.5179076764836414, + "average_token_length_metric": { + "8k": 0.5649502252093578, + "16k": 0.5347800008319371, + "32k": 0.5556420045489457, + "64k": 0.5214603320658495, + "128k": 0.4864319755387441, + "256k": 0.4385930434717982 + }, + "average_contextual_requirement_metric": { + "Full": 0.477916384525216, + "Partial": 0.566688836993147 + }, + "average_difficulty_metric": { + "Easy": 0.6526369307346341, + "Moderate": 0.4967551723461267, + "Hard": 0.48299903154456436, + "Extreme": 0.4039646133594433 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7877679677233363, + "T2. Sequencing & Structure Reconstruction": 0.7655772360963239, + "T3. Evidence-Grounded QA": 0.5416666666666666, + "T4. Summarization & Synthesis": 0.5446412810076181, + "T5. Attribution & Citation Alignment": 0.5358172435200781, + "T6. Aggregation & Clustering": 0.4889882988114357, + "T7. Consistency & Compliance Checking": 0.3557205172395749, + "T8. Structured & Numeric Reasoning": 0.24367283950617283, + "T9. Version & Code Diff Analysis": 0.6358733797519817, + "T10. Rule Induction & In-Context Learning": 0.6043981481481482, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4138888888888889 + }, + "average_language_metric": { + "Chinese": 0.5177833454262655, + "English": 0.5161691817959461 + }, + "BoN-1": { + "overall_metric": 0.5181528065498966, + "token_length": { + "8k": 0.5625899513794126, + "16k": 0.5301053854931655, + "32k": 0.5507157770563107, + "64k": 0.5301772699202785, + "128k": 0.4898481902149404, + "256k": 0.44548026523527584 + }, + "contextual_requirement": { + "Full": 0.4761374399130544, + "Partial": 0.571626909542243 + }, + "difficulty": { + "Easy": 0.6469710156877221, + "Moderate": 0.5054097919629513, + "Hard": 0.48366558283696526, + "Extreme": 0.4080599930595185 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7857412502114394, + "T2. Sequencing & Structure Reconstruction": 0.7680012781036988, + "T3. Evidence-Grounded QA": 0.5416666666666666, + "T4. Summarization & Synthesis": 0.5417591122015827, + "T5. Attribution & Citation Alignment": 0.5399240477985636, + "T6. Aggregation & Clustering": 0.4939990937730982, + "T7. Consistency & Compliance Checking": 0.3541605841917901, + "T8. Structured & Numeric Reasoning": 0.24212962962962964, + "T9. Version & Code Diff Analysis": 0.6325782099444311, + "T10. Rule Induction & In-Context Learning": 0.6151388888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667 + }, + "language": { + "Chinese": 0.5160793653421631, + "English": 0.5202262477576317 + } + }, + "pass@1": 0.23666666666666666, + "BoN-2": { + "overall_metric": 0.5442270665213056, + "token_length": { + "8k": 0.594568981140541, + "16k": 0.5576674233487121, + "32k": 0.5818313481616337, + "64k": 0.5446955915354488, + "128k": 0.5245780781334816, + "256k": 0.4620209768080174 + }, + "contextual_requirement": { + "Full": 0.5071422110822568, + "Partial": 0.591425973443732 + }, + "difficulty": { + "Easy": 0.6783617155027842, + "Moderate": 0.5372286722751785, + "Hard": 0.5038179540985808, + "Extreme": 0.4284267679263639 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8015300122926023, + "T2. Sequencing & Structure Reconstruction": 0.787322466174887, + "T3. Evidence-Grounded QA": 0.5666666666666667, + "T4. Summarization & Synthesis": 0.5591478963061529, + "T5. Attribution & Citation Alignment": 0.5635377708713467, + "T6. Aggregation & Clustering": 0.5151496972716835, + "T7. Consistency & Compliance Checking": 0.37875998396209754, + "T8. Structured & Numeric Reasoning": 0.29953703703703705, + "T9. Version & Code Diff Analysis": 0.6509912195762164, + "T10. Rule Induction & In-Context Learning": 0.6334722222222222, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.45 + }, + "language": { + "Chinese": 0.5466945351249929, + "English": 0.5417595979176187 + } + }, + "pass@2": 0.26066666666666666, + "BoN-3": { + "overall_metric": 0.5555999285609496, + "token_length": { + "8k": 0.6111433308773455, + "16k": 0.5701468581090299, + "32k": 0.5948383611006594, + "64k": 0.5601715262733609, + "128k": 0.5307382241597844, + "256k": 0.4665612708455233 + }, + "contextual_requirement": { + "Full": 0.5192835049000674, + "Partial": 0.6018208314020757 + }, + "difficulty": { + "Easy": 0.6899946609782567, + "Moderate": 0.55112703775723, + "Hard": 0.5166174713215149, + "Extreme": 0.4369188707412474 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8147581778505403, + "T2. Sequencing & Structure Reconstruction": 0.8027788153812361, + "T3. Evidence-Grounded QA": 0.5916666666666667, + "T4. Summarization & Synthesis": 0.5641241115145229, + "T5. Attribution & Citation Alignment": 0.5716268051520625, + "T6. Aggregation & Clustering": 0.5308177346459271, + "T7. Consistency & Compliance Checking": 0.3957991644610091, + "T8. Structured & Numeric Reasoning": 0.29953703703703705, + "T9. Version & Code Diff Analysis": 0.6606747373420039, + "T10. Rule Induction & In-Context Learning": 0.6418055555555555, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4583333333333333 + }, + "language": { + "Chinese": 0.5602606340750268, + "English": 0.550939223046875 + } + }, + "pass@3": 0.272 +} \ No newline at end of file diff --git a/results/DeepSeek-V3-0324/thinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3-0324/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1cc7a303d9b6ca379e1b6d87bfbe3e42476de989 --- /dev/null +++ b/results/DeepSeek-V3-0324/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5670800708470047, + "inference_iteration_1_overall_metric": 0.5592880863605422, + "inference_iteration_2_overall_metric": 0.5704394040472569, + "inference_iteration_3_overall_metric": 0.5715127221332137, + "average_token_length_metric": { + "8k": 0.6286936796561847, + "16k": 0.6309027535519853, + "32k": 0.5969011989319307, + "64k": 0.5427727165403452, + "128k": 0.5275644070173147, + "256k": 0.4756456693842662 + }, + "average_contextual_requirement_metric": { + "Full": 0.5368141482212665, + "Partial": 0.605600336007035 + }, + "average_difficulty_metric": { + "Easy": 0.7920271132046242, + "Moderate": 0.5713738824882528, + "Hard": 0.4620098327210685, + "Extreme": 0.3868526000236004 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8062267128065881, + "T2. Sequencing & Structure Reconstruction": 0.7434196920363564, + "T3. Evidence-Grounded QA": 0.5111111111111112, + "T4. Summarization & Synthesis": 0.5146898974811284, + "T5. Attribution & Citation Alignment": 0.537979547816877, + "T6. Aggregation & Clustering": 0.5248106585111237, + "T7. Consistency & Compliance Checking": 0.3681804918962003, + "T8. Structured & Numeric Reasoning": 0.5841049382716048, + "T9. Version & Code Diff Analysis": 0.6466057172430281, + "T10. Rule Induction & In-Context Learning": 0.6183796296296297, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4944444444444444 + }, + "average_language_metric": { + "Chinese": 0.5527161660546936, + "English": 0.5814439756393159 + }, + "BoN-1": { + "overall_metric": 0.5592880863605422, + "token_length": { + "8k": 0.6182266355968743, + "16k": 0.6224911215102044, + "32k": 0.6006190818475612, + "64k": 0.5386968289399401, + "128k": 0.5299641908835836, + "256k": 0.4457306593850912 + }, + "contextual_requirement": { + "Full": 0.530042096148415, + "Partial": 0.5965102557214322 + }, + "difficulty": { + "Easy": 0.7861925110638937, + "Moderate": 0.563744851626512, + "Hard": 0.4535437025457247, + "Extreme": 0.377252152391456 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7951398867629517, + "T2. Sequencing & Structure Reconstruction": 0.751023606023606, + "T3. Evidence-Grounded QA": 0.4666666666666667, + "T4. Summarization & Synthesis": 0.5136736975702932, + "T5. Attribution & Citation Alignment": 0.510566852786565, + "T6. Aggregation & Clustering": 0.5116905358775884, + "T7. Consistency & Compliance Checking": 0.37914696318940605, + "T8. Structured & Numeric Reasoning": 0.5847222222222223, + "T9. Version & Code Diff Analysis": 0.6690241210962063, + "T10. Rule Induction & In-Context Learning": 0.5966666666666666, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.475 + }, + "language": { + "Chinese": 0.5492580351840802, + "English": 0.569318137537005 + } + }, + "pass@1": 0.30666666666666664, + "BoN-2": { + "overall_metric": 0.6262109782348795, + "token_length": { + "8k": 0.6883768372773764, + "16k": 0.7011104482454619, + "32k": 0.647383679165818, + "64k": 0.6092322863406843, + "128k": 0.5965297187489229, + "256k": 0.5146328996310173 + }, + "contextual_requirement": { + "Full": 0.5949295492853987, + "Partial": 0.6660237059887659 + }, + "difficulty": { + "Easy": 0.8509257637159784, + "Moderate": 0.66205670168914, + "Hard": 0.5144444684176784, + "Extreme": 0.42991229790988206 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8583478908910738, + "T2. Sequencing & Structure Reconstruction": 0.7814144189144188, + "T3. Evidence-Grounded QA": 0.5916666666666667, + "T4. Summarization & Synthesis": 0.5277169061289491, + "T5. Attribution & Citation Alignment": 0.5960013921722541, + "T6. Aggregation & Clustering": 0.5992683212004715, + "T7. Consistency & Compliance Checking": 0.4404830107556436, + "T8. Structured & Numeric Reasoning": 0.6166666666666667, + "T9. Version & Code Diff Analysis": 0.7117518441173546, + "T10. Rule Induction & In-Context Learning": 0.7094444444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667 + }, + "language": { + "Chinese": 0.6172660802474611, + "English": 0.6351558762222995 + } + }, + "pass@2": 0.37133333333333335, + "BoN-3": { + "overall_metric": 0.656766091226133, + "token_length": { + "8k": 0.7136143676151803, + "16k": 0.723535463035206, + "32k": 0.676630607848943, + "64k": 0.6371662004749992, + "128k": 0.6230203333837121, + "256k": 0.5666295749987625 + }, + "contextual_requirement": { + "Full": 0.6274188340913682, + "Partial": 0.6941171457612914 + }, + "difficulty": { + "Easy": 0.8771215787680936, + "Moderate": 0.7073341252772278, + "Hard": 0.5445129149531545, + "Extreme": 0.4558925937418165 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8681438231282913, + "T2. Sequencing & Structure Reconstruction": 0.8017822455322453, + "T3. Evidence-Grounded QA": 0.625, + "T4. Summarization & Synthesis": 0.5352432221044823, + "T5. Attribution & Citation Alignment": 0.6492191718976088, + "T6. Aggregation & Clustering": 0.6200836170157672, + "T7. Consistency & Compliance Checking": 0.4675370424175185, + "T8. Structured & Numeric Reasoning": 0.6592592592592593, + "T9. Version & Code Diff Analysis": 0.7376455774030053, + "T10. Rule Induction & In-Context Learning": 0.7722222222222221, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6 + }, + "language": { + "Chinese": 0.6466196520294557, + "English": 0.6669125304228131 + } + }, + "pass@3": 0.39866666666666667 +} \ No newline at end of file diff --git a/results/DeepSeek-V3.1/nonthinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3.1/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f3f059aa81e94cd065a0953bae9f538b4e86e7b8 --- /dev/null +++ b/results/DeepSeek-V3.1/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.513858634133048, + "inference_iteration_1_overall_metric": 0.5123343209652136, + "inference_iteration_2_overall_metric": 0.5169477472023125, + "inference_iteration_3_overall_metric": 0.5122938342316177, + "average_token_length_metric": { + "8k": 0.5798800160519532, + "16k": 0.557162234839459, + "32k": 0.5231647768475723, + "64k": 0.5020895430155518, + "128k": 0.47482295470763564, + "256k": 0.44603227933611866 + }, + "average_contextual_requirement_metric": { + "Full": 0.4799762392602454, + "Partial": 0.556981682152979 + }, + "average_difficulty_metric": { + "Easy": 0.6361477184952151, + "Moderate": 0.4879546435756716, + "Hard": 0.4929175841249406, + "Extreme": 0.4106651751804595 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7899439606505894, + "T2. Sequencing & Structure Reconstruction": 0.7633837790575241, + "T3. Evidence-Grounded QA": 0.5472222222222223, + "T4. Summarization & Synthesis": 0.5504457320532966, + "T5. Attribution & Citation Alignment": 0.5405950417654154, + "T6. Aggregation & Clustering": 0.4826886948879801, + "T7. Consistency & Compliance Checking": 0.3782532668616311, + "T8. Structured & Numeric Reasoning": 0.20864197530864195, + "T9. Version & Code Diff Analysis": 0.6426366556970474, + "T10. Rule Induction & In-Context Learning": 0.5235185185185185, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4611111111111112 + }, + "average_language_metric": { + "Chinese": 0.5242036714829956, + "English": 0.5035135967831006 + }, + "BoN-1": { + "overall_metric": 0.5123343209652136, + "token_length": { + "8k": 0.5775546994411811, + "16k": 0.5639037168302903, + "32k": 0.5253784942631851, + "64k": 0.4921258363031359, + "128k": 0.46961700213634144, + "256k": 0.4454261768171497 + }, + "contextual_requirement": { + "Full": 0.4805828046453364, + "Partial": 0.5527453417359676 + }, + "difficulty": { + "Easy": 0.6289501731523275, + "Moderate": 0.48863452317355005, + "Hard": 0.49859679584956984, + "Extreme": 0.4091764699789028 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7832093443146274, + "T2. Sequencing & Structure Reconstruction": 0.758390884975293, + "T3. Evidence-Grounded QA": 0.525, + "T4. Summarization & Synthesis": 0.5526444826651147, + "T5. Attribution & Citation Alignment": 0.550721576382896, + "T6. Aggregation & Clustering": 0.49314760273206115, + "T7. Consistency & Compliance Checking": 0.36201890631349554, + "T8. Structured & Numeric Reasoning": 0.19351851851851853, + "T9. Version & Code Diff Analysis": 0.6396574046033499, + "T10. Rule Induction & In-Context Learning": 0.5548611111111109, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667 + }, + "language": { + "Chinese": 0.5269577477981102, + "English": 0.49771089413231795 + } + }, + "pass@1": 0.24, + "BoN-2": { + "overall_metric": 0.5575639036526701, + "token_length": { + "8k": 0.6295556911757817, + "16k": 0.600201088330424, + "32k": 0.568079050385207, + "64k": 0.540874671214598, + "128k": 0.5142278846286528, + "256k": 0.4924450361813656 + }, + "contextual_requirement": { + "Full": 0.5270669681600962, + "Partial": 0.596378185188677 + }, + "difficulty": { + "Easy": 0.6861712816104073, + "Moderate": 0.5385181729697447, + "Hard": 0.5320539370618487, + "Extreme": 0.4459453589628649 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7998040423032986, + "T2. Sequencing & Structure Reconstruction": 0.7974421612945821, + "T3. Evidence-Grounded QA": 0.6083333333333333, + "T4. Summarization & Synthesis": 0.5644516725623473, + "T5. Attribution & Citation Alignment": 0.603125221836478, + "T6. Aggregation & Clustering": 0.5375390883957035, + "T7. Consistency & Compliance Checking": 0.42616915873421196, + "T8. Structured & Numeric Reasoning": 0.2569444444444444, + "T9. Version & Code Diff Analysis": 0.6918022158557025, + "T10. Rule Induction & In-Context Learning": 0.5736111111111112, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5 + }, + "language": { + "Chinese": 0.5701747476373757, + "English": 0.5449530596679679 + } + }, + "pass@2": 0.278, + "BoN-3": { + "overall_metric": 0.5783189478755758, + "token_length": { + "8k": 0.6444603085006093, + "16k": 0.6168368410260194, + "32k": 0.591356956659339, + "64k": 0.5741881409846333, + "128k": 0.5329008044819139, + "256k": 0.5101706356009489 + }, + "contextual_requirement": { + "Full": 0.5453502508711524, + "Partial": 0.6202791076993917 + }, + "difficulty": { + "Easy": 0.700691507162887, + "Moderate": 0.5586892040625308, + "Hard": 0.5577221017016739, + "Extreme": 0.47068692726136246 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8323002597882068, + "T2. Sequencing & Structure Reconstruction": 0.8148032724056932, + "T3. Evidence-Grounded QA": 0.625, + "T4. Summarization & Synthesis": 0.5726119582175254, + "T5. Attribution & Citation Alignment": 0.625566004790705, + "T6. Aggregation & Clustering": 0.5576354028999125, + "T7. Consistency & Compliance Checking": 0.4599095603777464, + "T8. Structured & Numeric Reasoning": 0.2736111111111111, + "T9. Version & Code Diff Analysis": 0.7066934638816516, + "T10. Rule Induction & In-Context Learning": 0.5819444444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333 + }, + "language": { + "Chinese": 0.5882921354915299, + "English": 0.5683457602596252 + } + }, + "pass@3": 0.3 +} \ No newline at end of file diff --git a/results/DeepSeek-V3.1/thinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3.1/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ef74a0fdd4a8c72e383463d2456585b9f2f92385 --- /dev/null +++ b/results/DeepSeek-V3.1/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 8, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.6621817899708398, + "inference_iteration_1_overall_metric": 0.6612230154154042, + "inference_iteration_2_overall_metric": 0.6610111426397741, + "inference_iteration_3_overall_metric": 0.6643112118573413, + "average_token_length_metric": { + "8k": 0.7494820895775017, + "16k": 0.7158886748078707, + "32k": 0.668616684861116, + "64k": 0.7028333128738413, + "128k": 0.6150251691532579, + "256k": 0.5212448085514543 + }, + "average_contextual_requirement_metric": { + "Full": 0.6306995662884327, + "Partial": 0.7022500746575411 + }, + "average_difficulty_metric": { + "Easy": 0.8572162949660228, + "Moderate": 0.7353266184513482, + "Hard": 0.622190936275892, + "Extreme": 0.4267542215146938 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8547667520039813, + "T2. Sequencing & Structure Reconstruction": 0.8502990373823708, + "T3. Evidence-Grounded QA": 0.5944444444444446, + "T4. Summarization & Synthesis": 0.5592502973941748, + "T5. Attribution & Citation Alignment": 0.664951753589773, + "T6. Aggregation & Clustering": 0.6143401320362227, + "T7. Consistency & Compliance Checking": 0.5020434872004602, + "T8. Structured & Numeric Reasoning": 0.7200617283950619, + "T9. Version & Code Diff Analysis": 0.7327346609657337, + "T10. Rule Induction & In-Context Learning": 0.6883796296296296, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5777777777777778 + }, + "average_language_metric": { + "Chinese": 0.6626168849921547, + "English": 0.6617466949495263 + }, + "BoN-1": { + "overall_metric": 0.6612230154154042, + "token_length": { + "8k": 0.7464911564030304, + "16k": 0.7250299866543051, + "32k": 0.658322634935698, + "64k": 0.7169507057254954, + "128k": 0.6020278750216188, + "256k": 0.5185157337522811 + }, + "contextual_requirement": { + "Full": 0.6306907002796491, + "Partial": 0.7000823255881854 + }, + "difficulty": { + "Easy": 0.8474131666511834, + "Moderate": 0.7235563816620875, + "Hard": 0.6392066996365914, + "Extreme": 0.4307791961407244 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.858023793743841, + "T2. Sequencing & Structure Reconstruction": 0.8565782365782365, + "T3. Evidence-Grounded QA": 0.6083333333333333, + "T4. Summarization & Synthesis": 0.5579680390729822, + "T5. Attribution & Citation Alignment": 0.6494356501600668, + "T6. Aggregation & Clustering": 0.6299150042042198, + "T7. Consistency & Compliance Checking": 0.5158224119304962, + "T8. Structured & Numeric Reasoning": 0.6930555555555555, + "T9. Version & Code Diff Analysis": 0.7300925156020261, + "T10. Rule Induction & In-Context Learning": 0.6966666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.55 + }, + "language": { + "Chinese": 0.6664246885361451, + "English": 0.6560213422946652 + } + }, + "pass@1": 0.44533333333333336, + "BoN-2": { + "overall_metric": 0.7247597749822192, + "token_length": { + "8k": 0.7863113902541472, + "16k": 0.7674558325218811, + "32k": 0.7281178415241295, + "64k": 0.7821445768686631, + "128k": 0.6970380441541918, + "256k": 0.5874909645703058 + }, + "contextual_requirement": { + "Full": 0.6960065402743163, + "Partial": 0.7613548009740968 + }, + "difficulty": { + "Easy": 0.9116793007094617, + "Moderate": 0.8131496116751571, + "Hard": 0.7011503473822069, + "Extreme": 0.47744898037225214 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8830939332648011, + "T2. Sequencing & Structure Reconstruction": 0.8806919931919931, + "T3. Evidence-Grounded QA": 0.6916666666666667, + "T4. Summarization & Synthesis": 0.5727671170579177, + "T5. Attribution & Citation Alignment": 0.7446276254629812, + "T6. Aggregation & Clustering": 0.683646617964251, + "T7. Consistency & Compliance Checking": 0.5837976903893948, + "T8. Structured & Numeric Reasoning": 0.7902777777777777, + "T9. Version & Code Diff Analysis": 0.7892333891029187, + "T10. Rule Induction & In-Context Learning": 0.7691666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6416666666666667 + }, + "language": { + "Chinese": 0.7206824886096936, + "English": 0.7288370613547459 + } + }, + "pass@2": 0.5166666666666667, + "BoN-3": { + "overall_metric": 0.7535692341250043, + "token_length": { + "8k": 0.8201462848118084, + "16k": 0.7948417017172265, + "32k": 0.7537778254297324, + "64k": 0.8032112869269382, + "128k": 0.7298931033645517, + "256k": 0.6195452024997666 + }, + "contextual_requirement": { + "Full": 0.726098496655499, + "Partial": 0.7885319909043751 + }, + "difficulty": { + "Easy": 0.9344410438349966, + "Moderate": 0.844684360754373, + "Hard": 0.7404763342771397, + "Extreme": 0.5041859708975693 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9022474827372764, + "T2. Sequencing & Structure Reconstruction": 0.8967201779701777, + "T3. Evidence-Grounded QA": 0.7333333333333333, + "T4. Summarization & Synthesis": 0.5812947594742044, + "T5. Attribution & Citation Alignment": 0.7893345155800883, + "T6. Aggregation & Clustering": 0.7043090297107774, + "T7. Consistency & Compliance Checking": 0.6227060379432763, + "T8. Structured & Numeric Reasoning": 0.8129629629629629, + "T9. Version & Code Diff Analysis": 0.8129681115419459, + "T10. Rule Induction & In-Context Learning": 0.79375, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.7 + }, + "language": { + "Chinese": 0.7447219230644647, + "English": 0.7624165451855438 + } + }, + "pass@3": 0.552 +} \ No newline at end of file diff --git a/results/DeepSeek-V3.2/nonthinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3.2/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..d2fb18fa953399c601262552941ba97a0d56aa03 --- /dev/null +++ b/results/DeepSeek-V3.2/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5167049903246114, + "inference_iteration_1_overall_metric": 0.5175993160365915, + "inference_iteration_2_overall_metric": 0.5135807596157895, + "inference_iteration_3_overall_metric": 0.5189348953214519, + "average_token_length_metric": { + "8k": 0.5691616296699119, + "16k": 0.5676134549372556, + "32k": 0.5289760437098003, + "64k": 0.5015696259811485, + "128k": 0.4857001095282947, + "256k": 0.44720907812125815 + }, + "average_contextual_requirement_metric": { + "Full": 0.48158748704864085, + "Partial": 0.5613999944940294 + }, + "average_difficulty_metric": { + "Easy": 0.6212240545937193, + "Moderate": 0.5136346834318135, + "Hard": 0.5163049021668649, + "Extreme": 0.4044885248516518 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7713866456415331, + "T2. Sequencing & Structure Reconstruction": 0.762687420604087, + "T3. Evidence-Grounded QA": 0.5333333333333333, + "T4. Summarization & Synthesis": 0.5515992544844097, + "T5. Attribution & Citation Alignment": 0.5944535310423185, + "T6. Aggregation & Clustering": 0.4789878465188747, + "T7. Consistency & Compliance Checking": 0.39465891182344254, + "T8. Structured & Numeric Reasoning": 0.2149691358024691, + "T9. Version & Code Diff Analysis": 0.6451135379199672, + "T10. Rule Induction & In-Context Learning": 0.49787037037037035, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.46944444444444444 + }, + "average_language_metric": { + "Chinese": 0.5273254322066815, + "English": 0.5060845484425422 + }, + "BoN-1": { + "overall_metric": 0.5175993160365915, + "token_length": { + "8k": 0.5773249164903291, + "16k": 0.5604627326667504, + "32k": 0.5374659261246943, + "64k": 0.5074991434398594, + "128k": 0.4702901190904467, + "256k": 0.45255305840747634 + }, + "contextual_requirement": { + "Full": 0.4801143243240927, + "Partial": 0.5653074873070485 + }, + "difficulty": { + "Easy": 0.6321632804118208, + "Moderate": 0.5036215268809261, + "Hard": 0.5164886663122836, + "Extreme": 0.40201006150807705 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7671950408606149, + "T2. Sequencing & Structure Reconstruction": 0.7640499777999779, + "T3. Evidence-Grounded QA": 0.5833333333333334, + "T4. Summarization & Synthesis": 0.5488326711816013, + "T5. Attribution & Citation Alignment": 0.58422686569879, + "T6. Aggregation & Clustering": 0.47446486157270457, + "T7. Consistency & Compliance Checking": 0.3846286380881271, + "T8. Structured & Numeric Reasoning": 0.2226851851851852, + "T9. Version & Code Diff Analysis": 0.6587133120918426, + "T10. Rule Induction & In-Context Learning": 0.5076388888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.43333333333333335 + }, + "language": { + "Chinese": 0.5202198683535451, + "English": 0.5149787637196416 + } + }, + "pass@1": 0.24533333333333332, + "BoN-2": { + "overall_metric": 0.5858993921620037, + "token_length": { + "8k": 0.6332517244583726, + "16k": 0.6310887297252004, + "32k": 0.6114096243459353, + "64k": 0.5723573466459502, + "128k": 0.5523069746695444, + "256k": 0.5149819531270248 + }, + "contextual_requirement": { + "Full": 0.5499254609160144, + "Partial": 0.6316843955659928 + }, + "difficulty": { + "Easy": 0.6997370485006186, + "Moderate": 0.578931807315116, + "Hard": 0.59328992133015, + "Extreme": 0.46091761656187924 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8076556299673503, + "T2. Sequencing & Structure Reconstruction": 0.8115853128353127, + "T3. Evidence-Grounded QA": 0.65, + "T4. Summarization & Synthesis": 0.5673030178914519, + "T5. Attribution & Citation Alignment": 0.6729248677257385, + "T6. Aggregation & Clustering": 0.5463889541830715, + "T7. Consistency & Compliance Checking": 0.4736347042901523, + "T8. Structured & Numeric Reasoning": 0.2773148148148148, + "T9. Version & Code Diff Analysis": 0.7103491970064771, + "T10. Rule Induction & In-Context Learning": 0.5745833333333332, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5833333333333334 + }, + "language": { + "Chinese": 0.5980713707246905, + "English": 0.5737274135993191 + } + }, + "pass@2": 0.312, + "BoN-3": { + "overall_metric": 0.6250975643039252, + "token_length": { + "8k": 0.6624896013016575, + "16k": 0.667371637798531, + "32k": 0.6458484887458542, + "64k": 0.619893995338142, + "128k": 0.5995137452171235, + "256k": 0.5554679174222451 + }, + "contextual_requirement": { + "Full": 0.5850937206249248, + "Partial": 0.6760115471681097 + }, + "difficulty": { + "Easy": 0.7532527253719965, + "Moderate": 0.6215376875723243, + "Hard": 0.6270141217985696, + "Extreme": 0.48578877254181324 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8312391426962816, + "T2. Sequencing & Structure Reconstruction": 0.8401638639138637, + "T3. Evidence-Grounded QA": 0.725, + "T4. Summarization & Synthesis": 0.5737811360033489, + "T5. Attribution & Citation Alignment": 0.7032234702446885, + "T6. Aggregation & Clustering": 0.5739851542792719, + "T7. Consistency & Compliance Checking": 0.5085181572307016, + "T8. Structured & Numeric Reasoning": 0.33425925925925926, + "T9. Version & Code Diff Analysis": 0.7396125292314827, + "T10. Rule Induction & In-Context Learning": 0.6588888888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6166666666666667 + }, + "language": { + "Chinese": 0.6402707582944462, + "English": 0.6099243703134061 + } + }, + "pass@3": 0.3526666666666667 +} \ No newline at end of file diff --git a/results/DeepSeek-V3.2/thinking_context-120000_bon-3_summary.json b/results/DeepSeek-V3.2/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..67a1cc00e9cb43fb03fcc17e28986ce26181ea7a --- /dev/null +++ b/results/DeepSeek-V3.2/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.6782077426413915, + "inference_iteration_1_overall_metric": 0.671629754030229, + "inference_iteration_2_overall_metric": 0.6777556491690084, + "inference_iteration_3_overall_metric": 0.6852378247249357, + "average_token_length_metric": { + "8k": 0.755369154280727, + "16k": 0.7449467265987637, + "32k": 0.6953336880653428, + "64k": 0.6946800210314833, + "128k": 0.6477035080898761, + "256k": 0.5312133577821595 + }, + "average_contextual_requirement_metric": { + "Full": 0.6459619783148297, + "Partial": 0.7192478063297452 + }, + "average_difficulty_metric": { + "Easy": 0.8502179380964533, + "Moderate": 0.7507860067400632, + "Hard": 0.6772551692268365, + "Extreme": 0.4427333362390087 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8628300416379104, + "T2. Sequencing & Structure Reconstruction": 0.8633894500561163, + "T3. Evidence-Grounded QA": 0.6277777777777778, + "T4. Summarization & Synthesis": 0.5645627813985595, + "T5. Attribution & Citation Alignment": 0.7367830500533472, + "T6. Aggregation & Clustering": 0.6168551563610202, + "T7. Consistency & Compliance Checking": 0.5431477714039084, + "T8. Structured & Numeric Reasoning": 0.6640432098765434, + "T9. Version & Code Diff Analysis": 0.7821104015574073, + "T10. Rule Induction & In-Context Learning": 0.6818518518518517, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.622222222222222 + }, + "average_language_metric": { + "Chinese": 0.6775197474946019, + "English": 0.6788957377881832 + }, + "BoN-1": { + "overall_metric": 0.671629754030229, + "token_length": { + "8k": 0.7397920433374541, + "16k": 0.7269924173975423, + "32k": 0.7007145536231846, + "64k": 0.6696695962094932, + "128k": 0.655131428243527, + "256k": 0.5374784853701818 + }, + "contextual_requirement": { + "Full": 0.6384885103680042, + "Partial": 0.7138095186912471 + }, + "difficulty": { + "Easy": 0.8524722619644284, + "Moderate": 0.7391126766592921, + "Hard": 0.6576844523580323, + "Extreme": 0.4383605238465565 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8608914983834914, + "T2. Sequencing & Structure Reconstruction": 0.8649913049913043, + "T3. Evidence-Grounded QA": 0.6166666666666667, + "T4. Summarization & Synthesis": 0.5629979913624931, + "T5. Attribution & Citation Alignment": 0.727192234350903, + "T6. Aggregation & Clustering": 0.6142105299342737, + "T7. Consistency & Compliance Checking": 0.5448247044942024, + "T8. Structured & Numeric Reasoning": 0.6222222222222223, + "T9. Version & Code Diff Analysis": 0.7868571557580843, + "T10. Rule Induction & In-Context Learning": 0.7038888888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6 + }, + "language": { + "Chinese": 0.6695591460858414, + "English": 0.6737003619746216 + } + }, + "pass@1": 0.442, + "BoN-2": { + "overall_metric": 0.739062545668539, + "token_length": { + "8k": 0.7985986496818439, + "16k": 0.8081758798304621, + "32k": 0.7769565912085228, + "64k": 0.7312553059908137, + "128k": 0.7215196949254835, + "256k": 0.5978691523741125 + }, + "contextual_requirement": { + "Full": 0.7083303283149, + "Partial": 0.7781762768459 + }, + "difficulty": { + "Easy": 0.9085052195225997, + "Moderate": 0.8289156864861236, + "Hard": 0.7590532764337315, + "Extreme": 0.4812983463842697 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8977317620746387, + "T2. Sequencing & Structure Reconstruction": 0.9110066322566314, + "T3. Evidence-Grounded QA": 0.725, + "T4. Summarization & Synthesis": 0.5777725397440469, + "T5. Attribution & Citation Alignment": 0.807338444836897, + "T6. Aggregation & Clustering": 0.6747522573464864, + "T7. Consistency & Compliance Checking": 0.6096717826867489, + "T8. Structured & Numeric Reasoning": 0.7398148148148148, + "T9. Version & Code Diff Analysis": 0.8172408263391231, + "T10. Rule Induction & In-Context Learning": 0.7575, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334 + }, + "language": { + "Chinese": 0.7379779669884229, + "English": 0.7401471243486567 + } + }, + "pass@2": 0.5286666666666666, + "BoN-3": { + "overall_metric": 0.769484517884144, + "token_length": { + "8k": 0.8183989834849181, + "16k": 0.8287125912826261, + "32k": 0.8000120047613876, + "64k": 0.7929440904260506, + "128k": 0.761968193898885, + "256k": 0.6148712434509996 + }, + "contextual_requirement": { + "Full": 0.7396412839189731, + "Partial": 0.8074668156579995 + }, + "difficulty": { + "Easy": 0.9315740410553816, + "Moderate": 0.8685767211690967, + "Hard": 0.8013938214601453, + "Extreme": 0.5058786414037999 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.904701887970005, + "T2. Sequencing & Structure Reconstruction": 0.9204745254745246, + "T3. Evidence-Grounded QA": 0.7583333333333333, + "T4. Summarization & Synthesis": 0.5854934073644178, + "T5. Attribution & Citation Alignment": 0.8343370156947629, + "T6. Aggregation & Clustering": 0.6936792642304824, + "T7. Consistency & Compliance Checking": 0.6489698568119598, + "T8. Structured & Numeric Reasoning": 0.789814814814815, + "T9. Version & Code Diff Analysis": 0.8323537332622081, + "T10. Rule Induction & In-Context Learning": 0.8091666666666666, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.775 + }, + "language": { + "Chinese": 0.76992326145198, + "English": 0.7690457743163093 + } + }, + "pass@3": 0.572 +} \ No newline at end of file diff --git a/results/GLM-4.5/nonthinking_context-120000_bon-3_summary.json b/results/GLM-4.5/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..3a211dbb51908ddda1f1dde58e649fe3509fdfa0 --- /dev/null +++ b/results/GLM-4.5/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.43035083788419254, + "inference_iteration_1_overall_metric": 0.4323298899239496, + "inference_iteration_2_overall_metric": 0.42711968234411496, + "inference_iteration_3_overall_metric": 0.43160294138451205, + "average_token_length_metric": { + "8k": 0.5314683958937002, + "16k": 0.49409349830535854, + "32k": 0.5016963643416883, + "64k": 0.41773091498134723, + "128k": 0.3415048455402667, + "256k": 0.2956110082427921 + }, + "average_contextual_requirement_metric": { + "Full": 0.40615033864921796, + "Partial": 0.46115147327415856 + }, + "average_difficulty_metric": { + "Easy": 0.5567819400237102, + "Moderate": 0.36920960557839383, + "Hard": 0.4020802519560651, + "Extreme": 0.3505786202440932 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.698931526590424, + "T2. Sequencing & Structure Reconstruction": 0.6796812769380679, + "T3. Evidence-Grounded QA": 0.4333333333333333, + "T4. Summarization & Synthesis": 0.5382688702946171, + "T5. Attribution & Citation Alignment": 0.46682998936207576, + "T6. Aggregation & Clustering": 0.3861931118799582, + "T7. Consistency & Compliance Checking": 0.2699805385521367, + "T8. Structured & Numeric Reasoning": 0.19089506172839513, + "T9. Version & Code Diff Analysis": 0.5555800013857407, + "T10. Rule Induction & In-Context Learning": 0.4250462962962961, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3111111111111111 + }, + "average_language_metric": { + "Chinese": 0.4302365847397278, + "English": 0.4304650910286559 + }, + "BoN-1": { + "overall_metric": 0.4323298899239496, + "token_length": { + "8k": 0.5493363038285968, + "16k": 0.5003136640414932, + "32k": 0.48767860893851595, + "64k": 0.427526424572054, + "128k": 0.3355553710370271, + "256k": 0.29356896712600833 + }, + "contextual_requirement": { + "Full": 0.4069098702566493, + "Partial": 0.4646826422277853 + }, + "difficulty": { + "Easy": 0.549387464064302, + "Moderate": 0.38289221280334496, + "Hard": 0.4057019361680662, + "Extreme": 0.35405992762316457 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6989719229147292, + "T2. Sequencing & Structure Reconstruction": 0.6906849631849629, + "T3. Evidence-Grounded QA": 0.4583333333333333, + "T4. Summarization & Synthesis": 0.5384461382258066, + "T5. Attribution & Citation Alignment": 0.4818936352277487, + "T6. Aggregation & Clustering": 0.37379481108384327, + "T7. Consistency & Compliance Checking": 0.26307329406050667, + "T8. Structured & Numeric Reasoning": 0.19120370370370374, + "T9. Version & Code Diff Analysis": 0.5459081401129251, + "T10. Rule Induction & In-Context Learning": 0.43111111111111106, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665 + }, + "language": { + "Chinese": 0.42606506993377413, + "English": 0.4385947099141243 + } + }, + "pass@1": 0.188, + "BoN-2": { + "overall_metric": 0.5016598810015709, + "token_length": { + "8k": 0.6177853878601806, + "16k": 0.5606584472775635, + "32k": 0.5705417377303723, + "64k": 0.49224051142500497, + "128k": 0.4148610132238459, + "256k": 0.3538721884924622 + }, + "contextual_requirement": { + "Full": 0.4781661168763403, + "Partial": 0.531561035342776 + }, + "difficulty": { + "Easy": 0.6469329658307705, + "Moderate": 0.4476620656378584, + "Hard": 0.47548119381384835, + "Extreme": 0.39518120452359784 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7612773821912874, + "T2. Sequencing & Structure Reconstruction": 0.7455342342842339, + "T3. Evidence-Grounded QA": 0.5416666666666666, + "T4. Summarization & Synthesis": 0.5515008189706921, + "T5. Attribution & Citation Alignment": 0.5415377453921782, + "T6. Aggregation & Clustering": 0.46179351546743685, + "T7. Consistency & Compliance Checking": 0.33666684704302524, + "T8. Structured & Numeric Reasoning": 0.24537037037037035, + "T9. Version & Code Diff Analysis": 0.6269577879155583, + "T10. Rule Induction & In-Context Learning": 0.5281944444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333 + }, + "language": { + "Chinese": 0.48816204809318214, + "English": 0.5151577139099619 + } + }, + "pass@2": 0.23933333333333334, + "BoN-3": { + "overall_metric": 0.5384969000760513, + "token_length": { + "8k": 0.6356799930117969, + "16k": 0.5929083561253189, + "32k": 0.6076554475255213, + "64k": 0.5450784408185039, + "128k": 0.4563507575799897, + "256k": 0.3933084053951815 + }, + "contextual_requirement": { + "Full": 0.5134959721986003, + "Partial": 0.5703162628291728 + }, + "difficulty": { + "Easy": 0.6991002895184023, + "Moderate": 0.4873593462709992, + "Hard": 0.49915491813734847, + "Extreme": 0.4219917912549902 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7936494394174134, + "T2. Sequencing & Structure Reconstruction": 0.7682061919561918, + "T3. Evidence-Grounded QA": 0.6, + "T4. Summarization & Synthesis": 0.5604787610213925, + "T5. Attribution & Citation Alignment": 0.5757231079007503, + "T6. Aggregation & Clustering": 0.4903242104980166, + "T7. Consistency & Compliance Checking": 0.3811432372555578, + "T8. Structured & Numeric Reasoning": 0.274537037037037, + "T9. Version & Code Diff Analysis": 0.6567859123578721, + "T10. Rule Induction & In-Context Learning": 0.5740277777777778, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334 + }, + "language": { + "Chinese": 0.5274718845754229, + "English": 0.5495219155766814 + } + }, + "pass@3": 0.27466666666666667 +} \ No newline at end of file diff --git a/results/GLM-4.5/thinking_context-120000_bon-3_summary.json b/results/GLM-4.5/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1b3d4b1d4a70ce982dac62d983a27d967fb394cf --- /dev/null +++ b/results/GLM-4.5/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 2, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5547937875815533, + "inference_iteration_1_overall_metric": 0.5516398292433491, + "inference_iteration_2_overall_metric": 0.5535867950098665, + "inference_iteration_3_overall_metric": 0.559154738491441, + "average_token_length_metric": { + "8k": 0.6972820277601768, + "16k": 0.6560112539595868, + "32k": 0.6029656036576351, + "64k": 0.5486294944947675, + "128k": 0.4403706772307823, + "256k": 0.38350366838636923 + }, + "average_contextual_requirement_metric": { + "Full": 0.5245120995377551, + "Partial": 0.593334117819114 + }, + "average_difficulty_metric": { + "Easy": 0.7655351117837857, + "Moderate": 0.5513197070461822, + "Hard": 0.473832729215578, + "Extreme": 0.37939478048385467 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.81334590378291, + "T2. Sequencing & Structure Reconstruction": 0.7505160653892534, + "T3. Evidence-Grounded QA": 0.5027777777777779, + "T4. Summarization & Synthesis": 0.5448942860165678, + "T5. Attribution & Citation Alignment": 0.5301683776855378, + "T6. Aggregation & Clustering": 0.5045905245734419, + "T7. Consistency & Compliance Checking": 0.3519808997100079, + "T8. Structured & Numeric Reasoning": 0.6371913580246912, + "T9. Version & Code Diff Analysis": 0.6439646495440424, + "T10. Rule Induction & In-Context Learning": 0.5808333333333334, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3277777777777777 + }, + "average_language_metric": { + "Chinese": 0.5739046099726642, + "English": 0.5356829651904418 + }, + "BoN-1": { + "overall_metric": 0.5516398292433491, + "token_length": { + "8k": 0.7055429088605052, + "16k": 0.6578481199386829, + "32k": 0.5956492591185892, + "64k": 0.5401522196952867, + "128k": 0.42395107911700697, + "256k": 0.386695388730029 + }, + "contextual_requirement": { + "Full": 0.5177580971479406, + "Partial": 0.5947620337284167 + }, + "difficulty": { + "Easy": 0.752722499925911, + "Moderate": 0.5341124686360985, + "Hard": 0.4868060322041808, + "Extreme": 0.3854592094497618 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.802009002313031, + "T2. Sequencing & Structure Reconstruction": 0.7465347152847152, + "T3. Evidence-Grounded QA": 0.5, + "T4. Summarization & Synthesis": 0.545377281117479, + "T5. Attribution & Citation Alignment": 0.5499549412575729, + "T6. Aggregation & Clustering": 0.5019983393746833, + "T7. Consistency & Compliance Checking": 0.36021434419736664, + "T8. Structured & Numeric Reasoning": 0.6138888888888889, + "T9. Version & Code Diff Analysis": 0.6552473446554453, + "T10. Rule Induction & In-Context Learning": 0.5405555555555556, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667 + }, + "language": { + "Chinese": 0.5760533640670983, + "English": 0.5272262944196019 + } + }, + "pass@1": 0.304, + "BoN-2": { + "overall_metric": 0.634985055308529, + "token_length": { + "8k": 0.7829957270581746, + "16k": 0.720868655519701, + "32k": 0.6955532292535292, + "64k": 0.6191323026708053, + "128k": 0.5371685233314611, + "256k": 0.45419189401750604 + }, + "contextual_requirement": { + "Full": 0.6004623267171497, + "Partial": 0.67892307351574 + }, + "difficulty": { + "Easy": 0.8494656930118193, + "Moderate": 0.6521983830806934, + "Hard": 0.561519594238419, + "Extreme": 0.43697868974062276 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8580534190067889, + "T2. Sequencing & Structure Reconstruction": 0.8009605209605205, + "T3. Evidence-Grounded QA": 0.6166666666666667, + "T4. Summarization & Synthesis": 0.5623513196545108, + "T5. Attribution & Citation Alignment": 0.6128841050494817, + "T6. Aggregation & Clustering": 0.5858431630098295, + "T7. Consistency & Compliance Checking": 0.42985731617797807, + "T8. Structured & Numeric Reasoning": 0.7472222222222222, + "T9. Version & Code Diff Analysis": 0.7035408856813825, + "T10. Rule Induction & In-Context Learning": 0.6718055555555555, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667 + }, + "language": { + "Chinese": 0.6487038966161633, + "English": 0.6212662140008957 + } + }, + "pass@2": 0.4, + "BoN-3": { + "overall_metric": 0.6753829492782434, + "token_length": { + "8k": 0.8185554078642306, + "16k": 0.7599530890518512, + "32k": 0.7311990883460826, + "64k": 0.6754068771726057, + "128k": 0.5730612976856176, + "256k": 0.49412193554907724 + }, + "contextual_requirement": { + "Full": 0.6412352384515387, + "Partial": 0.718843672148596 + }, + "difficulty": { + "Easy": 0.8844767087888782, + "Moderate": 0.717837978263145, + "Hard": 0.600883682680195, + "Extreme": 0.4673774778829585 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.896714960990196, + "T2. Sequencing & Structure Reconstruction": 0.8421179283679281, + "T3. Evidence-Grounded QA": 0.675, + "T4. Summarization & Synthesis": 0.5712469046961466, + "T5. Attribution & Citation Alignment": 0.6300235179241888, + "T6. Aggregation & Clustering": 0.6348800775177585, + "T7. Consistency & Compliance Checking": 0.4736376213352886, + "T8. Structured & Numeric Reasoning": 0.7824074074074073, + "T9. Version & Code Diff Analysis": 0.7424625612755764, + "T10. Rule Induction & In-Context Learning": 0.765, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334 + }, + "language": { + "Chinese": 0.6895150466449506, + "English": 0.6612508519115379 + } + }, + "pass@3": 0.444 +} \ No newline at end of file diff --git a/results/GLM-4.6/nonthinking_context-120000_bon-3_summary.json b/results/GLM-4.6/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..80081a99bad607c4b85bc39af6499da435db9b66 --- /dev/null +++ b/results/GLM-4.6/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.45854238430368943, + "inference_iteration_1_overall_metric": 0.44890397156188516, + "inference_iteration_2_overall_metric": 0.4676756901179884, + "inference_iteration_3_overall_metric": 0.45904749123119587, + "average_token_length_metric": { + "8k": 0.539826456202734, + "16k": 0.49883990878468565, + "32k": 0.5226004279628154, + "64k": 0.4617605114078172, + "128k": 0.3868307627999842, + "256k": 0.34139623866410224 + }, + "average_contextual_requirement_metric": { + "Full": 0.43758451425043066, + "Partial": 0.4852160370987465 + }, + "average_difficulty_metric": { + "Easy": 0.5881815896217503, + "Moderate": 0.40045739164324096, + "Hard": 0.43071501653405486, + "Extreme": 0.3729573279423014 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7391294273870368, + "T2. Sequencing & Structure Reconstruction": 0.7168876379566528, + "T3. Evidence-Grounded QA": 0.48333333333333334, + "T4. Summarization & Synthesis": 0.54430615481068, + "T5. Attribution & Citation Alignment": 0.5036524549754317, + "T6. Aggregation & Clustering": 0.4230515938862694, + "T7. Consistency & Compliance Checking": 0.28024190494681, + "T8. Structured & Numeric Reasoning": 0.204783950617284, + "T9. Version & Code Diff Analysis": 0.5493453618981832, + "T10. Rule Induction & In-Context Learning": 0.48856481481481484, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3444444444444445 + }, + "average_language_metric": { + "Chinese": 0.4607293656684521, + "English": 0.45635540293892746 + }, + "BoN-1": { + "overall_metric": 0.44890397156188516, + "token_length": { + "8k": 0.5369079824065298, + "16k": 0.4950184758232463, + "32k": 0.5119378052749619, + "64k": 0.4359616248204175, + "128k": 0.3900490912931948, + "256k": 0.323548849752962 + }, + "contextual_requirement": { + "Full": 0.42475510461227917, + "Partial": 0.47963889313411256 + }, + "difficulty": { + "Easy": 0.5839417584803529, + "Moderate": 0.38543423470966093, + "Hard": 0.4251833017579716, + "Extreme": 0.3582444584458006 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7552782575667283, + "T2. Sequencing & Structure Reconstruction": 0.72015059015059, + "T3. Evidence-Grounded QA": 0.425, + "T4. Summarization & Synthesis": 0.5415035414147457, + "T5. Attribution & Citation Alignment": 0.49621823831328055, + "T6. Aggregation & Clustering": 0.41038094275648807, + "T7. Consistency & Compliance Checking": 0.27110571290315905, + "T8. Structured & Numeric Reasoning": 0.1953703703703704, + "T9. Version & Code Diff Analysis": 0.5256412558109732, + "T10. Rule Induction & In-Context Learning": 0.4822222222222222, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.35 + }, + "language": { + "Chinese": 0.44671558878002343, + "English": 0.45109235434374795 + } + }, + "pass@1": 0.19266666666666668, + "BoN-2": { + "overall_metric": 0.5266630332287439, + "token_length": { + "8k": 0.604236555945032, + "16k": 0.5564543546519637, + "32k": 0.5870438443448399, + "64k": 0.5387565602861155, + "128k": 0.46692762105066604, + "256k": 0.4065592630938467 + }, + "contextual_requirement": { + "Full": 0.5015019335956029, + "Partial": 0.5586862509436522 + }, + "difficulty": { + "Easy": 0.6773656505276844, + "Moderate": 0.47328676466113107, + "Hard": 0.49837787163941455, + "Extreme": 0.4152118781770787 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7867939033692525, + "T2. Sequencing & Structure Reconstruction": 0.7854105213890441, + "T3. Evidence-Grounded QA": 0.5666666666666667, + "T4. Summarization & Synthesis": 0.5572174880251567, + "T5. Attribution & Citation Alignment": 0.573956288098141, + "T6. Aggregation & Clustering": 0.4915724205045227, + "T7. Consistency & Compliance Checking": 0.33460325146257974, + "T8. Structured & Numeric Reasoning": 0.2578703703703704, + "T9. Version & Code Diff Analysis": 0.6423128731937172, + "T10. Rule Induction & In-Context Learning": 0.6198611111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.425 + }, + "language": { + "Chinese": 0.518790207475404, + "English": 0.5345358589820849 + } + }, + "pass@2": 0.25466666666666665, + "BoN-3": { + "overall_metric": 0.5609610306922399, + "token_length": { + "8k": 0.6386076789983793, + "16k": 0.5860184400135818, + "32k": 0.6210153091497477, + "64k": 0.5871225156541778, + "128k": 0.4913536643924111, + "256k": 0.4416485759451446 + }, + "contextual_requirement": { + "Full": 0.5370910562726694, + "Partial": 0.5913409981353319 + }, + "difficulty": { + "Easy": 0.7100406216614994, + "Moderate": 0.5174437099567866, + "Hard": 0.5265960486680222, + "Extreme": 0.44880562762488274 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.815117939706351, + "T2. Sequencing & Structure Reconstruction": 0.8064498061783285, + "T3. Evidence-Grounded QA": 0.6416666666666667, + "T4. Summarization & Synthesis": 0.5680717589841464, + "T5. Attribution & Citation Alignment": 0.604565984118908, + "T6. Aggregation & Clustering": 0.5283992168363204, + "T7. Consistency & Compliance Checking": 0.35872230397543203, + "T8. Structured & Numeric Reasoning": 0.28935185185185186, + "T9. Version & Code Diff Analysis": 0.6707362245587523, + "T10. Rule Induction & In-Context Learning": 0.6406944444444443, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5 + }, + "language": { + "Chinese": 0.5609588769304155, + "English": 0.5609631844540666 + } + }, + "pass@3": 0.286 +} \ No newline at end of file diff --git a/results/GLM-4.6/thinking_context-120000_bon-3_summary.json b/results/GLM-4.6/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..42a80ff8e04de9b7cdc2bb58d04268e6b7a11256 --- /dev/null +++ b/results/GLM-4.6/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5820993757625644, + "inference_iteration_1_overall_metric": 0.5900318347862288, + "inference_iteration_2_overall_metric": 0.5774825139114689, + "inference_iteration_3_overall_metric": 0.5787837785899949, + "average_token_length_metric": { + "8k": 0.7122784818137915, + "16k": 0.6603518496747058, + "32k": 0.6352743108645184, + "64k": 0.5897286272690893, + "128k": 0.475467875017661, + "256k": 0.4194951099356217 + }, + "average_contextual_requirement_metric": { + "Full": 0.5470143190278319, + "Partial": 0.6267530843340428 + }, + "average_difficulty_metric": { + "Easy": 0.7978473417092227, + "Moderate": 0.6094768922677877, + "Hard": 0.4892370620605133, + "Extreme": 0.3887688912252786 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8197327977970514, + "T2. Sequencing & Structure Reconstruction": 0.8006519321293782, + "T3. Evidence-Grounded QA": 0.538888888888889, + "T4. Summarization & Synthesis": 0.5408566771607968, + "T5. Attribution & Citation Alignment": 0.5337112988588841, + "T6. Aggregation & Clustering": 0.5397680321862239, + "T7. Consistency & Compliance Checking": 0.380513781495624, + "T8. Structured & Numeric Reasoning": 0.6123456790123456, + "T9. Version & Code Diff Analysis": 0.6754501038965057, + "T10. Rule Induction & In-Context Learning": 0.6013425925925924, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.46666666666666673 + }, + "average_language_metric": { + "Chinese": 0.5991918774535788, + "English": 0.5650068740715505 + }, + "BoN-1": { + "overall_metric": 0.5900318347862288, + "token_length": { + "8k": 0.7190910013269595, + "16k": 0.6680291983169964, + "32k": 0.6447298296516131, + "64k": 0.5905857251798682, + "128k": 0.4766512837488421, + "256k": 0.4411039704930917 + }, + "contextual_requirement": { + "Full": 0.5455741223917344, + "Partial": 0.6466143778337665 + }, + "difficulty": { + "Easy": 0.8187580590040169, + "Moderate": 0.6184177733393077, + "Hard": 0.4890540077544932, + "Extreme": 0.38715232500749663 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8264146064969754, + "T2. Sequencing & Structure Reconstruction": 0.8039418272654707, + "T3. Evidence-Grounded QA": 0.55, + "T4. Summarization & Synthesis": 0.5427878753470068, + "T5. Attribution & Citation Alignment": 0.5419657251498339, + "T6. Aggregation & Clustering": 0.5399584229450126, + "T7. Consistency & Compliance Checking": 0.37011974365960826, + "T8. Structured & Numeric Reasoning": 0.6416666666666667, + "T9. Version & Code Diff Analysis": 0.693087317328304, + "T10. Rule Induction & In-Context Learning": 0.6145833333333334, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.475 + }, + "language": { + "Chinese": 0.6075617792305364, + "English": 0.5725018903419208 + } + }, + "pass@1": 0.36466666666666664, + "BoN-2": { + "overall_metric": 0.6659160211594685, + "token_length": { + "8k": 0.7935291454736249, + "16k": 0.7469698033059613, + "32k": 0.7147019641303554, + "64k": 0.6716314717341791, + "128k": 0.5707409059251077, + "256k": 0.497922836387586 + }, + "contextual_requirement": { + "Full": 0.6309312423015924, + "Partial": 0.7104421033422219 + }, + "difficulty": { + "Easy": 0.8923409007483016, + "Moderate": 0.7327459154582487, + "Hard": 0.5632013922542815, + "Extreme": 0.4414476037490934 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.880882952904511, + "T2. Sequencing & Structure Reconstruction": 0.8476685999185997, + "T3. Evidence-Grounded QA": 0.6666666666666666, + "T4. Summarization & Synthesis": 0.5583277247093672, + "T5. Attribution & Citation Alignment": 0.6177785331058078, + "T6. Aggregation & Clustering": 0.6268647075743846, + "T7. Consistency & Compliance Checking": 0.4566279411326888, + "T8. Structured & Numeric Reasoning": 0.7217592592592593, + "T9. Version & Code Diff Analysis": 0.7464145919055787, + "T10. Rule Induction & In-Context Learning": 0.7400000000000001, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5583333333333333 + }, + "language": { + "Chinese": 0.6881369643762367, + "English": 0.6436950779427024 + } + }, + "pass@2": 0.43866666666666665, + "BoN-3": { + "overall_metric": 0.706844154531168, + "token_length": { + "8k": 0.8222844269368486, + "16k": 0.7772591556882481, + "32k": 0.7466099276083229, + "64k": 0.7234900151981559, + "128k": 0.62014724878678, + "256k": 0.551274152968658 + }, + "contextual_requirement": { + "Full": 0.6725119993190923, + "Partial": 0.7505396248010854 + }, + "difficulty": { + "Easy": 0.9285919042851156, + "Moderate": 0.7918869916930842, + "Hard": 0.6026672046571206, + "Extreme": 0.4764972072411785 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9055704999975606, + "T2. Sequencing & Structure Reconstruction": 0.8679013780100734, + "T3. Evidence-Grounded QA": 0.7416666666666667, + "T4. Summarization & Synthesis": 0.5683880603721403, + "T5. Attribution & Citation Alignment": 0.6545164467516165, + "T6. Aggregation & Clustering": 0.6671955091257745, + "T7. Consistency & Compliance Checking": 0.5102082868832639, + "T8. Structured & Numeric Reasoning": 0.7717592592592593, + "T9. Version & Code Diff Analysis": 0.7612642969391072, + "T10. Rule Induction & In-Context Learning": 0.7541666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333 + }, + "language": { + "Chinese": 0.7225009620618484, + "English": 0.6911873470004898 + } + }, + "pass@3": 0.48133333333333334 +} \ No newline at end of file diff --git a/results/GPT-4o/nonthinking_context-120000_bon-3_summary.json b/results/GPT-4o/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f34052712384c2995792bfc0438bdf36c43f10ec --- /dev/null +++ b/results/GPT-4o/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.46665010092977977, + "inference_iteration_1_overall_metric": 0.4658401067882854, + "inference_iteration_2_overall_metric": 0.46753496394327626, + "inference_iteration_3_overall_metric": 0.466575232057776, + "average_token_length_metric": { + "8k": 0.5113488376851383, + "16k": 0.4997009141224516, + "32k": 0.5251055066966325, + "64k": 0.45692433752384126, + "128k": 0.4357776587958875, + "256k": 0.37104335075472816 + }, + "average_contextual_requirement_metric": { + "Full": 0.4340554247509889, + "Partial": 0.5081342342482407 + }, + "average_difficulty_metric": { + "Easy": 0.5937760942513476, + "Moderate": 0.4302951347009006, + "Hard": 0.4487990540053617, + "Extreme": 0.3629928487032055 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7349022081832958, + "T2. Sequencing & Structure Reconstruction": 0.7250800279966945, + "T3. Evidence-Grounded QA": 0.522222222222222, + "T4. Summarization & Synthesis": 0.5082664460738612, + "T5. Attribution & Citation Alignment": 0.5342878338439898, + "T6. Aggregation & Clustering": 0.4265212400920285, + "T7. Consistency & Compliance Checking": 0.27964395302062434, + "T8. Structured & Numeric Reasoning": 0.21157407407407405, + "T9. Version & Code Diff Analysis": 0.5920364002998717, + "T10. Rule Induction & In-Context Learning": 0.4702777777777778, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.36944444444444435 + }, + "average_language_metric": { + "Chinese": 0.4566116864733059, + "English": 0.47668851538625334 + }, + "BoN-1": { + "overall_metric": 0.4658401067882854, + "token_length": { + "8k": 0.49942481947489, + "16k": 0.5075293232399243, + "32k": 0.535471182543319, + "64k": 0.4466006941173288, + "128k": 0.43362259025283534, + "256k": 0.37239203110142105 + }, + "contextual_requirement": { + "Full": 0.434998076864218, + "Partial": 0.5050935994189195 + }, + "difficulty": { + "Easy": 0.5852393493724929, + "Moderate": 0.43887833780663893, + "Hard": 0.4558206214636842, + "Extreme": 0.3593336239903742 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7408543245056541, + "T2. Sequencing & Structure Reconstruction": 0.7499819162319159, + "T3. Evidence-Grounded QA": 0.5166666666666667, + "T4. Summarization & Synthesis": 0.5048929077004661, + "T5. Attribution & Citation Alignment": 0.5359141447270879, + "T6. Aggregation & Clustering": 0.42758922871826105, + "T7. Consistency & Compliance Checking": 0.27436727289246843, + "T8. Structured & Numeric Reasoning": 0.2152777777777778, + "T9. Version & Code Diff Analysis": 0.592312178161249, + "T10. Rule Induction & In-Context Learning": 0.44819444444444445, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334 + }, + "language": { + "Chinese": 0.4424240788239187, + "English": 0.48925613475265434 + } + }, + "pass@1": 0.194, + "BoN-2": { + "overall_metric": 0.5398849816112041, + "token_length": { + "8k": 0.5759687409807404, + "16k": 0.549906030476181, + "32k": 0.6066920241775022, + "64k": 0.5317817512096467, + "128k": 0.5214872601681293, + "256k": 0.4534740826550301 + }, + "contextual_requirement": { + "Full": 0.5083509910328655, + "Partial": 0.5800191514381833 + }, + "difficulty": { + "Easy": 0.6782240284299118, + "Moderate": 0.5120970500900949, + "Hard": 0.5256120585356611, + "Extreme": 0.41596717800169664 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7993838319887435, + "T2. Sequencing & Structure Reconstruction": 0.7934912309912311, + "T3. Evidence-Grounded QA": 0.6666666666666666, + "T4. Summarization & Synthesis": 0.5276777173176291, + "T5. Attribution & Citation Alignment": 0.6042284709538017, + "T6. Aggregation & Clustering": 0.4920141039764696, + "T7. Consistency & Compliance Checking": 0.34899250013259275, + "T8. Structured & Numeric Reasoning": 0.26296296296296295, + "T9. Version & Code Diff Analysis": 0.6671322238361709, + "T10. Rule Induction & In-Context Learning": 0.5840277777777777, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.45 + }, + "language": { + "Chinese": 0.525731927704756, + "English": 0.5540380355176548 + } + }, + "pass@2": 0.25466666666666665, + "BoN-3": { + "overall_metric": 0.5744593750081309, + "token_length": { + "8k": 0.606512147802086, + "16k": 0.5958370007923852, + "32k": 0.6384127504911996, + "64k": 0.5707759455131253, + "128k": 0.5568976109117445, + "256k": 0.4783207945382497 + }, + "contextual_requirement": { + "Full": 0.5391592397427731, + "Partial": 0.6193868198913163 + }, + "difficulty": { + "Easy": 0.7196856800163218, + "Moderate": 0.540664277732673, + "Hard": 0.5601282438717835, + "Extreme": 0.44698074091055134 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8103075764869724, + "T2. Sequencing & Structure Reconstruction": 0.8166357716357717, + "T3. Evidence-Grounded QA": 0.7, + "T4. Summarization & Synthesis": 0.5377648665242384, + "T5. Attribution & Citation Alignment": 0.6423496422537093, + "T6. Aggregation & Clustering": 0.532547329213996, + "T7. Consistency & Compliance Checking": 0.40458496685251083, + "T8. Structured & Numeric Reasoning": 0.30046296296296293, + "T9. Version & Code Diff Analysis": 0.6915969977123051, + "T10. Rule Induction & In-Context Learning": 0.6340277777777777, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.49166666666666664 + }, + "language": { + "Chinese": 0.563099563818372, + "English": 0.5858191861978923 + } + }, + "pass@3": 0.286 +} \ No newline at end of file diff --git a/results/GPT-4o/thinking_context-120000_bon-3_summary.json b/results/GPT-4o/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..09045b4f19a91cc5d02399fa1e1d6a228181932f --- /dev/null +++ b/results/GPT-4o/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.4943586900400841, + "inference_iteration_1_overall_metric": 0.4968481802669354, + "inference_iteration_2_overall_metric": 0.4953906178052376, + "inference_iteration_3_overall_metric": 0.49083727204807814, + "average_token_length_metric": { + "8k": 0.5879811335998006, + "16k": 0.5326400416753286, + "32k": 0.512948102728002, + "64k": 0.4721690409999518, + "128k": 0.44658724759711643, + "256k": 0.41382657364030534 + }, + "average_contextual_requirement_metric": { + "Full": 0.46488316610562114, + "Partial": 0.5318729932294004 + }, + "average_difficulty_metric": { + "Easy": 0.7183848834775743, + "Moderate": 0.43069679620968054, + "Hard": 0.41352044386464876, + "Extreme": 0.34385849736921437 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7592784040430812, + "T2. Sequencing & Structure Reconstruction": 0.6992785706820794, + "T3. Evidence-Grounded QA": 0.41944444444444456, + "T4. Summarization & Synthesis": 0.4904128501144795, + "T5. Attribution & Citation Alignment": 0.5555445495468067, + "T6. Aggregation & Clustering": 0.46463843332938043, + "T7. Consistency & Compliance Checking": 0.26892122251820344, + "T8. Structured & Numeric Reasoning": 0.4810185185185185, + "T9. Version & Code Diff Analysis": 0.5265279154913766, + "T10. Rule Induction & In-Context Learning": 0.48212962962962963, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4250000000000001 + }, + "average_language_metric": { + "Chinese": 0.46257560421163496, + "English": 0.526141775868533 + }, + "BoN-1": { + "overall_metric": 0.4968481802669354, + "token_length": { + "8k": 0.5741867935281576, + "16k": 0.535374756046493, + "32k": 0.5202544808789716, + "64k": 0.47647427072064424, + "128k": 0.4412042204935842, + "256k": 0.4335945599337654 + }, + "contextual_requirement": { + "Full": 0.47247358832994213, + "Partial": 0.5278703881867464 + }, + "difficulty": { + "Easy": 0.7337781908034996, + "Moderate": 0.4336235016656763, + "Hard": 0.39945319018108144, + "Extreme": 0.3428000420213732 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.75748852621513, + "T2. Sequencing & Structure Reconstruction": 0.7128378857984122, + "T3. Evidence-Grounded QA": 0.4166666666666667, + "T4. Summarization & Synthesis": 0.4915611304736425, + "T5. Attribution & Citation Alignment": 0.5573645055278006, + "T6. Aggregation & Clustering": 0.4608094132930736, + "T7. Consistency & Compliance Checking": 0.2337605934000851, + "T8. Structured & Numeric Reasoning": 0.5046296296296297, + "T9. Version & Code Diff Analysis": 0.5617174175041987, + "T10. Rule Induction & In-Context Learning": 0.5141666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4 + }, + "language": { + "Chinese": 0.46378379039245493, + "English": 0.5299125701414172 + } + }, + "pass@1": 0.25266666666666665, + "BoN-2": { + "overall_metric": 0.5733973874130963, + "token_length": { + "8k": 0.6607408168420358, + "16k": 0.6168176180801352, + "32k": 0.6026001586229682, + "64k": 0.5549119793217003, + "128k": 0.5219672618111347, + "256k": 0.4833464898006067 + }, + "contextual_requirement": { + "Full": 0.5468729804164447, + "Partial": 0.6071557235906547 + }, + "difficulty": { + "Easy": 0.832354149230665, + "Moderate": 0.5035087598472016, + "Hard": 0.482911736547395, + "Extreme": 0.39505876757369657 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8194314757440243, + "T2. Sequencing & Structure Reconstruction": 0.7647144522144521, + "T3. Evidence-Grounded QA": 0.5166666666666667, + "T4. Summarization & Synthesis": 0.507756894787762, + "T5. Attribution & Citation Alignment": 0.6431090618973574, + "T6. Aggregation & Clustering": 0.5325685690744169, + "T7. Consistency & Compliance Checking": 0.31928948521783374, + "T8. Structured & Numeric Reasoning": 0.5949074074074073, + "T9. Version & Code Diff Analysis": 0.5981405988039616, + "T10. Rule Induction & In-Context Learning": 0.6058333333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666 + }, + "language": { + "Chinese": 0.5418805636874472, + "English": 0.6049142111387469 + } + }, + "pass@2": 0.33, + "BoN-3": { + "overall_metric": 0.6133026799603023, + "token_length": { + "8k": 0.7092280973251848, + "16k": 0.6573848999983455, + "32k": 0.6374049433271965, + "64k": 0.5884360272634136, + "128k": 0.5736764233126421, + "256k": 0.513685688535035 + }, + "contextual_requirement": { + "Full": 0.585559935676702, + "Partial": 0.6486116272303409 + }, + "difficulty": { + "Easy": 0.8579938866329095, + "Moderate": 0.5531379960724733, + "Hard": 0.5375404413799146, + "Extreme": 0.4345338594537549 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8365910773415853, + "T2. Sequencing & Structure Reconstruction": 0.7977771302771303, + "T3. Evidence-Grounded QA": 0.5916666666666667, + "T4. Summarization & Synthesis": 0.5159806774505266, + "T5. Attribution & Citation Alignment": 0.6935011362804332, + "T6. Aggregation & Clustering": 0.5705653708431486, + "T7. Consistency & Compliance Checking": 0.381137305153889, + "T8. Structured & Numeric Reasoning": 0.6296296296296297, + "T9. Version & Code Diff Analysis": 0.6584905752696673, + "T10. Rule Induction & In-Context Learning": 0.6336111111111112, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667 + }, + "language": { + "Chinese": 0.5780672262010988, + "English": 0.6485381337195074 + } + }, + "pass@3": 0.36666666666666664 +} \ No newline at end of file diff --git a/results/GPT-5/thinking_context-272000_bon-3_summary.json b/results/GPT-5/thinking_context-272000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..184b0204bdfe3a96f1df404699b209378784eb42 --- /dev/null +++ b/results/GPT-5/thinking_context-272000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.726053089253122, + "inference_iteration_1_overall_metric": 0.7242860759291603, + "inference_iteration_2_overall_metric": 0.72436075729001, + "inference_iteration_3_overall_metric": 0.729512434540192, + "average_token_length_metric": { + "8k": 0.7537078410340138, + "16k": 0.7627066310839429, + "32k": 0.7434290864816196, + "64k": 0.7646193918174649, + "128k": 0.6936202889645278, + "256k": 0.638235296137159 + }, + "average_contextual_requirement_metric": { + "Full": 0.6915568234658586, + "Partial": 0.7699574275278195 + }, + "average_difficulty_metric": { + "Easy": 0.8523326045847652, + "Moderate": 0.8231088494697211, + "Hard": 0.787367547123676, + "Extreme": 0.4836991814871219 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.9032376385150938, + "T2. Sequencing & Structure Reconstruction": 0.9075063054229715, + "T3. Evidence-Grounded QA": 0.6666666666666666, + "T4. Summarization & Synthesis": 0.5256066584699448, + "T5. Attribution & Citation Alignment": 0.8116994715897818, + "T6. Aggregation & Clustering": 0.6716265654111317, + "T7. Consistency & Compliance Checking": 0.631179283519898, + "T8. Structured & Numeric Reasoning": 0.7979938271604939, + "T9. Version & Code Diff Analysis": 0.818404768269679, + "T10. Rule Induction & In-Context Learning": 0.6802314814814814, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6111111111111112 + }, + "average_language_metric": { + "Chinese": 0.7196645097291159, + "English": 0.7324416687771269 + }, + "BoN-1": { + "overall_metric": 0.7242860759291603, + "token_length": { + "8k": 0.7638228227994025, + "16k": 0.7511485364018967, + "32k": 0.7397315002658593, + "64k": 0.7648062624572959, + "128k": 0.6947065191324134, + "256k": 0.6315008145180959 + }, + "contextual_requirement": { + "Full": 0.6845638507619599, + "Partial": 0.7748416352328712 + }, + "difficulty": { + "Easy": 0.8419121655420269, + "Moderate": 0.8140896757444649, + "Hard": 0.8018107002313927, + "Extreme": 0.4855278214669571 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9022908711992736, + "T2. Sequencing & Structure Reconstruction": 0.9003492803492802, + "T3. Evidence-Grounded QA": 0.6666666666666666, + "T4. Summarization & Synthesis": 0.525285592483348, + "T5. Attribution & Citation Alignment": 0.8350389199886978, + "T6. Aggregation & Clustering": 0.6728116198035761, + "T7. Consistency & Compliance Checking": 0.6250527729039961, + "T8. Structured & Numeric Reasoning": 0.7824074074074074, + "T9. Version & Code Diff Analysis": 0.8228424738103258, + "T10. Rule Induction & In-Context Learning": 0.6890277777777778, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5916666666666667 + }, + "language": { + "Chinese": 0.7225808137285838, + "English": 0.725991338129738 + } + }, + "pass@1": 0.5033333333333333, + "BoN-2": { + "overall_metric": 0.773365567880672, + "token_length": { + "8k": 0.7988567725267066, + "16k": 0.7953552672252621, + "32k": 0.7853032014648265, + "64k": 0.8171591510524335, + "128k": 0.7387615265550217, + "256k": 0.7047574884597809 + }, + "contextual_requirement": { + "Full": 0.740254405395005, + "Partial": 0.8155070474078848 + }, + "difficulty": { + "Easy": 0.8943471956938479, + "Moderate": 0.8694949682853881, + "Hard": 0.8603608124174508, + "Extreme": 0.5205560974396651 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9235081407527015, + "T2. Sequencing & Structure Reconstruction": 0.9270526695526693, + "T3. Evidence-Grounded QA": 0.7583333333333333, + "T4. Summarization & Synthesis": 0.5388141391367185, + "T5. Attribution & Citation Alignment": 0.8662194687189113, + "T6. Aggregation & Clustering": 0.724952326567939, + "T7. Consistency & Compliance Checking": 0.6769275451403334, + "T8. Structured & Numeric Reasoning": 0.837962962962963, + "T9. Version & Code Diff Analysis": 0.8518498172294341, + "T10. Rule Induction & In-Context Learning": 0.749861111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6916666666666667 + }, + "language": { + "Chinese": 0.7653921632804664, + "English": 0.7813389724808776 + } + }, + "pass@2": 0.5773333333333334, + "BoN-3": { + "overall_metric": 0.7997603117800453, + "token_length": { + "8k": 0.8156058789899132, + "16k": 0.8312258319915683, + "32k": 0.8146647150412942, + "64k": 0.8402343004850696, + "128k": 0.7648319163907665, + "256k": 0.7319992277816549 + }, + "contextual_requirement": { + "Full": 0.7681553658992594, + "Partial": 0.8399847883555894 + }, + "difficulty": { + "Easy": 0.9168344057692764, + "Moderate": 0.9117105202934518, + "Hard": 0.8867394849893248, + "Extreme": 0.5408505285512573 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9362853987173203, + "T2. Sequencing & Structure Reconstruction": 0.9359547859547858, + "T3. Evidence-Grounded QA": 0.7916666666666666, + "T4. Summarization & Synthesis": 0.5458038576746401, + "T5. Attribution & Citation Alignment": 0.8823540286034711, + "T6. Aggregation & Clustering": 0.7446436845926303, + "T7. Consistency & Compliance Checking": 0.6987021524631377, + "T8. Structured & Numeric Reasoning": 0.8824074074074073, + "T9. Version & Code Diff Analysis": 0.8622815151611319, + "T10. Rule Induction & In-Context Learning": 0.8040277777777778, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.75 + }, + "language": { + "Chinese": 0.7871986587353806, + "English": 0.8123219648247083 + } + }, + "pass@3": 0.6106666666666667 +} \ No newline at end of file diff --git a/results/GPT-OSS-120B/thinking_context-120000_bon-3_summary.json b/results/GPT-OSS-120B/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..aa68bddd3ad0e0c1a53cc58998c214b93c7c6405 --- /dev/null +++ b/results/GPT-OSS-120B/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5260760130553013, + "inference_iteration_1_overall_metric": 0.5251990705311491, + "inference_iteration_2_overall_metric": 0.5187040802401437, + "inference_iteration_3_overall_metric": 0.5343248883946079, + "average_token_length_metric": { + "8k": 0.6379995894817992, + "16k": 0.6200629617253591, + "32k": 0.5668769322787303, + "64k": 0.5173492904735919, + "128k": 0.4362186866504548, + "256k": 0.3779486177218682 + }, + "average_contextual_requirement_metric": { + "Full": 0.49116755127749995, + "Partial": 0.5705049644088642 + }, + "average_difficulty_metric": { + "Easy": 0.7406025674065024, + "Moderate": 0.506610347347898, + "Hard": 0.44966953179643426, + "Extreme": 0.3540424932279647 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7441033370277687, + "T2. Sequencing & Structure Reconstruction": 0.7329905896572565, + "T3. Evidence-Grounded QA": 0.5333333333333334, + "T4. Summarization & Synthesis": 0.5106082800845382, + "T5. Attribution & Citation Alignment": 0.46625824816375694, + "T6. Aggregation & Clustering": 0.5279484217060981, + "T7. Consistency & Compliance Checking": 0.31563292534840204, + "T8. Structured & Numeric Reasoning": 0.5515432098765432, + "T9. Version & Code Diff Analysis": 0.5119880580465573, + "T10. Rule Induction & In-Context Learning": 0.5589814814814815, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.425 + }, + "average_language_metric": { + "Chinese": 0.505405756924764, + "English": 0.5467462691858365 + }, + "BoN-1": { + "overall_metric": 0.5251990705311491, + "token_length": { + "8k": 0.6529081123251393, + "16k": 0.6200667957335821, + "32k": 0.5763521514454887, + "64k": 0.49832867440843903, + "128k": 0.4350202675435077, + "256k": 0.36851842173074006 + }, + "contextual_requirement": { + "Full": 0.48686804900920944, + "Partial": 0.5739840070136185 + }, + "difficulty": { + "Easy": 0.7411278216421984, + "Moderate": 0.5064062158524808, + "Hard": 0.4567280838491506, + "Extreme": 0.34597541625321154 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.737237169976606, + "T2. Sequencing & Structure Reconstruction": 0.7325725663225663, + "T3. Evidence-Grounded QA": 0.5166666666666667, + "T4. Summarization & Synthesis": 0.5064758252528795, + "T5. Attribution & Citation Alignment": 0.4582161191244139, + "T6. Aggregation & Clustering": 0.5344712887432478, + "T7. Consistency & Compliance Checking": 0.3124319752506027, + "T8. Structured & Numeric Reasoning": 0.5666666666666667, + "T9. Version & Code Diff Analysis": 0.5286040271943487, + "T10. Rule Induction & In-Context Learning": 0.5565277777777777, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333 + }, + "language": { + "Chinese": 0.5156423632056797, + "English": 0.534755777856619 + } + }, + "pass@1": 0.2833333333333333, + "BoN-2": { + "overall_metric": 0.6024165651661463, + "token_length": { + "8k": 0.7164315027505042, + "16k": 0.700257643310589, + "32k": 0.6507634459310141, + "64k": 0.5868187511846459, + "128k": 0.5172068676740627, + "256k": 0.4430211801460688 + }, + "contextual_requirement": { + "Full": 0.5670686758499749, + "Partial": 0.6474047879321861 + }, + "difficulty": { + "Easy": 0.8314482789657763, + "Moderate": 0.5941446742173757, + "Hard": 0.5272666327730237, + "Extreme": 0.4063157035624808 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8187672440796736, + "T2. Sequencing & Structure Reconstruction": 0.7801595626595622, + "T3. Evidence-Grounded QA": 0.6583333333333333, + "T4. Summarization & Synthesis": 0.5244604747568062, + "T5. Attribution & Citation Alignment": 0.5425411826774184, + "T6. Aggregation & Clustering": 0.5998244613513222, + "T7. Consistency & Compliance Checking": 0.38919737648291697, + "T8. Structured & Numeric Reasoning": 0.6231481481481482, + "T9. Version & Code Diff Analysis": 0.5981069547631331, + "T10. Rule Induction & In-Context Learning": 0.6645833333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.525 + }, + "language": { + "Chinese": 0.5840400269819745, + "English": 0.620793103350321 + } + }, + "pass@2": 0.3506666666666667, + "BoN-3": { + "overall_metric": 0.6337631142743206, + "token_length": { + "8k": 0.7401560588202145, + "16k": 0.7330726977884732, + "32k": 0.6780170211387931, + "64k": 0.6366671272752144, + "128k": 0.543143313800391, + "256k": 0.47152246682284277 + }, + "contextual_requirement": { + "Full": 0.5976534402407667, + "Partial": 0.6797208812261188 + }, + "difficulty": { + "Easy": 0.858812640816625, + "Moderate": 0.6436207584661483, + "Hard": 0.5564637457814393, + "Extreme": 0.43152853820526416 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.841381085118233, + "T2. Sequencing & Structure Reconstruction": 0.8196503034003031, + "T3. Evidence-Grounded QA": 0.6916666666666667, + "T4. Summarization & Synthesis": 0.5298447017215417, + "T5. Attribution & Citation Alignment": 0.5830791662518957, + "T6. Aggregation & Clustering": 0.6214286606830465, + "T7. Consistency & Compliance Checking": 0.41851724869569523, + "T8. Structured & Numeric Reasoning": 0.6564814814814816, + "T9. Version & Code Diff Analysis": 0.626359252313376, + "T10. Rule Induction & In-Context Learning": 0.7104166666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.575 + }, + "language": { + "Chinese": 0.6166297449202884, + "English": 0.6508964836283548 + } + }, + "pass@3": 0.382 +} \ No newline at end of file diff --git a/results/GPT-OSS-20B/thinking_context-120000_bon-3_summary.json b/results/GPT-OSS-20B/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6b91f8679753c4a7cb443194902a9199a89c5481 --- /dev/null +++ b/results/GPT-OSS-20B/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.4466309565832364, + "inference_iteration_1_overall_metric": 0.4454625807266656, + "inference_iteration_2_overall_metric": 0.45246537487177085, + "inference_iteration_3_overall_metric": 0.44196491415127315, + "average_token_length_metric": { + "8k": 0.5748339290561163, + "16k": 0.520513959710621, + "32k": 0.4891012266007553, + "64k": 0.41584677147603494, + "128k": 0.358630149540046, + "256k": 0.3208597031158458 + }, + "average_contextual_requirement_metric": { + "Full": 0.415365323316177, + "Partial": 0.4864235807413135 + }, + "average_difficulty_metric": { + "Easy": 0.650502297368124, + "Moderate": 0.39329906469313236, + "Hard": 0.35893228463928295, + "Extreme": 0.3159306081507978 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7015484192056548, + "T2. Sequencing & Structure Reconstruction": 0.685767192683859, + "T3. Evidence-Grounded QA": 0.45555555555555555, + "T4. Summarization & Synthesis": 0.4908914699997827, + "T5. Attribution & Citation Alignment": 0.36677196742848295, + "T6. Aggregation & Clustering": 0.4730052458390773, + "T7. Consistency & Compliance Checking": 0.20816491065985157, + "T8. Structured & Numeric Reasoning": 0.41743827160493835, + "T9. Version & Code Diff Analysis": 0.447495265816877, + "T10. Rule Induction & In-Context Learning": 0.4786111111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3083333333333334 + }, + "average_language_metric": { + "Chinese": 0.4149338600461589, + "English": 0.4783280531203148 + }, + "BoN-1": { + "overall_metric": 0.4454625807266656, + "token_length": { + "8k": 0.568519720930888, + "16k": 0.5262031339471792, + "32k": 0.4861844372065094, + "64k": 0.41353173269416393, + "128k": 0.3591760759962795, + "256k": 0.3191603835849737 + }, + "contextual_requirement": { + "Full": 0.4087106311919467, + "Partial": 0.49223778922539846 + }, + "difficulty": { + "Easy": 0.6501986428504741, + "Moderate": 0.4020783848645907, + "Hard": 0.35262004275477665, + "Extreme": 0.3106597264865292 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6988816931417082, + "T2. Sequencing & Structure Reconstruction": 0.6855601343101344, + "T3. Evidence-Grounded QA": 0.44166666666666665, + "T4. Summarization & Synthesis": 0.48711155580421095, + "T5. Attribution & Citation Alignment": 0.3737909056226912, + "T6. Aggregation & Clustering": 0.47376675235942955, + "T7. Consistency & Compliance Checking": 0.19543673928650457, + "T8. Structured & Numeric Reasoning": 0.4592592592592593, + "T9. Version & Code Diff Analysis": 0.4319105105134516, + "T10. Rule Induction & In-Context Learning": 0.4483333333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.30833333333333335 + }, + "language": { + "Chinese": 0.4135726150175988, + "English": 0.4773525464357322 + } + }, + "pass@1": 0.22066666666666668, + "BoN-2": { + "overall_metric": 0.5272106840251494, + "token_length": { + "8k": 0.6646954568347576, + "16k": 0.5904928422287453, + "32k": 0.5839842384318158, + "64k": 0.5063967419452635, + "128k": 0.43057993780384596, + "256k": 0.38711488690647056 + }, + "contextual_requirement": { + "Full": 0.49180621097136906, + "Partial": 0.5722709224572338 + }, + "difficulty": { + "Easy": 0.7548882368495022, + "Moderate": 0.4956196438725791, + "Hard": 0.4286236931227538, + "Extreme": 0.3633035715559389 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7734553050177705, + "T2. Sequencing & Structure Reconstruction": 0.7497803122803125, + "T3. Evidence-Grounded QA": 0.5833333333333334, + "T4. Summarization & Synthesis": 0.5045962797540592, + "T5. Attribution & Citation Alignment": 0.4402260724918136, + "T6. Aggregation & Clustering": 0.5442968593297539, + "T7. Consistency & Compliance Checking": 0.271121809138241, + "T8. Structured & Numeric Reasoning": 0.5342592592592592, + "T9. Version & Code Diff Analysis": 0.5242253558461997, + "T10. Rule Induction & In-Context Learning": 0.5733333333333334, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667 + }, + "language": { + "Chinese": 0.5001287057888837, + "English": 0.5542926622614154 + } + }, + "pass@2": 0.288, + "BoN-3": { + "overall_metric": 0.5630326206114822, + "token_length": { + "8k": 0.696307151429279, + "16k": 0.6363828018704142, + "32k": 0.6080757988876709, + "64k": 0.5447446048393001, + "128k": 0.4651876442546681, + "256k": 0.4274977223875639 + }, + "contextual_requirement": { + "Full": 0.5231691902067062, + "Partial": 0.6137678956721072 + }, + "difficulty": { + "Easy": 0.7981617653401918, + "Moderate": 0.5378309814958724, + "Hard": 0.4581227829612033, + "Extreme": 0.3909189138526289 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7979486004255343, + "T2. Sequencing & Structure Reconstruction": 0.787096468346468, + "T3. Evidence-Grounded QA": 0.6416666666666667, + "T4. Summarization & Synthesis": 0.5114673228538231, + "T5. Attribution & Citation Alignment": 0.4791973901175429, + "T6. Aggregation & Clustering": 0.5794154259483205, + "T7. Consistency & Compliance Checking": 0.29817240493883673, + "T8. Structured & Numeric Reasoning": 0.5800925925925926, + "T9. Version & Code Diff Analysis": 0.5598440073472041, + "T10. Rule Induction & In-Context Learning": 0.6325000000000001, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665 + }, + "language": { + "Chinese": 0.543096347456943, + "English": 0.5829688937660222 + } + }, + "pass@3": 0.316 +} \ No newline at end of file diff --git a/results/Gemini-2.5-Flash/nonthinking_context-1000000_bon-3_summary.json b/results/Gemini-2.5-Flash/nonthinking_context-1000000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..bc1d77e02113183f0d8f163734122d86e16697c1 --- /dev/null +++ b/results/Gemini-2.5-Flash/nonthinking_context-1000000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5591861836855936, + "inference_iteration_1_overall_metric": 0.555222408533961, + "inference_iteration_2_overall_metric": 0.5555746542742924, + "inference_iteration_3_overall_metric": 0.5667614882485269, + "average_token_length_metric": { + "8k": 0.5794836437092291, + "16k": 0.585038678968723, + "32k": 0.5764993408909757, + "64k": 0.5298001757287436, + "128k": 0.5583690767328653, + "256k": 0.5259261860830253 + }, + "average_contextual_requirement_metric": { + "Full": 0.5219144924948039, + "Partial": 0.6066228815647814 + }, + "average_difficulty_metric": { + "Easy": 0.6655203056614667, + "Moderate": 0.5398880938056573, + "Hard": 0.5786822600966999, + "Extreme": 0.4425719452767774 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7771168693642845, + "T2. Sequencing & Structure Reconstruction": 0.8032048969548965, + "T3. Evidence-Grounded QA": 0.6055555555555554, + "T4. Summarization & Synthesis": 0.5340660905787081, + "T5. Attribution & Citation Alignment": 0.745788551044203, + "T6. Aggregation & Clustering": 0.5023487328603856, + "T7. Consistency & Compliance Checking": 0.4407587176859518, + "T8. Structured & Numeric Reasoning": 0.2706790123456789, + "T9. Version & Code Diff Analysis": 0.7292026752712853, + "T10. Rule Induction & In-Context Learning": 0.5269907407407407, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.44722222222222224 + }, + "average_language_metric": { + "Chinese": 0.5654483346944664, + "English": 0.5529240326767215 + }, + "BoN-1": { + "overall_metric": 0.555222408533961, + "token_length": { + "8k": 0.5888731703826585, + "16k": 0.5734020363631814, + "32k": 0.5716727282141728, + "64k": 0.5201046130976303, + "128k": 0.5511174700611993, + "256k": 0.5261644330849247 + }, + "contextual_requirement": { + "Full": 0.5109299072772209, + "Partial": 0.6115946828607216 + }, + "difficulty": { + "Easy": 0.6581097400970598, + "Moderate": 0.5327009022057297, + "Hard": 0.5733517044448927, + "Extreme": 0.4453988971639295 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7847565381129994, + "T2. Sequencing & Structure Reconstruction": 0.8209740259740258, + "T3. Evidence-Grounded QA": 0.5833333333333334, + "T4. Summarization & Synthesis": 0.5367973042699341, + "T5. Attribution & Citation Alignment": 0.7270779373385174, + "T6. Aggregation & Clustering": 0.49367147369310305, + "T7. Consistency & Compliance Checking": 0.444842452883972, + "T8. Structured & Numeric Reasoning": 0.27037037037037037, + "T9. Version & Code Diff Analysis": 0.7099867444467561, + "T10. Rule Induction & In-Context Learning": 0.522361111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665 + }, + "language": { + "Chinese": 0.5642007220552895, + "English": 0.5462440950126328 + } + }, + "pass@1": 0.2846666666666667, + "BoN-2": { + "overall_metric": 0.600215470701642, + "token_length": { + "8k": 0.6308006025002828, + "16k": 0.6218713098115006, + "32k": 0.6209629478481974, + "64k": 0.5627834642545474, + "128k": 0.6034759308063585, + "256k": 0.5613985689889686 + }, + "contextual_requirement": { + "Full": 0.5620277169409751, + "Partial": 0.6488180663970381 + }, + "difficulty": { + "Easy": 0.7092035834801983, + "Moderate": 0.562063871268018, + "Hard": 0.6309834472774163, + "Extreme": 0.4856736448985854 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8017289450670718, + "T2. Sequencing & Structure Reconstruction": 0.8388936988936988, + "T3. Evidence-Grounded QA": 0.6833333333333333, + "T4. Summarization & Synthesis": 0.5495701538780565, + "T5. Attribution & Citation Alignment": 0.7927380382693102, + "T6. Aggregation & Clustering": 0.5444118952178385, + "T7. Consistency & Compliance Checking": 0.5074368060401414, + "T8. Structured & Numeric Reasoning": 0.3101851851851852, + "T9. Version & Code Diff Analysis": 0.751572829108757, + "T10. Rule Induction & In-Context Learning": 0.5584722222222223, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334 + }, + "language": { + "Chinese": 0.6052369717987128, + "English": 0.5951939696045724 + } + }, + "pass@2": 0.32666666666666666, + "BoN-3": { + "overall_metric": 0.6297638749448518, + "token_length": { + "8k": 0.6514500993442873, + "16k": 0.6579228544698497, + "32k": 0.6569374765913272, + "64k": 0.5879896058573842, + "128k": 0.6298609557348528, + "256k": 0.5944222576714168 + }, + "contextual_requirement": { + "Full": 0.5933694722646927, + "Partial": 0.6760840238105122 + }, + "difficulty": { + "Easy": 0.7491261498971669, + "Moderate": 0.6049484175447234, + "Hard": 0.6548738677798389, + "Extreme": 0.49881447206374147 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8141979383533716, + "T2. Sequencing & Structure Reconstruction": 0.8490187590187587, + "T3. Evidence-Grounded QA": 0.7083333333333334, + "T4. Summarization & Synthesis": 0.5562735797066128, + "T5. Attribution & Citation Alignment": 0.8231149616655856, + "T6. Aggregation & Clustering": 0.5745805749141115, + "T7. Consistency & Compliance Checking": 0.5414909010178753, + "T8. Structured & Numeric Reasoning": 0.3379629629629629, + "T9. Version & Code Diff Analysis": 0.7854748730572404, + "T10. Rule Induction & In-Context Learning": 0.6129166666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666 + }, + "language": { + "Chinese": 0.6363071790468551, + "English": 0.6232205708428518 + } + }, + "pass@3": 0.3606666666666667 +} \ No newline at end of file diff --git a/results/Gemini-2.5-Flash/thinking_context-1000000_bon-3_summary.json b/results/Gemini-2.5-Flash/thinking_context-1000000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b1ca1664790b9260467fadb6436ca12c6afe917b --- /dev/null +++ b/results/Gemini-2.5-Flash/thinking_context-1000000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.674056323049449, + "inference_iteration_1_overall_metric": 0.6759983703704746, + "inference_iteration_2_overall_metric": 0.6798720821649146, + "inference_iteration_3_overall_metric": 0.6662985166129568, + "average_token_length_metric": { + "8k": 0.7135890855897297, + "16k": 0.6856610849454701, + "32k": 0.6818807100418771, + "64k": 0.7027829296522448, + "128k": 0.6399411623659867, + "256k": 0.6204829657013889 + }, + "average_contextual_requirement_metric": { + "Full": 0.63656677094683, + "Partial": 0.7217702984527835 + }, + "average_difficulty_metric": { + "Easy": 0.7982292314606038, + "Moderate": 0.7239074886730374, + "Hard": 0.7218614768527792, + "Extreme": 0.47388809993909664 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8642581912430608, + "T2. Sequencing & Structure Reconstruction": 0.877125374625374, + "T3. Evidence-Grounded QA": 0.6638888888888889, + "T4. Summarization & Synthesis": 0.545007270213042, + "T5. Attribution & Citation Alignment": 0.7904062945014397, + "T6. Aggregation & Clustering": 0.6528080258554949, + "T7. Consistency & Compliance Checking": 0.5102049505643657, + "T8. Structured & Numeric Reasoning": 0.6658950617283949, + "T9. Version & Code Diff Analysis": 0.8004985540165189, + "T10. Rule Induction & In-Context Learning": 0.605046296296296, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5361111111111112 + }, + "average_language_metric": { + "Chinese": 0.6759423962737819, + "English": 0.6721702498251175 + }, + "BoN-1": { + "overall_metric": 0.6759983703704746, + "token_length": { + "8k": 0.7102710318952188, + "16k": 0.6883832260010031, + "32k": 0.6744900827350674, + "64k": 0.7119047863552107, + "128k": 0.6560068019942544, + "256k": 0.6149342932420981 + }, + "contextual_requirement": { + "Full": 0.6308759268961498, + "Partial": 0.733426934792345 + }, + "difficulty": { + "Easy": 0.8006604241201102, + "Moderate": 0.7377405126394252, + "Hard": 0.7224536765391667, + "Extreme": 0.46837070558456173 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8740026641040577, + "T2. Sequencing & Structure Reconstruction": 0.8621174196174196, + "T3. Evidence-Grounded QA": 0.675, + "T4. Summarization & Synthesis": 0.5455538913362303, + "T5. Attribution & Citation Alignment": 0.8061189254081019, + "T6. Aggregation & Clustering": 0.6554340790288137, + "T7. Consistency & Compliance Checking": 0.5169546620879129, + "T8. Structured & Numeric Reasoning": 0.6708333333333333, + "T9. Version & Code Diff Analysis": 0.7797147286011522, + "T10. Rule Induction & In-Context Learning": 0.5926388888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.55 + }, + "language": { + "Chinese": 0.6768990117658723, + "English": 0.6750977289750792 + } + }, + "pass@1": 0.4493333333333333, + "BoN-2": { + "overall_metric": 0.7479747877310018, + "token_length": { + "8k": 0.7719367863380663, + "16k": 0.7848557527755085, + "32k": 0.7435010923393187, + "64k": 0.7892992664474747, + "128k": 0.7226763130932595, + "256k": 0.6755795153923868 + }, + "contextual_requirement": { + "Full": 0.720714483291413, + "Partial": 0.782669720654116 + }, + "difficulty": { + "Easy": 0.8637385553396211, + "Moderate": 0.8121373016562715, + "Hard": 0.8141424576405533, + "Extreme": 0.535557607922782 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9027833860231022, + "T2. Sequencing & Structure Reconstruction": 0.9054959854959853, + "T3. Evidence-Grounded QA": 0.7666666666666667, + "T4. Summarization & Synthesis": 0.5582670711283242, + "T5. Attribution & Citation Alignment": 0.8550801223307993, + "T6. Aggregation & Clustering": 0.7273381143811253, + "T7. Consistency & Compliance Checking": 0.6357469182183698, + "T8. Structured & Numeric Reasoning": 0.7416666666666667, + "T9. Version & Code Diff Analysis": 0.8585696216489627, + "T10. Rule Induction & In-Context Learning": 0.712361111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6333333333333333 + }, + "language": { + "Chinese": 0.7535034625567544, + "English": 0.74244611290525 + } + }, + "pass@2": 0.5313333333333333, + "BoN-3": { + "overall_metric": 0.7790736538060705, + "token_length": { + "8k": 0.800474143876513, + "16k": 0.8203674165938436, + "32k": 0.767107753488941, + "64k": 0.8124208427670737, + "128k": 0.743086283346713, + "256k": 0.7309854827633406 + }, + "contextual_requirement": { + "Full": 0.7557801236733843, + "Partial": 0.808719964884035 + }, + "difficulty": { + "Easy": 0.8954381880917556, + "Moderate": 0.8399060564631339, + "Hard": 0.848259677254879, + "Extreme": 0.5662031295553962 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9242311389541524, + "T2. Sequencing & Structure Reconstruction": 0.9315938690938689, + "T3. Evidence-Grounded QA": 0.7916666666666666, + "T4. Summarization & Synthesis": 0.5695550592103452, + "T5. Attribution & Citation Alignment": 0.8809133228802023, + "T6. Aggregation & Clustering": 0.7566450728547504, + "T7. Consistency & Compliance Checking": 0.662115306165705, + "T8. Structured & Numeric Reasoning": 0.7824074074074074, + "T9. Version & Code Diff Analysis": 0.8867644916844079, + "T10. Rule Induction & In-Context Learning": 0.7519444444444443, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.7 + }, + "language": { + "Chinese": 0.7890668786410705, + "English": 0.7690804289710705 + } + }, + "pass@3": 0.5786666666666667 +} \ No newline at end of file diff --git a/results/Gemini-2.5-Pro/thinking_context-1000000_bon-3_summary.json b/results/Gemini-2.5-Pro/thinking_context-1000000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..44824d02ab11c2aeb3b9632106705ebdf4b2a670 --- /dev/null +++ b/results/Gemini-2.5-Pro/thinking_context-1000000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.7342184707317124, + "inference_iteration_1_overall_metric": 0.7402405885346022, + "inference_iteration_2_overall_metric": 0.7288378446496467, + "inference_iteration_3_overall_metric": 0.7335769790108894, + "average_token_length_metric": { + "8k": 0.7449778241967657, + "16k": 0.7478649041506191, + "32k": 0.7530566835243759, + "64k": 0.7417918268320294, + "128k": 0.6999601003776742, + "256k": 0.7176594853088111 + }, + "average_contextual_requirement_metric": { + "Full": 0.7006912685258201, + "Partial": 0.7768894553573948 + }, + "average_difficulty_metric": { + "Easy": 0.8440057387459964, + "Moderate": 0.819848501651939, + "Hard": 0.8102915033262061, + "Extreme": 0.5077419967802616 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.900910721502263, + "T2. Sequencing & Structure Reconstruction": 0.9242053162886497, + "T3. Evidence-Grounded QA": 0.6500000000000001, + "T4. Summarization & Synthesis": 0.5430214244860422, + "T5. Attribution & Citation Alignment": 0.8428063760922413, + "T6. Aggregation & Clustering": 0.7039837824498163, + "T7. Consistency & Compliance Checking": 0.6274987753728497, + "T8. Structured & Numeric Reasoning": 0.7824074074074073, + "T9. Version & Code Diff Analysis": 0.873498394228399, + "T10. Rule Induction & In-Context Learning": 0.683564814814815, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5888888888888888 + }, + "average_language_metric": { + "Chinese": 0.7449054000919034, + "English": 0.7235315413715225 + }, + "BoN-1": { + "overall_metric": 0.7402405885346022, + "token_length": { + "8k": 0.7531611316917413, + "16k": 0.7524897292361332, + "32k": 0.759794274989024, + "64k": 0.7435484076033682, + "128k": 0.6968353298720406, + "256k": 0.7356146578153082 + }, + "contextual_requirement": { + "Full": 0.6997313831727073, + "Partial": 0.7917977589951957 + }, + "difficulty": { + "Easy": 0.8496811955485003, + "Moderate": 0.8519813060901164, + "Hard": 0.7987384600115967, + "Extreme": 0.5085375776002995 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8998469455079252, + "T2. Sequencing & Structure Reconstruction": 0.9226606264106262, + "T3. Evidence-Grounded QA": 0.7083333333333334, + "T4. Summarization & Synthesis": 0.5451740468187636, + "T5. Attribution & Citation Alignment": 0.8298635734563725, + "T6. Aggregation & Clustering": 0.6845419570109973, + "T7. Consistency & Compliance Checking": 0.6246870318062281, + "T8. Structured & Numeric Reasoning": 0.7995370370370369, + "T9. Version & Code Diff Analysis": 0.8931464590407817, + "T10. Rule Induction & In-Context Learning": 0.6825, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6083333333333333 + }, + "language": { + "Chinese": 0.7482902342281601, + "English": 0.7321909428410444 + } + }, + "pass@1": 0.5366666666666666, + "BoN-2": { + "overall_metric": 0.7920257143166666, + "token_length": { + "8k": 0.8033887159307824, + "16k": 0.7898793857734576, + "32k": 0.8117863707215878, + "64k": 0.7964277126734358, + "128k": 0.7607965077861976, + "256k": 0.7898755930145346 + }, + "contextual_requirement": { + "Full": 0.7559933032110803, + "Partial": 0.8378851466328657 + }, + "difficulty": { + "Easy": 0.8992430876416225, + "Moderate": 0.8976673772472249, + "Hard": 0.8677753697095277, + "Extreme": 0.5554328394573533 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9332256652370721, + "T2. Sequencing & Structure Reconstruction": 0.9412599437599435, + "T3. Evidence-Grounded QA": 0.775, + "T4. Summarization & Synthesis": 0.5583020906699263, + "T5. Attribution & Citation Alignment": 0.8863658130468474, + "T6. Aggregation & Clustering": 0.7465324819181888, + "T7. Consistency & Compliance Checking": 0.7073416442539386, + "T8. Structured & Numeric Reasoning": 0.8560185185185186, + "T9. Version & Code Diff Analysis": 0.9071622825418993, + "T10. Rule Induction & In-Context Learning": 0.7341666666666666, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.7 + }, + "language": { + "Chinese": 0.795156398417639, + "English": 0.7888950302156931 + } + }, + "pass@2": 0.6133333333333333, + "BoN-3": { + "overall_metric": 0.8133867657039734, + "token_length": { + "8k": 0.8126699858808082, + "16k": 0.8091582494700531, + "32k": 0.8359194281957039, + "64k": 0.816367429901851, + "128k": 0.7927970026749335, + "256k": 0.8134084981004824 + }, + "contextual_requirement": { + "Full": 0.7838454845628277, + "Partial": 0.85098475988361 + }, + "difficulty": { + "Easy": 0.9143269591461097, + "Moderate": 0.9126631081599108, + "Hard": 0.9014414032827288, + "Extreme": 0.5797689782741141 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9453427087326437, + "T2. Sequencing & Structure Reconstruction": 0.9562455137455134, + "T3. Evidence-Grounded QA": 0.7833333333333333, + "T4. Summarization & Synthesis": 0.5633383564725131, + "T5. Attribution & Citation Alignment": 0.9113749401192192, + "T6. Aggregation & Clustering": 0.7887683935896461, + "T7. Consistency & Compliance Checking": 0.7334094152440016, + "T8. Structured & Numeric Reasoning": 0.8671296296296297, + "T9. Version & Code Diff Analysis": 0.922905227868178, + "T10. Rule Induction & In-Context Learning": 0.7925, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334 + }, + "language": { + "Chinese": 0.8203398785387466, + "English": 0.8064336528691968 + } + }, + "pass@3": 0.6466666666666666 +} \ No newline at end of file diff --git a/results/Gemma-3-12B-It/nonthinking_context-120000_bon-3_summary.json b/results/Gemma-3-12B-It/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c4c1db11176d1956439c3f5e1741429cf59baad --- /dev/null +++ b/results/Gemma-3-12B-It/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3215537579100566, + "inference_iteration_1_overall_metric": 0.32433034575226716, + "inference_iteration_2_overall_metric": 0.31844270123543494, + "inference_iteration_3_overall_metric": 0.3218882267424678, + "average_token_length_metric": { + "8k": 0.3884654162699951, + "16k": 0.3447777819230472, + "32k": 0.34075916239810233, + "64k": 0.2830133627729955, + "128k": 0.2903427416083741, + "256k": 0.2819640824878266 + }, + "average_contextual_requirement_metric": { + "Full": 0.2967079738599621, + "Partial": 0.3531756648829045 + }, + "average_difficulty_metric": { + "Easy": 0.43662085782547294, + "Moderate": 0.23390254455699586, + "Hard": 0.30432253509954527, + "Extreme": 0.26439167130106106 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.5521425232273246, + "T2. Sequencing & Structure Reconstruction": 0.6018784456284454, + "T3. Evidence-Grounded QA": 0.39722222222222225, + "T4. Summarization & Synthesis": 0.4889337252719512, + "T5. Attribution & Citation Alignment": 0.17289417311274363, + "T6. Aggregation & Clustering": 0.31608511775757386, + "T7. Consistency & Compliance Checking": 0.18810666022578687, + "T8. Structured & Numeric Reasoning": 0.06064814814814815, + "T9. Version & Code Diff Analysis": 0.34441506928983295, + "T10. Rule Induction & In-Context Learning": 0.39800925925925923, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666667 + }, + "average_language_metric": { + "Chinese": 0.31275985343364654, + "English": 0.3303476623864672 + }, + "BoN-1": { + "overall_metric": 0.32433034575226716, + "token_length": { + "8k": 0.3906271090949922, + "16k": 0.3561137033219127, + "32k": 0.34838703683305916, + "64k": 0.2733930672728249, + "128k": 0.295394704679806, + "256k": 0.2820664533110089 + }, + "contextual_requirement": { + "Full": 0.299184653750444, + "Partial": 0.35633395375458804 + }, + "difficulty": { + "Easy": 0.4375571061543292, + "Moderate": 0.2456864337252399, + "Hard": 0.3046160927997635, + "Extreme": 0.26489900749156403 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5440288685497278, + "T2. Sequencing & Structure Reconstruction": 0.595296185296185, + "T3. Evidence-Grounded QA": 0.4166666666666667, + "T4. Summarization & Synthesis": 0.4882012721729454, + "T5. Attribution & Citation Alignment": 0.16495676080926253, + "T6. Aggregation & Clustering": 0.3232791449049618, + "T7. Consistency & Compliance Checking": 0.1944817769562575, + "T8. Structured & Numeric Reasoning": 0.0699074074074074, + "T9. Version & Code Diff Analysis": 0.345977074505613, + "T10. Rule Induction & In-Context Learning": 0.4008333333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666667 + }, + "language": { + "Chinese": 0.3144798192828198, + "English": 0.334180872221715 + } + }, + "pass@1": 0.11133333333333334, + "BoN-2": { + "overall_metric": 0.3467523917671754, + "token_length": { + "8k": 0.4142271322547509, + "16k": 0.377736900046965, + "32k": 0.37302728004656277, + "64k": 0.3062912069169447, + "128k": 0.3073157553107305, + "256k": 0.3019160760270996 + }, + "contextual_requirement": { + "Full": 0.32135117390700496, + "Partial": 0.3790812144983023 + }, + "difficulty": { + "Easy": 0.46615306988638716, + "Moderate": 0.26112276416188485, + "Hard": 0.32737280433310617, + "Extreme": 0.28492633788743743 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5650465241334548, + "T2. Sequencing & Structure Reconstruction": 0.6249731287231288, + "T3. Evidence-Grounded QA": 0.43333333333333335, + "T4. Summarization & Synthesis": 0.5032712591907067, + "T5. Attribution & Citation Alignment": 0.20597827766959523, + "T6. Aggregation & Clustering": 0.346419862594183, + "T7. Consistency & Compliance Checking": 0.21497503949007407, + "T8. Structured & Numeric Reasoning": 0.07962962962962963, + "T9. Version & Code Diff Analysis": 0.3894322431353122, + "T10. Rule Induction & In-Context Learning": 0.4174999999999999, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334 + }, + "language": { + "Chinese": 0.3387201257962417, + "English": 0.3547846577381098 + } + }, + "pass@2": 0.122, + "BoN-3": { + "overall_metric": 0.36208312298536466, + "token_length": { + "8k": 0.4291254746141897, + "16k": 0.3819529993839447, + "32k": 0.3886685107360013, + "64k": 0.3331907556566436, + "128k": 0.3232918491379485, + "256k": 0.3162691483834608 + }, + "contextual_requirement": { + "Full": 0.33738791539283736, + "Partial": 0.39351338719403606 + }, + "difficulty": { + "Easy": 0.4781807777959498, + "Moderate": 0.2766679371680117, + "Hard": 0.3532458860700918, + "Extreme": 0.29681012423769865 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5905198790888515, + "T2. Sequencing & Structure Reconstruction": 0.6396154771154773, + "T3. Evidence-Grounded QA": 0.43333333333333335, + "T4. Summarization & Synthesis": 0.5090709574388708, + "T5. Attribution & Citation Alignment": 0.21374662530636385, + "T6. Aggregation & Clustering": 0.3778424215167421, + "T7. Consistency & Compliance Checking": 0.2211746842442865, + "T8. Structured & Numeric Reasoning": 0.09351851851851851, + "T9. Version & Code Diff Analysis": 0.4124771063926183, + "T10. Rule Induction & In-Context Learning": 0.43847222222222215, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25 + }, + "language": { + "Chinese": 0.35324378584530425, + "English": 0.3709224601254252 + } + }, + "pass@3": 0.12733333333333333 +} \ No newline at end of file diff --git a/results/Gemma-3-12B-It/thinking_context-120000_bon-3_summary.json b/results/Gemma-3-12B-It/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..d0715104a7ead06c8a3b275d80b3f8fd378066ac --- /dev/null +++ b/results/Gemma-3-12B-It/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3191893284499226, + "inference_iteration_1_overall_metric": 0.3198032509438655, + "inference_iteration_2_overall_metric": 0.3185140430839788, + "inference_iteration_3_overall_metric": 0.3192506913219244, + "average_token_length_metric": { + "8k": 0.39039471003518295, + "16k": 0.36666548253930176, + "32k": 0.3454075697967419, + "64k": 0.2965559300528977, + "128k": 0.2634390883568726, + "256k": 0.25267318991854026 + }, + "average_contextual_requirement_metric": { + "Full": 0.29023833980013936, + "Partial": 0.3560360412769198 + }, + "average_difficulty_metric": { + "Easy": 0.45478741131761163, + "Moderate": 0.2260826308323087, + "Hard": 0.28024203424188526, + "Extreme": 0.2573832550303484 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.5500567032378852, + "T2. Sequencing & Structure Reconstruction": 0.5750085972908512, + "T3. Evidence-Grounded QA": 0.275, + "T4. Summarization & Synthesis": 0.4560070506311603, + "T5. Attribution & Citation Alignment": 0.1618052577444313, + "T6. Aggregation & Clustering": 0.3308854760173438, + "T7. Consistency & Compliance Checking": 0.16988798875914354, + "T8. Structured & Numeric Reasoning": 0.254783950617284, + "T9. Version & Code Diff Analysis": 0.3307362069623843, + "T10. Rule Induction & In-Context Learning": 0.3245833333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.18333333333333338 + }, + "average_language_metric": { + "Chinese": 0.29411514963064544, + "English": 0.3442635072692001 + }, + "BoN-1": { + "overall_metric": 0.3198032509438655, + "token_length": { + "8k": 0.3780018581730685, + "16k": 0.3712141189933461, + "32k": 0.32728165209503113, + "64k": 0.30571589667403015, + "128k": 0.2747567763160667, + "256k": 0.2618492034116515 + }, + "contextual_requirement": { + "Full": 0.29216027329955657, + "Partial": 0.3549852224911681 + }, + "difficulty": { + "Easy": 0.45511408988195684, + "Moderate": 0.23953366795757702, + "Hard": 0.27643841537644914, + "Extreme": 0.25278171138445865 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5667485461728832, + "T2. Sequencing & Structure Reconstruction": 0.562965737965738, + "T3. Evidence-Grounded QA": 0.25, + "T4. Summarization & Synthesis": 0.4542471570812729, + "T5. Attribution & Citation Alignment": 0.17077813789682644, + "T6. Aggregation & Clustering": 0.331058456491463, + "T7. Consistency & Compliance Checking": 0.17425550039116783, + "T8. Structured & Numeric Reasoning": 0.25555555555555554, + "T9. Version & Code Diff Analysis": 0.34802456680209815, + "T10. Rule Induction & In-Context Learning": 0.32847222222222217, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.175 + }, + "language": { + "Chinese": 0.29038552736401346, + "English": 0.34922097452371764 + } + }, + "pass@1": 0.11466666666666667, + "BoN-2": { + "overall_metric": 0.38355638163031974, + "token_length": { + "8k": 0.4753389925367892, + "16k": 0.42098100793684207, + "32k": 0.4193716441650546, + "64k": 0.35186094157335357, + "128k": 0.3173777879671563, + "256k": 0.3164079156027232 + }, + "contextual_requirement": { + "Full": 0.3475360182865405, + "Partial": 0.4294004804314932 + }, + "difficulty": { + "Easy": 0.5523314770092703, + "Moderate": 0.2771664695259602, + "Hard": 0.3369385202392175, + "Extreme": 0.29916922382926153 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6227581461675343, + "T2. Sequencing & Structure Reconstruction": 0.6340152902652901, + "T3. Evidence-Grounded QA": 0.35, + "T4. Summarization & Synthesis": 0.47044783692403785, + "T5. Attribution & Citation Alignment": 0.21419149989714506, + "T6. Aggregation & Clustering": 0.40641717668678456, + "T7. Consistency & Compliance Checking": 0.22278173964383363, + "T8. Structured & Numeric Reasoning": 0.31851851851851853, + "T9. Version & Code Diff Analysis": 0.41479917818461753, + "T10. Rule Induction & In-Context Learning": 0.4166666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25 + }, + "language": { + "Chinese": 0.35221914805982085, + "English": 0.41489361520081863 + } + }, + "pass@2": 0.154, + "BoN-3": { + "overall_metric": 0.41783270358538355, + "token_length": { + "8k": 0.5097501627706587, + "16k": 0.4551451198013833, + "32k": 0.4565584105132001, + "64k": 0.39691938688026035, + "128k": 0.34453029473355373, + "256k": 0.3440928468132463 + }, + "contextual_requirement": { + "Full": 0.37691399280812576, + "Partial": 0.46991106275643846 + }, + "difficulty": { + "Easy": 0.6081488352407031, + "Moderate": 0.3030799752726802, + "Hard": 0.35694868313688616, + "Extreme": 0.32471144207202707 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6492859989190713, + "T2. Sequencing & Structure Reconstruction": 0.673747440830774, + "T3. Evidence-Grounded QA": 0.4083333333333333, + "T4. Summarization & Synthesis": 0.47953981981487215, + "T5. Attribution & Citation Alignment": 0.23614408266148504, + "T6. Aggregation & Clustering": 0.44385472207531035, + "T7. Consistency & Compliance Checking": 0.24389274305473457, + "T8. Structured & Numeric Reasoning": 0.3634259259259259, + "T9. Version & Code Diff Analysis": 0.4490980326738026, + "T10. Rule Induction & In-Context Learning": 0.4583333333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667 + }, + "language": { + "Chinese": 0.3946655531503397, + "English": 0.440999854020427 + } + }, + "pass@3": 0.174 +} \ No newline at end of file diff --git a/results/Gemma-3-27B-It/nonthinking_context-120000_bon-3_summary.json b/results/Gemma-3-27B-It/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..5399cedea6ec59a9ac2f1b923301c979c57d75f9 --- /dev/null +++ b/results/Gemma-3-27B-It/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3613898319544999, + "inference_iteration_1_overall_metric": 0.35966281601422384, + "inference_iteration_2_overall_metric": 0.3610482900444428, + "inference_iteration_3_overall_metric": 0.3634583898048339, + "average_token_length_metric": { + "8k": 0.43644157643949566, + "16k": 0.3804621509069283, + "32k": 0.39249485549033103, + "64k": 0.3508346036478247, + "128k": 0.30224089028156714, + "256k": 0.30586491496085444 + }, + "average_contextual_requirement_metric": { + "Full": 0.33698292855454914, + "Partial": 0.3924531635544373 + }, + "average_difficulty_metric": { + "Easy": 0.49956833239118315, + "Moderate": 0.2504305317277634, + "Hard": 0.3319623187177785, + "Extreme": 0.30223017713736006 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6186817382099091, + "T2. Sequencing & Structure Reconstruction": 0.6828640898869156, + "T3. Evidence-Grounded QA": 0.475, + "T4. Summarization & Synthesis": 0.4863291170840516, + "T5. Attribution & Citation Alignment": 0.20718550263597674, + "T6. Aggregation & Clustering": 0.35878702296933646, + "T7. Consistency & Compliance Checking": 0.19534470405785112, + "T8. Structured & Numeric Reasoning": 0.11558641975308641, + "T9. Version & Code Diff Analysis": 0.3981518981106548, + "T10. Rule Induction & In-Context Learning": 0.37236111111111114, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.27222222222222225 + }, + "average_language_metric": { + "Chinese": 0.3481327108034932, + "English": 0.3746469531055068 + }, + "BoN-1": { + "overall_metric": 0.35966281601422384, + "token_length": { + "8k": 0.43768168643426314, + "16k": 0.37280230082410293, + "32k": 0.39131018375907956, + "64k": 0.3439379322622389, + "128k": 0.30914303161909973, + "256k": 0.30310176118655724 + }, + "contextual_requirement": { + "Full": 0.33431999039428895, + "Partial": 0.39191732134868573 + }, + "difficulty": { + "Easy": 0.49522975795181967, + "Moderate": 0.24738817424396956, + "Hard": 0.33414580218141465, + "Extreme": 0.30165945795823235 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6149553491444276, + "T2. Sequencing & Structure Reconstruction": 0.6777184081350744, + "T3. Evidence-Grounded QA": 0.4666666666666667, + "T4. Summarization & Synthesis": 0.48496676951441003, + "T5. Attribution & Citation Alignment": 0.2117639703302691, + "T6. Aggregation & Clustering": 0.3687808030960292, + "T7. Consistency & Compliance Checking": 0.19168084686723558, + "T8. Structured & Numeric Reasoning": 0.1189814814814815, + "T9. Version & Code Diff Analysis": 0.3861048947753823, + "T10. Rule Induction & In-Context Learning": 0.35944444444444446, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.275 + }, + "language": { + "Chinese": 0.34703709850451614, + "English": 0.3722885335239309 + } + }, + "pass@1": 0.13066666666666665, + "BoN-2": { + "overall_metric": 0.38518154623771866, + "token_length": { + "8k": 0.45699608191277485, + "16k": 0.3970070268223878, + "32k": 0.4155514974235712, + "64k": 0.3795777424428599, + "128k": 0.3342775351649673, + "256k": 0.32767939365975096 + }, + "contextual_requirement": { + "Full": 0.36193530889736725, + "Partial": 0.4147676664890755 + }, + "difficulty": { + "Easy": 0.5260736620896884, + "Moderate": 0.2746052566038148, + "Hard": 0.3586645680646715, + "Extreme": 0.32088598840944504 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6391494300203041, + "T2. Sequencing & Structure Reconstruction": 0.6911707890874557, + "T3. Evidence-Grounded QA": 0.49166666666666664, + "T4. Summarization & Synthesis": 0.4979131525726192, + "T5. Attribution & Citation Alignment": 0.24413521215425465, + "T6. Aggregation & Clustering": 0.3897620242575365, + "T7. Consistency & Compliance Checking": 0.2135542070257249, + "T8. Structured & Numeric Reasoning": 0.14675925925925926, + "T9. Version & Code Diff Analysis": 0.4478430638786239, + "T10. Rule Induction & In-Context Learning": 0.39444444444444443, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2833333333333333 + }, + "language": { + "Chinese": 0.37305464561658386, + "English": 0.3973084468588535 + } + }, + "pass@2": 0.144, + "BoN-3": { + "overall_metric": 0.4011866600471999, + "token_length": { + "8k": 0.4633867051191647, + "16k": 0.42281662150014376, + "32k": 0.44096928608538427, + "64k": 0.39220811203028316, + "128k": 0.34412857203690767, + "256k": 0.34361066351131797 + }, + "contextual_requirement": { + "Full": 0.37845160215956763, + "Partial": 0.4301221882678235 + }, + "difficulty": { + "Easy": 0.5455848419176232, + "Moderate": 0.2912498230785689, + "Hard": 0.36825828865136306, + "Extreme": 0.33684148176489936 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.654875485368947, + "T2. Sequencing & Structure Reconstruction": 0.7061728240894906, + "T3. Evidence-Grounded QA": 0.5166666666666667, + "T4. Summarization & Synthesis": 0.5049097313487289, + "T5. Attribution & Citation Alignment": 0.2565335955530918, + "T6. Aggregation & Clustering": 0.4143461915069394, + "T7. Consistency & Compliance Checking": 0.22258770429716687, + "T8. Structured & Numeric Reasoning": 0.175462962962963, + "T9. Version & Code Diff Analysis": 0.4681907705235851, + "T10. Rule Induction & In-Context Learning": 0.40555555555555556, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2833333333333333 + }, + "language": { + "Chinese": 0.3916620761944417, + "English": 0.410711243899959 + } + }, + "pass@3": 0.15533333333333332 +} \ No newline at end of file diff --git a/results/Gemma-3-27B-It/thinking_context-120000_bon-3_summary.json b/results/Gemma-3-27B-It/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..01fcada94b951d98f1f3bac4624de962402c1239 --- /dev/null +++ b/results/Gemma-3-27B-It/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.37338415281793874, + "inference_iteration_1_overall_metric": 0.3756110685938797, + "inference_iteration_2_overall_metric": 0.3716321861397887, + "inference_iteration_3_overall_metric": 0.3729092037201496, + "average_token_length_metric": { + "8k": 0.44812577930836095, + "16k": 0.4266217475899872, + "32k": 0.4074453646105579, + "64k": 0.35662526806956907, + "128k": 0.2952141304786102, + "256k": 0.3062726268505501 + }, + "average_contextual_requirement_metric": { + "Full": 0.3443713298159222, + "Partial": 0.4103095639114165 + }, + "average_difficulty_metric": { + "Easy": 0.5780767692142667, + "Moderate": 0.24533089723723267, + "Hard": 0.3056384367420397, + "Extreme": 0.27775702033096106 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6037064651554387, + "T2. Sequencing & Structure Reconstruction": 0.6278458897510606, + "T3. Evidence-Grounded QA": 0.3361111111111111, + "T4. Summarization & Synthesis": 0.45719209902963875, + "T5. Attribution & Citation Alignment": 0.23234121031762375, + "T6. Aggregation & Clustering": 0.38387242742350736, + "T7. Consistency & Compliance Checking": 0.18133975282134737, + "T8. Structured & Numeric Reasoning": 0.3114197530864198, + "T9. Version & Code Diff Analysis": 0.44895353115875314, + "T10. Rule Induction & In-Context Learning": 0.35731481481481475, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.28888888888888886 + }, + "average_language_metric": { + "Chinese": 0.3378195347928154, + "English": 0.40894877084306375 + }, + "BoN-1": { + "overall_metric": 0.3756110685938797, + "token_length": { + "8k": 0.4545820286350837, + "16k": 0.4352872653228386, + "32k": 0.3950079365533934, + "64k": 0.35311614333477187, + "128k": 0.3114655730243775, + "256k": 0.3042074646928149 + }, + "contextual_requirement": { + "Full": 0.35375594987355935, + "Partial": 0.4034266742379251 + }, + "difficulty": { + "Easy": 0.5844518111059833, + "Moderate": 0.23662855463799365, + "Hard": 0.3096811226362121, + "Extreme": 0.281425757285206 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6068670183757786, + "T2. Sequencing & Structure Reconstruction": 0.6163485749654042, + "T3. Evidence-Grounded QA": 0.36666666666666664, + "T4. Summarization & Synthesis": 0.4560619845050425, + "T5. Attribution & Citation Alignment": 0.22005868934189987, + "T6. Aggregation & Clustering": 0.39305351710005704, + "T7. Consistency & Compliance Checking": 0.1715251657008684, + "T8. Structured & Numeric Reasoning": 0.33888888888888885, + "T9. Version & Code Diff Analysis": 0.44004517714509755, + "T10. Rule Induction & In-Context Learning": 0.35888888888888887, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.275 + }, + "language": { + "Chinese": 0.34160224855312066, + "English": 0.40961988863463966 + } + }, + "pass@1": 0.15533333333333332, + "BoN-2": { + "overall_metric": 0.4363206399942735, + "token_length": { + "8k": 0.5234602086730173, + "16k": 0.5018004637617909, + "32k": 0.46794475339715547, + "64k": 0.4185517955545234, + "128k": 0.3516627837551589, + "256k": 0.354503834823997 + }, + "contextual_requirement": { + "Full": 0.4063870314951086, + "Partial": 0.4744179599023025 + }, + "difficulty": { + "Easy": 0.6923421245761895, + "Moderate": 0.2843297701377906, + "Hard": 0.35026505438423683, + "Extreme": 0.31221398104277637 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6567331296259676, + "T2. Sequencing & Structure Reconstruction": 0.6726985976985973, + "T3. Evidence-Grounded QA": 0.43333333333333335, + "T4. Summarization & Synthesis": 0.4711965663860271, + "T5. Attribution & Citation Alignment": 0.27489752350046465, + "T6. Aggregation & Clustering": 0.44484952313131765, + "T7. Consistency & Compliance Checking": 0.22198381203531814, + "T8. Structured & Numeric Reasoning": 0.40925925925925927, + "T9. Version & Code Diff Analysis": 0.5240655133007459, + "T10. Rule Induction & In-Context Learning": 0.4486111111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334 + }, + "language": { + "Chinese": 0.3992628958468014, + "English": 0.47337838414174643 + } + }, + "pass@2": 0.19933333333333333, + "BoN-3": { + "overall_metric": 0.4678951184844386, + "token_length": { + "8k": 0.5595081038285018, + "16k": 0.5288508385707865, + "32k": 0.5128049001652387, + "64k": 0.45281870495664023, + "128k": 0.37498216355710723, + "256k": 0.3784059998283628 + }, + "contextual_requirement": { + "Full": 0.4378657173135933, + "Partial": 0.5061143563382442 + }, + "difficulty": { + "Easy": 0.7390417273273124, + "Moderate": 0.31763207699520324, + "Hard": 0.37625789112044933, + "Extreme": 0.3297508281124345 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6917909626535441, + "T2. Sequencing & Structure Reconstruction": 0.7037670200170199, + "T3. Evidence-Grounded QA": 0.475, + "T4. Summarization & Synthesis": 0.4763487195551507, + "T5. Attribution & Citation Alignment": 0.30687631368745927, + "T6. Aggregation & Clustering": 0.4772314892310187, + "T7. Consistency & Compliance Checking": 0.24817243122652502, + "T8. Structured & Numeric Reasoning": 0.4217592592592592, + "T9. Version & Code Diff Analysis": 0.5559667511226697, + "T10. Rule Induction & In-Context Learning": 0.5098611111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333 + }, + "language": { + "Chinese": 0.4238794242578244, + "English": 0.511910812711055 + } + }, + "pass@3": 0.22333333333333333 +} \ No newline at end of file diff --git a/results/Gemma-3-4B-It/nonthinking_context-120000_bon-3_summary.json b/results/Gemma-3-4B-It/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..58b773f2e218e0914f011b011369ec92b670647a --- /dev/null +++ b/results/Gemma-3-4B-It/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.2175748220994214, + "inference_iteration_1_overall_metric": 0.21852109706784154, + "inference_iteration_2_overall_metric": 0.2163322668515703, + "inference_iteration_3_overall_metric": 0.21787110237885274, + "average_token_length_metric": { + "8k": 0.24656523663590132, + "16k": 0.2205604877341683, + "32k": 0.23963284248634728, + "64k": 0.21111513028758372, + "128k": 0.19343593120899555, + "256k": 0.19413930424353257 + }, + "average_contextual_requirement_metric": { + "Full": 0.21444597727329182, + "Partial": 0.22155698824176823 + }, + "average_difficulty_metric": { + "Easy": 0.28179521332096, + "Moderate": 0.15821172453341914, + "Hard": 0.20704875252188354, + "Extreme": 0.19312877831692504 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.4236351950773965, + "T2. Sequencing & Structure Reconstruction": 0.45359587924627603, + "T3. Evidence-Grounded QA": 0.3194444444444444, + "T4. Summarization & Synthesis": 0.443681734303759, + "T5. Attribution & Citation Alignment": 0.042786026910229404, + "T6. Aggregation & Clustering": 0.19637703429305803, + "T7. Consistency & Compliance Checking": 0.09142599749178396, + "T8. Structured & Numeric Reasoning": 0.02438271604938272, + "T9. Version & Code Diff Analysis": 0.1616430041389551, + "T10. Rule Induction & In-Context Learning": 0.22050925925925927, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.18611111111111114 + }, + "average_language_metric": { + "Chinese": 0.20889372279815283, + "English": 0.22625592140069015 + }, + "BoN-1": { + "overall_metric": 0.21852109706784154, + "token_length": { + "8k": 0.2476866042969573, + "16k": 0.21620176526480195, + "32k": 0.24753627633784483, + "64k": 0.21130221395252485, + "128k": 0.19685176433002144, + "256k": 0.19154795822489784 + }, + "contextual_requirement": { + "Full": 0.21588524494794425, + "Partial": 0.22187581794771047 + }, + "difficulty": { + "Easy": 0.28399652990881596, + "Moderate": 0.1580374343562769, + "Hard": 0.20857790675358978, + "Extreme": 0.19305337410218493 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.4309006915611223, + "T2. Sequencing & Structure Reconstruction": 0.4447710584938352, + "T3. Evidence-Grounded QA": 0.3333333333333333, + "T4. Summarization & Synthesis": 0.441308668504869, + "T5. Attribution & Citation Alignment": 0.03940722221903115, + "T6. Aggregation & Clustering": 0.20337572553598166, + "T7. Consistency & Compliance Checking": 0.08406090392051238, + "T8. Structured & Numeric Reasoning": 0.02361111111111111, + "T9. Version & Code Diff Analysis": 0.16341557282886326, + "T10. Rule Induction & In-Context Learning": 0.22013888888888886, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668 + }, + "language": { + "Chinese": 0.2120533839166827, + "English": 0.22498881021900005 + } + }, + "pass@1": 0.06733333333333333, + "BoN-2": { + "overall_metric": 0.23271347684478086, + "token_length": { + "8k": 0.2631313909667352, + "16k": 0.2312324535272521, + "32k": 0.25978977734748787, + "64k": 0.23161875208541116, + "128k": 0.20537856042145283, + "256k": 0.20512992672034427 + }, + "contextual_requirement": { + "Full": 0.2292600969188946, + "Partial": 0.23710868765954476 + }, + "difficulty": { + "Easy": 0.2946012095946107, + "Moderate": 0.1735404680186527, + "Hard": 0.22252519869057716, + "Extreme": 0.21047603309909335 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.4536230189628364, + "T2. Sequencing & Structure Reconstruction": 0.4791046371928724, + "T3. Evidence-Grounded QA": 0.3333333333333333, + "T4. Summarization & Synthesis": 0.4555393638488407, + "T5. Attribution & Citation Alignment": 0.04458180082240233, + "T6. Aggregation & Clustering": 0.22301925498410874, + "T7. Consistency & Compliance Checking": 0.10512521699205389, + "T8. Structured & Numeric Reasoning": 0.029166666666666667, + "T9. Version & Code Diff Analysis": 0.18246404287967255, + "T10. Rule Induction & In-Context Learning": 0.23263888888888887, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668 + }, + "language": { + "Chinese": 0.22667394776839053, + "English": 0.2387530059211707 + } + }, + "pass@2": 0.06866666666666667, + "BoN-3": { + "overall_metric": 0.2401600102211412, + "token_length": { + "8k": 0.26882845096050667, + "16k": 0.24012842113745536, + "32k": 0.2657505147595882, + "64k": 0.237575165952952, + "128k": 0.21797743491880378, + "256k": 0.21070007359754095 + }, + "contextual_requirement": { + "Full": 0.23644784183701267, + "Partial": 0.24488458816457748 + }, + "difficulty": { + "Easy": 0.3019876106242685, + "Moderate": 0.17780290261890433, + "Hard": 0.22967603723970634, + "Extreme": 0.22027403043562677 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.4596998605405389, + "T2. Sequencing & Structure Reconstruction": 0.5016641610023963, + "T3. Evidence-Grounded QA": 0.3333333333333333, + "T4. Summarization & Synthesis": 0.4669275398584054, + "T5. Attribution & Citation Alignment": 0.051513017753619265, + "T6. Aggregation & Clustering": 0.23601198455578568, + "T7. Consistency & Compliance Checking": 0.11322244187918541, + "T8. Structured & Numeric Reasoning": 0.03148148148148148, + "T9. Version & Code Diff Analysis": 0.19070502006795878, + "T10. Rule Induction & In-Context Learning": 0.23541666666666666, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668 + }, + "language": { + "Chinese": 0.23450699509250633, + "English": 0.24581302534977606 + } + }, + "pass@3": 0.07 +} \ No newline at end of file diff --git a/results/Gemma-3-4B-It/thinking_context-120000_bon-3_summary.json b/results/Gemma-3-4B-It/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..3e659459e0da902fd55fd9f70dd35ea2331e62a4 --- /dev/null +++ b/results/Gemma-3-4B-It/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.2119885030064203, + "inference_iteration_1_overall_metric": 0.21437249924782262, + "inference_iteration_2_overall_metric": 0.21469684223951344, + "inference_iteration_3_overall_metric": 0.20689616753192464, + "average_token_length_metric": { + "8k": 0.24366425705090342, + "16k": 0.2312288563166909, + "32k": 0.24934489050979397, + "64k": 0.17455165550407764, + "128k": 0.18287496802077124, + "256k": 0.19026639063628464 + }, + "average_contextual_requirement_metric": { + "Full": 0.20450724123760522, + "Partial": 0.221510108894003 + }, + "average_difficulty_metric": { + "Easy": 0.28661876230483024, + "Moderate": 0.13848484214026374, + "Hard": 0.1987191002737751, + "Extreme": 0.18722857209328425 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.4122355140839005, + "T2. Sequencing & Structure Reconstruction": 0.4238909558076955, + "T3. Evidence-Grounded QA": 0.2361111111111111, + "T4. Summarization & Synthesis": 0.4241502347645632, + "T5. Attribution & Citation Alignment": 0.04263401505989564, + "T6. Aggregation & Clustering": 0.23701628551906095, + "T7. Consistency & Compliance Checking": 0.07531749025982754, + "T8. Structured & Numeric Reasoning": 0.10648148148148145, + "T9. Version & Code Diff Analysis": 0.15631527456623664, + "T10. Rule Induction & In-Context Learning": 0.20962962962962953, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.11666666666666664 + }, + "average_language_metric": { + "Chinese": 0.19120313153416005, + "English": 0.23277387447868045 + }, + "BoN-1": { + "overall_metric": 0.21437249924782262, + "token_length": { + "8k": 0.260899815427902, + "16k": 0.2556087799887465, + "32k": 0.23432924859855547, + "64k": 0.1786455712537597, + "128k": 0.1740181124720787, + "256k": 0.18273346774589402 + }, + "contextual_requirement": { + "Full": 0.2082323257071287, + "Partial": 0.22218726557234228 + }, + "difficulty": { + "Easy": 0.28518994612968396, + "Moderate": 0.1458906399979874, + "Hard": 0.19668309266803527, + "Extreme": 0.19339405931078624 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.44068453349253894, + "T2. Sequencing & Structure Reconstruction": 0.42484177261436434, + "T3. Evidence-Grounded QA": 0.25, + "T4. Summarization & Synthesis": 0.42183521824061226, + "T5. Attribution & Citation Alignment": 0.03826565166477447, + "T6. Aggregation & Clustering": 0.25040130008880007, + "T7. Consistency & Compliance Checking": 0.08179456674127736, + "T8. Structured & Numeric Reasoning": 0.08703703703703704, + "T9. Version & Code Diff Analysis": 0.16976304211815604, + "T10. Rule Induction & In-Context Learning": 0.19708333333333336, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.10833333333333334 + }, + "language": { + "Chinese": 0.19621205437007752, + "English": 0.23253294412556783 + } + }, + "pass@1": 0.06333333333333334, + "BoN-2": { + "overall_metric": 0.2636576810412479, + "token_length": { + "8k": 0.3079135549113116, + "16k": 0.28734590159692097, + "32k": 0.30413774794877135, + "64k": 0.21523254007725492, + "128k": 0.22804447155407118, + "256k": 0.23927187015915638 + }, + "contextual_requirement": { + "Full": 0.250251500957686, + "Partial": 0.28072009205669 + }, + "difficulty": { + "Easy": 0.364020302669296, + "Moderate": 0.1823512513190994, + "Hard": 0.23524288009416136, + "Extreme": 0.22578189022008766 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5017988282740397, + "T2. Sequencing & Structure Reconstruction": 0.5025411389881287, + "T3. Evidence-Grounded QA": 0.275, + "T4. Summarization & Synthesis": 0.44235758157965793, + "T5. Attribution & Citation Alignment": 0.07123185213097494, + "T6. Aggregation & Clustering": 0.3076327414869081, + "T7. Consistency & Compliance Checking": 0.11226263632567846, + "T8. Structured & Numeric Reasoning": 0.14583333333333334, + "T9. Version & Code Diff Analysis": 0.2253096564350266, + "T10. Rule Induction & In-Context Learning": 0.26222222222222225, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.16666666666666666 + }, + "language": { + "Chinese": 0.24369127122121384, + "English": 0.28362409086128176 + } + }, + "pass@2": 0.086, + "BoN-3": { + "overall_metric": 0.29217739130674947, + "token_length": { + "8k": 0.3390735712300384, + "16k": 0.3084943145981662, + "32k": 0.331141630537573, + "64k": 0.24932737796255805, + "128k": 0.25196083809503234, + "256k": 0.27306661541712973 + }, + "contextual_requirement": { + "Full": 0.2733927975152606, + "Partial": 0.3160850561322812 + }, + "difficulty": { + "Easy": 0.4041733526262008, + "Moderate": 0.2194670736422517, + "Hard": 0.2580927963130063, + "Extreme": 0.23963574676642313 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5332538419210622, + "T2. Sequencing & Structure Reconstruction": 0.5533578363062058, + "T3. Evidence-Grounded QA": 0.3416666666666667, + "T4. Summarization & Synthesis": 0.4493150256833664, + "T5. Attribution & Citation Alignment": 0.09079534419446701, + "T6. Aggregation & Clustering": 0.3434824981209785, + "T7. Consistency & Compliance Checking": 0.12257916371054065, + "T8. Structured & Numeric Reasoning": 0.16805555555555557, + "T9. Version & Code Diff Analysis": 0.24931951714865616, + "T10. Rule Induction & In-Context Learning": 0.30833333333333335, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.175 + }, + "language": { + "Chinese": 0.27088663804629004, + "English": 0.3134681445672092 + } + }, + "pass@3": 0.102 +} \ No newline at end of file diff --git a/results/Kimi-K2-Instruct-0905/nonthinking_context-224000_bon-3_summary.json b/results/Kimi-K2-Instruct-0905/nonthinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..38091a0c0cac6468902915e449a15a7abc238bb9 --- /dev/null +++ b/results/Kimi-K2-Instruct-0905/nonthinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 67, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5009443422920304, + "inference_iteration_1_overall_metric": 0.5011015308802983, + "inference_iteration_2_overall_metric": 0.49751406897312744, + "inference_iteration_3_overall_metric": 0.5042174270226657, + "average_token_length_metric": { + "8k": 0.5193469215810047, + "16k": 0.5532046525085649, + "32k": 0.5393076869166767, + "64k": 0.45954315717941974, + "128k": 0.4753071835553842, + "256k": 0.4589564520111373 + }, + "average_contextual_requirement_metric": { + "Full": 0.45716785858755954, + "Partial": 0.5566598670068132 + }, + "average_difficulty_metric": { + "Easy": 0.6491770551060967, + "Moderate": 0.4905460752618018, + "Hard": 0.43431147544571097, + "Extreme": 0.3960536599333797 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7424451818013483, + "T2. Sequencing & Structure Reconstruction": 0.7428685203685202, + "T3. Evidence-Grounded QA": 0.5472222222222223, + "T4. Summarization & Synthesis": 0.5115863734748296, + "T5. Attribution & Citation Alignment": 0.5310286936858898, + "T6. Aggregation & Clustering": 0.481867796853936, + "T7. Consistency & Compliance Checking": 0.36661627375742456, + "T8. Structured & Numeric Reasoning": 0.24089506172839517, + "T9. Version & Code Diff Analysis": 0.607908662662019, + "T10. Rule Induction & In-Context Learning": 0.5085648148148147, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.43611111111111117 + }, + "average_language_metric": { + "Chinese": 0.502898779221276, + "English": 0.4989899053627864 + }, + "BoN-1": { + "overall_metric": 0.5011015308802983, + "token_length": { + "8k": 0.5266204875905753, + "16k": 0.5431095505457598, + "32k": 0.5392474254502099, + "64k": 0.46556965866611255, + "128k": 0.48161604654304363, + "256k": 0.45044601648609367 + }, + "contextual_requirement": { + "Full": 0.45533929299371073, + "Partial": 0.5593443790995934 + }, + "difficulty": { + "Easy": 0.6458118886638174, + "Moderate": 0.4926159658473186, + "Hard": 0.42892012991547307, + "Extreme": 0.4021536227747844 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.734408014297419, + "T2. Sequencing & Structure Reconstruction": 0.7401580826580821, + "T3. Evidence-Grounded QA": 0.5666666666666667, + "T4. Summarization & Synthesis": 0.5077956681433179, + "T5. Attribution & Citation Alignment": 0.5196800787679438, + "T6. Aggregation & Clustering": 0.4941189417411527, + "T7. Consistency & Compliance Checking": 0.3706991980056276, + "T8. Structured & Numeric Reasoning": 0.22268518518518515, + "T9. Version & Code Diff Analysis": 0.6228334158501364, + "T10. Rule Induction & In-Context Learning": 0.5243055555555556, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667 + }, + "language": { + "Chinese": 0.49773496712013804, + "English": 0.5044680946404603 + } + }, + "pass@1": 0.23, + "BoN-2": { + "overall_metric": 0.5483962332490746, + "token_length": { + "8k": 0.5725846797858735, + "16k": 0.6070506360109902, + "32k": 0.5785761030801342, + "64k": 0.5092135264066221, + "128k": 0.5131800223023555, + "256k": 0.5097724319084751 + }, + "contextual_requirement": { + "Full": 0.49887561173193595, + "Partial": 0.6114224788163434 + }, + "difficulty": { + "Easy": 0.7046240276873015, + "Moderate": 0.5462638174512109, + "Hard": 0.4707831873116643, + "Extreme": 0.4364568187575384 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7623386988134085, + "T2. Sequencing & Structure Reconstruction": 0.7700949513449513, + "T3. Evidence-Grounded QA": 0.6, + "T4. Summarization & Synthesis": 0.5249959487118497, + "T5. Attribution & Citation Alignment": 0.5953546052259285, + "T6. Aggregation & Clustering": 0.5412045721601162, + "T7. Consistency & Compliance Checking": 0.41849900851913713, + "T8. Structured & Numeric Reasoning": 0.28935185185185186, + "T9. Version & Code Diff Analysis": 0.6584466738317519, + "T10. Rule Induction & In-Context Learning": 0.5701388888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5 + }, + "language": { + "Chinese": 0.5438314702987085, + "English": 0.552960996199442 + } + }, + "pass@2": 0.27666666666666667, + "BoN-3": { + "overall_metric": 0.5729921291255787, + "token_length": { + "8k": 0.5918227174634976, + "16k": 0.6198567950677695, + "32k": 0.6115768945303457, + "64k": 0.5284625404433138, + "128k": 0.558149430962295, + "256k": 0.528084396286257 + }, + "contextual_requirement": { + "Full": 0.524120078602167, + "Partial": 0.6351929207008328 + }, + "difficulty": { + "Easy": 0.728216785648972, + "Moderate": 0.5755547819370206, + "Hard": 0.5066701999617829, + "Extreme": 0.45151519711603216 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7860047126136753, + "T2. Sequencing & Structure Reconstruction": 0.7853528878528878, + "T3. Evidence-Grounded QA": 0.625, + "T4. Summarization & Synthesis": 0.5322270993764735, + "T5. Attribution & Citation Alignment": 0.6208079019292253, + "T6. Aggregation & Clustering": 0.5652450687006129, + "T7. Consistency & Compliance Checking": 0.44884599567602773, + "T8. Structured & Numeric Reasoning": 0.31157407407407406, + "T9. Version & Code Diff Analysis": 0.6765946379547446, + "T10. Rule Induction & In-Context Learning": 0.5895833333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5583333333333333 + }, + "language": { + "Chinese": 0.5755180331103958, + "English": 0.5704662251407642 + } + }, + "pass@3": 0.3 +} \ No newline at end of file diff --git a/results/Kimi-K2-Instruct-0905/thinking_context-224000_bon-3_summary.json b/results/Kimi-K2-Instruct-0905/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..26db8474a6b75109bab5053a74aa49a112ec2fd2 --- /dev/null +++ b/results/Kimi-K2-Instruct-0905/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 69, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5553060678788313, + "inference_iteration_1_overall_metric": 0.558917739810572, + "inference_iteration_2_overall_metric": 0.5552262066724464, + "inference_iteration_3_overall_metric": 0.5517742571534756, + "average_token_length_metric": { + "8k": 0.5978532013581613, + "16k": 0.5816609532803436, + "32k": 0.5872894997726004, + "64k": 0.5360933501085343, + "128k": 0.522886026665569, + "256k": 0.5060533760877814 + }, + "average_contextual_requirement_metric": { + "Full": 0.5076499747745465, + "Partial": 0.6159592772842868 + }, + "average_difficulty_metric": { + "Easy": 0.7729188134795828, + "Moderate": 0.5733088402612271, + "Hard": 0.4375074213815535, + "Extreme": 0.38246894115073926 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.758377055109974, + "T2. Sequencing & Structure Reconstruction": 0.7423694091857211, + "T3. Evidence-Grounded QA": 0.4861111111111112, + "T4. Summarization & Synthesis": 0.5011656658098056, + "T5. Attribution & Citation Alignment": 0.6197584764672828, + "T6. Aggregation & Clustering": 0.5164556923382113, + "T7. Consistency & Compliance Checking": 0.3547519606397262, + "T8. Structured & Numeric Reasoning": 0.5962962962962963, + "T9. Version & Code Diff Analysis": 0.6299270957790389, + "T10. Rule Induction & In-Context Learning": 0.5606944444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666654 + }, + "average_language_metric": { + "Chinese": 0.5409680916435047, + "English": 0.5696440441141594 + }, + "BoN-1": { + "overall_metric": 0.558917739810572, + "token_length": { + "8k": 0.6032704706738693, + "16k": 0.5808397170448323, + "32k": 0.5927696772272222, + "64k": 0.5485389223926213, + "128k": 0.5293584568340762, + "256k": 0.49872919469081073 + }, + "contextual_requirement": { + "Full": 0.5058707162683681, + "Partial": 0.6264321334097424 + }, + "difficulty": { + "Easy": 0.7833558783440772, + "Moderate": 0.5768808845916201, + "Hard": 0.4381739958885578, + "Extreme": 0.3805641270346401 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7528903747933631, + "T2. Sequencing & Structure Reconstruction": 0.7690795456301782, + "T3. Evidence-Grounded QA": 0.525, + "T4. Summarization & Synthesis": 0.5001505956406523, + "T5. Attribution & Citation Alignment": 0.6016113058386018, + "T6. Aggregation & Clustering": 0.5066860396628982, + "T7. Consistency & Compliance Checking": 0.37783095340307493, + "T8. Structured & Numeric Reasoning": 0.5875000000000001, + "T9. Version & Code Diff Analysis": 0.6056866583526187, + "T10. Rule Induction & In-Context Learning": 0.5740277777777777, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.45 + }, + "language": { + "Chinese": 0.5344377355706056, + "English": 0.5833977440505398 + } + }, + "pass@1": 0.30733333333333335, + "BoN-2": { + "overall_metric": 0.6315444476656525, + "token_length": { + "8k": 0.6765293955798586, + "16k": 0.6653958948787116, + "32k": 0.6614274068144226, + "64k": 0.6099720675525434, + "128k": 0.5887552831784975, + "256k": 0.5871866379898839 + }, + "contextual_requirement": { + "Full": 0.5906087497944075, + "Partial": 0.6836444267745122 + }, + "difficulty": { + "Easy": 0.8546630775122501, + "Moderate": 0.6784971908255039, + "Hard": 0.5110824387985737, + "Extreme": 0.4354103526732191 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7926072573330768, + "T2. Sequencing & Structure Reconstruction": 0.814456423206423, + "T3. Evidence-Grounded QA": 0.5666666666666667, + "T4. Summarization & Synthesis": 0.5156065007848037, + "T5. Attribution & Citation Alignment": 0.705413217864198, + "T6. Aggregation & Clustering": 0.6176306197741449, + "T7. Consistency & Compliance Checking": 0.42951044567324476, + "T8. Structured & Numeric Reasoning": 0.6861111111111111, + "T9. Version & Code Diff Analysis": 0.7066217095721875, + "T10. Rule Induction & In-Context Learning": 0.6680555555555556, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.525 + }, + "language": { + "Chinese": 0.617638270540212, + "English": 0.6454506247910954 + } + }, + "pass@2": 0.37066666666666664, + "BoN-3": { + "overall_metric": 0.6596591269299542, + "token_length": { + "8k": 0.6994761842119754, + "16k": 0.6880103332979634, + "32k": 0.6831628614701334, + "64k": 0.6414281237947528, + "128k": 0.6223896794908902, + "256k": 0.6234875793140215 + }, + "contextual_requirement": { + "Full": 0.6186943779973499, + "Partial": 0.7117960801169108 + }, + "difficulty": { + "Easy": 0.865163305432102, + "Moderate": 0.7223592192252923, + "Hard": 0.5480741090923853, + "Extreme": 0.4666471483928459 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8175679720885881, + "T2. Sequencing & Structure Reconstruction": 0.8413347300847297, + "T3. Evidence-Grounded QA": 0.6, + "T4. Summarization & Synthesis": 0.5241839616339089, + "T5. Attribution & Citation Alignment": 0.7286969407202896, + "T6. Aggregation & Clustering": 0.6482898910584165, + "T7. Consistency & Compliance Checking": 0.47389920059132556, + "T8. Structured & Numeric Reasoning": 0.7092592592592593, + "T9. Version & Code Diff Analysis": 0.7448385112889893, + "T10. Rule Induction & In-Context Learning": 0.7002777777777779, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666 + }, + "language": { + "Chinese": 0.646426154128056, + "English": 0.6728920997318576 + } + }, + "pass@3": 0.4013333333333333 +} \ No newline at end of file diff --git a/results/Llama-3.1-405B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Llama-3.1-405B-Instruct/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..813cd67a14dd030568e5c15988acd0c513604750 --- /dev/null +++ b/results/Llama-3.1-405B-Instruct/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.4006972406362581, + "inference_iteration_1_overall_metric": 0.4033767470362484, + "inference_iteration_2_overall_metric": 0.4037400979033875, + "inference_iteration_3_overall_metric": 0.3949748769691391, + "average_token_length_metric": { + "8k": 0.495611810737427, + "16k": 0.47999448108480186, + "32k": 0.4902670612376324, + "64k": 0.40596651011726403, + "128k": 0.2929513776114342, + "256k": 0.23939220302898997 + }, + "average_contextual_requirement_metric": { + "Full": 0.3708303649326616, + "Partial": 0.4387096278953802 + }, + "average_difficulty_metric": { + "Easy": 0.5544944910188809, + "Moderate": 0.2906684222492303, + "Hard": 0.35510805093665054, + "Extreme": 0.33443208075583397 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7010722319989281, + "T2. Sequencing & Structure Reconstruction": 0.6852361434861434, + "T3. Evidence-Grounded QA": 0.39722222222222225, + "T4. Summarization & Synthesis": 0.49860889459519425, + "T5. Attribution & Citation Alignment": 0.33280185377001925, + "T6. Aggregation & Clustering": 0.4288546372402087, + "T7. Consistency & Compliance Checking": 0.22662521432331084, + "T8. Structured & Numeric Reasoning": 0.1651234567901234, + "T9. Version & Code Diff Analysis": 0.4037951252761809, + "T10. Rule Induction & In-Context Learning": 0.43962962962962965, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3194444444444445 + }, + "average_language_metric": { + "Chinese": 0.38105191405127925, + "English": 0.42034256722123686 + }, + "BoN-1": { + "overall_metric": 0.4033767470362484, + "token_length": { + "8k": 0.5193261381744707, + "16k": 0.47458776670121805, + "32k": 0.4893823156329057, + "64k": 0.41443272045065593, + "128k": 0.28371722198812693, + "256k": 0.2388143192701119 + }, + "contextual_requirement": { + "Full": 0.3743223052410306, + "Partial": 0.44035512750288913 + }, + "difficulty": { + "Easy": 0.5734527544487493, + "Moderate": 0.2799762087674356, + "Hard": 0.35650549807929216, + "Extreme": 0.3289038173440243 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6946832479137959, + "T2. Sequencing & Structure Reconstruction": 0.6867736892736891, + "T3. Evidence-Grounded QA": 0.4083333333333333, + "T4. Summarization & Synthesis": 0.5005294746426308, + "T5. Attribution & Citation Alignment": 0.3263529995384955, + "T6. Aggregation & Clustering": 0.4192752041935995, + "T7. Consistency & Compliance Checking": 0.23609519970144652, + "T8. Structured & Numeric Reasoning": 0.18101851851851852, + "T9. Version & Code Diff Analysis": 0.391786542964143, + "T10. Rule Induction & In-Context Learning": 0.47083333333333327, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.30833333333333335 + }, + "language": { + "Chinese": 0.3889859691853749, + "English": 0.41776752488712193 + } + }, + "pass@1": 0.16266666666666665, + "BoN-2": { + "overall_metric": 0.44522764896517386, + "token_length": { + "8k": 0.558048061407528, + "16k": 0.5158119021911939, + "32k": 0.530639639804942, + "64k": 0.45350021661205264, + "128k": 0.3440588360232635, + "256k": 0.26930723775206405 + }, + "contextual_requirement": { + "Full": 0.40918538567791846, + "Partial": 0.49109962042168165 + }, + "difficulty": { + "Easy": 0.6186902100237465, + "Moderate": 0.32736405593203205, + "Hard": 0.39034570869637836, + "Extreme": 0.36866970508796515 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7336012615268711, + "T2. Sequencing & Structure Reconstruction": 0.737358012358012, + "T3. Evidence-Grounded QA": 0.45, + "T4. Summarization & Synthesis": 0.5147616333746151, + "T5. Attribution & Citation Alignment": 0.3674427224531845, + "T6. Aggregation & Clustering": 0.4743354751511784, + "T7. Consistency & Compliance Checking": 0.26985986799607303, + "T8. Structured & Numeric Reasoning": 0.20462962962962963, + "T9. Version & Code Diff Analysis": 0.44727785652000346, + "T10. Rule Induction & In-Context Learning": 0.5249999999999999, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664 + }, + "language": { + "Chinese": 0.433626244820898, + "English": 0.4568290531094507 + } + }, + "pass@2": 0.19, + "BoN-3": { + "overall_metric": 0.4623473432019363, + "token_length": { + "8k": 0.5703117586774046, + "16k": 0.531567048206503, + "32k": 0.5504750677715584, + "64k": 0.47066633396240054, + "128k": 0.3715170610120738, + "256k": 0.27954678958167684 + }, + "contextual_requirement": { + "Full": 0.42433070289568703, + "Partial": 0.510732158137163 + }, + "difficulty": { + "Easy": 0.6297066688011407, + "Moderate": 0.34570653296360615, + "Hard": 0.4161126844074583, + "Extreme": 0.3859923786829926 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7453735974615648, + "T2. Sequencing & Structure Reconstruction": 0.7443156843156841, + "T3. Evidence-Grounded QA": 0.45, + "T4. Summarization & Synthesis": 0.52546578139621, + "T5. Attribution & Citation Alignment": 0.4112102181587475, + "T6. Aggregation & Clustering": 0.5005878653860251, + "T7. Consistency & Compliance Checking": 0.2880921279803352, + "T8. Structured & Numeric Reasoning": 0.22685185185185183, + "T9. Version & Code Diff Analysis": 0.4880120741980102, + "T10. Rule Induction & In-Context Learning": 0.5249999999999999, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664 + }, + "language": { + "Chinese": 0.4464982468640505, + "English": 0.47819643953982244 + } + }, + "pass@3": 0.20266666666666666 +} \ No newline at end of file diff --git a/results/Llama-3.1-405B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Llama-3.1-405B-Instruct/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b0dac0fb76a0adce17e358a97f522b4efb811204 --- /dev/null +++ b/results/Llama-3.1-405B-Instruct/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.40659333298471173, + "inference_iteration_1_overall_metric": 0.3990307184980547, + "inference_iteration_2_overall_metric": 0.40746318832781453, + "inference_iteration_3_overall_metric": 0.4132860921282645, + "average_token_length_metric": { + "8k": 0.5237859741060642, + "16k": 0.517961066700275, + "32k": 0.4641044292483723, + "64k": 0.4182090511525332, + "128k": 0.260073356838944, + "256k": 0.25542611986208036 + }, + "average_contextual_requirement_metric": { + "Full": 0.3729971231698154, + "Partial": 0.44935214547639707 + }, + "average_difficulty_metric": { + "Easy": 0.613595833316379, + "Moderate": 0.2921863382975652, + "Hard": 0.3409469495726698, + "Extreme": 0.29809383550926016 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6642543814799384, + "T2. Sequencing & Structure Reconstruction": 0.660868776285443, + "T3. Evidence-Grounded QA": 0.3, + "T4. Summarization & Synthesis": 0.4699204609681677, + "T5. Attribution & Citation Alignment": 0.35415278369391223, + "T6. Aggregation & Clustering": 0.44523374235335905, + "T7. Consistency & Compliance Checking": 0.2145859605426568, + "T8. Structured & Numeric Reasoning": 0.36867283950617286, + "T9. Version & Code Diff Analysis": 0.43511107590777826, + "T10. Rule Induction & In-Context Learning": 0.37759259259259265, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2777777777777778 + }, + "average_language_metric": { + "Chinese": 0.36859109108368215, + "English": 0.4445955748857405 + }, + "BoN-1": { + "overall_metric": 0.3990307184980547, + "token_length": { + "8k": 0.5319602144193065, + "16k": 0.495294502488435, + "32k": 0.44900241142401065, + "64k": 0.4104061454120568, + "128k": 0.24974039710926457, + "256k": 0.2577806401352564 + }, + "contextual_requirement": { + "Full": 0.3663953815207406, + "Partial": 0.4405666019237288 + }, + "difficulty": { + "Easy": 0.6016370051267634, + "Moderate": 0.26872089549030526, + "Hard": 0.34181177303603094, + "Extreme": 0.3002570456178896 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6857190555998647, + "T2. Sequencing & Structure Reconstruction": 0.6476817164317161, + "T3. Evidence-Grounded QA": 0.2916666666666667, + "T4. Summarization & Synthesis": 0.4722890782260215, + "T5. Attribution & Citation Alignment": 0.3720362720390496, + "T6. Aggregation & Clustering": 0.4527252876757789, + "T7. Consistency & Compliance Checking": 0.19573002717715302, + "T8. Structured & Numeric Reasoning": 0.3111111111111111, + "T9. Version & Code Diff Analysis": 0.4434471088718592, + "T10. Rule Induction & In-Context Learning": 0.39402777777777775, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667 + }, + "language": { + "Chinese": 0.3646393289213864, + "English": 0.43342210807472437 + } + }, + "pass@1": 0.17866666666666667, + "BoN-2": { + "overall_metric": 0.47482444558969183, + "token_length": { + "8k": 0.6017952145070679, + "16k": 0.591088648529682, + "32k": 0.5375509398549342, + "64k": 0.4892562129103632, + "128k": 0.3164459575452793, + "256k": 0.3128097001908253 + }, + "contextual_requirement": { + "Full": 0.43173242380596516, + "Partial": 0.5296688369507991 + }, + "difficulty": { + "Easy": 0.7232714101187123, + "Moderate": 0.3321291637725583, + "Hard": 0.3952399665178113, + "Extreme": 0.3486594773940952 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7289955546187882, + "T2. Sequencing & Structure Reconstruction": 0.7412526825026818, + "T3. Evidence-Grounded QA": 0.4166666666666667, + "T4. Summarization & Synthesis": 0.49118408476251574, + "T5. Attribution & Citation Alignment": 0.40752656877934634, + "T6. Aggregation & Clustering": 0.5058127366024506, + "T7. Consistency & Compliance Checking": 0.2513867378923854, + "T8. Structured & Numeric Reasoning": 0.4481481481481482, + "T9. Version & Code Diff Analysis": 0.5031863563544513, + "T10. Rule Induction & In-Context Learning": 0.48847222222222225, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.35 + }, + "language": { + "Chinese": 0.43768030437389926, + "English": 0.511968586805485 + } + }, + "pass@2": 0.24, + "BoN-3": { + "overall_metric": 0.5112966568518927, + "token_length": { + "8k": 0.638281295176006, + "16k": 0.6423408107009556, + "32k": 0.5654314097480697, + "64k": 0.5253595874826821, + "128k": 0.3530891462917609, + "256k": 0.3432776917118821 + }, + "contextual_requirement": { + "Full": 0.4709592628636537, + "Partial": 0.5626351582914695 + }, + "difficulty": { + "Easy": 0.7672241772316003, + "Moderate": 0.375515223666526, + "Hard": 0.43497012537117896, + "Extreme": 0.3702560590461619 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7536336494527094, + "T2. Sequencing & Structure Reconstruction": 0.7626347726347721, + "T3. Evidence-Grounded QA": 0.43333333333333335, + "T4. Summarization & Synthesis": 0.5004237684828063, + "T5. Attribution & Citation Alignment": 0.44213397827806017, + "T6. Aggregation & Clustering": 0.5445203343972627, + "T7. Consistency & Compliance Checking": 0.2924260189234064, + "T8. Structured & Numeric Reasoning": 0.5148148148148148, + "T9. Version & Code Diff Analysis": 0.5454347340415279, + "T10. Rule Induction & In-Context Learning": 0.5093055555555556, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667 + }, + "language": { + "Chinese": 0.47723166126529054, + "English": 0.545361652438495 + } + }, + "pass@3": 0.27466666666666667 +} \ No newline at end of file diff --git a/results/Llama-3.1-70B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Llama-3.1-70B-Instruct/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..8e84d47ed865a21f77a21e18b459481428432968 --- /dev/null +++ b/results/Llama-3.1-70B-Instruct/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.31531891526483563, + "inference_iteration_1_overall_metric": 0.3187840899451787, + "inference_iteration_2_overall_metric": 0.3181153572604232, + "inference_iteration_3_overall_metric": 0.3090572985889053, + "average_token_length_metric": { + "8k": 0.44072420920233435, + "16k": 0.4154170524608382, + "32k": 0.39938052404397517, + "64k": 0.3038357678876172, + "128k": 0.16668959617065898, + "256k": 0.16586634182358947 + }, + "average_contextual_requirement_metric": { + "Full": 0.28629550819966637, + "Partial": 0.3522577969841428 + }, + "average_difficulty_metric": { + "Easy": 0.4401547192469389, + "Moderate": 0.21463302825178993, + "Hard": 0.2886301431775311, + "Extreme": 0.26222895835717064 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.5528929885667343, + "T2. Sequencing & Structure Reconstruction": 0.5351228410297036, + "T3. Evidence-Grounded QA": 0.34722222222222227, + "T4. Summarization & Synthesis": 0.48251270202602564, + "T5. Attribution & Citation Alignment": 0.2383585369805011, + "T6. Aggregation & Clustering": 0.3094103218555832, + "T7. Consistency & Compliance Checking": 0.18317006381802675, + "T8. Structured & Numeric Reasoning": 0.11111111111111113, + "T9. Version & Code Diff Analysis": 0.35298805295632424, + "T10. Rule Induction & In-Context Learning": 0.3324074074074074, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.1944444444444444 + }, + "average_language_metric": { + "Chinese": 0.27963370499725654, + "English": 0.3510041255324155 + }, + "BoN-1": { + "overall_metric": 0.3187840899451787, + "token_length": { + "8k": 0.45086107163290673, + "16k": 0.42082616936389317, + "32k": 0.4024958091173256, + "64k": 0.31412697939763573, + "128k": 0.16271506723441695, + "256k": 0.16167944292489206 + }, + "contextual_requirement": { + "Full": 0.28863047320759827, + "Partial": 0.35716142033846177 + }, + "difficulty": { + "Easy": 0.4450445268436371, + "Moderate": 0.23675467668036354, + "Hard": 0.2806267700317323, + "Extreme": 0.25941235199849716 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5578290894275468, + "T2. Sequencing & Structure Reconstruction": 0.5340221352721353, + "T3. Evidence-Grounded QA": 0.36666666666666664, + "T4. Summarization & Synthesis": 0.4824250532281757, + "T5. Attribution & Citation Alignment": 0.22435495508076153, + "T6. Aggregation & Clustering": 0.3143695347862014, + "T7. Consistency & Compliance Checking": 0.1819753578820025, + "T8. Structured & Numeric Reasoning": 0.11342592592592592, + "T9. Version & Code Diff Analysis": 0.33790255230380384, + "T10. Rule Induction & In-Context Learning": 0.3586111111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334 + }, + "language": { + "Chinese": 0.29265028040862706, + "English": 0.34491789948172935 + } + }, + "pass@1": 0.116, + "BoN-2": { + "overall_metric": 0.3704962078912661, + "token_length": { + "8k": 0.5007140188951568, + "16k": 0.471793141596409, + "32k": 0.4656246883405684, + "64k": 0.3555228262932919, + "128k": 0.22278285337549444, + "256k": 0.20653971884667408 + }, + "contextual_requirement": { + "Full": 0.33766430263043345, + "Partial": 0.41228226913232546 + }, + "difficulty": { + "Easy": 0.505817704967975, + "Moderate": 0.26539475589230027, + "Hard": 0.33902165222515473, + "Extreme": 0.3119632638554326 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6325785537327107, + "T2. Sequencing & Structure Reconstruction": 0.6024494976700858, + "T3. Evidence-Grounded QA": 0.4083333333333333, + "T4. Summarization & Synthesis": 0.5068628671369674, + "T5. Attribution & Citation Alignment": 0.27736426767676775, + "T6. Aggregation & Clustering": 0.39468559218559207, + "T7. Consistency & Compliance Checking": 0.23698781644898675, + "T8. Structured & Numeric Reasoning": 0.1412037037037037, + "T9. Version & Code Diff Analysis": 0.42207618836131144, + "T10. Rule Induction & In-Context Learning": 0.3888888888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334 + }, + "language": { + "Chinese": 0.3386454170505693, + "English": 0.4023469987319626 + } + }, + "pass@2": 0.144, + "BoN-3": { + "overall_metric": 0.39272719549993146, + "token_length": { + "8k": 0.5352185196666011, + "16k": 0.48774731079355566, + "32k": 0.4914359262892438, + "64k": 0.38089490889752375, + "128k": 0.23106162164056493, + "256k": 0.23000488571210054 + }, + "contextual_requirement": { + "Full": 0.3601448943540025, + "Partial": 0.434195578776569 + }, + "difficulty": { + "Easy": 0.5296491139243411, + "Moderate": 0.28398341296374197, + "Hard": 0.3678982241467773, + "Extreme": 0.3304684709396022 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6534337905352087, + "T2. Sequencing & Structure Reconstruction": 0.6269316794316794, + "T3. Evidence-Grounded QA": 0.43333333333333335, + "T4. Summarization & Synthesis": 0.5138761303847846, + "T5. Attribution & Citation Alignment": 0.30993542257550877, + "T6. Aggregation & Clustering": 0.4248105135605135, + "T7. Consistency & Compliance Checking": 0.25284892696838046, + "T8. Structured & Numeric Reasoning": 0.1523148148148148, + "T9. Version & Code Diff Analysis": 0.4435626489175109, + "T10. Rule Induction & In-Context Learning": 0.4330555555555556, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25 + }, + "language": { + "Chinese": 0.3526743750393581, + "English": 0.4327800159605054 + } + }, + "pass@3": 0.16266666666666665 +} \ No newline at end of file diff --git a/results/Llama-3.1-70B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Llama-3.1-70B-Instruct/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..a114871cfe65a4d1acbb6687755e515db1a58be0 --- /dev/null +++ b/results/Llama-3.1-70B-Instruct/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3212355454655496, + "inference_iteration_1_overall_metric": 0.3168848382877898, + "inference_iteration_2_overall_metric": 0.3235694833261471, + "inference_iteration_3_overall_metric": 0.32325231478271244, + "average_token_length_metric": { + "8k": 0.44607201777886324, + "16k": 0.43551597572008266, + "32k": 0.40532179664339874, + "64k": 0.3372735574136524, + "128k": 0.14963906519016765, + "256k": 0.15359086004713354 + }, + "average_contextual_requirement_metric": { + "Full": 0.2920710323592253, + "Partial": 0.3583540166917813 + }, + "average_difficulty_metric": { + "Easy": 0.4845503461095368, + "Moderate": 0.21437000317705404, + "Hard": 0.28040007010006374, + "Extreme": 0.2393443186282747 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.5203183043430563, + "T2. Sequencing & Structure Reconstruction": 0.5048829192634423, + "T3. Evidence-Grounded QA": 0.2972222222222221, + "T4. Summarization & Synthesis": 0.46869982746667466, + "T5. Attribution & Citation Alignment": 0.24051072181402314, + "T6. Aggregation & Clustering": 0.3352447566072413, + "T7. Consistency & Compliance Checking": 0.16831334519236535, + "T8. Structured & Numeric Reasoning": 0.24182098765432103, + "T9. Version & Code Diff Analysis": 0.3504639112512836, + "T10. Rule Induction & In-Context Learning": 0.3041666666666666, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.21111111111111108 + }, + "average_language_metric": { + "Chinese": 0.27396983223922683, + "English": 0.368501258691873 + }, + "BoN-1": { + "overall_metric": 0.3168848382877898, + "token_length": { + "8k": 0.4288512287020653, + "16k": 0.431350020549353, + "32k": 0.41805229286031653, + "64k": 0.3101016124828498, + "128k": 0.16200619520020057, + "256k": 0.1509476799319523 + }, + "contextual_requirement": { + "Full": 0.28780546371048366, + "Partial": 0.3538949513861792 + }, + "difficulty": { + "Easy": 0.48546407013873055, + "Moderate": 0.1945019101312392, + "Hard": 0.2709484383728626, + "Extreme": 0.24276914751620637 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.48095324443543325, + "T2. Sequencing & Structure Reconstruction": 0.4873936033102699, + "T3. Evidence-Grounded QA": 0.275, + "T4. Summarization & Synthesis": 0.46817349027530025, + "T5. Attribution & Citation Alignment": 0.21679364691461467, + "T6. Aggregation & Clustering": 0.3425653712663516, + "T7. Consistency & Compliance Checking": 0.17647214053474497, + "T8. Structured & Numeric Reasoning": 0.25462962962962965, + "T9. Version & Code Diff Analysis": 0.347662448182329, + "T10. Rule Induction & In-Context Learning": 0.31625, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334 + }, + "language": { + "Chinese": 0.2661805594268294, + "English": 0.36758911714875 + } + }, + "pass@1": 0.13733333333333334, + "BoN-2": { + "overall_metric": 0.39714621701263747, + "token_length": { + "8k": 0.5101215226314286, + "16k": 0.5452446091546882, + "32k": 0.5033835517180962, + "64k": 0.40629425415229514, + "128k": 0.20367087402793838, + "256k": 0.21416249039138233 + }, + "contextual_requirement": { + "Full": 0.3628015637327161, + "Partial": 0.4408575939143573 + }, + "difficulty": { + "Easy": 0.6103509331261675, + "Moderate": 0.2739929015343184, + "Hard": 0.33264890818640663, + "Extreme": 0.2868424835064885 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6190525866570429, + "T2. Sequencing & Structure Reconstruction": 0.5863848211097236, + "T3. Evidence-Grounded QA": 0.4, + "T4. Summarization & Synthesis": 0.48619725203488445, + "T5. Attribution & Citation Alignment": 0.290019754922306, + "T6. Aggregation & Clustering": 0.4354357416367219, + "T7. Consistency & Compliance Checking": 0.21250441110110105, + "T8. Structured & Numeric Reasoning": 0.3388888888888889, + "T9. Version & Code Diff Analysis": 0.4174297354939513, + "T10. Rule Induction & In-Context Learning": 0.42666666666666675, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336 + }, + "language": { + "Chinese": 0.3417787747514596, + "English": 0.45251365927381704 + } + }, + "pass@2": 0.19133333333333333, + "BoN-3": { + "overall_metric": 0.43806378317359107, + "token_length": { + "8k": 0.5718190971548373, + "16k": 0.5932030702540269, + "32k": 0.5401634596906353, + "64k": 0.45712267932316264, + "128k": 0.2296143794444798, + "256k": 0.23646001317440798 + }, + "contextual_requirement": { + "Full": 0.39971488113838277, + "Partial": 0.48687147667294856 + }, + "difficulty": { + "Easy": 0.6591788734633496, + "Moderate": 0.3103483415629278, + "Hard": 0.38512512860737047, + "Extreme": 0.3144991474359921 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6739172902122323, + "T2. Sequencing & Structure Reconstruction": 0.6553062801836215, + "T3. Evidence-Grounded QA": 0.475, + "T4. Summarization & Synthesis": 0.5017174053624821, + "T5. Attribution & Citation Alignment": 0.31632987998243095, + "T6. Aggregation & Clustering": 0.4726401785278596, + "T7. Consistency & Compliance Checking": 0.24365040952524628, + "T8. Structured & Numeric Reasoning": 0.36944444444444446, + "T9. Version & Code Diff Analysis": 0.44797944073835727, + "T10. Rule Induction & In-Context Learning": 0.46027777777777784, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665 + }, + "language": { + "Chinese": 0.386238080047615, + "English": 0.48988948629956824 + } + }, + "pass@3": 0.21933333333333332 +} \ No newline at end of file diff --git a/results/Llama-3.1-8B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Llama-3.1-8B-Instruct/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..dbbed086dc03a451c3ba5d51dc42498caa85232d --- /dev/null +++ b/results/Llama-3.1-8B-Instruct/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.21094590782574696, + "inference_iteration_1_overall_metric": 0.20814425242445228, + "inference_iteration_2_overall_metric": 0.213015185500322, + "inference_iteration_3_overall_metric": 0.21167828555246626, + "average_token_length_metric": { + "8k": 0.24549737122739362, + "16k": 0.2608710428868677, + "32k": 0.2249354240045269, + "64k": 0.18691854981764278, + "128k": 0.18010527298228765, + "256k": 0.16734778603576234 + }, + "average_contextual_requirement_metric": { + "Full": 0.1813007104244737, + "Partial": 0.24867615906373072 + }, + "average_difficulty_metric": { + "Easy": 0.254652823450579, + "Moderate": 0.13823151671179162, + "Hard": 0.21215047305696197, + "Extreme": 0.21003592225516218 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.4667231034949644, + "T2. Sequencing & Structure Reconstruction": 0.4213756913607651, + "T3. Evidence-Grounded QA": 0.15000000000000002, + "T4. Summarization & Synthesis": 0.47465443881044483, + "T5. Attribution & Citation Alignment": 0.08095709533952888, + "T6. Aggregation & Clustering": 0.1895252817222955, + "T7. Consistency & Compliance Checking": 0.12098051997071714, + "T8. Structured & Numeric Reasoning": 0.04969135802469136, + "T9. Version & Code Diff Analysis": 0.21578074220253873, + "T10. Rule Induction & In-Context Learning": 0.16759259259259265, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.11944444444444445 + }, + "average_language_metric": { + "Chinese": 0.17905397547001678, + "English": 0.24283784018147686 + }, + "BoN-1": { + "overall_metric": 0.20814425242445228, + "token_length": { + "8k": 0.22656456269661684, + "16k": 0.25006742264480875, + "32k": 0.22983820975916858, + "64k": 0.17867648708372652, + "128k": 0.1893868580863639, + "256k": 0.17433197427602828 + }, + "contextual_requirement": { + "Full": 0.1770456924597648, + "Partial": 0.24772423783405423 + }, + "difficulty": { + "Easy": 0.2566570988180275, + "Moderate": 0.1324196374719661, + "Hard": 0.20899042479807298, + "Extreme": 0.2041821568416995 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.4630702505342932, + "T2. Sequencing & Structure Reconstruction": 0.4120432796938637, + "T3. Evidence-Grounded QA": 0.13333333333333333, + "T4. Summarization & Synthesis": 0.47400084179308405, + "T5. Attribution & Citation Alignment": 0.0690637373143065, + "T6. Aggregation & Clustering": 0.19812149190741143, + "T7. Consistency & Compliance Checking": 0.11267079352137302, + "T8. Structured & Numeric Reasoning": 0.04490740740740741, + "T9. Version & Code Diff Analysis": 0.19979772893803915, + "T10. Rule Induction & In-Context Learning": 0.18361111111111109, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.13333333333333333 + }, + "language": { + "Chinese": 0.17869850347586846, + "English": 0.2375900013730359 + } + }, + "pass@1": 0.052, + "BoN-2": { + "overall_metric": 0.24847856382430364, + "token_length": { + "8k": 0.28288621820265775, + "16k": 0.30147254873111595, + "32k": 0.2728297990630907, + "64k": 0.21066076283394117, + "128k": 0.21820825855623124, + "256k": 0.2048137955587849 + }, + "contextual_requirement": { + "Full": 0.2142227788917067, + "Partial": 0.29207683555669967 + }, + "difficulty": { + "Easy": 0.3015279708931293, + "Moderate": 0.1575465527579005, + "Hard": 0.2601383702201079, + "Extreme": 0.24243277505755167 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5278328421453239, + "T2. Sequencing & Structure Reconstruction": 0.4858897020853544, + "T3. Evidence-Grounded QA": 0.18333333333333332, + "T4. Summarization & Synthesis": 0.4923937781072418, + "T5. Attribution & Citation Alignment": 0.10321693015669395, + "T6. Aggregation & Clustering": 0.24202529361366723, + "T7. Consistency & Compliance Checking": 0.1496372306528247, + "T8. Structured & Numeric Reasoning": 0.06435185185185185, + "T9. Version & Code Diff Analysis": 0.2640161200205543, + "T10. Rule Induction & In-Context Learning": 0.22361111111111112, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666 + }, + "language": { + "Chinese": 0.21521719416329613, + "English": 0.2817399334853111 + } + }, + "pass@2": 0.068, + "BoN-3": { + "overall_metric": 0.27478587280873845, + "token_length": { + "8k": 0.32764512992755657, + "16k": 0.3226696445637056, + "32k": 0.29882237152612906, + "64k": 0.24198009489674097, + "128k": 0.23991327950411545, + "256k": 0.21768471643418316 + }, + "contextual_requirement": { + "Full": 0.2383436635644805, + "Partial": 0.32116686639233955 + }, + "difficulty": { + "Easy": 0.33550759893050514, + "Moderate": 0.18287651202405897, + "Hard": 0.28514599318022166, + "Extreme": 0.26183100573765283 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5538072480868509, + "T2. Sequencing & Structure Reconstruction": 0.5290554353054353, + "T3. Evidence-Grounded QA": 0.2, + "T4. Summarization & Synthesis": 0.5032056223860172, + "T5. Attribution & Citation Alignment": 0.1333833560986498, + "T6. Aggregation & Clustering": 0.2737365591929987, + "T7. Consistency & Compliance Checking": 0.17502387399050007, + "T8. Structured & Numeric Reasoning": 0.08750000000000001, + "T9. Version & Code Diff Analysis": 0.3068144317903629, + "T10. Rule Induction & In-Context Learning": 0.2625, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666 + }, + "language": { + "Chinese": 0.23247692481372026, + "English": 0.3170948208037568 + } + }, + "pass@3": 0.078 +} \ No newline at end of file diff --git a/results/Llama-3.1-8B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Llama-3.1-8B-Instruct/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..8f58089e21405a416305ba97c005165e2257a2d9 --- /dev/null +++ b/results/Llama-3.1-8B-Instruct/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.20055372622856252, + "inference_iteration_1_overall_metric": 0.20930536348826667, + "inference_iteration_2_overall_metric": 0.19814963328771615, + "inference_iteration_3_overall_metric": 0.19420618190970523, + "average_token_length_metric": { + "8k": 0.25813666494087695, + "16k": 0.2584728735658432, + "32k": 0.22849707778354275, + "64k": 0.18730191383793596, + "128k": 0.12903377597359986, + "256k": 0.14188005126957745 + }, + "average_contextual_requirement_metric": { + "Full": 0.1871659292157323, + "Partial": 0.21759274060852876 + }, + "average_difficulty_metric": { + "Easy": 0.2628031197661334, + "Moderate": 0.12316795604778738, + "Hard": 0.17987117557385604, + "Extreme": 0.19677540131116525 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.4057709287198289, + "T2. Sequencing & Structure Reconstruction": 0.34219177348181007, + "T3. Evidence-Grounded QA": 0.12222222222222225, + "T4. Summarization & Synthesis": 0.4554008546339488, + "T5. Attribution & Citation Alignment": 0.09150573083223558, + "T6. Aggregation & Clustering": 0.2205869310535933, + "T7. Consistency & Compliance Checking": 0.10384155329285091, + "T8. Structured & Numeric Reasoning": 0.10123456790123456, + "T9. Version & Code Diff Analysis": 0.17017808218806194, + "T10. Rule Induction & In-Context Learning": 0.1756018518518519, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.10555555555555557 + }, + "average_language_metric": { + "Chinese": 0.14712355199329824, + "English": 0.2539839004638273 + }, + "BoN-1": { + "overall_metric": 0.20930536348826667, + "token_length": { + "8k": 0.2793229214860049, + "16k": 0.271683549702249, + "32k": 0.23348902534053415, + "64k": 0.19344554869301367, + "128k": 0.12028061678065591, + "256k": 0.15761051892714206 + }, + "contextual_requirement": { + "Full": 0.18602878507502096, + "Partial": 0.2389300996505795 + }, + "difficulty": { + "Easy": 0.28424670669624624, + "Moderate": 0.1253791912220988, + "Hard": 0.17541923812572474, + "Extreme": 0.20459178755292767 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.4385523704155171, + "T2. Sequencing & Structure Reconstruction": 0.33931459481923876, + "T3. Evidence-Grounded QA": 0.11666666666666667, + "T4. Summarization & Synthesis": 0.45953789724020144, + "T5. Attribution & Citation Alignment": 0.0722496761788766, + "T6. Aggregation & Clustering": 0.21044140806468803, + "T7. Consistency & Compliance Checking": 0.1283895768466609, + "T8. Structured & Numeric Reasoning": 0.1125, + "T9. Version & Code Diff Analysis": 0.2038326942491424, + "T10. Rule Induction & In-Context Learning": 0.19249999999999998, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.11666666666666667 + }, + "language": { + "Chinese": 0.15245924739914177, + "English": 0.26615147957739155 + } + }, + "pass@1": 0.06, + "BoN-2": { + "overall_metric": 0.2668017232451993, + "token_length": { + "8k": 0.3461725655993255, + "16k": 0.34802165706779187, + "32k": 0.31037609052110204, + "64k": 0.2389649239073623, + "128k": 0.1748998439051164, + "256k": 0.18237525847049635 + }, + "contextual_requirement": { + "Full": 0.2499435538255567, + "Partial": 0.28825757523383516 + }, + "difficulty": { + "Easy": 0.3588173402056394, + "Moderate": 0.16793760263800153, + "Hard": 0.24236535579298768, + "Extreme": 0.246988799777556 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5197176048343898, + "T2. Sequencing & Structure Reconstruction": 0.48746135503643234, + "T3. Evidence-Grounded QA": 0.16666666666666666, + "T4. Summarization & Synthesis": 0.4794299302901568, + "T5. Attribution & Citation Alignment": 0.15065800813584843, + "T6. Aggregation & Clustering": 0.3052547184784026, + "T7. Consistency & Compliance Checking": 0.15302354691124467, + "T8. Structured & Numeric Reasoning": 0.14444444444444443, + "T9. Version & Code Diff Analysis": 0.2507539108503574, + "T10. Rule Induction & In-Context Learning": 0.23458333333333334, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666 + }, + "language": { + "Chinese": 0.19465748810477185, + "English": 0.33894595838562674 + } + }, + "pass@2": 0.082, + "BoN-3": { + "overall_metric": 0.29957877493017654, + "token_length": { + "8k": 0.3888483252966101, + "16k": 0.39060272417227027, + "32k": 0.34430797611932984, + "64k": 0.2620211212858359, + "128k": 0.19544431047747687, + "256k": 0.21624819222953548 + }, + "contextual_requirement": { + "Full": 0.2794795135612338, + "Partial": 0.3251596530361036 + }, + "difficulty": { + "Easy": 0.4075586546242195, + "Moderate": 0.19423764623204026, + "Hard": 0.27460228265341846, + "Extreme": 0.26688670776930307 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5783680297471054, + "T2. Sequencing & Structure Reconstruction": 0.5321006964107361, + "T3. Evidence-Grounded QA": 0.20833333333333334, + "T4. Summarization & Synthesis": 0.4907425557642246, + "T5. Attribution & Citation Alignment": 0.1724297780962502, + "T6. Aggregation & Clustering": 0.3362168666306595, + "T7. Consistency & Compliance Checking": 0.16743722857143514, + "T8. Structured & Numeric Reasoning": 0.1921296296296296, + "T9. Version & Code Diff Analysis": 0.2933902615835246, + "T10. Rule Induction & In-Context Learning": 0.2673611111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.15833333333333333 + }, + "language": { + "Chinese": 0.22893100795326488, + "English": 0.37022654190708804 + } + }, + "pass@3": 0.09866666666666667 +} \ No newline at end of file diff --git a/results/Llama-3.2-3B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Llama-3.2-3B-Instruct/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..11639dccb9160ccc28868b95572f9c9314dbd63f --- /dev/null +++ b/results/Llama-3.2-3B-Instruct/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.15708345836639478, + "inference_iteration_1_overall_metric": 0.1522499746285183, + "inference_iteration_2_overall_metric": 0.16292071303675215, + "inference_iteration_3_overall_metric": 0.15607968743391326, + "average_token_length_metric": { + "8k": 0.19074751544937305, + "16k": 0.18311448739692587, + "32k": 0.15849934191444578, + "64k": 0.13711682904337855, + "128k": 0.13633492271322553, + "256k": 0.13668765368101904 + }, + "average_contextual_requirement_metric": { + "Full": 0.15215903834853456, + "Partial": 0.16335090202548921 + }, + "average_difficulty_metric": { + "Easy": 0.18487754128195516, + "Moderate": 0.10373568516116806, + "Hard": 0.15009434290092064, + "Extreme": 0.16626666941305895 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.3024747670482544, + "T2. Sequencing & Structure Reconstruction": 0.30977286822552064, + "T3. Evidence-Grounded QA": 0.11388888888888889, + "T4. Summarization & Synthesis": 0.4341408375280332, + "T5. Attribution & Citation Alignment": 0.0344779091920434, + "T6. Aggregation & Clustering": 0.15408667402382106, + "T7. Consistency & Compliance Checking": 0.07795789080091817, + "T8. Structured & Numeric Reasoning": 0.04398148148148148, + "T9. Version & Code Diff Analysis": 0.12905444479341674, + "T10. Rule Induction & In-Context Learning": 0.1479166666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.07777777777777778 + }, + "average_language_metric": { + "Chinese": 0.10513615313151659, + "English": 0.20903076360127287 + }, + "BoN-1": { + "overall_metric": 0.1522499746285183, + "token_length": { + "8k": 0.18381886404580258, + "16k": 0.1763640779318065, + "32k": 0.1530403977981516, + "64k": 0.1336449793468121, + "128k": 0.13665833409911293, + "256k": 0.1299731945494239 + }, + "contextual_requirement": { + "Full": 0.14616662224309246, + "Partial": 0.15999242311906023 + }, + "difficulty": { + "Easy": 0.17269103520136797, + "Moderate": 0.10010586621394478, + "Hard": 0.1522959770717054, + "Extreme": 0.16407670515037534 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.31616711715222934, + "T2. Sequencing & Structure Reconstruction": 0.30997657033436127, + "T3. Evidence-Grounded QA": 0.075, + "T4. Summarization & Synthesis": 0.433290104470351, + "T5. Attribution & Citation Alignment": 0.040719448989792455, + "T6. Aggregation & Clustering": 0.1420105345821374, + "T7. Consistency & Compliance Checking": 0.0818116476937141, + "T8. Structured & Numeric Reasoning": 0.04583333333333333, + "T9. Version & Code Diff Analysis": 0.12251594627374464, + "T10. Rule Induction & In-Context Learning": 0.12597222222222224, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.075 + }, + "language": { + "Chinese": 0.10313580635298143, + "English": 0.2013641429040551 + } + }, + "pass@1": 0.03333333333333333, + "BoN-2": { + "overall_metric": 0.1921634438744948, + "token_length": { + "8k": 0.21983364549684287, + "16k": 0.21955667723795985, + "32k": 0.18995358273392637, + "64k": 0.16240795470998307, + "128k": 0.17591239432067907, + "256k": 0.18531640874757638 + }, + "contextual_requirement": { + "Full": 0.183991826715688, + "Partial": 0.20256368389479382 + }, + "difficulty": { + "Easy": 0.23050124787481288, + "Moderate": 0.13024657792029828, + "Hard": 0.18670800635922485, + "Extreme": 0.19441747608931817 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.3586662331587295, + "T2. Sequencing & Structure Reconstruction": 0.3847515389630534, + "T3. Evidence-Grounded QA": 0.16666666666666666, + "T4. Summarization & Synthesis": 0.4623366120534231, + "T5. Attribution & Citation Alignment": 0.04912393153232278, + "T6. Aggregation & Clustering": 0.19169925535926752, + "T7. Consistency & Compliance Checking": 0.09784140674649551, + "T8. Structured & Numeric Reasoning": 0.06805555555555555, + "T9. Version & Code Diff Analysis": 0.17896485067612064, + "T10. Rule Induction & In-Context Learning": 0.17347222222222225, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666 + }, + "language": { + "Chinese": 0.1340947957448097, + "English": 0.25023209200417945 + } + }, + "pass@2": 0.052, + "BoN-3": { + "overall_metric": 0.2126068902708674, + "token_length": { + "8k": 0.24600389478810633, + "16k": 0.23558970928733117, + "32k": 0.21547322231985894, + "64k": 0.18904442314365716, + "128k": 0.19236619778234643, + "256k": 0.19716389430390446 + }, + "contextual_requirement": { + "Full": 0.20471895219678654, + "Partial": 0.22264608418333381 + }, + "difficulty": { + "Easy": 0.2591916862859481, + "Moderate": 0.1495495342063559, + "Hard": 0.20304294133612527, + "Extreme": 0.20927435723794804 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.3790383678432267, + "T2. Sequencing & Structure Reconstruction": 0.41972832869160254, + "T3. Evidence-Grounded QA": 0.19166666666666668, + "T4. Summarization & Synthesis": 0.47495636922941775, + "T5. Attribution & Citation Alignment": 0.058968622858899346, + "T6. Aggregation & Clustering": 0.22320851430069694, + "T7. Consistency & Compliance Checking": 0.10913808800759871, + "T8. Structured & Numeric Reasoning": 0.08194444444444444, + "T9. Version & Code Diff Analysis": 0.19665231407803055, + "T10. Rule Induction & In-Context Learning": 0.21513888888888888, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.1 + }, + "language": { + "Chinese": 0.1489641004694211, + "English": 0.27624968007231393 + } + }, + "pass@3": 0.059333333333333335 +} \ No newline at end of file diff --git a/results/Llama-3.2-3B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Llama-3.2-3B-Instruct/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..55789f7fa33a798169bfbac5733132f51548c16e --- /dev/null +++ b/results/Llama-3.2-3B-Instruct/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.12579532534277046, + "inference_iteration_1_overall_metric": 0.12189144574406009, + "inference_iteration_2_overall_metric": 0.1293014670146224, + "inference_iteration_3_overall_metric": 0.12619306326962915, + "average_token_length_metric": { + "8k": 0.15520842809468816, + "16k": 0.1484137730398096, + "32k": 0.13566213672791996, + "64k": 0.10339908100452032, + "128k": 0.11523413736015264, + "256k": 0.09685439582953186 + }, + "average_contextual_requirement_metric": { + "Full": 0.11497129858976732, + "Partial": 0.13957135939204723 + }, + "average_difficulty_metric": { + "Easy": 0.14352977294197386, + "Moderate": 0.07165328990150331, + "Hard": 0.10476288088788434, + "Extreme": 0.15574400931361643 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.18027953901797078, + "T2. Sequencing & Structure Reconstruction": 0.2032463362205395, + "T3. Evidence-Grounded QA": 0.11944444444444445, + "T4. Summarization & Synthesis": 0.42455853122133613, + "T5. Attribution & Citation Alignment": 0.03812352880578544, + "T6. Aggregation & Clustering": 0.12499067187507265, + "T7. Consistency & Compliance Checking": 0.06067661860338902, + "T8. Structured & Numeric Reasoning": 0.029629629629629627, + "T9. Version & Code Diff Analysis": 0.09370491802352847, + "T10. Rule Induction & In-Context Learning": 0.09847222222222225, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666 + }, + "average_language_metric": { + "Chinese": 0.08710157372538106, + "English": 0.16448907696016007 + }, + "BoN-1": { + "overall_metric": 0.12189144574406009, + "token_length": { + "8k": 0.14926024363329887, + "16k": 0.14140607600695068, + "32k": 0.12814754734467546, + "64k": 0.09111044280283606, + "128k": 0.1221228193452808, + "256k": 0.09930154533131832 + }, + "contextual_requirement": { + "Full": 0.11202506506940242, + "Partial": 0.13444865751180604 + }, + "difficulty": { + "Easy": 0.14039643942766825, + "Moderate": 0.056507474186827354, + "Hard": 0.10349493747731114, + "Extreme": 0.15664860872958616 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.16389165825733903, + "T2. Sequencing & Structure Reconstruction": 0.1787449973585681, + "T3. Evidence-Grounded QA": 0.09166666666666666, + "T4. Summarization & Synthesis": 0.42251209889763974, + "T5. Attribution & Citation Alignment": 0.029073247426826414, + "T6. Aggregation & Clustering": 0.13197902480468204, + "T7. Consistency & Compliance Checking": 0.06122238755841022, + "T8. Structured & Numeric Reasoning": 0.044444444444444446, + "T9. Version & Code Diff Analysis": 0.08781339576018317, + "T10. Rule Induction & In-Context Learning": 0.10180555555555555, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666 + }, + "language": { + "Chinese": 0.07947872192042414, + "English": 0.16430416956769595 + } + }, + "pass@1": 0.03333333333333333, + "BoN-2": { + "overall_metric": 0.16673174742075972, + "token_length": { + "8k": 0.21235596929835865, + "16k": 0.19139324741892721, + "32k": 0.17847356237746767, + "64k": 0.1323473405869264, + "128k": 0.15654182190882684, + "256k": 0.12927854293405058 + }, + "contextual_requirement": { + "Full": 0.15641485621192497, + "Partial": 0.1798623362320038 + }, + "difficulty": { + "Easy": 0.20556729410789223, + "Moderate": 0.09843476269293053, + "Hard": 0.13991984386404396, + "Extreme": 0.1866584183549311 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.23716632921131517, + "T2. Sequencing & Structure Reconstruction": 0.283700678823595, + "T3. Evidence-Grounded QA": 0.18333333333333332, + "T4. Summarization & Synthesis": 0.44817949561782316, + "T5. Attribution & Citation Alignment": 0.07044117047262258, + "T6. Aggregation & Clustering": 0.1774625349060259, + "T7. Consistency & Compliance Checking": 0.0801161795021524, + "T8. Structured & Numeric Reasoning": 0.05, + "T9. Version & Code Diff Analysis": 0.14648554146631595, + "T10. Rule Induction & In-Context Learning": 0.14513888888888887, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.10833333333333334 + }, + "language": { + "Chinese": 0.11086445603583714, + "English": 0.22259903880568205 + } + }, + "pass@2": 0.048, + "BoN-3": { + "overall_metric": 0.19563257554771482, + "token_length": { + "8k": 0.2530151029327433, + "16k": 0.2145335526961687, + "32k": 0.21507949798738746, + "64k": 0.15928462128906565, + "128k": 0.18536894288047534, + "256k": 0.14651373550044794 + }, + "contextual_requirement": { + "Full": 0.18678441171314086, + "Partial": 0.20689387497353642 + }, + "difficulty": { + "Easy": 0.24756743178790308, + "Moderate": 0.1259095390094924, + "Hard": 0.1628721973860393, + "Extreme": 0.20605327132157794 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.3006781841143776, + "T2. Sequencing & Structure Reconstruction": 0.34291202176503843, + "T3. Evidence-Grounded QA": 0.20833333333333334, + "T4. Summarization & Synthesis": 0.46018159312540347, + "T5. Attribution & Citation Alignment": 0.0815602367002352, + "T6. Aggregation & Clustering": 0.20248617019109905, + "T7. Consistency & Compliance Checking": 0.08636604532829192, + "T8. Structured & Numeric Reasoning": 0.07777777777777778, + "T9. Version & Code Diff Analysis": 0.17054683536229345, + "T10. Rule Induction & In-Context Learning": 0.18958333333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666 + }, + "language": { + "Chinese": 0.13358536850999, + "English": 0.25767978258543983 + } + }, + "pass@3": 0.06133333333333333 +} \ No newline at end of file diff --git a/results/Llama-3.3-70B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Llama-3.3-70B-Instruct/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..62ef2beeaf886aba533300cd870538dbf5265a5f --- /dev/null +++ b/results/Llama-3.3-70B-Instruct/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3189017909858673, + "inference_iteration_1_overall_metric": 0.3156125985423413, + "inference_iteration_2_overall_metric": 0.32123074093436993, + "inference_iteration_3_overall_metric": 0.31986203348089143, + "average_token_length_metric": { + "8k": 0.4593043661575621, + "16k": 0.4357005279195819, + "32k": 0.4042163423898818, + "64k": 0.33700892278371447, + "128k": 0.1425979596199625, + "256k": 0.13458262704450225 + }, + "average_contextual_requirement_metric": { + "Full": 0.291318303255093, + "Partial": 0.35400804809776254 + }, + "average_difficulty_metric": { + "Easy": 0.440396417724264, + "Moderate": 0.22606271262921054, + "Hard": 0.29071041097108385, + "Extreme": 0.2653049554891388 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.5698993177933688, + "T2. Sequencing & Structure Reconstruction": 0.5207791047484032, + "T3. Evidence-Grounded QA": 0.33055555555555555, + "T4. Summarization & Synthesis": 0.4889794661436827, + "T5. Attribution & Citation Alignment": 0.24622344831110268, + "T6. Aggregation & Clustering": 0.31517757482240366, + "T7. Consistency & Compliance Checking": 0.19554447151545906, + "T8. Structured & Numeric Reasoning": 0.11126543209876544, + "T9. Version & Code Diff Analysis": 0.3449561289681397, + "T10. Rule Induction & In-Context Learning": 0.3185648148148148, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334 + }, + "average_language_metric": { + "Chinese": 0.28656733073968105, + "English": 0.35123625123205404 + }, + "BoN-1": { + "overall_metric": 0.3156125985423413, + "token_length": { + "8k": 0.4598857003901339, + "16k": 0.43280589089049554, + "32k": 0.4031552748630893, + "64k": 0.32858688094254546, + "128k": 0.13499648051535806, + "256k": 0.1342453636524253 + }, + "contextual_requirement": { + "Full": 0.28665010278500924, + "Partial": 0.3524739567789461 + }, + "difficulty": { + "Easy": 0.4365832341037067, + "Moderate": 0.2165303315022285, + "Hard": 0.2928446211124342, + "Extreme": 0.26312822197701774 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5433299618590155, + "T2. Sequencing & Structure Reconstruction": 0.526070226070226, + "T3. Evidence-Grounded QA": 0.325, + "T4. Summarization & Synthesis": 0.49127697260080533, + "T5. Attribution & Citation Alignment": 0.24102740431192388, + "T6. Aggregation & Clustering": 0.31258403804363555, + "T7. Consistency & Compliance Checking": 0.20132235403207502, + "T8. Structured & Numeric Reasoning": 0.11481481481481483, + "T9. Version & Code Diff Analysis": 0.34245443993484, + "T10. Rule Induction & In-Context Learning": 0.30791666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.225 + }, + "language": { + "Chinese": 0.28075822912737913, + "English": 0.3504669679573037 + } + }, + "pass@1": 0.11266666666666666, + "BoN-2": { + "overall_metric": 0.34649569529069124, + "token_length": { + "8k": 0.47647116314268717, + "16k": 0.4687523820747641, + "32k": 0.4305852114495849, + "64k": 0.3618707734054544, + "128k": 0.17753523030724935, + "256k": 0.16375941136440833 + }, + "contextual_requirement": { + "Full": 0.3168861362630559, + "Partial": 0.3841805885985913 + }, + "difficulty": { + "Easy": 0.4708835305116446, + "Moderate": 0.2536880874829281, + "Hard": 0.31598267575873745, + "Extreme": 0.29123370602859766 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.613795052914502, + "T2. Sequencing & Structure Reconstruction": 0.572449494949495, + "T3. Evidence-Grounded QA": 0.35833333333333334, + "T4. Summarization & Synthesis": 0.5065243852070945, + "T5. Attribution & Citation Alignment": 0.26010872017700837, + "T6. Aggregation & Clustering": 0.3543572676296878, + "T7. Consistency & Compliance Checking": 0.2180957885923682, + "T8. Structured & Numeric Reasoning": 0.1287037037037037, + "T9. Version & Code Diff Analysis": 0.3871389535524576, + "T10. Rule Induction & In-Context Learning": 0.3311111111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25 + }, + "language": { + "Chinese": 0.3170227577667401, + "English": 0.3759686328146428 + } + }, + "pass@2": 0.126, + "BoN-3": { + "overall_metric": 0.358435386899099, + "token_length": { + "8k": 0.48710556132647576, + "16k": 0.4795308481526823, + "32k": 0.4449722178774269, + "64k": 0.37149023172449164, + "128k": 0.19457817433016786, + "256k": 0.17293528798335014 + }, + "contextual_requirement": { + "Full": 0.33005566769629513, + "Partial": 0.39455502952084975 + }, + "difficulty": { + "Easy": 0.48736321739782795, + "Moderate": 0.2624427701465103, + "Hard": 0.3272030727162918, + "Extreme": 0.30076445676260405 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6366814621432787, + "T2. Sequencing & Structure Reconstruction": 0.5821049783549782, + "T3. Evidence-Grounded QA": 0.36666666666666664, + "T4. Summarization & Synthesis": 0.5152102198399457, + "T5. Attribution & Citation Alignment": 0.27428748176856066, + "T6. Aggregation & Clustering": 0.36977067074988085, + "T7. Consistency & Compliance Checking": 0.22440889882489728, + "T8. Structured & Numeric Reasoning": 0.1300925925925926, + "T9. Version & Code Diff Analysis": 0.39630550643647333, + "T10. Rule Induction & In-Context Learning": 0.37277777777777776, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25 + }, + "language": { + "Chinese": 0.33234090111179226, + "English": 0.3845298726864061 + } + }, + "pass@3": 0.13466666666666666 +} \ No newline at end of file diff --git a/results/Llama-3.3-70B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Llama-3.3-70B-Instruct/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..4d5b8a496fc89154ed9b99552066db4e6375dc58 --- /dev/null +++ b/results/Llama-3.3-70B-Instruct/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3368788983987977, + "inference_iteration_1_overall_metric": 0.3346445205602255, + "inference_iteration_2_overall_metric": 0.34105124981338825, + "inference_iteration_3_overall_metric": 0.3349409248227798, + "average_token_length_metric": { + "8k": 0.48257436937624887, + "16k": 0.4570891611420083, + "32k": 0.43164967032208246, + "64k": 0.37974005621997625, + "128k": 0.14494662029982153, + "256k": 0.1252735130326494 + }, + "average_contextual_requirement_metric": { + "Full": 0.3148605467957358, + "Partial": 0.3649022549845135 + }, + "average_difficulty_metric": { + "Easy": 0.5193942390521589, + "Moderate": 0.22606077851307846, + "Hard": 0.2859477654450278, + "Extreme": 0.2431814890253722 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.5716856431399633, + "T2. Sequencing & Structure Reconstruction": 0.5180708023900146, + "T3. Evidence-Grounded QA": 0.3222222222222222, + "T4. Summarization & Synthesis": 0.4675755720039648, + "T5. Attribution & Citation Alignment": 0.2472281333613823, + "T6. Aggregation & Clustering": 0.36222233277682875, + "T7. Consistency & Compliance Checking": 0.16955304749357702, + "T8. Structured & Numeric Reasoning": 0.2820987654320987, + "T9. Version & Code Diff Analysis": 0.34705004572107623, + "T10. Rule Induction & In-Context Learning": 0.31356481481481485, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2027777777777778 + }, + "average_language_metric": { + "Chinese": 0.28230462501190684, + "English": 0.39145317178568884 + }, + "BoN-1": { + "overall_metric": 0.3346445205602255, + "token_length": { + "8k": 0.4542953722813034, + "16k": 0.44790102117035924, + "32k": 0.4589206338089406, + "64k": 0.3903508423882444, + "128k": 0.1390327640597696, + "256k": 0.11736648965273745 + }, + "contextual_requirement": { + "Full": 0.3229561312874356, + "Partial": 0.34952065236195895 + }, + "difficulty": { + "Easy": 0.5259881421298087, + "Moderate": 0.21937379810223762, + "Hard": 0.272924266331999, + "Extreme": 0.24128717207335554 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5894507286480519, + "T2. Sequencing & Structure Reconstruction": 0.49666058774955946, + "T3. Evidence-Grounded QA": 0.31666666666666665, + "T4. Summarization & Synthesis": 0.4602747124060712, + "T5. Attribution & Citation Alignment": 0.2717574105274334, + "T6. Aggregation & Clustering": 0.34752011368589375, + "T7. Consistency & Compliance Checking": 0.15669004787219404, + "T8. Structured & Numeric Reasoning": 0.3101851851851852, + "T9. Version & Code Diff Analysis": 0.3494311586679074, + "T10. Rule Induction & In-Context Learning": 0.2688888888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334 + }, + "language": { + "Chinese": 0.28273488892882487, + "English": 0.3865541521916267 + } + }, + "pass@1": 0.14066666666666666, + "BoN-2": { + "overall_metric": 0.3958348563743987, + "token_length": { + "8k": 0.5375237165063974, + "16k": 0.5279796876480686, + "32k": 0.5128361604334021, + "64k": 0.44897938061236586, + "128k": 0.18481857856663617, + "256k": 0.16287161447952125 + }, + "contextual_requirement": { + "Full": 0.37597657573399246, + "Partial": 0.4211090317349159 + }, + "difficulty": { + "Easy": 0.6024719774255488, + "Moderate": 0.2783382985783699, + "Hard": 0.3365596754926361, + "Extreme": 0.28558017485446086 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6560921957788185, + "T2. Sequencing & Structure Reconstruction": 0.590698368130755, + "T3. Evidence-Grounded QA": 0.38333333333333336, + "T4. Summarization & Synthesis": 0.4863977172263582, + "T5. Attribution & Citation Alignment": 0.29159793863898964, + "T6. Aggregation & Clustering": 0.424325611415665, + "T7. Consistency & Compliance Checking": 0.20595551459420022, + "T8. Structured & Numeric Reasoning": 0.3527777777777778, + "T9. Version & Code Diff Analysis": 0.3998111292235962, + "T10. Rule Induction & In-Context Learning": 0.42375, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667 + }, + "language": { + "Chinese": 0.3369511703916512, + "English": 0.4547185423571466 + } + }, + "pass@2": 0.18533333333333332, + "BoN-3": { + "overall_metric": 0.4231932102533526, + "token_length": { + "8k": 0.5825111026223746, + "16k": 0.5470777230462098, + "32k": 0.5441468137547846, + "64k": 0.476981081390983, + "128k": 0.19851254776601635, + "256k": 0.189929992939747 + }, + "contextual_requirement": { + "Full": 0.4008399164475989, + "Partial": 0.4516428569152216 + }, + "difficulty": { + "Easy": 0.6383633989735824, + "Moderate": 0.29622937081220263, + "Hard": 0.3631324161592812, + "Extreme": 0.31032522872728074 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6823167917941978, + "T2. Sequencing & Structure Reconstruction": 0.6268177888335086, + "T3. Evidence-Grounded QA": 0.38333333333333336, + "T4. Summarization & Synthesis": 0.49909323423974894, + "T5. Attribution & Citation Alignment": 0.30308485751721276, + "T6. Aggregation & Clustering": 0.4702913178424183, + "T7. Consistency & Compliance Checking": 0.22495072081956943, + "T8. Structured & Numeric Reasoning": 0.38055555555555554, + "T9. Version & Code Diff Analysis": 0.4307393977892574, + "T10. Rule Induction & In-Context Learning": 0.47583333333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.275 + }, + "language": { + "Chinese": 0.36349427675262524, + "English": 0.48289214375408057 + } + }, + "pass@3": 0.20866666666666667 +} \ No newline at end of file diff --git a/results/Magistral-Small-2509/thinking_context-120000_bon-3_summary.json b/results/Magistral-Small-2509/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..e75935e36612f86d6bb7a6e93be99e5989405b6f --- /dev/null +++ b/results/Magistral-Small-2509/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.38398116897357343, + "inference_iteration_1_overall_metric": 0.3775125226933459, + "inference_iteration_2_overall_metric": 0.3891788972494985, + "inference_iteration_3_overall_metric": 0.38525208697787594, + "average_token_length_metric": { + "8k": 0.5449912961150511, + "16k": 0.47444243155128724, + "32k": 0.4264163670766384, + "64k": 0.3029575907045873, + "128k": 0.302750216644183, + "256k": 0.2523291117496938 + }, + "average_contextual_requirement_metric": { + "Full": 0.3475865533347475, + "Partial": 0.4303015888775337 + }, + "average_difficulty_metric": { + "Easy": 0.5425168562606535, + "Moderate": 0.2943739375717643, + "Hard": 0.3291735952793175, + "Extreme": 0.30516679942900543 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6519835369865774, + "T2. Sequencing & Structure Reconstruction": 0.6259667241346109, + "T3. Evidence-Grounded QA": 0.438888888888889, + "T4. Summarization & Synthesis": 0.5181402784690045, + "T5. Attribution & Citation Alignment": 0.2964646210831104, + "T6. Aggregation & Clustering": 0.3430731095683128, + "T7. Consistency & Compliance Checking": 0.1940507194662843, + "T8. Structured & Numeric Reasoning": 0.23070987654320987, + "T9. Version & Code Diff Analysis": 0.40948667090743257, + "T10. Rule Induction & In-Context Learning": 0.4654166666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2416666666666666 + }, + "average_language_metric": { + "Chinese": 0.3639911930284364, + "English": 0.40397114491871056 + }, + "BoN-1": { + "overall_metric": 0.3775125226933459, + "token_length": { + "8k": 0.5435818039060978, + "16k": 0.4750140116790471, + "32k": 0.40129268121677775, + "64k": 0.28670320465634697, + "128k": 0.3074867892397562, + "256k": 0.25099664546205136 + }, + "contextual_requirement": { + "Full": 0.33866755109330426, + "Partial": 0.42695157745703605 + }, + "difficulty": { + "Easy": 0.5415570936201182, + "Moderate": 0.2821803430686346, + "Hard": 0.32058466511584804, + "Extreme": 0.29781631261319746 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6532513121925126, + "T2. Sequencing & Structure Reconstruction": 0.6111094803277807, + "T3. Evidence-Grounded QA": 0.425, + "T4. Summarization & Synthesis": 0.5176378460686947, + "T5. Attribution & Citation Alignment": 0.2913492653600891, + "T6. Aggregation & Clustering": 0.33400917389507784, + "T7. Consistency & Compliance Checking": 0.1887200677483396, + "T8. Structured & Numeric Reasoning": 0.22129629629629627, + "T9. Version & Code Diff Analysis": 0.41785365614151276, + "T10. Rule Induction & In-Context Learning": 0.445, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667 + }, + "language": { + "Chinese": 0.34935070830064097, + "English": 0.4056743370860514 + } + }, + "pass@1": 0.15666666666666668, + "BoN-2": { + "overall_metric": 0.4410818050309274, + "token_length": { + "8k": 0.6043547908072043, + "16k": 0.5353872646463821, + "32k": 0.50503501278225, + "64k": 0.35415051008140713, + "128k": 0.3590897028976117, + "256k": 0.28847354897071054 + }, + "contextual_requirement": { + "Full": 0.4052447124894285, + "Partial": 0.48669265008374507 + }, + "difficulty": { + "Easy": 0.6224120794085322, + "Moderate": 0.35760055550726977, + "Hard": 0.3739866973836827, + "Extreme": 0.3413440208772583 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7183527826830406, + "T2. Sequencing & Structure Reconstruction": 0.6865933482781307, + "T3. Evidence-Grounded QA": 0.5333333333333333, + "T4. Summarization & Synthesis": 0.5317396047677334, + "T5. Attribution & Citation Alignment": 0.34712734561624925, + "T6. Aggregation & Clustering": 0.4084056110628732, + "T7. Consistency & Compliance Checking": 0.2396263809618489, + "T8. Structured & Numeric Reasoning": 0.28148148148148144, + "T9. Version & Code Diff Analysis": 0.4526614935043571, + "T10. Rule Induction & In-Context Learning": 0.5494444444444445, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3 + }, + "language": { + "Chinese": 0.4165337844683723, + "English": 0.4656298255934835 + } + }, + "pass@2": 0.202, + "BoN-3": { + "overall_metric": 0.47447373045805247, + "token_length": { + "8k": 0.6472581172783606, + "16k": 0.572013026736313, + "32k": 0.5323134358536711, + "64k": 0.38848449233451177, + "128k": 0.38802278615487296, + "256k": 0.31875052439058804 + }, + "contextual_requirement": { + "Full": 0.43884732486807027, + "Partial": 0.5198164284816681 + }, + "difficulty": { + "Easy": 0.6584366917924486, + "Moderate": 0.38635877266139323, + "Hard": 0.40375289537189885, + "Extreme": 0.3772769049579495 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7364126543991428, + "T2. Sequencing & Structure Reconstruction": 0.718547042731825, + "T3. Evidence-Grounded QA": 0.5666666666666667, + "T4. Summarization & Synthesis": 0.5424760798885467, + "T5. Attribution & Citation Alignment": 0.3887278739210803, + "T6. Aggregation & Clustering": 0.45972240993615016, + "T7. Consistency & Compliance Checking": 0.258469847158553, + "T8. Structured & Numeric Reasoning": 0.32592592592592595, + "T9. Version & Code Diff Analysis": 0.48746959414301055, + "T10. Rule Induction & In-Context Learning": 0.5911111111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333 + }, + "language": { + "Chinese": 0.45726250895870557, + "English": 0.4916849519574011 + } + }, + "pass@3": 0.22866666666666666 +} \ No newline at end of file diff --git a/results/MiniMax-M2/thinking_context-1000000_bon-3_summary.json b/results/MiniMax-M2/thinking_context-1000000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..3c813b1a43cd4c125b6b414abea6d24710902e82 --- /dev/null +++ b/results/MiniMax-M2/thinking_context-1000000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5320685707653132, + "inference_iteration_1_overall_metric": 0.535180398833494, + "inference_iteration_2_overall_metric": 0.5311849506804371, + "inference_iteration_3_overall_metric": 0.5298403627820072, + "average_token_length_metric": { + "8k": 0.654795970947119, + "16k": 0.5832041701523042, + "32k": 0.5830505446766833, + "64k": 0.5201561955794758, + "128k": 0.5060838591020447, + "256k": 0.3451206841342513 + }, + "average_contextual_requirement_metric": { + "Full": 0.4938467607068266, + "Partial": 0.5807145108397509 + }, + "average_difficulty_metric": { + "Easy": 0.7219874781362817, + "Moderate": 0.599199335465557, + "Hard": 0.4257653962693645, + "Extreme": 0.34975019139747615 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.767571983047427, + "T2. Sequencing & Structure Reconstruction": 0.7186696477094124, + "T3. Evidence-Grounded QA": 0.4972222222222222, + "T4. Summarization & Synthesis": 0.4696599254603241, + "T5. Attribution & Citation Alignment": 0.54344042963745, + "T6. Aggregation & Clustering": 0.5123089198769455, + "T7. Consistency & Compliance Checking": 0.31381086481875964, + "T8. Structured & Numeric Reasoning": 0.6038580246913581, + "T9. Version & Code Diff Analysis": 0.5619188050015754, + "T10. Rule Induction & In-Context Learning": 0.5529629629629632, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.39444444444444454 + }, + "average_language_metric": { + "Chinese": 0.5354797348772966, + "English": 0.5286574066533296 + }, + "BoN-1": { + "overall_metric": 0.535180398833494, + "token_length": { + "8k": 0.6571274187960493, + "16k": 0.5855452098864022, + "32k": 0.6094638772285274, + "64k": 0.5094373867375244, + "128k": 0.5028727484199556, + "256k": 0.34663575193250185 + }, + "contextual_requirement": { + "Full": 0.4983496461876354, + "Partial": 0.5820559022009494 + }, + "difficulty": { + "Easy": 0.7348029222520711, + "Moderate": 0.6076522249303262, + "Hard": 0.4165082385065274, + "Extreme": 0.3468482177079349 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7483964432006224, + "T2. Sequencing & Structure Reconstruction": 0.6994320017261199, + "T3. Evidence-Grounded QA": 0.5, + "T4. Summarization & Synthesis": 0.46659438196842223, + "T5. Attribution & Citation Alignment": 0.5466093432829364, + "T6. Aggregation & Clustering": 0.5244645023077399, + "T7. Consistency & Compliance Checking": 0.32026132009110975, + "T8. Structured & Numeric Reasoning": 0.6097222222222223, + "T9. Version & Code Diff Analysis": 0.5581618594200692, + "T10. Rule Induction & In-Context Learning": 0.5638888888888888, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.425 + }, + "language": { + "Chinese": 0.5363273796225, + "English": 0.5340334180444871 + } + }, + "pass@1": 0.30133333333333334, + "BoN-2": { + "overall_metric": 0.631079137318095, + "token_length": { + "8k": 0.7607802022833232, + "16k": 0.6922523863155777, + "32k": 0.696226877019834, + "64k": 0.6061494664574102, + "128k": 0.5803070617163435, + "256k": 0.45075883011608214 + }, + "contextual_requirement": { + "Full": 0.5863061857618213, + "Partial": 0.6880628938442622 + }, + "difficulty": { + "Easy": 0.8429412379045521, + "Moderate": 0.7260325873797594, + "Hard": 0.5037591992235261, + "Extreme": 0.42025273404272534 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.854967268320159, + "T2. Sequencing & Structure Reconstruction": 0.7950413527388575, + "T3. Evidence-Grounded QA": 0.6333333333333333, + "T4. Summarization & Synthesis": 0.48790929716965253, + "T5. Attribution & Citation Alignment": 0.6628232709674524, + "T6. Aggregation & Clustering": 0.6070531962911038, + "T7. Consistency & Compliance Checking": 0.4273910542891768, + "T8. Structured & Numeric Reasoning": 0.6976851851851852, + "T9. Version & Code Diff Analysis": 0.6589983180763119, + "T10. Rule Induction & In-Context Learning": 0.663888888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333 + }, + "language": { + "Chinese": 0.6243713115466586, + "English": 0.6377869630895318 + } + }, + "pass@2": 0.38133333333333336, + "BoN-3": { + "overall_metric": 0.6838483190204042, + "token_length": { + "8k": 0.8056053063071357, + "16k": 0.754121676530954, + "32k": 0.7309373525434467, + "64k": 0.6802921620132278, + "128k": 0.6428076963616377, + "256k": 0.48932572036602695 + }, + "contextual_requirement": { + "Full": 0.6418563038987366, + "Partial": 0.7372927019025284 + }, + "difficulty": { + "Easy": 0.8932565923719491, + "Moderate": 0.7958991372015439, + "Hard": 0.5571346872299623, + "Extreme": 0.46408187669686923 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8856810957992655, + "T2. Sequencing & Structure Reconstruction": 0.8257039309014353, + "T3. Evidence-Grounded QA": 0.7083333333333334, + "T4. Summarization & Synthesis": 0.5020042367803014, + "T5. Attribution & Citation Alignment": 0.729994986816228, + "T6. Aggregation & Clustering": 0.6651081533166491, + "T7. Consistency & Compliance Checking": 0.49051401515979876, + "T8. Structured & Numeric Reasoning": 0.7680555555555556, + "T9. Version & Code Diff Analysis": 0.7036475958542688, + "T10. Rule Induction & In-Context Learning": 0.7072222222222223, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6 + }, + "language": { + "Chinese": 0.6727473750313149, + "English": 0.694949263009495 + } + }, + "pass@3": 0.444 +} \ No newline at end of file diff --git a/results/MiniMax-Text-01/nonthinking_context-1000000_bon-3_summary.json b/results/MiniMax-Text-01/nonthinking_context-1000000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f73a7335187aa418a6d75eeed9e1ea7dd3d2f56e --- /dev/null +++ b/results/MiniMax-Text-01/nonthinking_context-1000000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.4113523778378889, + "inference_iteration_1_overall_metric": 0.4026546189679395, + "inference_iteration_2_overall_metric": 0.41422198000018023, + "inference_iteration_3_overall_metric": 0.41718053454554826, + "average_token_length_metric": { + "8k": 0.45750122785552744, + "16k": 0.40648581074103435, + "32k": 0.41953181726499883, + "64k": 0.3963813527019971, + "128k": 0.41323756281622565, + "256k": 0.3749764956475515 + }, + "average_contextual_requirement_metric": { + "Full": 0.37732447212646125, + "Partial": 0.45466062147061553 + }, + "average_difficulty_metric": { + "Easy": 0.5125950929989945, + "Moderate": 0.38228847113922254, + "Hard": 0.3867421547849868, + "Extreme": 0.33569972963459577 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7019344870456294, + "T2. Sequencing & Structure Reconstruction": 0.6935557265385518, + "T3. Evidence-Grounded QA": 0.5000000000000001, + "T4. Summarization & Synthesis": 0.525289467915154, + "T5. Attribution & Citation Alignment": 0.40960389859884994, + "T6. Aggregation & Clustering": 0.3855189408594916, + "T7. Consistency & Compliance Checking": 0.2570183735053335, + "T8. Structured & Numeric Reasoning": 0.16126543209876543, + "T9. Version & Code Diff Analysis": 0.3763262824393013, + "T10. Rule Induction & In-Context Learning": 0.3850462962962962, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.34444444444444444 + }, + "average_language_metric": { + "Chinese": 0.4206201869029405, + "English": 0.40208456877283766 + }, + "BoN-1": { + "overall_metric": 0.4026546189679395, + "token_length": { + "8k": 0.4484815946744958, + "16k": 0.40023341947584756, + "32k": 0.39365195091822286, + "64k": 0.4050265329266902, + "128k": 0.40626760527764794, + "256k": 0.3622666105347326 + }, + "contextual_requirement": { + "Full": 0.3618050193216267, + "Partial": 0.45464501851779227 + }, + "difficulty": { + "Easy": 0.49189201078601713, + "Moderate": 0.3847984308236515, + "Hard": 0.39113612973801154, + "Extreme": 0.32419293466074633 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7071317552591362, + "T2. Sequencing & Structure Reconstruction": 0.692217342415818, + "T3. Evidence-Grounded QA": 0.475, + "T4. Summarization & Synthesis": 0.5252872492452957, + "T5. Attribution & Citation Alignment": 0.3965042482839467, + "T6. Aggregation & Clustering": 0.38900319686695384, + "T7. Consistency & Compliance Checking": 0.24881818692821855, + "T8. Structured & Numeric Reasoning": 0.1462962962962963, + "T9. Version & Code Diff Analysis": 0.35572673286895423, + "T10. Rule Induction & In-Context Learning": 0.36347222222222214, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667 + }, + "language": { + "Chinese": 0.4101250015901318, + "English": 0.3951842363457473 + } + }, + "pass@1": 0.15533333333333332, + "BoN-2": { + "overall_metric": 0.4807038949944852, + "token_length": { + "8k": 0.5303281501019884, + "16k": 0.4819497908714715, + "32k": 0.47954691765928337, + "64k": 0.48083012165065453, + "128k": 0.465293133114307, + "256k": 0.44627525656921 + }, + "contextual_requirement": { + "Full": 0.44656912126099607, + "Partial": 0.5241481524734732 + }, + "difficulty": { + "Easy": 0.596920990471646, + "Moderate": 0.4603818463137054, + "Hard": 0.4590470067460482, + "Extreme": 0.3809658785230146 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7449388025193358, + "T2. Sequencing & Structure Reconstruction": 0.7399444536944532, + "T3. Evidence-Grounded QA": 0.6166666666666667, + "T4. Summarization & Synthesis": 0.54417036696111, + "T5. Attribution & Citation Alignment": 0.5088222013004289, + "T6. Aggregation & Clustering": 0.4705462063266301, + "T7. Consistency & Compliance Checking": 0.3211976903039678, + "T8. Structured & Numeric Reasoning": 0.2083333333333333, + "T9. Version & Code Diff Analysis": 0.4528903513431796, + "T10. Rule Induction & In-Context Learning": 0.45958333333333334, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665 + }, + "language": { + "Chinese": 0.48759226495133245, + "English": 0.47381552503763963 + } + }, + "pass@2": 0.206, + "BoN-3": { + "overall_metric": 0.5286875532565248, + "token_length": { + "8k": 0.5760868208214227, + "16k": 0.5315447995369911, + "32k": 0.5297979513353553, + "64k": 0.5139951126923608, + "128k": 0.5285145377275431, + "256k": 0.49218609742548064 + }, + "contextual_requirement": { + "Full": 0.4922563052647642, + "Partial": 0.5750545961551319 + }, + "difficulty": { + "Easy": 0.6571166886876132, + "Moderate": 0.5008690293131257, + "Hard": 0.5070400734318661, + "Extreme": 0.42048944373649116 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7851733453547192, + "T2. Sequencing & Structure Reconstruction": 0.7758457283457282, + "T3. Evidence-Grounded QA": 0.7083333333333334, + "T4. Summarization & Synthesis": 0.5534698517064113, + "T5. Attribution & Citation Alignment": 0.5639720868179612, + "T6. Aggregation & Clustering": 0.503918026189678, + "T7. Consistency & Compliance Checking": 0.34945026972752397, + "T8. Structured & Numeric Reasoning": 0.25277777777777777, + "T9. Version & Code Diff Analysis": 0.5104976262726122, + "T10. Rule Induction & In-Context Learning": 0.5270833333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.525 + }, + "language": { + "Chinese": 0.5330279709661675, + "English": 0.5243471355468845 + } + }, + "pass@3": 0.24266666666666667 +} \ No newline at end of file diff --git a/results/MiniMax-Text-01/thinking_context-1000000_bon-3_summary.json b/results/MiniMax-Text-01/thinking_context-1000000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..984f604d36398edfbdb83e4dc1374752d6d0b1fd --- /dev/null +++ b/results/MiniMax-Text-01/thinking_context-1000000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.4499528005964066, + "inference_iteration_1_overall_metric": 0.4519835462001885, + "inference_iteration_2_overall_metric": 0.4481755772504262, + "inference_iteration_3_overall_metric": 0.4496992783386054, + "average_token_length_metric": { + "8k": 0.485225729559654, + "16k": 0.4524723240855649, + "32k": 0.46920448352940436, + "64k": 0.44046374240515457, + "128k": 0.4133092627171987, + "256k": 0.43904126128146514 + }, + "average_contextual_requirement_metric": { + "Full": 0.4116545212336913, + "Partial": 0.49869606523986354 + }, + "average_difficulty_metric": { + "Easy": 0.6191934548978654, + "Moderate": 0.4082147550465631, + "Hard": 0.3801988071084879, + "Extreme": 0.33778735493415807 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6830330861399296, + "T2. Sequencing & Structure Reconstruction": 0.6403219944448011, + "T3. Evidence-Grounded QA": 0.4833333333333333, + "T4. Summarization & Synthesis": 0.5086176566073063, + "T5. Attribution & Citation Alignment": 0.416914270509611, + "T6. Aggregation & Clustering": 0.4334853794839026, + "T7. Consistency & Compliance Checking": 0.27119391146489646, + "T8. Structured & Numeric Reasoning": 0.38966049382716056, + "T9. Version & Code Diff Analysis": 0.4348929522191275, + "T10. Rule Induction & In-Context Learning": 0.41300925925925924, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4027777777777778 + }, + "average_language_metric": { + "Chinese": 0.45819903421860664, + "English": 0.4417065669742075 + }, + "BoN-1": { + "overall_metric": 0.4519835462001885, + "token_length": { + "8k": 0.4879779124929164, + "16k": 0.4554840853531918, + "32k": 0.4648286187996774, + "64k": 0.42985632449506034, + "128k": 0.4307020670264534, + "256k": 0.443052269033835 + }, + "contextual_requirement": { + "Full": 0.41228070711895354, + "Partial": 0.5025144323035801 + }, + "difficulty": { + "Easy": 0.6285595261886431, + "Moderate": 0.4057015689049336, + "Hard": 0.37791019658117175, + "Extreme": 0.33760415329971205 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6904671153390595, + "T2. Sequencing & Structure Reconstruction": 0.6319390331890332, + "T3. Evidence-Grounded QA": 0.44166666666666665, + "T4. Summarization & Synthesis": 0.5079368349605524, + "T5. Attribution & Citation Alignment": 0.3963567606333699, + "T6. Aggregation & Clustering": 0.4315669444489273, + "T7. Consistency & Compliance Checking": 0.26717481095169254, + "T8. Structured & Numeric Reasoning": 0.40648148148148144, + "T9. Version & Code Diff Analysis": 0.4533152836127507, + "T10. Rule Induction & In-Context Learning": 0.4119444444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4583333333333333 + }, + "language": { + "Chinese": 0.47043321599568005, + "English": 0.43353387640469826 + } + }, + "pass@1": 0.21, + "BoN-2": { + "overall_metric": 0.5523435379453717, + "token_length": { + "8k": 0.6041368153338821, + "16k": 0.553143416205592, + "32k": 0.5547357356840433, + "64k": 0.5474714891955119, + "128k": 0.5080001305944092, + "256k": 0.5465736406587951 + }, + "contextual_requirement": { + "Full": 0.5094377873914124, + "Partial": 0.6069508568322307 + }, + "difficulty": { + "Easy": 0.7582302423860908, + "Moderate": 0.5069058318579235, + "Hard": 0.47636905813527697, + "Extreme": 0.40654974290892704 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7753422134076285, + "T2. Sequencing & Structure Reconstruction": 0.7188864376364378, + "T3. Evidence-Grounded QA": 0.6083333333333333, + "T4. Summarization & Synthesis": 0.5276281423571613, + "T5. Attribution & Citation Alignment": 0.573374443874177, + "T6. Aggregation & Clustering": 0.5278895685136558, + "T7. Consistency & Compliance Checking": 0.35338346649949204, + "T8. Structured & Numeric Reasoning": 0.5027777777777779, + "T9. Version & Code Diff Analysis": 0.552570101188694, + "T10. Rule Induction & In-Context Learning": 0.5220833333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.55 + }, + "language": { + "Chinese": 0.5649919855932303, + "English": 0.5396950902975145 + } + }, + "pass@2": 0.2753333333333333, + "BoN-3": { + "overall_metric": 0.5997056103547938, + "token_length": { + "8k": 0.6457585156659336, + "16k": 0.6123141997231359, + "32k": 0.6242961953070552, + "64k": 0.5876928890236057, + "128k": 0.5497742714361217, + "256k": 0.5783975909729129 + }, + "contextual_requirement": { + "Full": 0.5540426758661396, + "Partial": 0.6578220724312633 + }, + "difficulty": { + "Easy": 0.8056166900447767, + "Moderate": 0.5614066990728871, + "Hard": 0.5178805893116146, + "Extreme": 0.45303896497156343 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8083334868935526, + "T2. Sequencing & Structure Reconstruction": 0.7593085155585156, + "T3. Evidence-Grounded QA": 0.6833333333333333, + "T4. Summarization & Synthesis": 0.5344316475303361, + "T5. Attribution & Citation Alignment": 0.6383957562170883, + "T6. Aggregation & Clustering": 0.5743997782942697, + "T7. Consistency & Compliance Checking": 0.39861351698347697, + "T8. Structured & Numeric Reasoning": 0.5527777777777778, + "T9. Version & Code Diff Analysis": 0.5853585580965909, + "T10. Rule Induction & In-Context Learning": 0.5984722222222222, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6 + }, + "language": { + "Chinese": 0.6144058958264887, + "English": 0.5850053248830991 + } + }, + "pass@3": 0.31933333333333336 +} \ No newline at end of file diff --git a/results/Ministral-3-14B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json b/results/Ministral-3-14B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..034d947740b5b771402360bda5c3d57661ebc5a3 --- /dev/null +++ b/results/Ministral-3-14B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.40137741846169367, + "inference_iteration_1_overall_metric": 0.40238223426918757, + "inference_iteration_2_overall_metric": 0.4002952301959775, + "inference_iteration_3_overall_metric": 0.4014547909199151, + "average_token_length_metric": { + "8k": 0.43375517745982745, + "16k": 0.4624624276537502, + "32k": 0.4225675952668474, + "64k": 0.39702430034744873, + "128k": 0.3821065286583368, + "256k": 0.3103484813839513 + }, + "average_contextual_requirement_metric": { + "Full": 0.3639638275660222, + "Partial": 0.4489947159652748 + }, + "average_difficulty_metric": { + "Easy": 0.5260224045368944, + "Moderate": 0.3401707697729334, + "Hard": 0.35036961025265867, + "Extreme": 0.33853899745082133 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7189334347722878, + "T2. Sequencing & Structure Reconstruction": 0.6668259209925876, + "T3. Evidence-Grounded QA": 0.4583333333333333, + "T4. Summarization & Synthesis": 0.5221962563770883, + "T5. Attribution & Citation Alignment": 0.3136369167899536, + "T6. Aggregation & Clustering": 0.39798286937745314, + "T7. Consistency & Compliance Checking": 0.21946000799150195, + "T8. Structured & Numeric Reasoning": 0.14151234567901233, + "T9. Version & Code Diff Analysis": 0.47709977467470854, + "T10. Rule Induction & In-Context Learning": 0.4217592592592592, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.30000000000000004 + }, + "average_language_metric": { + "Chinese": 0.40560935669062276, + "English": 0.397145480232764 + }, + "BoN-1": { + "overall_metric": 0.40238223426918757, + "token_length": { + "8k": 0.433884589163934, + "16k": 0.46516909415802493, + "32k": 0.42676245820656594, + "64k": 0.3945954502711027, + "128k": 0.38087905781855047, + "256k": 0.31300275599695 + }, + "contextual_requirement": { + "Full": 0.36618501990567515, + "Partial": 0.44845141618638706 + }, + "difficulty": { + "Easy": 0.5313007708587812, + "Moderate": 0.33905574344991224, + "Hard": 0.35279460774167126, + "Extreme": 0.33532188262609514 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7231148921645373, + "T2. Sequencing & Structure Reconstruction": 0.6661998186998181, + "T3. Evidence-Grounded QA": 0.4666666666666667, + "T4. Summarization & Synthesis": 0.5230622050479962, + "T5. Attribution & Citation Alignment": 0.30176586689960616, + "T6. Aggregation & Clustering": 0.3921100544588917, + "T7. Consistency & Compliance Checking": 0.219946553696774, + "T8. Structured & Numeric Reasoning": 0.14675925925925926, + "T9. Version & Code Diff Analysis": 0.4747724555416158, + "T10. Rule Induction & In-Context Learning": 0.4193055555555556, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665 + }, + "language": { + "Chinese": 0.40159850236109407, + "English": 0.4031659661772829 + } + }, + "pass@1": 0.158, + "BoN-2": { + "overall_metric": 0.4167918691419028, + "token_length": { + "8k": 0.4448864222174602, + "16k": 0.481483520242527, + "32k": 0.43929332248789354, + "64k": 0.4125546400739453, + "128k": 0.39499054247940135, + "256k": 0.3275427673501904 + }, + "contextual_requirement": { + "Full": 0.37697160991132156, + "Partial": 0.4674721990717337 + }, + "difficulty": { + "Easy": 0.543547780022054, + "Moderate": 0.3573487107807427, + "Hard": 0.3645996932831712, + "Extreme": 0.3512606476539851 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7338149078373205, + "T2. Sequencing & Structure Reconstruction": 0.6898572261072259, + "T3. Evidence-Grounded QA": 0.475, + "T4. Summarization & Synthesis": 0.5324649457390374, + "T5. Attribution & Citation Alignment": 0.3305062690008576, + "T6. Aggregation & Clustering": 0.41269190069190054, + "T7. Consistency & Compliance Checking": 0.24063695524418569, + "T8. Structured & Numeric Reasoning": 0.14953703703703705, + "T9. Version & Code Diff Analysis": 0.4968728427963255, + "T10. Rule Induction & In-Context Learning": 0.43041666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665 + }, + "language": { + "Chinese": 0.42183857405387454, + "English": 0.4117451642299312 + } + }, + "pass@2": 0.164, + "BoN-3": { + "overall_metric": 0.42795282912937055, + "token_length": { + "8k": 0.45584907332800734, + "16k": 0.4930808335792047, + "32k": 0.45565921785915897, + "64k": 0.41883977659018873, + "128k": 0.4041598036532658, + "256k": 0.34012826976639843 + }, + "contextual_requirement": { + "Full": 0.3885912574254956, + "Partial": 0.47804937493430194 + }, + "difficulty": { + "Easy": 0.554784793655902, + "Moderate": 0.37124888788597155, + "Hard": 0.37318051863597795, + "Extreme": 0.36223380606151817 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7412124373820532, + "T2. Sequencing & Structure Reconstruction": 0.6988453213453213, + "T3. Evidence-Grounded QA": 0.48333333333333334, + "T4. Summarization & Synthesis": 0.538030953852775, + "T5. Attribution & Citation Alignment": 0.34888079906957237, + "T6. Aggregation & Clustering": 0.4339707150850415, + "T7. Consistency & Compliance Checking": 0.24529018187024293, + "T8. Structured & Numeric Reasoning": 0.15046296296296297, + "T9. Version & Code Diff Analysis": 0.5141050625900392, + "T10. Rule Induction & In-Context Learning": 0.44708333333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333 + }, + "language": { + "Chinese": 0.43352032726356865, + "English": 0.42238533099517206 + } + }, + "pass@3": 0.16933333333333334 +} \ No newline at end of file diff --git a/results/Ministral-3-14B-Instruct-2512/thinking_context-224000_bon-3_summary.json b/results/Ministral-3-14B-Instruct-2512/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..d11436d053696ca6931612d429ddedbfb43c15e3 --- /dev/null +++ b/results/Ministral-3-14B-Instruct-2512/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.45799174186842606, + "inference_iteration_1_overall_metric": 0.45848259675544495, + "inference_iteration_2_overall_metric": 0.46152550901732137, + "inference_iteration_3_overall_metric": 0.453967119832514, + "average_token_length_metric": { + "8k": 0.5187827600069922, + "16k": 0.48518599796025474, + "32k": 0.48745678732020276, + "64k": 0.4570416898883375, + "128k": 0.42361656559174016, + "256k": 0.3758666504430352 + }, + "average_contextual_requirement_metric": { + "Full": 0.4248803053301027, + "Partial": 0.500133570189931 + }, + "average_difficulty_metric": { + "Easy": 0.6756101555622144, + "Moderate": 0.39349242818800056, + "Hard": 0.37479147914388156, + "Extreme": 0.31661242864258926 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7608265794743194, + "T2. Sequencing & Structure Reconstruction": 0.6976467766457445, + "T3. Evidence-Grounded QA": 0.4138888888888888, + "T4. Summarization & Synthesis": 0.49943611989270814, + "T5. Attribution & Citation Alignment": 0.3460922902510361, + "T6. Aggregation & Clustering": 0.45809776319010986, + "T7. Consistency & Compliance Checking": 0.2280021697407482, + "T8. Structured & Numeric Reasoning": 0.41836419753086435, + "T9. Version & Code Diff Analysis": 0.5354951076952419, + "T10. Rule Induction & In-Context Learning": 0.462037037037037, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.35277777777777775 + }, + "average_language_metric": { + "Chinese": 0.43851283791556633, + "English": 0.4774706458212879 + }, + "BoN-1": { + "overall_metric": 0.45848259675544495, + "token_length": { + "8k": 0.538436412123529, + "16k": 0.47816429220213824, + "32k": 0.4805712657434, + "64k": 0.44911476875649875, + "128k": 0.4390480988508506, + "256k": 0.3655607428562557 + }, + "contextual_requirement": { + "Full": 0.42758275308934174, + "Partial": 0.49780967051230435 + }, + "difficulty": { + "Easy": 0.6788797898422243, + "Moderate": 0.39582478863697335, + "Hard": 0.37581391782555706, + "Extreme": 0.3125005687762375 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7602730263789274, + "T2. Sequencing & Structure Reconstruction": 0.7038347568610727, + "T3. Evidence-Grounded QA": 0.4583333333333333, + "T4. Summarization & Synthesis": 0.4990357866977353, + "T5. Attribution & Citation Alignment": 0.35405443033443207, + "T6. Aggregation & Clustering": 0.43771712805012347, + "T7. Consistency & Compliance Checking": 0.21275716732831734, + "T8. Structured & Numeric Reasoning": 0.42546296296296293, + "T9. Version & Code Diff Analysis": 0.5013174605476809, + "T10. Rule Induction & In-Context Learning": 0.48194444444444445, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334 + }, + "language": { + "Chinese": 0.43269825076497226, + "English": 0.48426694274591864 + } + }, + "pass@1": 0.22733333333333333, + "BoN-2": { + "overall_metric": 0.5227530650530186, + "token_length": { + "8k": 0.5815903903274882, + "16k": 0.5495634888346752, + "32k": 0.5446963722215565, + "64k": 0.5299811967182008, + "128k": 0.4888075896458958, + "256k": 0.4418793525702981 + }, + "contextual_requirement": { + "Full": 0.489360650067773, + "Partial": 0.565252502306969 + }, + "difficulty": { + "Easy": 0.7604456302850787, + "Moderate": 0.463797787486306, + "Hard": 0.43728803588921916, + "Extreme": 0.35722954733316814 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8036702370810763, + "T2. Sequencing & Structure Reconstruction": 0.7488194240400123, + "T3. Evidence-Grounded QA": 0.5083333333333333, + "T4. Summarization & Synthesis": 0.5120697680827194, + "T5. Attribution & Citation Alignment": 0.4029499793922618, + "T6. Aggregation & Clustering": 0.5304624812643679, + "T7. Consistency & Compliance Checking": 0.27354479208047294, + "T8. Structured & Numeric Reasoning": 0.4962962962962963, + "T9. Version & Code Diff Analysis": 0.6272818834382945, + "T10. Rule Induction & In-Context Learning": 0.5725, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333 + }, + "language": { + "Chinese": 0.5068736065410564, + "English": 0.5386325235649823 + } + }, + "pass@2": 0.2833333333333333, + "BoN-3": { + "overall_metric": 0.5542260073321058, + "token_length": { + "8k": 0.6154421507239413, + "16k": 0.5842317006106588, + "32k": 0.5699342902198496, + "64k": 0.5576866952391514, + "128k": 0.5184072720538686, + "256k": 0.4796539351451706 + }, + "contextual_requirement": { + "Full": 0.5204393104129562, + "Partial": 0.5972272579564804 + }, + "difficulty": { + "Easy": 0.7905996585472251, + "Moderate": 0.5012307381116559, + "Hard": 0.46914010908714937, + "Extreme": 0.3859836380407792 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8214474184096282, + "T2. Sequencing & Structure Reconstruction": 0.7666511293717175, + "T3. Evidence-Grounded QA": 0.525, + "T4. Summarization & Synthesis": 0.5197606048087388, + "T5. Attribution & Citation Alignment": 0.4423027060091871, + "T6. Aggregation & Clustering": 0.5772430200792206, + "T7. Consistency & Compliance Checking": 0.3040682257365206, + "T8. Structured & Numeric Reasoning": 0.5402777777777777, + "T9. Version & Code Diff Analysis": 0.661946364328448, + "T10. Rule Induction & In-Context Learning": 0.5916666666666668, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667 + }, + "language": { + "Chinese": 0.5366930860176997, + "English": 0.5717589286465138 + } + }, + "pass@3": 0.31133333333333335 +} \ No newline at end of file diff --git a/results/Ministral-3-3B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json b/results/Ministral-3-3B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..32c3d32fcf911b08d21f1b583c0511c193295456 --- /dev/null +++ b/results/Ministral-3-3B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.30179346535967666, + "inference_iteration_1_overall_metric": 0.3029527040012084, + "inference_iteration_2_overall_metric": 0.30315184463083855, + "inference_iteration_3_overall_metric": 0.2992758474469829, + "average_token_length_metric": { + "8k": 0.31721543930238405, + "16k": 0.3412774869133842, + "32k": 0.31691876705456545, + "64k": 0.2687371996940676, + "128k": 0.30453000021054333, + "256k": 0.2620818989831162 + }, + "average_contextual_requirement_metric": { + "Full": 0.2701610943483024, + "Partial": 0.34205284664688057 + }, + "average_difficulty_metric": { + "Easy": 0.38811634301315323, + "Moderate": 0.23804107928640247, + "Hard": 0.2856786739337558, + "Extreme": 0.2596888909694311 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.5494421813257797, + "T2. Sequencing & Structure Reconstruction": 0.5158060495268492, + "T3. Evidence-Grounded QA": 0.43333333333333335, + "T4. Summarization & Synthesis": 0.5079974835401471, + "T5. Attribution & Citation Alignment": 0.1926176241888911, + "T6. Aggregation & Clustering": 0.30095602813594396, + "T7. Consistency & Compliance Checking": 0.11630381954916796, + "T8. Structured & Numeric Reasoning": 0.0558641975308642, + "T9. Version & Code Diff Analysis": 0.27038742910884633, + "T10. Rule Induction & In-Context Learning": 0.332037037037037, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2611111111111111 + }, + "average_language_metric": { + "Chinese": 0.32613475528214075, + "English": 0.2774521754372128 + }, + "BoN-1": { + "overall_metric": 0.3029527040012084, + "token_length": { + "8k": 0.3191767566971557, + "16k": 0.3423054126585435, + "32k": 0.31611056842908253, + "64k": 0.26522388398511487, + "128k": 0.31148016857049676, + "256k": 0.26341943366685827 + }, + "contextual_requirement": { + "Full": 0.27081993053812903, + "Partial": 0.3438489611360375 + }, + "difficulty": { + "Easy": 0.3900871254265116, + "Moderate": 0.24072949435669488, + "Hard": 0.2869463348423525, + "Extreme": 0.25888351569934015 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5387044717426223, + "T2. Sequencing & Structure Reconstruction": 0.5228428722670637, + "T3. Evidence-Grounded QA": 0.425, + "T4. Summarization & Synthesis": 0.5081133322337062, + "T5. Attribution & Citation Alignment": 0.19946236214954846, + "T6. Aggregation & Clustering": 0.300068633862112, + "T7. Consistency & Compliance Checking": 0.12608033855092354, + "T8. Structured & Numeric Reasoning": 0.05092592592592593, + "T9. Version & Code Diff Analysis": 0.2735623030026126, + "T10. Rule Induction & In-Context Learning": 0.33694444444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.26666666666666666 + }, + "language": { + "Chinese": 0.3278788873616267, + "English": 0.2780265206407908 + } + }, + "pass@1": 0.09933333333333333, + "BoN-2": { + "overall_metric": 0.3235664407635165, + "token_length": { + "8k": 0.33752116035721835, + "16k": 0.3644601713974257, + "32k": 0.34581977441150585, + "64k": 0.2903196448924103, + "128k": 0.3272049759144316, + "256k": 0.2760729176081059 + }, + "contextual_requirement": { + "Full": 0.2920011834968168, + "Partial": 0.3637404045574977 + }, + "difficulty": { + "Easy": 0.4147958762035026, + "Moderate": 0.2544155279477453, + "Hard": 0.3096540579882773, + "Extreme": 0.2781862238174447 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5653125926955125, + "T2. Sequencing & Structure Reconstruction": 0.5686493732875312, + "T3. Evidence-Grounded QA": 0.4583333333333333, + "T4. Summarization & Synthesis": 0.5187643397494163, + "T5. Attribution & Citation Alignment": 0.2137291166026424, + "T6. Aggregation & Clustering": 0.32250141360204826, + "T7. Consistency & Compliance Checking": 0.1323008909170473, + "T8. Structured & Numeric Reasoning": 0.057870370370370364, + "T9. Version & Code Diff Analysis": 0.3024494082079856, + "T10. Rule Induction & In-Context Learning": 0.35666666666666663, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667 + }, + "language": { + "Chinese": 0.34580247197246466, + "English": 0.3013304095545682 + } + }, + "pass@2": 0.108, + "BoN-3": { + "overall_metric": 0.3337187991769299, + "token_length": { + "8k": 0.3545410484752056, + "16k": 0.37343595068728874, + "32k": 0.3540675331326773, + "64k": 0.3014677883245934, + "128k": 0.3329694407689707, + "256k": 0.2858310336728446 + }, + "contextual_requirement": { + "Full": 0.30091145355832494, + "Partial": 0.3754736026915186 + }, + "difficulty": { + "Easy": 0.4222694522861904, + "Moderate": 0.2710401461853368, + "Hard": 0.31823781489879627, + "Extreme": 0.28805225456849304 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5911236100282776, + "T2. Sequencing & Structure Reconstruction": 0.5777883824265405, + "T3. Evidence-Grounded QA": 0.4583333333333333, + "T4. Summarization & Synthesis": 0.5244656227142894, + "T5. Attribution & Citation Alignment": 0.22343915037802925, + "T6. Aggregation & Clustering": 0.3402498647457522, + "T7. Consistency & Compliance Checking": 0.14046319763373694, + "T8. Structured & Numeric Reasoning": 0.07175925925925926, + "T9. Version & Code Diff Analysis": 0.3067930750396991, + "T10. Rule Induction & In-Context Learning": 0.3608333333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3 + }, + "language": { + "Chinese": 0.35283833596795733, + "English": 0.3145992623859028 + } + }, + "pass@3": 0.112 +} \ No newline at end of file diff --git a/results/Ministral-3-3B-Instruct-2512/thinking_context-224000_bon-3_summary.json b/results/Ministral-3-3B-Instruct-2512/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..148c001169a70f28357640aeaa567a5480f2c298 --- /dev/null +++ b/results/Ministral-3-3B-Instruct-2512/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3454036931179677, + "inference_iteration_1_overall_metric": 0.34873430118319537, + "inference_iteration_2_overall_metric": 0.34879316780005654, + "inference_iteration_3_overall_metric": 0.338683610370652, + "average_token_length_metric": { + "8k": 0.3692200735727051, + "16k": 0.40119881531813145, + "32k": 0.3781626650292353, + "64k": 0.32612186138726695, + "128k": 0.3193179750507245, + "256k": 0.27840076834974453 + }, + "average_contextual_requirement_metric": { + "Full": 0.31819246274226237, + "Partial": 0.3800361681415936 + }, + "average_difficulty_metric": { + "Easy": 0.4964533039214179, + "Moderate": 0.255962770404739, + "Hard": 0.30232089813471313, + "Extreme": 0.2669796976793795 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.5858449524514547, + "T2. Sequencing & Structure Reconstruction": 0.586324154256027, + "T3. Evidence-Grounded QA": 0.37777777777777766, + "T4. Summarization & Synthesis": 0.49033515640007624, + "T5. Attribution & Citation Alignment": 0.21401668096493356, + "T6. Aggregation & Clustering": 0.3516741155045124, + "T7. Consistency & Compliance Checking": 0.154158877596512, + "T8. Structured & Numeric Reasoning": 0.24166666666666664, + "T9. Version & Code Diff Analysis": 0.352322026546868, + "T10. Rule Induction & In-Context Learning": 0.32856481481481475, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2611111111111111 + }, + "average_language_metric": { + "Chinese": 0.32661149524563937, + "English": 0.364195890990297 + }, + "BoN-1": { + "overall_metric": 0.34873430118319537, + "token_length": { + "8k": 0.3656122243049499, + "16k": 0.3958940853914839, + "32k": 0.3703173794807147, + "64k": 0.3317196346770501, + "128k": 0.34303779448458255, + "256k": 0.2858246887603903 + }, + "contextual_requirement": { + "Full": 0.32665848057093605, + "Partial": 0.3768308001442518 + }, + "difficulty": { + "Easy": 0.50262211292123, + "Moderate": 0.25361014093585454, + "Hard": 0.310150509372198, + "Extreme": 0.2679790122903014 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.58240433072649, + "T2. Sequencing & Structure Reconstruction": 0.6066101012652736, + "T3. Evidence-Grounded QA": 0.36666666666666664, + "T4. Summarization & Synthesis": 0.4928112466906771, + "T5. Attribution & Citation Alignment": 0.2064491285924213, + "T6. Aggregation & Clustering": 0.3455987518896014, + "T7. Consistency & Compliance Checking": 0.15983398189125114, + "T8. Structured & Numeric Reasoning": 0.2453703703703704, + "T9. Version & Code Diff Analysis": 0.37553263462157643, + "T10. Rule Induction & In-Context Learning": 0.31083333333333335, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667 + }, + "language": { + "Chinese": 0.3267683933919365, + "English": 0.3707002089744536 + } + }, + "pass@1": 0.13466666666666666, + "BoN-2": { + "overall_metric": 0.4230141245251924, + "token_length": { + "8k": 0.46908993057301923, + "16k": 0.4637780067071494, + "32k": 0.4631374771588592, + "64k": 0.4023395380855012, + "128k": 0.38963984120856304, + "256k": 0.3500999534180651 + }, + "contextual_requirement": { + "Full": 0.38841564042851795, + "Partial": 0.4670485588300523 + }, + "difficulty": { + "Easy": 0.6182575201716307, + "Moderate": 0.3194064855847735, + "Hard": 0.36683106532741566, + "Extreme": 0.3140872951169107 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6586830692904562, + "T2. Sequencing & Structure Reconstruction": 0.6861384911384911, + "T3. Evidence-Grounded QA": 0.5, + "T4. Summarization & Synthesis": 0.5047315613131492, + "T5. Attribution & Citation Alignment": 0.2972254153066172, + "T6. Aggregation & Clustering": 0.42732208121097004, + "T7. Consistency & Compliance Checking": 0.19799483338543603, + "T8. Structured & Numeric Reasoning": 0.3277777777777777, + "T9. Version & Code Diff Analysis": 0.4437559809549206, + "T10. Rule Induction & In-Context Learning": 0.41750000000000004, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.35 + }, + "language": { + "Chinese": 0.40188885021867327, + "English": 0.4441393988317126 + } + }, + "pass@2": 0.184, + "BoN-3": { + "overall_metric": 0.45091674287019584, + "token_length": { + "8k": 0.4890912273204133, + "16k": 0.5060340159767878, + "32k": 0.48168220208134793, + "64k": 0.43952399968187594, + "128k": 0.41252202308139446, + "256k": 0.3766469890793584 + }, + "contextual_requirement": { + "Full": 0.4168658522047825, + "Partial": 0.4942542400807231 + }, + "difficulty": { + "Easy": 0.6450588018724265, + "Moderate": 0.3584312989173043, + "Hard": 0.39189079094493173, + "Extreme": 0.3377583814377241 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6863503216486717, + "T2. Sequencing & Structure Reconstruction": 0.7174943112443114, + "T3. Evidence-Grounded QA": 0.5416666666666666, + "T4. Summarization & Synthesis": 0.5097159356890285, + "T5. Attribution & Citation Alignment": 0.3359084101583427, + "T6. Aggregation & Clustering": 0.46349171404726974, + "T7. Consistency & Compliance Checking": 0.2201801789041147, + "T8. Structured & Numeric Reasoning": 0.34444444444444444, + "T9. Version & Code Diff Analysis": 0.4909269121544662, + "T10. Rule Induction & In-Context Learning": 0.44555555555555554, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664 + }, + "language": { + "Chinese": 0.41909228907998713, + "English": 0.4827411966604057 + } + }, + "pass@3": 0.202 +} \ No newline at end of file diff --git a/results/Ministral-3-8B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json b/results/Ministral-3-8B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..66fe280b380dd2e9bd48276327777d7025fdaa01 --- /dev/null +++ b/results/Ministral-3-8B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3780437992177988, + "inference_iteration_1_overall_metric": 0.37746736206286746, + "inference_iteration_2_overall_metric": 0.3812187781897308, + "inference_iteration_3_overall_metric": 0.3754452574007977, + "average_token_length_metric": { + "8k": 0.40747091961293536, + "16k": 0.4217586689370988, + "32k": 0.4100622794449128, + "64k": 0.3509822308146611, + "128k": 0.3761773667981329, + "256k": 0.30181132969905095 + }, + "average_contextual_requirement_metric": { + "Full": 0.3448423936300968, + "Partial": 0.4203001336021466 + }, + "average_difficulty_metric": { + "Easy": 0.5026909853492837, + "Moderate": 0.3172643692616012, + "Hard": 0.3210729729273563, + "Extreme": 0.3188389818084875 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6788284158016221, + "T2. Sequencing & Structure Reconstruction": 0.6236659328325996, + "T3. Evidence-Grounded QA": 0.47777777777777775, + "T4. Summarization & Synthesis": 0.5139559182541029, + "T5. Attribution & Citation Alignment": 0.2898615675534933, + "T6. Aggregation & Clustering": 0.38892311924203327, + "T7. Consistency & Compliance Checking": 0.2090272807257386, + "T8. Structured & Numeric Reasoning": 0.10648148148148148, + "T9. Version & Code Diff Analysis": 0.4152267224956728, + "T10. Rule Induction & In-Context Learning": 0.3640277777777777, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3055555555555555 + }, + "average_language_metric": { + "Chinese": 0.3899531126999766, + "English": 0.36613448573562096 + }, + "BoN-1": { + "overall_metric": 0.37746736206286746, + "token_length": { + "8k": 0.4069937376708668, + "16k": 0.41722807754864144, + "32k": 0.40789581779808437, + "64k": 0.34921996994765486, + "128k": 0.37545000489943714, + "256k": 0.3080165645125228 + }, + "contextual_requirement": { + "Full": 0.3453649180362828, + "Partial": 0.4183250180967035 + }, + "difficulty": { + "Easy": 0.5044527526099465, + "Moderate": 0.3125637979759982, + "Hard": 0.3178750307733104, + "Extreme": 0.32013180642672173 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6846023385004407, + "T2. Sequencing & Structure Reconstruction": 0.6255770155770156, + "T3. Evidence-Grounded QA": 0.475, + "T4. Summarization & Synthesis": 0.5150710795645007, + "T5. Attribution & Citation Alignment": 0.2866964120580854, + "T6. Aggregation & Clustering": 0.38933176968982663, + "T7. Consistency & Compliance Checking": 0.20492564574674327, + "T8. Structured & Numeric Reasoning": 0.10555555555555556, + "T9. Version & Code Diff Analysis": 0.42320350137539475, + "T10. Rule Induction & In-Context Learning": 0.3668055555555556, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667 + }, + "language": { + "Chinese": 0.3917885072903086, + "English": 0.3631462168354272 + } + }, + "pass@1": 0.14066666666666666, + "BoN-2": { + "overall_metric": 0.3960720661735587, + "token_length": { + "8k": 0.4267853319337846, + "16k": 0.4487497839538122, + "32k": 0.42026835310130006, + "64k": 0.37076824135007136, + "128k": 0.3903702458059971, + "256k": 0.3194904408963875 + }, + "contextual_requirement": { + "Full": 0.3617541651771871, + "Partial": 0.43974939471439556 + }, + "difficulty": { + "Easy": 0.5236613839420188, + "Moderate": 0.32881680957334175, + "Hard": 0.3378863045377575, + "Extreme": 0.3386957095958586 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6941496673211041, + "T2. Sequencing & Structure Reconstruction": 0.6537227587227588, + "T3. Evidence-Grounded QA": 0.5, + "T4. Summarization & Synthesis": 0.5266833803950245, + "T5. Attribution & Citation Alignment": 0.30914312627907664, + "T6. Aggregation & Clustering": 0.4099625037688756, + "T7. Consistency & Compliance Checking": 0.21406107873077526, + "T8. Structured & Numeric Reasoning": 0.11296296296296296, + "T9. Version & Code Diff Analysis": 0.43908318736871094, + "T10. Rule Induction & In-Context Learning": 0.3893055555555555, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333 + }, + "language": { + "Chinese": 0.41022855768773003, + "English": 0.38191557465938764 + } + }, + "pass@2": 0.15333333333333332, + "BoN-3": { + "overall_metric": 0.40392076628937795, + "token_length": { + "8k": 0.44011246245033525, + "16k": 0.45248242271387923, + "32k": 0.4364757916450588, + "64k": 0.3739727159425725, + "128k": 0.3968686251866505, + "256k": 0.3236125797977717 + }, + "contextual_requirement": { + "Full": 0.36776720734616364, + "Partial": 0.4499343867625605 + }, + "difficulty": { + "Easy": 0.5291795040463627, + "Moderate": 0.3398411270992665, + "Hard": 0.3454895416328106, + "Extreme": 0.3471728755003404 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6957621908179982, + "T2. Sequencing & Structure Reconstruction": 0.6586220261220265, + "T3. Evidence-Grounded QA": 0.5166666666666667, + "T4. Summarization & Synthesis": 0.5308128403233062, + "T5. Attribution & Citation Alignment": 0.3225311270119217, + "T6. Aggregation & Clustering": 0.4233804016791107, + "T7. Consistency & Compliance Checking": 0.22394925367429327, + "T8. Structured & Numeric Reasoning": 0.11851851851851854, + "T9. Version & Code Diff Analysis": 0.4448702446451986, + "T10. Rule Induction & In-Context Learning": 0.3893055555555555, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667 + }, + "language": { + "Chinese": 0.4204961295794779, + "English": 0.3873454029992785 + } + }, + "pass@3": 0.15933333333333333 +} \ No newline at end of file diff --git a/results/Ministral-3-8B-Instruct-2512/thinking_context-224000_bon-3_summary.json b/results/Ministral-3-8B-Instruct-2512/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..3a0d612493f5f689a1c1d1d43fa3d387509f2e3e --- /dev/null +++ b/results/Ministral-3-8B-Instruct-2512/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.4445697906316412, + "inference_iteration_1_overall_metric": 0.4414354535926015, + "inference_iteration_2_overall_metric": 0.44685007554136563, + "inference_iteration_3_overall_metric": 0.44542384276095776, + "average_token_length_metric": { + "8k": 0.5011776417934968, + "16k": 0.48676163299638525, + "32k": 0.4768790712429065, + "64k": 0.43057655714960325, + "128k": 0.41136598921457657, + "256k": 0.36065785139288176 + }, + "average_contextual_requirement_metric": { + "Full": 0.406935522390496, + "Partial": 0.49246795021128126 + }, + "average_difficulty_metric": { + "Easy": 0.6714266695708587, + "Moderate": 0.3525997582691683, + "Hard": 0.3499422819715659, + "Extreme": 0.3186190490562193 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7086003520112202, + "T2. Sequencing & Structure Reconstruction": 0.6809300421800422, + "T3. Evidence-Grounded QA": 0.43888888888888894, + "T4. Summarization & Synthesis": 0.4966730523372876, + "T5. Attribution & Citation Alignment": 0.3928781055792416, + "T6. Aggregation & Clustering": 0.4235524897993562, + "T7. Consistency & Compliance Checking": 0.2048019407256743, + "T8. Structured & Numeric Reasoning": 0.39459876543209865, + "T9. Version & Code Diff Analysis": 0.5072684442594425, + "T10. Rule Induction & In-Context Learning": 0.46134259259259264, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3361111111111112 + }, + "average_language_metric": { + "Chinese": 0.4274680156614393, + "English": 0.4616715656018438 + }, + "BoN-1": { + "overall_metric": 0.4414354535926015, + "token_length": { + "8k": 0.49046479436781104, + "16k": 0.467031355142411, + "32k": 0.48202229982107525, + "64k": 0.4432590731600766, + "128k": 0.42424252636864085, + "256k": 0.34159267269559335 + }, + "contextual_requirement": { + "Full": 0.4063419548203588, + "Partial": 0.4860999065754557 + }, + "difficulty": { + "Easy": 0.6551940293849869, + "Moderate": 0.3509192356404266, + "Hard": 0.3599913200493228, + "Extreme": 0.32021933552500187 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7053507564972957, + "T2. Sequencing & Structure Reconstruction": 0.6859980759980755, + "T3. Evidence-Grounded QA": 0.44166666666666665, + "T4. Summarization & Synthesis": 0.49921805286585674, + "T5. Attribution & Citation Alignment": 0.3949316390801899, + "T6. Aggregation & Clustering": 0.41861071554912704, + "T7. Consistency & Compliance Checking": 0.2063855467243845, + "T8. Structured & Numeric Reasoning": 0.37129629629629635, + "T9. Version & Code Diff Analysis": 0.5077280298336089, + "T10. Rule Induction & In-Context Learning": 0.4469444444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667 + }, + "language": { + "Chinese": 0.42410938192847564, + "English": 0.4587615252567272 + } + }, + "pass@1": 0.204, + "BoN-2": { + "overall_metric": 0.5073666298083515, + "token_length": { + "8k": 0.5709490486184731, + "16k": 0.5511354485905285, + "32k": 0.5387154833941795, + "64k": 0.4998902412554904, + "128k": 0.46094293028745753, + "256k": 0.422566626703988 + }, + "contextual_requirement": { + "Full": 0.47415575005443095, + "Partial": 0.549635022222436 + }, + "difficulty": { + "Easy": 0.7571129884853747, + "Moderate": 0.4236864876869295, + "Hard": 0.4106963399857809, + "Extreme": 0.3522382887844671 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7586612129799296, + "T2. Sequencing & Structure Reconstruction": 0.7268826543826544, + "T3. Evidence-Grounded QA": 0.525, + "T4. Summarization & Synthesis": 0.5087847643179455, + "T5. Attribution & Citation Alignment": 0.4821472159010296, + "T6. Aggregation & Clustering": 0.5023386543115935, + "T7. Consistency & Compliance Checking": 0.2538128132470806, + "T8. Structured & Numeric Reasoning": 0.45879629629629626, + "T9. Version & Code Diff Analysis": 0.5837409347959492, + "T10. Rule Induction & In-Context Learning": 0.5177777777777777, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667 + }, + "language": { + "Chinese": 0.48982515153338074, + "English": 0.5249081080833256 + } + }, + "pass@2": 0.26, + "BoN-3": { + "overall_metric": 0.54231740926941, + "token_length": { + "8k": 0.6068057358810921, + "16k": 0.604506768606215, + "32k": 0.5734498234618363, + "64k": 0.5181523235316485, + "128k": 0.4882018386364437, + "256k": 0.4627879654992305 + }, + "contextual_requirement": { + "Full": 0.5027116133357081, + "Partial": 0.592724785912306 + }, + "difficulty": { + "Easy": 0.8109806331655524, + "Moderate": 0.44576401021132206, + "Hard": 0.43505232412248046, + "Extreme": 0.3818807456723898 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8065060178436674, + "T2. Sequencing & Structure Reconstruction": 0.761479215229215, + "T3. Evidence-Grounded QA": 0.55, + "T4. Summarization & Synthesis": 0.512142930512617, + "T5. Attribution & Citation Alignment": 0.5013791499027004, + "T6. Aggregation & Clustering": 0.5419289746475281, + "T7. Consistency & Compliance Checking": 0.27906109745150515, + "T8. Structured & Numeric Reasoning": 0.5125, + "T9. Version & Code Diff Analysis": 0.6030585275642201, + "T10. Rule Induction & In-Context Learning": 0.6025000000000001, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665 + }, + "language": { + "Chinese": 0.5225371072526017, + "English": 0.5620977112862204 + } + }, + "pass@3": 0.292 +} \ No newline at end of file diff --git a/results/Ministral-8B-Instruct-2410/nonthinking_context-120000_bon-3_summary.json b/results/Ministral-8B-Instruct-2410/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..e514863f61844da0b44070579cb8da37777ac545 --- /dev/null +++ b/results/Ministral-8B-Instruct-2410/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.17559419950360972, + "inference_iteration_1_overall_metric": 0.1798365624509928, + "inference_iteration_2_overall_metric": 0.17628623970116333, + "inference_iteration_3_overall_metric": 0.17065979635867265, + "average_token_length_metric": { + "8k": 0.23670773015182286, + "16k": 0.2225568752590229, + "32k": 0.18016652444641598, + "64k": 0.1429897738729207, + "128k": 0.13700010614467598, + "256k": 0.13414418714679874 + }, + "average_contextual_requirement_metric": { + "Full": 0.16475960378398763, + "Partial": 0.1893836849649465 + }, + "average_difficulty_metric": { + "Easy": 0.1985800924396908, + "Moderate": 0.12257638309875012, + "Hard": 0.18613560242292748, + "Extreme": 0.1783134792811784 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.39248676390172876, + "T2. Sequencing & Structure Reconstruction": 0.41139201610622733, + "T3. Evidence-Grounded QA": 0.16388888888888892, + "T4. Summarization & Synthesis": 0.3808268181260766, + "T5. Attribution & Citation Alignment": 0.07226655367312615, + "T6. Aggregation & Clustering": 0.1829192046779024, + "T7. Consistency & Compliance Checking": 0.0756354691154679, + "T8. Structured & Numeric Reasoning": 0.01790123456790123, + "T9. Version & Code Diff Analysis": 0.10878073870531223, + "T10. Rule Induction & In-Context Learning": 0.16171296296296298, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.08888888888888888 + }, + "average_language_metric": { + "Chinese": 0.16472451449115244, + "English": 0.1864638845160666 + }, + "BoN-1": { + "overall_metric": 0.1798365624509928, + "token_length": { + "8k": 0.24346526021614548, + "16k": 0.22653552572263724, + "32k": 0.1770017940649281, + "64k": 0.15054614321487733, + "128k": 0.15816316532578398, + "256k": 0.12330748616158495 + }, + "contextual_requirement": { + "Full": 0.17138999029023816, + "Partial": 0.19058674520104432 + }, + "difficulty": { + "Easy": 0.20041414087611367, + "Moderate": 0.12656096910842649, + "Hard": 0.20970773721313615, + "Extreme": 0.17266720828925194 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.39139525725719654, + "T2. Sequencing & Structure Reconstruction": 0.43729511229511225, + "T3. Evidence-Grounded QA": 0.19166666666666668, + "T4. Summarization & Synthesis": 0.3753264284837986, + "T5. Attribution & Citation Alignment": 0.07301644262170577, + "T6. Aggregation & Clustering": 0.18246822005155336, + "T7. Consistency & Compliance Checking": 0.07435907091608848, + "T8. Structured & Numeric Reasoning": 0.016666666666666666, + "T9. Version & Code Diff Analysis": 0.1033217424170237, + "T10. Rule Induction & In-Context Learning": 0.16569444444444445, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.1 + }, + "language": { + "Chinese": 0.17157094623226812, + "English": 0.18810217866971762 + } + }, + "pass@1": 0.042, + "BoN-2": { + "overall_metric": 0.24632458802491347, + "token_length": { + "8k": 0.3310742150705802, + "16k": 0.2986080788714333, + "32k": 0.2667237213160877, + "64k": 0.1950849880242041, + "128k": 0.20119049419756752, + "256k": 0.1852660306696089 + }, + "contextual_requirement": { + "Full": 0.22948115089254192, + "Partial": 0.2677616898297509 + }, + "difficulty": { + "Easy": 0.2985290634429019, + "Moderate": 0.17216911619732905, + "Hard": 0.26824836302970206, + "Extreme": 0.22344368627563585 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.49278911557042887, + "T2. Sequencing & Structure Reconstruction": 0.54627701002701, + "T3. Evidence-Grounded QA": 0.2833333333333333, + "T4. Summarization & Synthesis": 0.41486609223778365, + "T5. Attribution & Citation Alignment": 0.13174847956194702, + "T6. Aggregation & Clustering": 0.28143118609785267, + "T7. Consistency & Compliance Checking": 0.11797004643635843, + "T8. Structured & Numeric Reasoning": 0.027314814814814816, + "T9. Version & Code Diff Analysis": 0.16858035966848986, + "T10. Rule Induction & In-Context Learning": 0.2597222222222222, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666 + }, + "language": { + "Chinese": 0.23254504310620405, + "English": 0.2601041329436236 + } + }, + "pass@2": 0.06733333333333333, + "BoN-3": { + "overall_metric": 0.28484680397856615, + "token_length": { + "8k": 0.3735683555982749, + "16k": 0.34629859523101947, + "32k": 0.31456400077002905, + "64k": 0.22697550391574145, + "128k": 0.22085267461030403, + "256k": 0.2268216937460278 + }, + "contextual_requirement": { + "Full": 0.265505541689755, + "Partial": 0.3094629559825076 + }, + "difficulty": { + "Easy": 0.3518525654113404, + "Moderate": 0.20247492584084434, + "Hard": 0.2991035774243336, + "Extreme": 0.2561842772635152 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.5450181829888936, + "T2. Sequencing & Structure Reconstruction": 0.5925035612535613, + "T3. Evidence-Grounded QA": 0.38333333333333336, + "T4. Summarization & Synthesis": 0.4350813325596142, + "T5. Attribution & Citation Alignment": 0.16950641606988356, + "T6. Aggregation & Clustering": 0.3244020623625887, + "T7. Consistency & Compliance Checking": 0.1368650403313524, + "T8. Structured & Numeric Reasoning": 0.04398148148148148, + "T9. Version & Code Diff Analysis": 0.20838045837476707, + "T10. Rule Induction & In-Context Learning": 0.27722222222222215, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668 + }, + "language": { + "Chinese": 0.2733872958268853, + "English": 0.29630631213024705 + } + }, + "pass@3": 0.08666666666666667 +} \ No newline at end of file diff --git a/results/Ministral-8B-Instruct-2410/thinking_context-120000_bon-3_summary.json b/results/Ministral-8B-Instruct-2410/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..d5ecf959c2b2f40b926ab225c2de149c27a0568e --- /dev/null +++ b/results/Ministral-8B-Instruct-2410/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.14430959523166526, + "inference_iteration_1_overall_metric": 0.13849588125287102, + "inference_iteration_2_overall_metric": 0.14293536519106903, + "inference_iteration_3_overall_metric": 0.15149753925105558, + "average_token_length_metric": { + "8k": 0.21115060917104964, + "16k": 0.19398692652809305, + "32k": 0.12654145463127237, + "64k": 0.10693470763289076, + "128k": 0.11058360115846737, + "256k": 0.11666027226821786 + }, + "average_contextual_requirement_metric": { + "Full": 0.13689831594533386, + "Partial": 0.15374213250517774 + }, + "average_difficulty_metric": { + "Easy": 0.16844291277154172, + "Moderate": 0.09886695623599538, + "Hard": 0.13984977399541215, + "Extreme": 0.15064948603349507 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.2973074399585583, + "T2. Sequencing & Structure Reconstruction": 0.3370251608769255, + "T3. Evidence-Grounded QA": 0.11944444444444446, + "T4. Summarization & Synthesis": 0.38152337418528626, + "T5. Attribution & Citation Alignment": 0.04010115798342492, + "T6. Aggregation & Clustering": 0.13501345810105547, + "T7. Consistency & Compliance Checking": 0.05419569047101407, + "T8. Structured & Numeric Reasoning": 0.047839506172839504, + "T9. Version & Code Diff Analysis": 0.10414538082981198, + "T10. Rule Induction & In-Context Learning": 0.11319444444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.055555555555555546 + }, + "average_language_metric": { + "Chinese": 0.1233481918215467, + "English": 0.16527099864178366 + }, + "BoN-1": { + "overall_metric": 0.13849588125287102, + "token_length": { + "8k": 0.21715146588540263, + "16k": 0.17919497907671647, + "32k": 0.13758692284538804, + "64k": 0.09292136118327776, + "128k": 0.10222628424195211, + "256k": 0.10189427428448873 + }, + "contextual_requirement": { + "Full": 0.13174868671272114, + "Partial": 0.14708321975851618 + }, + "difficulty": { + "Easy": 0.14908853505231587, + "Moderate": 0.09146123280540662, + "Hard": 0.14088385418342952, + "Extreme": 0.15621685873708296 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.25192356457401754, + "T2. Sequencing & Structure Reconstruction": 0.3288852758782787, + "T3. Evidence-Grounded QA": 0.09166666666666666, + "T4. Summarization & Synthesis": 0.3927474127585449, + "T5. Attribution & Citation Alignment": 0.04019875024260989, + "T6. Aggregation & Clustering": 0.1463177717344383, + "T7. Consistency & Compliance Checking": 0.051579081014564904, + "T8. Structured & Numeric Reasoning": 0.05324074074074074, + "T9. Version & Code Diff Analysis": 0.07976489975059771, + "T10. Rule Induction & In-Context Learning": 0.11930555555555555, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.05 + }, + "language": { + "Chinese": 0.11950758971285085, + "English": 0.15748417279289106 + } + }, + "pass@1": 0.03133333333333333, + "BoN-2": { + "overall_metric": 0.20014764756254877, + "token_length": { + "8k": 0.28346565871948276, + "16k": 0.2485354487812065, + "32k": 0.1891519780196866, + "64k": 0.1488424509609456, + "128k": 0.16144012015437917, + "256k": 0.1694502287395916 + }, + "contextual_requirement": { + "Full": 0.19478513206751483, + "Partial": 0.20697266728350117 + }, + "difficulty": { + "Easy": 0.25273031996725753, + "Moderate": 0.13899098269897925, + "Hard": 0.1936738222234887, + "Extreme": 0.18696665567502654 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.39483140448121057, + "T2. Sequencing & Structure Reconstruction": 0.4611114324443421, + "T3. Evidence-Grounded QA": 0.18333333333333332, + "T4. Summarization & Synthesis": 0.42108537584236744, + "T5. Attribution & Citation Alignment": 0.06676507134840469, + "T6. Aggregation & Clustering": 0.1877574508713916, + "T7. Consistency & Compliance Checking": 0.08623797150678873, + "T8. Structured & Numeric Reasoning": 0.08194444444444446, + "T9. Version & Code Diff Analysis": 0.16317028795937463, + "T10. Rule Induction & In-Context Learning": 0.17763888888888887, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.1 + }, + "language": { + "Chinese": 0.17263611058145747, + "English": 0.22765918454364006 + } + }, + "pass@2": 0.058666666666666666, + "BoN-3": { + "overall_metric": 0.24816325726475538, + "token_length": { + "8k": 0.34035307822346317, + "16k": 0.3192701911329572, + "32k": 0.2250254030817654, + "64k": 0.2025517709430857, + "128k": 0.1946876666118698, + "256k": 0.20709143359539076 + }, + "contextual_requirement": { + "Full": 0.24205586173316382, + "Partial": 0.25593630612314483 + }, + "difficulty": { + "Easy": 0.32112947294679217, + "Moderate": 0.17708357883824535, + "Hard": 0.24279933271248433, + "Extreme": 0.21844163295140512 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.4707134866823802, + "T2. Sequencing & Structure Reconstruction": 0.5820753654025713, + "T3. Evidence-Grounded QA": 0.2833333333333333, + "T4. Summarization & Synthesis": 0.43262446200126664, + "T5. Attribution & Citation Alignment": 0.08657988616321947, + "T6. Aggregation & Clustering": 0.24002098313209438, + "T7. Consistency & Compliance Checking": 0.09483216743431798, + "T8. Structured & Numeric Reasoning": 0.11666666666666667, + "T9. Version & Code Diff Analysis": 0.20318445637705182, + "T10. Rule Induction & In-Context Learning": 0.23291666666666666, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.13333333333333333 + }, + "language": { + "Chinese": 0.22445802277836707, + "English": 0.27186849175114364 + } + }, + "pass@3": 0.08 +} \ No newline at end of file diff --git a/results/Mistral-Large-Instruct-2411/nonthinking_context-120000_bon-3_summary.json b/results/Mistral-Large-Instruct-2411/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6e3720adfc00c7a910f75ab8638257196e8fafb5 --- /dev/null +++ b/results/Mistral-Large-Instruct-2411/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3169030003205045, + "inference_iteration_1_overall_metric": 0.31907671414212213, + "inference_iteration_2_overall_metric": 0.31793129606674814, + "inference_iteration_3_overall_metric": 0.3137009907526448, + "average_token_length_metric": { + "8k": 0.4275967733792583, + "16k": 0.3967521615717436, + "32k": 0.3684760987014902, + "64k": 0.2785203784059932, + "128k": 0.230059702582881, + "256k": 0.2000128872816638 + }, + "average_contextual_requirement_metric": { + "Full": 0.296348339160902, + "Partial": 0.3430634781600003 + }, + "average_difficulty_metric": { + "Easy": 0.4165851756517525, + "Moderate": 0.23424308281587738, + "Hard": 0.29878920863003516, + "Extreme": 0.27389571270169133 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.5986650899154622, + "T2. Sequencing & Structure Reconstruction": 0.595066968754912, + "T3. Evidence-Grounded QA": 0.3194444444444444, + "T4. Summarization & Synthesis": 0.5064060211674865, + "T5. Attribution & Citation Alignment": 0.2495290285755284, + "T6. Aggregation & Clustering": 0.33225024606080356, + "T7. Consistency & Compliance Checking": 0.15459568382089198, + "T8. Structured & Numeric Reasoning": 0.07623456790123456, + "T9. Version & Code Diff Analysis": 0.307786685955565, + "T10. Rule Induction & In-Context Learning": 0.32310185185185186, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666665 + }, + "average_language_metric": { + "Chinese": 0.3028142034775456, + "English": 0.3309917971634648 + }, + "BoN-1": { + "overall_metric": 0.31907671414212213, + "token_length": { + "8k": 0.42705539380437574, + "16k": 0.40444023819384445, + "32k": 0.3490132720940125, + "64k": 0.2906900596810724, + "128k": 0.2317028482688344, + "256k": 0.21155847281059137 + }, + "contextual_requirement": { + "Full": 0.29415523855405945, + "Partial": 0.3507949557996556 + }, + "difficulty": { + "Easy": 0.4170748170246479, + "Moderate": 0.23410587578788378, + "Hard": 0.2992371396825697, + "Extreme": 0.2805656316735027 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6048650518041011, + "T2. Sequencing & Structure Reconstruction": 0.58605531755393, + "T3. Evidence-Grounded QA": 0.3333333333333333, + "T4. Summarization & Synthesis": 0.5119167501762161, + "T5. Attribution & Citation Alignment": 0.2473640440223312, + "T6. Aggregation & Clustering": 0.3368335027354634, + "T7. Consistency & Compliance Checking": 0.16052542025800243, + "T8. Structured & Numeric Reasoning": 0.06111111111111111, + "T9. Version & Code Diff Analysis": 0.3099971565075231, + "T10. Rule Induction & In-Context Learning": 0.3488888888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334 + }, + "language": { + "Chinese": 0.29912831688952574, + "English": 0.3390251113947178 + } + }, + "pass@1": 0.11466666666666667, + "BoN-2": { + "overall_metric": 0.38594527565749853, + "token_length": { + "8k": 0.5042831481575721, + "16k": 0.475227330555721, + "32k": 0.43467889359878076, + "64k": 0.3349905766445296, + "128k": 0.31010981403738264, + "256k": 0.2563818909510073 + }, + "contextual_requirement": { + "Full": 0.3530504764849349, + "Partial": 0.4278113836953078 + }, + "difficulty": { + "Easy": 0.5179857542164504, + "Moderate": 0.29460590189658764, + "Hard": 0.35300286039674555, + "Extreme": 0.3229314719344226 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6679073055509605, + "T2. Sequencing & Structure Reconstruction": 0.6741524216524215, + "T3. Evidence-Grounded QA": 0.4166666666666667, + "T4. Summarization & Synthesis": 0.5302045683446602, + "T5. Attribution & Citation Alignment": 0.3062092936556243, + "T6. Aggregation & Clustering": 0.4082451589804528, + "T7. Consistency & Compliance Checking": 0.2092540186651841, + "T8. Structured & Numeric Reasoning": 0.11203703703703705, + "T9. Version & Code Diff Analysis": 0.36639914560216913, + "T10. Rule Induction & In-Context Learning": 0.4518055555555556, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665 + }, + "language": { + "Chinese": 0.3686971114406643, + "English": 0.4031934398743337 + } + }, + "pass@2": 0.154, + "BoN-3": { + "overall_metric": 0.421953191488323, + "token_length": { + "8k": 0.5418256257423728, + "16k": 0.514596656363469, + "32k": 0.47852902965649924, + "64k": 0.37391520136163825, + "128k": 0.34070102557165205, + "256k": 0.2821516102343087 + }, + "contextual_requirement": { + "Full": 0.3875133170087601, + "Partial": 0.4657857590077671 + }, + "difficulty": { + "Easy": 0.5664227594780717, + "Moderate": 0.3257686393502546, + "Hard": 0.38999210046214894, + "Extreme": 0.34786173490515865 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6882095518311481, + "T2. Sequencing & Structure Reconstruction": 0.7038069338069337, + "T3. Evidence-Grounded QA": 0.4666666666666667, + "T4. Summarization & Synthesis": 0.5370106443627681, + "T5. Attribution & Citation Alignment": 0.3586021289313467, + "T6. Aggregation & Clustering": 0.45934845005229746, + "T7. Consistency & Compliance Checking": 0.22554787508833687, + "T8. Structured & Numeric Reasoning": 0.14537037037037037, + "T9. Version & Code Diff Analysis": 0.42569114696089216, + "T10. Rule Induction & In-Context Learning": 0.5073611111111112, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667 + }, + "language": { + "Chinese": 0.40247669761625265, + "English": 0.44142968536039356 + } + }, + "pass@3": 0.17933333333333334 +} \ No newline at end of file diff --git a/results/Mistral-Large-Instruct-2411/thinking_context-120000_bon-3_summary.json b/results/Mistral-Large-Instruct-2411/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..d64abb01910ad5356ee22fc1003223b07633ce25 --- /dev/null +++ b/results/Mistral-Large-Instruct-2411/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3624706912806339, + "inference_iteration_1_overall_metric": 0.3518340119783078, + "inference_iteration_2_overall_metric": 0.36714050613217963, + "inference_iteration_3_overall_metric": 0.36843755573141435, + "average_token_length_metric": { + "8k": 0.519368817474254, + "16k": 0.46878642904173495, + "32k": 0.4367602706000035, + "64k": 0.2820635855060585, + "128k": 0.2391844493104579, + "256k": 0.22866059575129521 + }, + "average_contextual_requirement_metric": { + "Full": 0.3358652015644663, + "Partial": 0.39633222364666526 + }, + "average_difficulty_metric": { + "Easy": 0.536500103394983, + "Moderate": 0.2561851997238422, + "Hard": 0.29421165901910773, + "Extreme": 0.28647339751785583 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6270246927945843, + "T2. Sequencing & Structure Reconstruction": 0.5930747647414312, + "T3. Evidence-Grounded QA": 0.35000000000000014, + "T4. Summarization & Synthesis": 0.4891097262506477, + "T5. Attribution & Citation Alignment": 0.27219561539555, + "T6. Aggregation & Clustering": 0.3929461864831539, + "T7. Consistency & Compliance Checking": 0.17406259434511873, + "T8. Structured & Numeric Reasoning": 0.2887345679012346, + "T9. Version & Code Diff Analysis": 0.3055397446573754, + "T10. Rule Induction & In-Context Learning": 0.37699074074074074, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333325 + }, + "average_language_metric": { + "Chinese": 0.3335769489143806, + "English": 0.3913644336468867 + }, + "BoN-1": { + "overall_metric": 0.3518340119783078, + "token_length": { + "8k": 0.4978924620415691, + "16k": 0.4660407575784769, + "32k": 0.44439300101192125, + "64k": 0.2504015769837324, + "128k": 0.22569830445064326, + "256k": 0.22657796980350628 + }, + "contextual_requirement": { + "Full": 0.3296943530149145, + "Partial": 0.3800117597499008 + }, + "difficulty": { + "Easy": 0.5117779578352326, + "Moderate": 0.25778003157579815, + "Hard": 0.27490568840940766, + "Extreme": 0.28892697548899077 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.640262949096431, + "T2. Sequencing & Structure Reconstruction": 0.5669311244311241, + "T3. Evidence-Grounded QA": 0.3416666666666667, + "T4. Summarization & Synthesis": 0.4918201268608611, + "T5. Attribution & Citation Alignment": 0.23208406359722156, + "T6. Aggregation & Clustering": 0.36478858378368206, + "T7. Consistency & Compliance Checking": 0.1661247455189573, + "T8. Structured & Numeric Reasoning": 0.2925925925925926, + "T9. Version & Code Diff Analysis": 0.32795689178925624, + "T10. Rule Induction & In-Context Learning": 0.3702777777777778, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668 + }, + "language": { + "Chinese": 0.3268255224599945, + "English": 0.3768425014966223 + } + }, + "pass@1": 0.15066666666666667, + "BoN-2": { + "overall_metric": 0.43679829295529543, + "token_length": { + "8k": 0.5911437236371854, + "16k": 0.5703298590207265, + "32k": 0.5326982597861109, + "64k": 0.346664895131139, + "128k": 0.3012300084386134, + "256k": 0.2787230117179987 + }, + "contextual_requirement": { + "Full": 0.407647673394358, + "Partial": 0.47389908148739823 + }, + "difficulty": { + "Easy": 0.6480374309185871, + "Moderate": 0.3165832098159298, + "Hard": 0.36022054379586954, + "Extreme": 0.3346525294008514 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7044693578616591, + "T2. Sequencing & Structure Reconstruction": 0.6615363340363338, + "T3. Evidence-Grounded QA": 0.45, + "T4. Summarization & Synthesis": 0.5100444551229819, + "T5. Attribution & Citation Alignment": 0.3337101477566481, + "T6. Aggregation & Clustering": 0.47858076006885547, + "T7. Consistency & Compliance Checking": 0.23101969311309353, + "T8. Structured & Numeric Reasoning": 0.3634259259259259, + "T9. Version & Code Diff Analysis": 0.3826232429462048, + "T10. Rule Induction & In-Context Learning": 0.46638888888888885, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667 + }, + "language": { + "Chinese": 0.40374567113223503, + "English": 0.4698509147783562 + } + }, + "pass@2": 0.206, + "BoN-3": { + "overall_metric": 0.48020123308951057, + "token_length": { + "8k": 0.6542419600875183, + "16k": 0.6183938644898413, + "32k": 0.571965197774906, + "64k": 0.3866173301264082, + "128k": 0.33388551488861024, + "256k": 0.3161035311697812 + }, + "contextual_requirement": { + "Full": 0.4442943380691916, + "Partial": 0.5259009176608265 + }, + "difficulty": { + "Easy": 0.7130100233942409, + "Moderate": 0.35462311357794873, + "Hard": 0.39367151583097126, + "Extreme": 0.3644860692251465 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7280998354477696, + "T2. Sequencing & Structure Reconstruction": 0.7144397731897729, + "T3. Evidence-Grounded QA": 0.5083333333333333, + "T4. Summarization & Synthesis": 0.5171556422011707, + "T5. Attribution & Citation Alignment": 0.37747723607793277, + "T6. Aggregation & Clustering": 0.5266240833565394, + "T7. Consistency & Compliance Checking": 0.25904571820731975, + "T8. Structured & Numeric Reasoning": 0.41620370370370363, + "T9. Version & Code Diff Analysis": 0.42031044657867184, + "T10. Rule Induction & In-Context Learning": 0.5422222222222222, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.39166666666666666 + }, + "language": { + "Chinese": 0.4469064954573681, + "English": 0.5134959707216539 + } + }, + "pass@3": 0.23866666666666667 +} \ No newline at end of file diff --git a/results/Mistral-Small-3.2-24B-Instruct-2506/nonthinking_context-120000_bon-3_summary.json b/results/Mistral-Small-3.2-24B-Instruct-2506/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..54ba21c89251ed84dca9b107b657f75048706b66 --- /dev/null +++ b/results/Mistral-Small-3.2-24B-Instruct-2506/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3731986116770262, + "inference_iteration_1_overall_metric": 0.3764430164692507, + "inference_iteration_2_overall_metric": 0.37295344128487523, + "inference_iteration_3_overall_metric": 0.37019937727695235, + "average_token_length_metric": { + "8k": 0.47652440065044005, + "16k": 0.449888659714999, + "32k": 0.4078506319915549, + "64k": 0.3156364049708699, + "128k": 0.3188001493625175, + "256k": 0.2704914233717748 + }, + "average_contextual_requirement_metric": { + "Full": 0.3426671160616572, + "Partial": 0.4120568788238591 + }, + "average_difficulty_metric": { + "Easy": 0.5045346001029442, + "Moderate": 0.274199216283861, + "Hard": 0.33731265993779014, + "Extreme": 0.3179211546315068 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6709523819471523, + "T2. Sequencing & Structure Reconstruction": 0.6500401579022548, + "T3. Evidence-Grounded QA": 0.4277777777777778, + "T4. Summarization & Synthesis": 0.5256573469543858, + "T5. Attribution & Citation Alignment": 0.3048343833139541, + "T6. Aggregation & Clustering": 0.3439710542602968, + "T7. Consistency & Compliance Checking": 0.18369196767650903, + "T8. Structured & Numeric Reasoning": 0.14243827160493827, + "T9. Version & Code Diff Analysis": 0.392976065162092, + "T10. Rule Induction & In-Context Learning": 0.43203703703703705, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2555555555555555 + }, + "average_language_metric": { + "Chinese": 0.3541108289818725, + "English": 0.3922863943721798 + }, + "BoN-1": { + "overall_metric": 0.3764430164692507, + "token_length": { + "8k": 0.47293579301173544, + "16k": 0.4529323453367711, + "32k": 0.4036888939118912, + "64k": 0.3200427598731278, + "128k": 0.32499778534152896, + "256k": 0.2840605213404498 + }, + "contextual_requirement": { + "Full": 0.3477950533618385, + "Partial": 0.41290406042413935 + }, + "difficulty": { + "Easy": 0.5103779259997739, + "Moderate": 0.27686100242496664, + "Hard": 0.3386830547019045, + "Extreme": 0.31993211786890735 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6798326459221492, + "T2. Sequencing & Structure Reconstruction": 0.6539941668562641, + "T3. Evidence-Grounded QA": 0.425, + "T4. Summarization & Synthesis": 0.5249912472397621, + "T5. Attribution & Citation Alignment": 0.3021910633281516, + "T6. Aggregation & Clustering": 0.34743215354780066, + "T7. Consistency & Compliance Checking": 0.18127201803220644, + "T8. Structured & Numeric Reasoning": 0.1486111111111111, + "T9. Version & Code Diff Analysis": 0.39855565848262914, + "T10. Rule Induction & In-Context Learning": 0.4466666666666666, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336 + }, + "language": { + "Chinese": 0.35803273408066133, + "English": 0.3948532988578403 + } + }, + "pass@1": 0.14533333333333334, + "BoN-2": { + "overall_metric": 0.39540993027593374, + "token_length": { + "8k": 0.5041800135019555, + "16k": 0.476249230788589, + "32k": 0.4259536786274836, + "64k": 0.33796643925845926, + "128k": 0.338211260649731, + "256k": 0.28989895882938543 + }, + "contextual_requirement": { + "Full": 0.36338165608597955, + "Partial": 0.4361731883358756 + }, + "difficulty": { + "Easy": 0.5278149129484206, + "Moderate": 0.30819954499388696, + "Hard": 0.35088879158138625, + "Extreme": 0.3368899502329733 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6887146593492802, + "T2. Sequencing & Structure Reconstruction": 0.6648539552160522, + "T3. Evidence-Grounded QA": 0.4583333333333333, + "T4. Summarization & Synthesis": 0.5373243786993875, + "T5. Attribution & Citation Alignment": 0.34748354292868555, + "T6. Aggregation & Clustering": 0.37668161380086723, + "T7. Consistency & Compliance Checking": 0.19122702647405698, + "T8. Structured & Numeric Reasoning": 0.1597222222222222, + "T9. Version & Code Diff Analysis": 0.4286346318433818, + "T10. Rule Induction & In-Context Learning": 0.45916666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.26666666666666666 + }, + "language": { + "Chinese": 0.37724327797419344, + "English": 0.41357658257767443 + } + }, + "pass@2": 0.16066666666666668, + "BoN-3": { + "overall_metric": 0.406738281428378, + "token_length": { + "8k": 0.5137058259549282, + "16k": 0.4815649687345498, + "32k": 0.44638935934429075, + "64k": 0.3486156261221766, + "128k": 0.34787432869425106, + "256k": 0.30227957972006925 + }, + "contextual_requirement": { + "Full": 0.37465484620173795, + "Partial": 0.44757174444410075 + }, + "difficulty": { + "Easy": 0.5448834015964552, + "Moderate": 0.31056675024038943, + "Hard": 0.3651326226193961, + "Extreme": 0.34590114503772057 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6949591749410688, + "T2. Sequencing & Structure Reconstruction": 0.6751648017768989, + "T3. Evidence-Grounded QA": 0.475, + "T4. Summarization & Synthesis": 0.5442519483294492, + "T5. Attribution & Citation Alignment": 0.35765608366828805, + "T6. Aggregation & Clustering": 0.3948528763892126, + "T7. Consistency & Compliance Checking": 0.19845858401758282, + "T8. Structured & Numeric Reasoning": 0.17083333333333334, + "T9. Version & Code Diff Analysis": 0.43181265186215534, + "T10. Rule Induction & In-Context Learning": 0.4675, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667 + }, + "language": { + "Chinese": 0.3869496550877674, + "English": 0.4265269077689878 + } + }, + "pass@3": 0.168 +} \ No newline at end of file diff --git a/results/Mistral-Small-3.2-24B-Instruct-2506/thinking_context-120000_bon-3_summary.json b/results/Mistral-Small-3.2-24B-Instruct-2506/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..651a3d9332182d499c82750556491bf2214d27b7 --- /dev/null +++ b/results/Mistral-Small-3.2-24B-Instruct-2506/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3987037370686582, + "inference_iteration_1_overall_metric": 0.4005466639339314, + "inference_iteration_2_overall_metric": 0.39333333377476243, + "inference_iteration_3_overall_metric": 0.40223121349728147, + "average_token_length_metric": { + "8k": 0.5424778112458041, + "16k": 0.48688051040801983, + "32k": 0.4403148811203121, + "64k": 0.3401436546897792, + "128k": 0.30659117482257736, + "256k": 0.27581439012545783 + }, + "average_contextual_requirement_metric": { + "Full": 0.3659155932981649, + "Partial": 0.44043410186746873 + }, + "average_difficulty_metric": { + "Easy": 0.6121824426461749, + "Moderate": 0.27744531940705713, + "Hard": 0.317638215137331, + "Extreme": 0.29773756082773445 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6558837112656615, + "T2. Sequencing & Structure Reconstruction": 0.6233137078970414, + "T3. Evidence-Grounded QA": 0.3972222222222222, + "T4. Summarization & Synthesis": 0.4982728933846063, + "T5. Attribution & Citation Alignment": 0.3406895449155636, + "T6. Aggregation & Clustering": 0.3784497613515588, + "T7. Consistency & Compliance Checking": 0.1850253553759489, + "T8. Structured & Numeric Reasoning": 0.3645061728395062, + "T9. Version & Code Diff Analysis": 0.4149149215448353, + "T10. Rule Induction & In-Context Learning": 0.44208333333333344, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.21944444444444447 + }, + "average_language_metric": { + "Chinese": 0.37181071338153865, + "English": 0.42559676075577846 + }, + "BoN-1": { + "overall_metric": 0.4005466639339314, + "token_length": { + "8k": 0.549936123733468, + "16k": 0.5059723107057712, + "32k": 0.436500147965248, + "64k": 0.3263663588238478, + "128k": 0.3118538606332236, + "256k": 0.27265118174202957 + }, + "contextual_requirement": { + "Full": 0.3706804581354897, + "Partial": 0.43855819858649375 + }, + "difficulty": { + "Easy": 0.6146432894264784, + "Moderate": 0.27885587375626963, + "Hard": 0.3246283820077309, + "Extreme": 0.29580677382167836 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6418078252086361, + "T2. Sequencing & Structure Reconstruction": 0.6254830724830722, + "T3. Evidence-Grounded QA": 0.4083333333333333, + "T4. Summarization & Synthesis": 0.4926333263044629, + "T5. Attribution & Citation Alignment": 0.34851784289183024, + "T6. Aggregation & Clustering": 0.38872032735842527, + "T7. Consistency & Compliance Checking": 0.19546994720957953, + "T8. Structured & Numeric Reasoning": 0.3689814814814814, + "T9. Version & Code Diff Analysis": 0.41307804265635545, + "T10. Rule Induction & In-Context Learning": 0.4305555555555556, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666667 + }, + "language": { + "Chinese": 0.372583682740498, + "English": 0.4285096451273649 + } + }, + "pass@1": 0.17933333333333334, + "BoN-2": { + "overall_metric": 0.44657411540443975, + "token_length": { + "8k": 0.5764947600450254, + "16k": 0.5474122233968183, + "32k": 0.5034002505230384, + "64k": 0.3860776417526754, + "128k": 0.36789187708581184, + "256k": 0.29816793962327126 + }, + "contextual_requirement": { + "Full": 0.4116834851076065, + "Partial": 0.49098037214586493 + }, + "difficulty": { + "Easy": 0.6920753978961921, + "Moderate": 0.310198508060507, + "Hard": 0.35436801382293803, + "Extreme": 0.32777524203722713 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6903770310691527, + "T2. Sequencing & Structure Reconstruction": 0.6839748214748216, + "T3. Evidence-Grounded QA": 0.4666666666666667, + "T4. Summarization & Synthesis": 0.5134804778640072, + "T5. Attribution & Citation Alignment": 0.38386685118201486, + "T6. Aggregation & Clustering": 0.4380455926805801, + "T7. Consistency & Compliance Checking": 0.21770656422693052, + "T8. Structured & Numeric Reasoning": 0.4087962962962963, + "T9. Version & Code Diff Analysis": 0.4801823589375706, + "T10. Rule Induction & In-Context Learning": 0.5001388888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.26666666666666666 + }, + "language": { + "Chinese": 0.420870958024232, + "English": 0.47227727278464793 + } + }, + "pass@2": 0.22, + "BoN-3": { + "overall_metric": 0.4796780923764449, + "token_length": { + "8k": 0.6046783631215611, + "16k": 0.5809303286628532, + "32k": 0.5430608854898975, + "64k": 0.4279434742821682, + "128k": 0.3893995220031886, + "256k": 0.3320559806990021 + }, + "contextual_requirement": { + "Full": 0.43765149013675825, + "Partial": 0.5331664952269561 + }, + "difficulty": { + "Easy": 0.7316590781086564, + "Moderate": 0.35837276698741044, + "Hard": 0.3776680447598276, + "Extreme": 0.35032197345759886 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7260943331172737, + "T2. Sequencing & Structure Reconstruction": 0.7150282125282126, + "T3. Evidence-Grounded QA": 0.5333333333333333, + "T4. Summarization & Synthesis": 0.5186475702115565, + "T5. Attribution & Citation Alignment": 0.4078869630146266, + "T6. Aggregation & Clustering": 0.4834113173877446, + "T7. Consistency & Compliance Checking": 0.24236854182796289, + "T8. Structured & Numeric Reasoning": 0.42546296296296293, + "T9. Version & Code Diff Analysis": 0.5163159536769999, + "T10. Rule Induction & In-Context Learning": 0.5601388888888891, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667 + }, + "language": { + "Chinese": 0.4547335554957914, + "English": 0.5046226292570991 + } + }, + "pass@3": 0.24533333333333332 +} \ No newline at end of file diff --git a/results/Qwen2.5-72B-Instruct/nonthinking_context-120000_bon-3_summary.json b/results/Qwen2.5-72B-Instruct/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..4011179399d5c41234deecb520b50ecee044f395 --- /dev/null +++ b/results/Qwen2.5-72B-Instruct/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.39637256479971833, + "inference_iteration_1_overall_metric": 0.3937967514186296, + "inference_iteration_2_overall_metric": 0.3964337111478629, + "inference_iteration_3_overall_metric": 0.3988872318326605, + "average_token_length_metric": { + "8k": 0.47921715442818713, + "16k": 0.3991398791513683, + "32k": 0.44041622927702145, + "64k": 0.36697110801414906, + "128k": 0.37859902889300584, + "256k": 0.3138919890345743 + }, + "average_contextual_requirement_metric": { + "Full": 0.38099477047000957, + "Partial": 0.41594430303752816 + }, + "average_difficulty_metric": { + "Easy": 0.534784488551614, + "Moderate": 0.31363336895476984, + "Hard": 0.35900431211097755, + "Extreme": 0.323637349749313 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6846177524490639, + "T2. Sequencing & Structure Reconstruction": 0.657116734090758, + "T3. Evidence-Grounded QA": 0.4638888888888889, + "T4. Summarization & Synthesis": 0.5223335075531164, + "T5. Attribution & Citation Alignment": 0.35501048685955816, + "T6. Aggregation & Clustering": 0.3527926553465598, + "T7. Consistency & Compliance Checking": 0.22711088737402554, + "T8. Structured & Numeric Reasoning": 0.1410493827160494, + "T9. Version & Code Diff Analysis": 0.3933158575556893, + "T10. Rule Induction & In-Context Learning": 0.5025, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2944444444444444 + }, + "average_language_metric": { + "Chinese": 0.39791297636758555, + "English": 0.39483215323184995 + }, + "BoN-1": { + "overall_metric": 0.3937967514186296, + "token_length": { + "8k": 0.48150911063795604, + "16k": 0.3955134086911857, + "32k": 0.44051865480915253, + "64k": 0.35605224184998097, + "128k": 0.3735426660157483, + "256k": 0.31564442650775115 + }, + "contextual_requirement": { + "Full": 0.3790246648535571, + "Partial": 0.4125975888650856 + }, + "difficulty": { + "Easy": 0.5289899613446293, + "Moderate": 0.31455221378896076, + "Hard": 0.35328095705198703, + "Extreme": 0.3243594986063606 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6838538961825568, + "T2. Sequencing & Structure Reconstruction": 0.6576533446275381, + "T3. Evidence-Grounded QA": 0.4666666666666667, + "T4. Summarization & Synthesis": 0.5233466170189328, + "T5. Attribution & Citation Alignment": 0.34332894764623034, + "T6. Aggregation & Clustering": 0.35214311990772595, + "T7. Consistency & Compliance Checking": 0.22465424456172434, + "T8. Structured & Numeric Reasoning": 0.14120370370370372, + "T9. Version & Code Diff Analysis": 0.3972749849978754, + "T10. Rule Induction & In-Context Learning": 0.49000000000000005, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2833333333333333 + }, + "language": { + "Chinese": 0.39463375393747463, + "English": 0.3929597488997841 + } + }, + "pass@1": 0.158, + "BoN-2": { + "overall_metric": 0.4266968766552369, + "token_length": { + "8k": 0.5081210460935147, + "16k": 0.433884061206856, + "32k": 0.4739562311618829, + "64k": 0.3969295417300637, + "128k": 0.4063722243032158, + "256k": 0.3409181554358875 + }, + "contextual_requirement": { + "Full": 0.40567420103364404, + "Partial": 0.45345300926453685 + }, + "difficulty": { + "Easy": 0.5611962030845826, + "Moderate": 0.34459684961888387, + "Hard": 0.3903222118897577, + "Extreme": 0.35717530981838436 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7162339766047533, + "T2. Sequencing & Structure Reconstruction": 0.6883400190642125, + "T3. Evidence-Grounded QA": 0.5, + "T4. Summarization & Synthesis": 0.538190912835796, + "T5. Attribution & Citation Alignment": 0.40236792686670547, + "T6. Aggregation & Clustering": 0.3971289905185964, + "T7. Consistency & Compliance Checking": 0.2560060835156213, + "T8. Structured & Numeric Reasoning": 0.1550925925925926, + "T9. Version & Code Diff Analysis": 0.44331995621210946, + "T10. Rule Induction & In-Context Learning": 0.51625, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665 + }, + "language": { + "Chinese": 0.4315697660767929, + "English": 0.42182398723368103 + } + }, + "pass@2": 0.17066666666666666, + "BoN-3": { + "overall_metric": 0.44674650669663196, + "token_length": { + "8k": 0.5262573822920757, + "16k": 0.4582783757995498, + "32k": 0.49570076375833805, + "64k": 0.4144687055290633, + "128k": 0.4299894189350878, + "256k": 0.3557843938656765 + }, + "contextual_requirement": { + "Full": 0.4233162526040225, + "Partial": 0.4765668300872253 + }, + "difficulty": { + "Easy": 0.5769237639050746, + "Moderate": 0.3742638517279143, + "Hard": 0.4173834394346273, + "Extreme": 0.37103736067438053 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7291777705678423, + "T2. Sequencing & Structure Reconstruction": 0.7155669062905904, + "T3. Evidence-Grounded QA": 0.5166666666666667, + "T4. Summarization & Synthesis": 0.5442979937116319, + "T5. Attribution & Citation Alignment": 0.42622684636162084, + "T6. Aggregation & Clustering": 0.4288727477882729, + "T7. Consistency & Compliance Checking": 0.27916215760547175, + "T8. Structured & Numeric Reasoning": 0.16203703703703703, + "T9. Version & Code Diff Analysis": 0.4872872364633724, + "T10. Rule Induction & In-Context Learning": 0.535, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.325 + }, + "language": { + "Chinese": 0.45016861158798044, + "English": 0.44332440180528293 + } + }, + "pass@3": 0.18466666666666667 +} \ No newline at end of file diff --git a/results/Qwen2.5-72B-Instruct/thinking_context-120000_bon-3_summary.json b/results/Qwen2.5-72B-Instruct/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..06866dcb22e077c418c27a0dbd121a1e1b05b7f7 --- /dev/null +++ b/results/Qwen2.5-72B-Instruct/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.4408902584645425, + "inference_iteration_1_overall_metric": 0.44376702120969425, + "inference_iteration_2_overall_metric": 0.44036154482238754, + "inference_iteration_3_overall_metric": 0.43854220936154753, + "average_token_length_metric": { + "8k": 0.5285612004903454, + "16k": 0.47472561982724465, + "32k": 0.46043302796304997, + "64k": 0.4189652464720481, + "128k": 0.3917337204907261, + "256k": 0.37092273554384453 + }, + "average_contextual_requirement_metric": { + "Full": 0.41330115115036786, + "Partial": 0.47600366777349373 + }, + "average_difficulty_metric": { + "Easy": 0.6779648379508855, + "Moderate": 0.31025727820561944, + "Hard": 0.36454303136241756, + "Extreme": 0.3171341962257389 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6877440761283994, + "T2. Sequencing & Structure Reconstruction": 0.6590772291315768, + "T3. Evidence-Grounded QA": 0.45555555555555544, + "T4. Summarization & Synthesis": 0.5129812732828513, + "T5. Attribution & Citation Alignment": 0.34764194837967977, + "T6. Aggregation & Clustering": 0.4267544160945399, + "T7. Consistency & Compliance Checking": 0.24278940543780603, + "T8. Structured & Numeric Reasoning": 0.4026234567901234, + "T9. Version & Code Diff Analysis": 0.49529389751168806, + "T10. Rule Induction & In-Context Learning": 0.43902777777777774, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.30555555555555564 + }, + "average_language_metric": { + "Chinese": 0.4318377484964527, + "English": 0.44994276843263364 + }, + "BoN-1": { + "overall_metric": 0.44376702120969425, + "token_length": { + "8k": 0.5322737731717848, + "16k": 0.4756587605561328, + "32k": 0.4739240524772759, + "64k": 0.4301284112089289, + "128k": 0.3925013454757625, + "256k": 0.3581157843682803 + }, + "contextual_requirement": { + "Full": 0.4190008048268543, + "Partial": 0.4752876602424002 + }, + "difficulty": { + "Easy": 0.6809264760303465, + "Moderate": 0.3123838002275438, + "Hard": 0.38330482067932314, + "Extreme": 0.30997717937688596 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6951653783701364, + "T2. Sequencing & Structure Reconstruction": 0.6538150476737434, + "T3. Evidence-Grounded QA": 0.4666666666666667, + "T4. Summarization & Synthesis": 0.5148026353764281, + "T5. Attribution & Citation Alignment": 0.3465569726210779, + "T6. Aggregation & Clustering": 0.4294465777942789, + "T7. Consistency & Compliance Checking": 0.24971806893075102, + "T8. Structured & Numeric Reasoning": 0.4111111111111112, + "T9. Version & Code Diff Analysis": 0.5131396498811356, + "T10. Rule Induction & In-Context Learning": 0.4131944444444445, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.30833333333333335 + }, + "language": { + "Chinese": 0.4290108132095998, + "English": 0.4585232292097895 + } + }, + "pass@1": 0.20933333333333334, + "BoN-2": { + "overall_metric": 0.5111971513374702, + "token_length": { + "8k": 0.608869344107263, + "16k": 0.5580527030947412, + "32k": 0.5259126977020612, + "64k": 0.49288230504222735, + "128k": 0.46074971792635044, + "256k": 0.42071614015218295 + }, + "contextual_requirement": { + "Full": 0.48375250466177205, + "Partial": 0.5461267016519972 + }, + "difficulty": { + "Easy": 0.7817754799095641, + "Moderate": 0.3628317914878497, + "Hard": 0.43323803533283267, + "Extreme": 0.36344492213322593 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7590349530612852, + "T2. Sequencing & Structure Reconstruction": 0.7222824598911556, + "T3. Evidence-Grounded QA": 0.5333333333333333, + "T4. Summarization & Synthesis": 0.5277624952357224, + "T5. Attribution & Citation Alignment": 0.42967259054859913, + "T6. Aggregation & Clustering": 0.5148559581215818, + "T7. Consistency & Compliance Checking": 0.29132433156438, + "T8. Structured & Numeric Reasoning": 0.49027777777777776, + "T9. Version & Code Diff Analysis": 0.5752747917860145, + "T10. Rule Induction & In-Context Learning": 0.5229166666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.375 + }, + "language": { + "Chinese": 0.49799451206849277, + "English": 0.5243997906064495 + } + }, + "pass@2": 0.26466666666666666, + "BoN-3": { + "overall_metric": 0.5467736842178992, + "token_length": { + "8k": 0.6412013485454756, + "16k": 0.5997169544343945, + "32k": 0.5670917206414194, + "64k": 0.5191991662550954, + "128k": 0.4920959206351818, + "256k": 0.4613369947958361 + }, + "contextual_requirement": { + "Full": 0.5191971033346234, + "Partial": 0.5818711507966168 + }, + "difficulty": { + "Easy": 0.8347260400113382, + "Moderate": 0.3850286249254218, + "Hard": 0.46842929282635226, + "Extreme": 0.3890300836616431 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7891907338254459, + "T2. Sequencing & Structure Reconstruction": 0.7449808987308988, + "T3. Evidence-Grounded QA": 0.575, + "T4. Summarization & Synthesis": 0.533886440771335, + "T5. Attribution & Citation Alignment": 0.45243256205857046, + "T6. Aggregation & Clustering": 0.5676630345451723, + "T7. Consistency & Compliance Checking": 0.3275556384256395, + "T8. Structured & Numeric Reasoning": 0.5166666666666667, + "T9. Version & Code Diff Analysis": 0.6088524078812863, + "T10. Rule Induction & In-Context Learning": 0.5875, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.425 + }, + "language": { + "Chinese": 0.5347761594749554, + "English": 0.5587712089608458 + } + }, + "pass@3": 0.29933333333333334 +} \ No newline at end of file diff --git a/results/Qwen3-14B/nonthinking_context-120000_bon-3_summary.json b/results/Qwen3-14B/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..47747dd77eb13f5abb8245c9e776e27a6d41674f --- /dev/null +++ b/results/Qwen3-14B/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.371082468941152, + "inference_iteration_1_overall_metric": 0.3702373257200545, + "inference_iteration_2_overall_metric": 0.3726984925368399, + "inference_iteration_3_overall_metric": 0.37031158856656204, + "average_token_length_metric": { + "8k": 0.43147751060158823, + "16k": 0.4033401435644649, + "32k": 0.38388990895019404, + "64k": 0.35197621981023713, + "128k": 0.3411527865206871, + "256k": 0.3146582441997405 + }, + "average_contextual_requirement_metric": { + "Full": 0.34812497724125846, + "Partial": 0.4003010947410168 + }, + "average_difficulty_metric": { + "Easy": 0.4843882954180315, + "Moderate": 0.29072923835932707, + "Hard": 0.35344272393957227, + "Extreme": 0.31132465435421575 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6723638502162732, + "T2. Sequencing & Structure Reconstruction": 0.6447956689096103, + "T3. Evidence-Grounded QA": 0.4888888888888889, + "T4. Summarization & Synthesis": 0.5233374417827854, + "T5. Attribution & Citation Alignment": 0.28649007131147347, + "T6. Aggregation & Clustering": 0.36581903676825483, + "T7. Consistency & Compliance Checking": 0.2131249616899122, + "T8. Structured & Numeric Reasoning": 0.08441358024691357, + "T9. Version & Code Diff Analysis": 0.354238942968118, + "T10. Rule Induction & In-Context Learning": 0.43171296296296297, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667 + }, + "average_language_metric": { + "Chinese": 0.3760505297242663, + "English": 0.36611440815803775 + }, + "BoN-1": { + "overall_metric": 0.3702373257200545, + "token_length": { + "8k": 0.4365488155340593, + "16k": 0.40901874466031907, + "32k": 0.3764681078768914, + "64k": 0.3558737887299017, + "128k": 0.337849918010455, + "256k": 0.30566457950869874 + }, + "contextual_requirement": { + "Full": 0.34789877784316464, + "Partial": 0.39866820483609555 + }, + "difficulty": { + "Easy": 0.47550918682117166, + "Moderate": 0.2976353828419508, + "Hard": 0.3595591768355849, + "Extreme": 0.3096166654215201 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6693651539947573, + "T2. Sequencing & Structure Reconstruction": 0.643656093318885, + "T3. Evidence-Grounded QA": 0.49166666666666664, + "T4. Summarization & Synthesis": 0.5246008307406761, + "T5. Attribution & Citation Alignment": 0.28080851312128013, + "T6. Aggregation & Clustering": 0.3666330157824754, + "T7. Consistency & Compliance Checking": 0.219629388990068, + "T8. Structured & Numeric Reasoning": 0.07824074074074075, + "T9. Version & Code Diff Analysis": 0.3648645953884854, + "T10. Rule Induction & In-Context Learning": 0.41458333333333336, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667 + }, + "language": { + "Chinese": 0.37263158698072796, + "English": 0.36784306445938036 + } + }, + "pass@1": 0.136, + "BoN-2": { + "overall_metric": 0.39747790211532286, + "token_length": { + "8k": 0.4586397357227334, + "16k": 0.4346767556872062, + "32k": 0.40654133431254835, + "64k": 0.3846278376492738, + "128k": 0.366368675809838, + "256k": 0.334013073510339 + }, + "contextual_requirement": { + "Full": 0.3762242907679459, + "Partial": 0.4245279529210758 + }, + "difficulty": { + "Easy": 0.5137672201120023, + "Moderate": 0.3211245068584676, + "Hard": 0.37927074472552935, + "Extreme": 0.33219733038914645 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6959264095016384, + "T2. Sequencing & Structure Reconstruction": 0.6706657296788379, + "T3. Evidence-Grounded QA": 0.5333333333333333, + "T4. Summarization & Synthesis": 0.5372603569538371, + "T5. Attribution & Citation Alignment": 0.30622924870075346, + "T6. Aggregation & Clustering": 0.394716262565116, + "T7. Consistency & Compliance Checking": 0.23852214824749268, + "T8. Structured & Numeric Reasoning": 0.10046296296296296, + "T9. Version & Code Diff Analysis": 0.389089970943113, + "T10. Rule Induction & In-Context Learning": 0.46875, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.26666666666666666 + }, + "language": { + "Chinese": 0.39990507125254376, + "English": 0.39505073297810234 + } + }, + "pass@2": 0.152, + "BoN-3": { + "overall_metric": 0.4076945334257239, + "token_length": { + "8k": 0.47272207856160825, + "16k": 0.44496379279331655, + "32k": 0.4150202640804616, + "64k": 0.39561603549517055, + "128k": 0.3755358328446131, + "256k": 0.3423091967791733 + }, + "contextual_requirement": { + "Full": 0.38567226658065773, + "Partial": 0.435722873046717 + }, + "difficulty": { + "Easy": 0.5205174207555429, + "Moderate": 0.3404168914689614, + "Hard": 0.3867590355007207, + "Extreme": 0.34204218281858184 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7035836621732522, + "T2. Sequencing & Structure Reconstruction": 0.6789876392507479, + "T3. Evidence-Grounded QA": 0.5416666666666666, + "T4. Summarization & Synthesis": 0.5440147604159721, + "T5. Attribution & Citation Alignment": 0.3324233311147572, + "T6. Aggregation & Clustering": 0.41234554444538063, + "T7. Consistency & Compliance Checking": 0.24338937917661474, + "T8. Structured & Numeric Reasoning": 0.10046296296296298, + "T9. Version & Code Diff Analysis": 0.41745877832271355, + "T10. Rule Induction & In-Context Learning": 0.46874999999999994, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.275 + }, + "language": { + "Chinese": 0.41071052140088854, + "English": 0.4046785454505592 + } + }, + "pass@3": 0.15933333333333333 +} \ No newline at end of file diff --git a/results/Qwen3-14B/thinking_context-120000_bon-3_summary.json b/results/Qwen3-14B/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..d38d8acb001e92d7cf2a4bd2d46ab6091fafc040 --- /dev/null +++ b/results/Qwen3-14B/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 19, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.47140163871696883, + "inference_iteration_1_overall_metric": 0.4702465037493788, + "inference_iteration_2_overall_metric": 0.4731184728103188, + "inference_iteration_3_overall_metric": 0.4708399395912094, + "average_token_length_metric": { + "8k": 0.5782430822549752, + "16k": 0.5324144227767025, + "32k": 0.497499382093641, + "64k": 0.4573465836705664, + "128k": 0.3964337558937265, + "256k": 0.36647260561220363 + }, + "average_contextual_requirement_metric": { + "Full": 0.434181327100322, + "Partial": 0.5187729444108845 + }, + "average_difficulty_metric": { + "Easy": 0.6954869771942226, + "Moderate": 0.39026206272645725, + "Hard": 0.3840987663314641, + "Extreme": 0.3365623987956807 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7452599144109371, + "T2. Sequencing & Structure Reconstruction": 0.690021191602685, + "T3. Evidence-Grounded QA": 0.46388888888888874, + "T4. Summarization & Synthesis": 0.5079917038055713, + "T5. Attribution & Citation Alignment": 0.4011786521228004, + "T6. Aggregation & Clustering": 0.456013454446653, + "T7. Consistency & Compliance Checking": 0.26448705908382575, + "T8. Structured & Numeric Reasoning": 0.4464506172839507, + "T9. Version & Code Diff Analysis": 0.5003275109836627, + "T10. Rule Induction & In-Context Learning": 0.4528703703703705, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3805555555555556 + }, + "average_language_metric": { + "Chinese": 0.43752943274977507, + "English": 0.5052738446841633 + }, + "BoN-1": { + "overall_metric": 0.4702465037493788, + "token_length": { + "8k": 0.5913655956747179, + "16k": 0.5095272462031484, + "32k": 0.5071120818375107, + "64k": 0.44784910454974686, + "128k": 0.41043482344109344, + "256k": 0.35519017079005766 + }, + "contextual_requirement": { + "Full": 0.43466831346143797, + "Partial": 0.5155278368431229 + }, + "difficulty": { + "Easy": 0.6903049055473668, + "Moderate": 0.4015483173366612, + "Hard": 0.38434152744866484, + "Extreme": 0.330728695471088 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7350489226932438, + "T2. Sequencing & Structure Reconstruction": 0.7006655082390375, + "T3. Evidence-Grounded QA": 0.43333333333333335, + "T4. Summarization & Synthesis": 0.5066399530046457, + "T5. Attribution & Citation Alignment": 0.4030762255317208, + "T6. Aggregation & Clustering": 0.44151556236290046, + "T7. Consistency & Compliance Checking": 0.2711344235218425, + "T8. Structured & Numeric Reasoning": 0.4435185185185186, + "T9. Version & Code Diff Analysis": 0.5056201530159209, + "T10. Rule Induction & In-Context Learning": 0.45111111111111113, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333 + }, + "language": { + "Chinese": 0.43645831615890296, + "English": 0.5040346913398556 + } + }, + "pass@1": 0.23333333333333334, + "BoN-2": { + "overall_metric": 0.5468849991087089, + "token_length": { + "8k": 0.6615753489640523, + "16k": 0.6155193707982429, + "32k": 0.5758535655924854, + "64k": 0.540724309445691, + "128k": 0.4626871186035022, + "256k": 0.4249502812482792 + }, + "contextual_requirement": { + "Full": 0.5080630736758954, + "Partial": 0.5962947223868355 + }, + "difficulty": { + "Easy": 0.7925721699736568, + "Moderate": 0.479093636665099, + "Hard": 0.4528494469332917, + "Extreme": 0.38403673085414924 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8163493863260063, + "T2. Sequencing & Structure Reconstruction": 0.7442577955813249, + "T3. Evidence-Grounded QA": 0.5833333333333334, + "T4. Summarization & Synthesis": 0.5201693233821205, + "T5. Attribution & Citation Alignment": 0.5067233304435315, + "T6. Aggregation & Clustering": 0.5291314359023842, + "T7. Consistency & Compliance Checking": 0.31988194373207235, + "T8. Structured & Numeric Reasoning": 0.5319444444444444, + "T9. Version & Code Diff Analysis": 0.5907370281186358, + "T10. Rule Induction & In-Context Learning": 0.5363888888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667 + }, + "language": { + "Chinese": 0.5179409589397771, + "English": 0.5758290392776407 + } + }, + "pass@2": 0.3006666666666667, + "BoN-3": { + "overall_metric": 0.5824444918225498, + "token_length": { + "8k": 0.6960404076989396, + "16k": 0.6595983514236444, + "32k": 0.6169744918377427, + "64k": 0.5731325003589102, + "128k": 0.48953898539680313, + "256k": 0.4593822142192594 + }, + "contextual_requirement": { + "Full": 0.5444884918996935, + "Partial": 0.6307521280880043 + }, + "difficulty": { + "Easy": 0.8395318293521158, + "Moderate": 0.5135257846150404, + "Hard": 0.4826106375194937, + "Extreme": 0.4116567499755111 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8331279763695215, + "T2. Sequencing & Structure Reconstruction": 0.7661840637575932, + "T3. Evidence-Grounded QA": 0.6416666666666667, + "T4. Summarization & Synthesis": 0.5304996896358936, + "T5. Attribution & Citation Alignment": 0.5410298862500872, + "T6. Aggregation & Clustering": 0.5653969713210634, + "T7. Consistency & Compliance Checking": 0.35530101198058683, + "T8. Structured & Numeric Reasoning": 0.5680555555555555, + "T9. Version & Code Diff Analysis": 0.6135286679274133, + "T10. Rule Induction & In-Context Learning": 0.6047222222222223, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5166666666666667 + }, + "language": { + "Chinese": 0.5531328379155088, + "English": 0.6117561457295911 + } + }, + "pass@3": 0.338 +} \ No newline at end of file diff --git a/results/Qwen3-235B-A22B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json b/results/Qwen3-235B-A22B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..343e9426996716d7c5daa5071aa88d27eb89b172 --- /dev/null +++ b/results/Qwen3-235B-A22B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5250792648341495, + "inference_iteration_1_overall_metric": 0.5304344950538739, + "inference_iteration_2_overall_metric": 0.5198766479638736, + "inference_iteration_3_overall_metric": 0.5249266514847013, + "average_token_length_metric": { + "8k": 0.5437004868383555, + "16k": 0.5685851303356437, + "32k": 0.5571646658594525, + "64k": 0.4927762749794244, + "128k": 0.5219020146975271, + "256k": 0.46634701629449443 + }, + "average_contextual_requirement_metric": { + "Full": 0.48053236591813503, + "Partial": 0.5817753179999874 + }, + "average_difficulty_metric": { + "Easy": 0.617560842340432, + "Moderate": 0.5314310614765573, + "Hard": 0.5233520312908267, + "Extreme": 0.4207324032576421 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.794390974707769, + "T2. Sequencing & Structure Reconstruction": 0.7891406587239914, + "T3. Evidence-Grounded QA": 0.5694444444444444, + "T4. Summarization & Synthesis": 0.548404897537323, + "T5. Attribution & Citation Alignment": 0.7090811316885252, + "T6. Aggregation & Clustering": 0.4833303018822092, + "T7. Consistency & Compliance Checking": 0.36054654248613494, + "T8. Structured & Numeric Reasoning": 0.17916666666666667, + "T9. Version & Code Diff Analysis": 0.639574547883412, + "T10. Rule Induction & In-Context Learning": 0.5316666666666666, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4472222222222222 + }, + "average_language_metric": { + "Chinese": 0.5279929441888006, + "English": 0.5221655854795001 + }, + "BoN-1": { + "overall_metric": 0.5304344950538739, + "token_length": { + "8k": 0.5382631577044388, + "16k": 0.575757250526112, + "32k": 0.5564174429144133, + "64k": 0.49882324949171003, + "128k": 0.5360876545520631, + "256k": 0.47725821513450517 + }, + "contextual_requirement": { + "Full": 0.48609242430882327, + "Partial": 0.5868698578203024 + }, + "difficulty": { + "Easy": 0.621987238841124, + "Moderate": 0.5450178207268627, + "Hard": 0.5368976043475531, + "Extreme": 0.41631894457473306 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8032396811710955, + "T2. Sequencing & Structure Reconstruction": 0.7962516650016649, + "T3. Evidence-Grounded QA": 0.6166666666666667, + "T4. Summarization & Synthesis": 0.5449456903648067, + "T5. Attribution & Citation Alignment": 0.7007693812727995, + "T6. Aggregation & Clustering": 0.4931518214295991, + "T7. Consistency & Compliance Checking": 0.35790360694105, + "T8. Structured & Numeric Reasoning": 0.18935185185185183, + "T9. Version & Code Diff Analysis": 0.6460027389181906, + "T10. Rule Induction & In-Context Learning": 0.528611111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.43333333333333335 + }, + "language": { + "Chinese": 0.5351277676036212, + "English": 0.5257412225041267 + } + }, + "pass@1": 0.25, + "BoN-2": { + "overall_metric": 0.559631618514268, + "token_length": { + "8k": 0.5759701055353683, + "16k": 0.5915870050315596, + "32k": 0.5811019083400786, + "64k": 0.5340616992071157, + "128k": 0.566738039323504, + "256k": 0.5083309536479829 + }, + "contextual_requirement": { + "Full": 0.5121620583385146, + "Partial": 0.6200474223743184 + }, + "difficulty": { + "Easy": 0.6503756016170681, + "Moderate": 0.5630560246396688, + "Hard": 0.5739183320047223, + "Extreme": 0.4485927243676055 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8282631375366962, + "T2. Sequencing & Structure Reconstruction": 0.8215701428201425, + "T3. Evidence-Grounded QA": 0.625, + "T4. Summarization & Synthesis": 0.5605335358825536, + "T5. Attribution & Citation Alignment": 0.7353432686427397, + "T6. Aggregation & Clustering": 0.5159697175359719, + "T7. Consistency & Compliance Checking": 0.39569633402766907, + "T8. Structured & Numeric Reasoning": 0.2037037037037037, + "T9. Version & Code Diff Analysis": 0.6716305136452022, + "T10. Rule Induction & In-Context Learning": 0.5966666666666668, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334 + }, + "language": { + "Chinese": 0.5625026563498647, + "English": 0.5567605806786718 + } + }, + "pass@2": 0.27666666666666667, + "BoN-3": { + "overall_metric": 0.5808888120058974, + "token_length": { + "8k": 0.585393137716361, + "16k": 0.6092994228323365, + "32k": 0.6038173310381142, + "64k": 0.5645208259955651, + "128k": 0.5986706621283983, + "256k": 0.5236314923246123 + }, + "contextual_requirement": { + "Full": 0.5307941999072111, + "Partial": 0.6446455910405908 + }, + "difficulty": { + "Easy": 0.6680957906246583, + "Moderate": 0.5949187228015328, + "Hard": 0.5923336661549259, + "Extreme": 0.46862524208942063 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8359951247705989, + "T2. Sequencing & Structure Reconstruction": 0.8325952750952746, + "T3. Evidence-Grounded QA": 0.6416666666666667, + "T4. Summarization & Synthesis": 0.5676340892602741, + "T5. Attribution & Citation Alignment": 0.7621749447875176, + "T6. Aggregation & Clustering": 0.5352498549729722, + "T7. Consistency & Compliance Checking": 0.4260968481164947, + "T8. Structured & Numeric Reasoning": 0.22037037037037036, + "T9. Version & Code Diff Analysis": 0.6990239948591902, + "T10. Rule Induction & In-Context Learning": 0.6077777777777779, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666 + }, + "language": { + "Chinese": 0.5829028144055859, + "English": 0.5788748096062107 + } + }, + "pass@3": 0.2946666666666667 +} \ No newline at end of file diff --git a/results/Qwen3-235B-A22B-Instruct-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-235B-A22B-Instruct-2507/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..711ccbe3317e5d2a36f003a043e4ec093eca7a2c --- /dev/null +++ b/results/Qwen3-235B-A22B-Instruct-2507/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.6376818411345996, + "inference_iteration_1_overall_metric": 0.6339596775760088, + "inference_iteration_2_overall_metric": 0.6394834532022874, + "inference_iteration_3_overall_metric": 0.6396023926255001, + "average_token_length_metric": { + "8k": 0.6908518806680625, + "16k": 0.710583866590834, + "32k": 0.6732110240389698, + "64k": 0.621313683819941, + "128k": 0.601721218415662, + "256k": 0.5284093732741275 + }, + "average_contextual_requirement_metric": { + "Full": 0.5907324430049904, + "Partial": 0.6974356205722853 + }, + "average_difficulty_metric": { + "Easy": 0.8298420157208769, + "Moderate": 0.6814554333905586, + "Hard": 0.5860331219068426, + "Extreme": 0.43235163100792684 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8583982650560713, + "T2. Sequencing & Structure Reconstruction": 0.848515913099246, + "T3. Evidence-Grounded QA": 0.6, + "T4. Summarization & Synthesis": 0.5177498916968352, + "T5. Attribution & Citation Alignment": 0.77066193723968, + "T6. Aggregation & Clustering": 0.6117211713831538, + "T7. Consistency & Compliance Checking": 0.42049824163434013, + "T8. Structured & Numeric Reasoning": 0.6337962962962962, + "T9. Version & Code Diff Analysis": 0.7293123320088637, + "T10. Rule Induction & In-Context Learning": 0.6056944444444443, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666664 + }, + "average_language_metric": { + "Chinese": 0.636515950238815, + "English": 0.6388477320303853 + }, + "BoN-1": { + "overall_metric": 0.6339596775760088, + "token_length": { + "8k": 0.6855442294551932, + "16k": 0.7000813231569181, + "32k": 0.6747494576040707, + "64k": 0.6226259564636208, + "128k": 0.5873958179866461, + "256k": 0.533361280789608 + }, + "contextual_requirement": { + "Full": 0.5879826501710673, + "Partial": 0.6924758942732089 + }, + "difficulty": { + "Easy": 0.82231148583884, + "Moderate": 0.6884371521646265, + "Hard": 0.5762430652636876, + "Extreme": 0.42975635762025166 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8670882121182245, + "T2. Sequencing & Structure Reconstruction": 0.834568857068857, + "T3. Evidence-Grounded QA": 0.5916666666666667, + "T4. Summarization & Synthesis": 0.5199050816665388, + "T5. Attribution & Citation Alignment": 0.7523417979282552, + "T6. Aggregation & Clustering": 0.6062619024899728, + "T7. Consistency & Compliance Checking": 0.4101310403597984, + "T8. Structured & Numeric Reasoning": 0.6231481481481482, + "T9. Version & Code Diff Analysis": 0.732669273310252, + "T10. Rule Induction & In-Context Learning": 0.6336111111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333 + }, + "language": { + "Chinese": 0.6266859734445107, + "English": 0.6412333817075085 + } + }, + "pass@1": 0.38466666666666666, + "BoN-2": { + "overall_metric": 0.6974027072595513, + "token_length": { + "8k": 0.7323866259678374, + "16k": 0.7745752522150099, + "32k": 0.7261213539879122, + "64k": 0.6928752138123638, + "128k": 0.6670434516287179, + "256k": 0.5914143459454726 + }, + "contextual_requirement": { + "Full": 0.6549789400684759, + "Partial": 0.751396592775468 + }, + "difficulty": { + "Easy": 0.8841904588128215, + "Moderate": 0.7743090948629835, + "Hard": 0.6459134355083986, + "Extreme": 0.47609079196413456 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8835231545032977, + "T2. Sequencing & Structure Reconstruction": 0.8910442335442333, + "T3. Evidence-Grounded QA": 0.6833333333333333, + "T4. Summarization & Synthesis": 0.5311781500383242, + "T5. Attribution & Citation Alignment": 0.8245467421256093, + "T6. Aggregation & Clustering": 0.6739689017829934, + "T7. Consistency & Compliance Checking": 0.48373798320712474, + "T8. Structured & Numeric Reasoning": 0.7101851851851851, + "T9. Version & Code Diff Analysis": 0.779292344158872, + "T10. Rule Induction & In-Context Learning": 0.706111111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6166666666666667 + }, + "language": { + "Chinese": 0.6932540877553197, + "English": 0.7015513267637853 + } + }, + "pass@2": 0.46, + "BoN-3": { + "overall_metric": 0.7313269007238051, + "token_length": { + "8k": 0.7568418794712882, + "16k": 0.7976309127316717, + "32k": 0.7546566659314653, + "64k": 0.7371399894465867, + "128k": 0.7061444864165639, + "256k": 0.6355474703452554 + }, + "contextual_requirement": { + "Full": 0.6842404276773796, + "Partial": 0.7912551391465288 + }, + "difficulty": { + "Easy": 0.9130888695564184, + "Moderate": 0.8144404231670943, + "Hard": 0.6852010111307348, + "Extreme": 0.5079207760169129 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9040064038436306, + "T2. Sequencing & Structure Reconstruction": 0.9156552244052241, + "T3. Evidence-Grounded QA": 0.7166666666666667, + "T4. Summarization & Synthesis": 0.5373815939695279, + "T5. Attribution & Citation Alignment": 0.8401844251202388, + "T6. Aggregation & Clustering": 0.7030048302464991, + "T7. Consistency & Compliance Checking": 0.5353694302280319, + "T8. Structured & Numeric Reasoning": 0.7574074074074073, + "T9. Version & Code Diff Analysis": 0.7970749987749245, + "T10. Rule Induction & In-Context Learning": 0.7536111111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6833333333333333 + }, + "language": { + "Chinese": 0.7336257997533113, + "English": 0.7290280016942995 + } + }, + "pass@3": 0.5053333333333333 +} \ No newline at end of file diff --git a/results/Qwen3-235B-A22B-Thinking-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-235B-A22B-Thinking-2507/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..db2d87509160fd257d0b0ae98bc1382489e5727e --- /dev/null +++ b/results/Qwen3-235B-A22B-Thinking-2507/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 13, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.6697401744849466, + "inference_iteration_1_overall_metric": 0.6645867118007756, + "inference_iteration_2_overall_metric": 0.6751104806595288, + "inference_iteration_3_overall_metric": 0.6695233309945317, + "average_token_length_metric": { + "8k": 0.7205839084021065, + "16k": 0.7043002698220667, + "32k": 0.6969133410539584, + "64k": 0.6685079197146846, + "128k": 0.6404675702828265, + "256k": 0.5876680376340346 + }, + "average_contextual_requirement_metric": { + "Full": 0.634705106653694, + "Partial": 0.7143302608156324 + }, + "average_difficulty_metric": { + "Easy": 0.8354877129168713, + "Moderate": 0.7511550276701573, + "Hard": 0.6709614325576964, + "Extreme": 0.43389437976281486 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8772550883476504, + "T2. Sequencing & Structure Reconstruction": 0.8623538806872134, + "T3. Evidence-Grounded QA": 0.6250000000000003, + "T4. Summarization & Synthesis": 0.5401699896092569, + "T5. Attribution & Citation Alignment": 0.7728207746125852, + "T6. Aggregation & Clustering": 0.6408623505152069, + "T7. Consistency & Compliance Checking": 0.46406727840672235, + "T8. Structured & Numeric Reasoning": 0.7117283950617284, + "T9. Version & Code Diff Analysis": 0.789350597014821, + "T10. Rule Induction & In-Context Learning": 0.6548148148148148, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.525 + }, + "average_language_metric": { + "Chinese": 0.6712139930169294, + "English": 0.668266355952964 + }, + "BoN-1": { + "overall_metric": 0.6645867118007756, + "token_length": { + "8k": 0.7097656837729444, + "16k": 0.7010367810682975, + "32k": 0.6918335677824285, + "64k": 0.6771373662881649, + "128k": 0.6229921235210027, + "256k": 0.584754748371821 + }, + "contextual_requirement": { + "Full": 0.6291093737088903, + "Partial": 0.7097396875540872 + }, + "difficulty": { + "Easy": 0.8231290446647995, + "Moderate": 0.7591822484146827, + "Hard": 0.6449304811041314, + "Extreme": 0.4416892939586231 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8690587447518973, + "T2. Sequencing & Structure Reconstruction": 0.8768765956265957, + "T3. Evidence-Grounded QA": 0.6416666666666667, + "T4. Summarization & Synthesis": 0.5380854111531137, + "T5. Attribution & Citation Alignment": 0.7488408410161091, + "T6. Aggregation & Clustering": 0.6513683909314774, + "T7. Consistency & Compliance Checking": 0.46848122327638125, + "T8. Structured & Numeric Reasoning": 0.6837962962962962, + "T9. Version & Code Diff Analysis": 0.7940034392057601, + "T10. Rule Induction & In-Context Learning": 0.6166666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5166666666666667 + }, + "language": { + "Chinese": 0.6644345913196646, + "English": 0.6647388322818897 + } + }, + "pass@1": 0.4266666666666667, + "BoN-2": { + "overall_metric": 0.7478859807771899, + "token_length": { + "8k": 0.7974609291497671, + "16k": 0.7833069738720557, + "32k": 0.7539511540579624, + "64k": 0.7523052610595647, + "128k": 0.7233847771912111, + "256k": 0.6769067893325807 + }, + "contextual_requirement": { + "Full": 0.709048497560702, + "Partial": 0.797315504870903 + }, + "difficulty": { + "Easy": 0.9183290544233588, + "Moderate": 0.8402480622734665, + "Hard": 0.7656967810020093, + "Extreme": 0.4888097891525321 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8996374413195756, + "T2. Sequencing & Structure Reconstruction": 0.9140728715728713, + "T3. Evidence-Grounded QA": 0.7416666666666667, + "T4. Summarization & Synthesis": 0.5510532445357571, + "T5. Attribution & Citation Alignment": 0.8553010122231844, + "T6. Aggregation & Clustering": 0.72259791476162, + "T7. Consistency & Compliance Checking": 0.550051783712619, + "T8. Structured & Numeric Reasoning": 0.7916666666666666, + "T9. Version & Code Diff Analysis": 0.8487023090187992, + "T10. Rule Induction & In-Context Learning": 0.7916666666666666, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.65 + }, + "language": { + "Chinese": 0.7479449056610918, + "English": 0.7478270558932893 + } + }, + "pass@2": 0.524, + "BoN-3": { + "overall_metric": 0.7822004920631344, + "token_length": { + "8k": 0.8225828352477704, + "16k": 0.8267205325130447, + "32k": 0.8024853199503054, + "64k": 0.7952104500545913, + "128k": 0.7471444685103535, + "256k": 0.6990593461027415 + }, + "contextual_requirement": { + "Full": 0.7488719518681644, + "Partial": 0.824618634129461 + }, + "difficulty": { + "Easy": 0.9465988423811745, + "Moderate": 0.8961057909699204, + "Hard": 0.8084893169461448, + "Extreme": 0.5100275224528017 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9298480703943819, + "T2. Sequencing & Structure Reconstruction": 0.9221497946497943, + "T3. Evidence-Grounded QA": 0.8166666666666667, + "T4. Summarization & Synthesis": 0.5589114384792794, + "T5. Attribution & Citation Alignment": 0.8835244862799918, + "T6. Aggregation & Clustering": 0.7476293375871608, + "T7. Consistency & Compliance Checking": 0.5892510818578476, + "T8. Structured & Numeric Reasoning": 0.8444444444444444, + "T9. Version & Code Diff Analysis": 0.8566406207071109, + "T10. Rule Induction & In-Context Learning": 0.8377777777777777, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.7 + }, + "language": { + "Chinese": 0.7822230658547494, + "English": 0.78217791827152 + } + }, + "pass@3": 0.572 +} \ No newline at end of file diff --git a/results/Qwen3-30B-A3B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json b/results/Qwen3-30B-A3B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f50c8e1229dae86ceba53ca63ef69e550bf2af1a --- /dev/null +++ b/results/Qwen3-30B-A3B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.43835174236080204, + "inference_iteration_1_overall_metric": 0.4347938238206688, + "inference_iteration_2_overall_metric": 0.43739224653302283, + "inference_iteration_3_overall_metric": 0.4428691567287152, + "average_token_length_metric": { + "8k": 0.4640531253197961, + "16k": 0.4647383786815656, + "32k": 0.4769242453969347, + "64k": 0.4179394455075989, + "128k": 0.41351252229299035, + "256k": 0.39294273696592813 + }, + "average_contextual_requirement_metric": { + "Full": 0.41095809725832383, + "Partial": 0.4732163815821386 + }, + "average_difficulty_metric": { + "Easy": 0.5588711611175687, + "Moderate": 0.39037585922610546, + "Hard": 0.41317943830896897, + "Extreme": 0.3543732565793154 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7127770907550041, + "T2. Sequencing & Structure Reconstruction": 0.6851617075892437, + "T3. Evidence-Grounded QA": 0.5611111111111112, + "T4. Summarization & Synthesis": 0.5287940406323086, + "T5. Attribution & Citation Alignment": 0.5540826057234692, + "T6. Aggregation & Clustering": 0.3922538101756423, + "T7. Consistency & Compliance Checking": 0.25219880289608915, + "T8. Structured & Numeric Reasoning": 0.15478395061728395, + "T9. Version & Code Diff Analysis": 0.4794950077949977, + "T10. Rule Induction & In-Context Learning": 0.4563425925925925, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3027777777777778 + }, + "average_language_metric": { + "Chinese": 0.4449592008603477, + "English": 0.43174428386125685 + }, + "BoN-1": { + "overall_metric": 0.4347938238206688, + "token_length": { + "8k": 0.45780843124261406, + "16k": 0.4544510486962234, + "32k": 0.48190834806509436, + "64k": 0.41649388756464034, + "128k": 0.4188417047217461, + "256k": 0.3792595226336961 + }, + "contextual_requirement": { + "Full": 0.4119506732449559, + "Partial": 0.4638669245533959 + }, + "difficulty": { + "Easy": 0.5484313078217694, + "Moderate": 0.3819799640213123, + "Hard": 0.40444373140916484, + "Extreme": 0.3649327213215975 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7075508946315197, + "T2. Sequencing & Structure Reconstruction": 0.6841748991748985, + "T3. Evidence-Grounded QA": 0.5583333333333333, + "T4. Summarization & Synthesis": 0.5338944636170653, + "T5. Attribution & Citation Alignment": 0.5472533802884338, + "T6. Aggregation & Clustering": 0.4012455230126335, + "T7. Consistency & Compliance Checking": 0.2457091900873631, + "T8. Structured & Numeric Reasoning": 0.15416666666666667, + "T9. Version & Code Diff Analysis": 0.46883931261867207, + "T10. Rule Induction & In-Context Learning": 0.4331944444444443, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3 + }, + "language": { + "Chinese": 0.44364717413939403, + "English": 0.4259404735019449 + } + }, + "pass@1": 0.17733333333333334, + "BoN-2": { + "overall_metric": 0.46640045487719345, + "token_length": { + "8k": 0.48850623477567573, + "16k": 0.48941657724686394, + "32k": 0.5072169193313956, + "64k": 0.4523087841468838, + "128k": 0.44368419626075345, + "256k": 0.41727001750158577 + }, + "contextual_requirement": { + "Full": 0.44061393965405443, + "Partial": 0.4992196560702788 + }, + "difficulty": { + "Easy": 0.5800147754645664, + "Moderate": 0.42352739928412814, + "Hard": 0.44183822997113786, + "Extreme": 0.3862338492888558 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.74209693890316, + "T2. Sequencing & Structure Reconstruction": 0.7157272357272357, + "T3. Evidence-Grounded QA": 0.5666666666666667, + "T4. Summarization & Synthesis": 0.5450289133980314, + "T5. Attribution & Citation Alignment": 0.5961599581640015, + "T6. Aggregation & Clustering": 0.4338491861878003, + "T7. Consistency & Compliance Checking": 0.27109254578114006, + "T8. Structured & Numeric Reasoning": 0.18379629629629632, + "T9. Version & Code Diff Analysis": 0.5158022640412946, + "T10. Rule Induction & In-Context Learning": 0.49041666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.325 + }, + "language": { + "Chinese": 0.4708259876166834, + "English": 0.4619749221377029 + } + }, + "pass@2": 0.198, + "BoN-3": { + "overall_metric": 0.48455280677595725, + "token_length": { + "8k": 0.5068277344354145, + "16k": 0.5063031851071311, + "32k": 0.5248706991014918, + "64k": 0.462267170875692, + "128k": 0.4631788587622459, + "256k": 0.44386919237376954 + }, + "contextual_requirement": { + "Full": 0.45623565272994365, + "Partial": 0.5205928210163393 + }, + "difficulty": { + "Easy": 0.5983262263426448, + "Moderate": 0.44016373789846164, + "Hard": 0.4662266857524413, + "Extreme": 0.40111167416288757 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7549544231445334, + "T2. Sequencing & Structure Reconstruction": 0.737453426203426, + "T3. Evidence-Grounded QA": 0.5833333333333334, + "T4. Summarization & Synthesis": 0.5505766147965558, + "T5. Attribution & Citation Alignment": 0.6027932502768333, + "T6. Aggregation & Clustering": 0.45641196612410123, + "T7. Consistency & Compliance Checking": 0.29641454225252506, + "T8. Structured & Numeric Reasoning": 0.1949074074074074, + "T9. Version & Code Diff Analysis": 0.546337052157623, + "T10. Rule Induction & In-Context Learning": 0.49319444444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664 + }, + "language": { + "Chinese": 0.48823712145503023, + "English": 0.48086849209688537 + } + }, + "pass@3": 0.20866666666666667 +} \ No newline at end of file diff --git a/results/Qwen3-30B-A3B-Instruct-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-30B-A3B-Instruct-2507/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..95bae708c215018b2cac88a4d6ee4e929a0866d7 --- /dev/null +++ b/results/Qwen3-30B-A3B-Instruct-2507/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5452162786351843, + "inference_iteration_1_overall_metric": 0.5475365928256798, + "inference_iteration_2_overall_metric": 0.5453410517872768, + "inference_iteration_3_overall_metric": 0.5427711912925933, + "average_token_length_metric": { + "8k": 0.6035050249721831, + "16k": 0.5987273703229294, + "32k": 0.5599890143683894, + "64k": 0.5400303382355833, + "128k": 0.49784316183098404, + "256k": 0.4712027620810347 + }, + "average_contextual_requirement_metric": { + "Full": 0.5045780200164052, + "Partial": 0.5969376986954488 + }, + "average_difficulty_metric": { + "Easy": 0.7559096468444604, + "Moderate": 0.564732272741711, + "Hard": 0.4403591043429616, + "Extreme": 0.37046490953699124 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7874723394436245, + "T2. Sequencing & Structure Reconstruction": 0.747240085843135, + "T3. Evidence-Grounded QA": 0.5416666666666666, + "T4. Summarization & Synthesis": 0.5127484150702732, + "T5. Attribution & Citation Alignment": 0.6032294596505793, + "T6. Aggregation & Clustering": 0.5166889927087092, + "T7. Consistency & Compliance Checking": 0.31859016422394293, + "T8. Structured & Numeric Reasoning": 0.5577160493827158, + "T9. Version & Code Diff Analysis": 0.5756222253109884, + "T10. Rule Induction & In-Context Learning": 0.5438425925925927, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4138888888888889 + }, + "average_language_metric": { + "Chinese": 0.534932377457035, + "English": 0.5555001798133334 + }, + "BoN-1": { + "overall_metric": 0.5475365928256798, + "token_length": { + "8k": 0.6016020197847444, + "16k": 0.5955245523768168, + "32k": 0.5628331081172674, + "64k": 0.5480517546124761, + "128k": 0.5064957488766313, + "256k": 0.4707123731861472 + }, + "contextual_requirement": { + "Full": 0.5089038204182644, + "Partial": 0.5967055758896659 + }, + "difficulty": { + "Easy": 0.7601650922395514, + "Moderate": 0.5686448581258617, + "Hard": 0.44460622355514695, + "Extreme": 0.36835398221192234 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7928563896826898, + "T2. Sequencing & Structure Reconstruction": 0.7446857472592766, + "T3. Evidence-Grounded QA": 0.5583333333333333, + "T4. Summarization & Synthesis": 0.5140071321863116, + "T5. Attribution & Citation Alignment": 0.5999370151569586, + "T6. Aggregation & Clustering": 0.5192504692366862, + "T7. Consistency & Compliance Checking": 0.33101895293351313, + "T8. Structured & Numeric Reasoning": 0.5685185185185185, + "T9. Version & Code Diff Analysis": 0.5777336594471387, + "T10. Rule Induction & In-Context Learning": 0.5034722222222221, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.425 + }, + "language": { + "Chinese": 0.5279438377377988, + "English": 0.5671293479135634 + } + }, + "pass@1": 0.29733333333333334, + "BoN-2": { + "overall_metric": 0.6226628481842147, + "token_length": { + "8k": 0.6834746444616835, + "16k": 0.6777780429260933, + "32k": 0.6235393400581155, + "64k": 0.6117468620961922, + "128k": 0.5816405990800028, + "256k": 0.5577976004832055 + }, + "contextual_requirement": { + "Full": 0.5762829279061583, + "Partial": 0.6816918376290168 + }, + "difficulty": { + "Easy": 0.8411762869831911, + "Moderate": 0.6700737072149301, + "Hard": 0.5094718947245584, + "Extreme": 0.4264968704316206 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8378092426392205, + "T2. Sequencing & Structure Reconstruction": 0.8185840085840084, + "T3. Evidence-Grounded QA": 0.6083333333333333, + "T4. Summarization & Synthesis": 0.5304585800985443, + "T5. Attribution & Citation Alignment": 0.6911490951783217, + "T6. Aggregation & Clustering": 0.5992410622374563, + "T7. Consistency & Compliance Checking": 0.39138387898011423, + "T8. Structured & Numeric Reasoning": 0.6398148148148147, + "T9. Version & Code Diff Analysis": 0.6585139306429096, + "T10. Rule Induction & In-Context Learning": 0.6511111111111112, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666 + }, + "language": { + "Chinese": 0.6113970502008556, + "English": 0.6339286461675765 + } + }, + "pass@2": 0.36666666666666664, + "BoN-3": { + "overall_metric": 0.6598719096201268, + "token_length": { + "8k": 0.7112228338386369, + "16k": 0.704821999670257, + "32k": 0.6679623687905272, + "64k": 0.6494739136662858, + "128k": 0.6304280872767323, + "256k": 0.5953222544783268 + }, + "contextual_requirement": { + "Full": 0.6175176296001476, + "Partial": 0.7137773569182847 + }, + "difficulty": { + "Easy": 0.8737608485181246, + "Moderate": 0.7316366823753812, + "Hard": 0.5427232232690453, + "Extreme": 0.45537527889367313 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.858239586310667, + "T2. Sequencing & Structure Reconstruction": 0.8379063991563991, + "T3. Evidence-Grounded QA": 0.675, + "T4. Summarization & Synthesis": 0.5353639111706652, + "T5. Attribution & Citation Alignment": 0.7194455371416155, + "T6. Aggregation & Clustering": 0.6373541014616064, + "T7. Consistency & Compliance Checking": 0.43430884132356135, + "T8. Structured & Numeric Reasoning": 0.6731481481481482, + "T9. Version & Code Diff Analysis": 0.699949022294497, + "T10. Rule Induction & In-Context Learning": 0.7136111111111112, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5916666666666667 + }, + "language": { + "Chinese": 0.6502665902311631, + "English": 0.6694772290090928 + } + }, + "pass@3": 0.4053333333333333 +} \ No newline at end of file diff --git a/results/Qwen3-30B-A3B-Thinking-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-30B-A3B-Thinking-2507/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..c427f8324376d7e81ba236d91ad4694e7e46f54e --- /dev/null +++ b/results/Qwen3-30B-A3B-Thinking-2507/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5967923243935424, + "inference_iteration_1_overall_metric": 0.5971485082647852, + "inference_iteration_2_overall_metric": 0.5964644421604209, + "inference_iteration_3_overall_metric": 0.5967640227554183, + "average_token_length_metric": { + "8k": 0.6603587329609213, + "16k": 0.6544573971618018, + "32k": 0.6327664105346437, + "64k": 0.5866153064796855, + "128k": 0.5279135083169344, + "256k": 0.5186425909072657 + }, + "average_contextual_requirement_metric": { + "Full": 0.5570876315601245, + "Partial": 0.6473255698178918 + }, + "average_difficulty_metric": { + "Easy": 0.7963733087366597, + "Moderate": 0.6254851987507496, + "Hard": 0.5276205174176363, + "Extreme": 0.40474772683313665 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8293501125585261, + "T2. Sequencing & Structure Reconstruction": 0.8116547209831653, + "T3. Evidence-Grounded QA": 0.5750000000000003, + "T4. Summarization & Synthesis": 0.5368729714469394, + "T5. Attribution & Citation Alignment": 0.6564732160656777, + "T6. Aggregation & Clustering": 0.5777371247422506, + "T7. Consistency & Compliance Checking": 0.3565708837074654, + "T8. Structured & Numeric Reasoning": 0.6209876543209873, + "T9. Version & Code Diff Analysis": 0.6645539841533562, + "T10. Rule Induction & In-Context Learning": 0.5863888888888888, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.46666666666666673 + }, + "average_language_metric": { + "Chinese": 0.5922310064298796, + "English": 0.6013536423572048 + }, + "BoN-1": { + "overall_metric": 0.5971485082647852, + "token_length": { + "8k": 0.6631122039610606, + "16k": 0.6695831110367371, + "32k": 0.6289403768757488, + "64k": 0.5821521427163692, + "128k": 0.522724275200541, + "256k": 0.5163789397982567 + }, + "contextual_requirement": { + "Full": 0.5537794331097636, + "Partial": 0.6523455130075411 + }, + "difficulty": { + "Easy": 0.7810131026617345, + "Moderate": 0.6463897022556653, + "Hard": 0.5345120229683763, + "Extreme": 0.4045314734647145 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8311430041412194, + "T2. Sequencing & Structure Reconstruction": 0.8249045861545858, + "T3. Evidence-Grounded QA": 0.5666666666666667, + "T4. Summarization & Synthesis": 0.5383277195885704, + "T5. Attribution & Citation Alignment": 0.6451460821905192, + "T6. Aggregation & Clustering": 0.5737635479840616, + "T7. Consistency & Compliance Checking": 0.37517972023255514, + "T8. Structured & Numeric Reasoning": 0.6055555555555555, + "T9. Version & Code Diff Analysis": 0.6730867255766664, + "T10. Rule Induction & In-Context Learning": 0.5700000000000001, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334 + }, + "language": { + "Chinese": 0.5907857688067475, + "English": 0.603511247722824 + } + }, + "pass@1": 0.3546666666666667, + "BoN-2": { + "overall_metric": 0.6705564073004658, + "token_length": { + "8k": 0.7144249310094072, + "16k": 0.7233569898208718, + "32k": 0.7203526229389474, + "64k": 0.666470085141174, + "128k": 0.5975716987083816, + "256k": 0.6011621161840196 + }, + "contextual_requirement": { + "Full": 0.6242851454438455, + "Partial": 0.7294471042088951 + }, + "difficulty": { + "Easy": 0.8669888777194616, + "Moderate": 0.7348384499045617, + "Hard": 0.6052202090307908, + "Extreme": 0.45606595331273436 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8629693587046573, + "T2. Sequencing & Structure Reconstruction": 0.8585263810263809, + "T3. Evidence-Grounded QA": 0.6833333333333333, + "T4. Summarization & Synthesis": 0.5520973617668322, + "T5. Attribution & Citation Alignment": 0.7329873034344809, + "T6. Aggregation & Clustering": 0.647246392300239, + "T7. Consistency & Compliance Checking": 0.4479448894336623, + "T8. Structured & Numeric Reasoning": 0.7083333333333334, + "T9. Version & Code Diff Analysis": 0.7361988748337432, + "T10. Rule Induction & In-Context Learning": 0.683888888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667 + }, + "language": { + "Chinese": 0.6628481693891412, + "English": 0.6782646452117937 + } + }, + "pass@2": 0.43266666666666664, + "BoN-3": { + "overall_metric": 0.7073514436172486, + "token_length": { + "8k": 0.7517926459779344, + "16k": 0.7722619232731515, + "32k": 0.7511237446632382, + "64k": 0.6976847688527813, + "128k": 0.629184440307289, + "256k": 0.6420611386291009 + }, + "contextual_requirement": { + "Full": 0.6691986411009105, + "Partial": 0.7559095559107719 + }, + "difficulty": { + "Easy": 0.915791957089584, + "Moderate": 0.766786734359907, + "Hard": 0.6353007083282902, + "Extreme": 0.48730038902683037 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8832882758900559, + "T2. Sequencing & Structure Reconstruction": 0.8749057424057421, + "T3. Evidence-Grounded QA": 0.75, + "T4. Summarization & Synthesis": 0.55766162876669, + "T5. Attribution & Citation Alignment": 0.7834137982359759, + "T6. Aggregation & Clustering": 0.6805855792952568, + "T7. Consistency & Compliance Checking": 0.4766259604534431, + "T8. Structured & Numeric Reasoning": 0.7546296296296295, + "T9. Version & Code Diff Analysis": 0.7651396236274333, + "T10. Rule Induction & In-Context Learning": 0.7513888888888889, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6083333333333333 + }, + "language": { + "Chinese": 0.703943401540441, + "English": 0.7107594856940571 + } + }, + "pass@3": 0.478 +} \ No newline at end of file diff --git a/results/Qwen3-32B/nonthinking_context-120000_bon-3_summary.json b/results/Qwen3-32B/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..87137b4c067878070d19a926904b2e0d3640d9b1 --- /dev/null +++ b/results/Qwen3-32B/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.40276958737882135, + "inference_iteration_1_overall_metric": 0.4016952023158214, + "inference_iteration_2_overall_metric": 0.4048631802018712, + "inference_iteration_3_overall_metric": 0.40175037961877114, + "average_token_length_metric": { + "8k": 0.4617524313525191, + "16k": 0.45515708355028794, + "32k": 0.4245573275344764, + "64k": 0.38812942658699545, + "128k": 0.37207065716673743, + "256k": 0.3149505980819123 + }, + "average_contextual_requirement_metric": { + "Full": 0.37062296245474236, + "Partial": 0.4436834736458306 + }, + "average_difficulty_metric": { + "Easy": 0.5190221632028355, + "Moderate": 0.3430474305314189, + "Hard": 0.3861354100882252, + "Extreme": 0.32557240333042736 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6783903124505437, + "T2. Sequencing & Structure Reconstruction": 0.691038080579805, + "T3. Evidence-Grounded QA": 0.5361111111111111, + "T4. Summarization & Synthesis": 0.5235103467599385, + "T5. Attribution & Citation Alignment": 0.36734324660318, + "T6. Aggregation & Clustering": 0.3859827152631366, + "T7. Consistency & Compliance Checking": 0.22774134858655815, + "T8. Structured & Numeric Reasoning": 0.09135802469135802, + "T9. Version & Code Diff Analysis": 0.4480573156228135, + "T10. Rule Induction & In-Context Learning": 0.39643518518518506, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3361111111111111 + }, + "average_language_metric": { + "Chinese": 0.40772752509755494, + "English": 0.3978116496600876 + }, + "BoN-1": { + "overall_metric": 0.4016952023158214, + "token_length": { + "8k": 0.46663668330951064, + "16k": 0.4538107088644924, + "32k": 0.4171039185976235, + "64k": 0.38935415644800225, + "128k": 0.3718552714760765, + "256k": 0.31141047519922166 + }, + "contextual_requirement": { + "Full": 0.3669026836885647, + "Partial": 0.445976589659603 + }, + "difficulty": { + "Easy": 0.5194117663381852, + "Moderate": 0.3405231428124576, + "Hard": 0.3855439159391187, + "Extreme": 0.32352943663527506 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6715192705862865, + "T2. Sequencing & Structure Reconstruction": 0.6997405107762323, + "T3. Evidence-Grounded QA": 0.55, + "T4. Summarization & Synthesis": 0.5220646003288852, + "T5. Attribution & Citation Alignment": 0.3550270822278831, + "T6. Aggregation & Clustering": 0.37946082280852406, + "T7. Consistency & Compliance Checking": 0.23788006601247816, + "T8. Structured & Numeric Reasoning": 0.09490740740740743, + "T9. Version & Code Diff Analysis": 0.4582161206858629, + "T10. Rule Induction & In-Context Learning": 0.4045833333333333, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667 + }, + "language": { + "Chinese": 0.4096699821210686, + "English": 0.3937204225105745 + } + }, + "pass@1": 0.15666666666666668, + "BoN-2": { + "overall_metric": 0.4458899241198126, + "token_length": { + "8k": 0.5107956286409391, + "16k": 0.4940196458333606, + "32k": 0.46285124603401756, + "64k": 0.4356914401730522, + "128k": 0.4150838110618531, + "256k": 0.3568977729756547 + }, + "contextual_requirement": { + "Full": 0.41105292891627904, + "Partial": 0.4902279180152205 + }, + "difficulty": { + "Easy": 0.5623795180533137, + "Moderate": 0.38380867566198085, + "Hard": 0.4305011006295078, + "Extreme": 0.36916462074950435 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7155725341000191, + "T2. Sequencing & Structure Reconstruction": 0.7385771329569484, + "T3. Evidence-Grounded QA": 0.5583333333333333, + "T4. Summarization & Synthesis": 0.5363931015769866, + "T5. Attribution & Citation Alignment": 0.40577037385073444, + "T6. Aggregation & Clustering": 0.4381787255377262, + "T7. Consistency & Compliance Checking": 0.2654075018995459, + "T8. Structured & Numeric Reasoning": 0.12083333333333335, + "T9. Version & Code Diff Analysis": 0.5102649011903974, + "T10. Rule Induction & In-Context Learning": 0.45541666666666664, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667 + }, + "language": { + "Chinese": 0.4499576438501969, + "English": 0.4418222043894293 + } + }, + "pass@2": 0.184, + "BoN-3": { + "overall_metric": 0.46707342636344645, + "token_length": { + "8k": 0.5223679274291421, + "16k": 0.5196118628123836, + "32k": 0.48374718677985445, + "64k": 0.45430831830027746, + "128k": 0.44035677780297794, + "256k": 0.3820484850560422 + }, + "contextual_requirement": { + "Full": 0.4329127928337618, + "Partial": 0.5105505963103175 + }, + "difficulty": { + "Easy": 0.5794161679541103, + "Moderate": 0.41412466934190495, + "Hard": 0.45291372678066194, + "Extreme": 0.3880851138906052 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7343093370344386, + "T2. Sequencing & Structure Reconstruction": 0.7559608376898355, + "T3. Evidence-Grounded QA": 0.5833333333333334, + "T4. Summarization & Synthesis": 0.5455373041006337, + "T5. Attribution & Citation Alignment": 0.44072256206800897, + "T6. Aggregation & Clustering": 0.47661016797778905, + "T7. Consistency & Compliance Checking": 0.2859756712512137, + "T8. Structured & Numeric Reasoning": 0.13425925925925924, + "T9. Version & Code Diff Analysis": 0.5528701409177667, + "T10. Rule Induction & In-Context Learning": 0.45541666666666664, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.425 + }, + "language": { + "Chinese": 0.4707375089112187, + "English": 0.4634093438156739 + } + }, + "pass@3": 0.19666666666666666 +} \ No newline at end of file diff --git a/results/Qwen3-32B/thinking_context-120000_bon-3_summary.json b/results/Qwen3-32B/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..c8ae14db2066963f00f74888025051ae990a3cd9 --- /dev/null +++ b/results/Qwen3-32B/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.511230077368557, + "inference_iteration_1_overall_metric": 0.5125551266359154, + "inference_iteration_2_overall_metric": 0.5093558836789825, + "inference_iteration_3_overall_metric": 0.5117792217907733, + "average_token_length_metric": { + "8k": 0.6093750682953369, + "16k": 0.5606336434386148, + "32k": 0.5407757802194578, + "64k": 0.49507317974870746, + "128k": 0.4498486791562606, + "256k": 0.4116741133529667 + }, + "average_contextual_requirement_metric": { + "Full": 0.47415262721658286, + "Partial": 0.5584195593801601 + }, + "average_difficulty_metric": { + "Easy": 0.7280370376711341, + "Moderate": 0.46181032202581557, + "Hard": 0.42241569201251666, + "Extreme": 0.36452260417788923 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.770604191490206, + "T2. Sequencing & Structure Reconstruction": 0.7391521133187797, + "T3. Evidence-Grounded QA": 0.47222222222222227, + "T4. Summarization & Synthesis": 0.5173657920719582, + "T5. Attribution & Citation Alignment": 0.4466370952378391, + "T6. Aggregation & Clustering": 0.4909460945890284, + "T7. Consistency & Compliance Checking": 0.2977388131082807, + "T8. Structured & Numeric Reasoning": 0.5168209876543208, + "T9. Version & Code Diff Analysis": 0.5534968208496264, + "T10. Rule Induction & In-Context Learning": 0.49097222222222225, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4416666666666667 + }, + "average_language_metric": { + "Chinese": 0.5000696493273706, + "English": 0.5223905054097431 + }, + "BoN-1": { + "overall_metric": 0.5125551266359154, + "token_length": { + "8k": 0.6199505611114019, + "16k": 0.5449906257247921, + "32k": 0.5573838747951294, + "64k": 0.48881589783475765, + "128k": 0.4414701560105687, + "256k": 0.4227196443388436 + }, + "contextual_requirement": { + "Full": 0.4840737513137435, + "Partial": 0.5488041497732257 + }, + "difficulty": { + "Easy": 0.7237636460264806, + "Moderate": 0.45634319610108454, + "Hard": 0.435386696673209, + "Extreme": 0.368792439903043 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7683888534222428, + "T2. Sequencing & Structure Reconstruction": 0.7350878750878753, + "T3. Evidence-Grounded QA": 0.475, + "T4. Summarization & Synthesis": 0.5164166688813505, + "T5. Attribution & Citation Alignment": 0.4327235065619549, + "T6. Aggregation & Clustering": 0.48097372489084406, + "T7. Consistency & Compliance Checking": 0.28774132062670765, + "T8. Structured & Numeric Reasoning": 0.5685185185185185, + "T9. Version & Code Diff Analysis": 0.552360721830303, + "T10. Rule Induction & In-Context Learning": 0.4877777777777777, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.43333333333333335 + }, + "language": { + "Chinese": 0.5036040074009573, + "English": 0.5215062458708739 + } + }, + "pass@1": 0.26, + "BoN-2": { + "overall_metric": 0.5889072134846955, + "token_length": { + "8k": 0.6825707673405994, + "16k": 0.6560634814652357, + "32k": 0.6151197445549184, + "64k": 0.5702241454706968, + "128k": 0.5232780250539345, + "256k": 0.4861871170227882 + }, + "contextual_requirement": { + "Full": 0.5510603442388599, + "Partial": 0.6370759561612137 + }, + "difficulty": { + "Easy": 0.8070382684727199, + "Moderate": 0.5619198253755202, + "Hard": 0.5090922692146654, + "Extreme": 0.42010427156051966 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8345700718573785, + "T2. Sequencing & Structure Reconstruction": 0.7945044307544308, + "T3. Evidence-Grounded QA": 0.5666666666666667, + "T4. Summarization & Synthesis": 0.5346021319695609, + "T5. Attribution & Citation Alignment": 0.5351280430956195, + "T6. Aggregation & Clustering": 0.5706144095580495, + "T7. Consistency & Compliance Checking": 0.38333556049784645, + "T8. Structured & Numeric Reasoning": 0.6296296296296297, + "T9. Version & Code Diff Analysis": 0.6241105357978589, + "T10. Rule Induction & In-Context Learning": 0.5713888888888888, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.525 + }, + "language": { + "Chinese": 0.5813115070957934, + "English": 0.5965029198735978 + } + }, + "pass@2": 0.3393333333333333, + "BoN-3": { + "overall_metric": 0.6236268197943012, + "token_length": { + "8k": 0.7099773255313729, + "16k": 0.6915751481616828, + "32k": 0.6387773010277447, + "64k": 0.615062805287137, + "128k": 0.5761902661960858, + "256k": 0.5101780725617869 + }, + "contextual_requirement": { + "Full": 0.5835815761216835, + "Partial": 0.6745934935594524 + }, + "difficulty": { + "Easy": 0.8441219878850473, + "Moderate": 0.6121934384709582, + "Hard": 0.5394052983997921, + "Extreme": 0.4449122649436655 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8757613870336376, + "T2. Sequencing & Structure Reconstruction": 0.8133404095904098, + "T3. Evidence-Grounded QA": 0.6083333333333333, + "T4. Summarization & Synthesis": 0.5410211462556259, + "T5. Attribution & Citation Alignment": 0.5882233881768194, + "T6. Aggregation & Clustering": 0.6075727032146355, + "T7. Consistency & Compliance Checking": 0.41240156108775533, + "T8. Structured & Numeric Reasoning": 0.6592592592592592, + "T9. Version & Code Diff Analysis": 0.658971964363135, + "T10. Rule Induction & In-Context Learning": 0.6241666666666666, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667 + }, + "language": { + "Chinese": 0.6173730284036301, + "English": 0.6298806111849737 + } + }, + "pass@3": 0.37466666666666665 +} \ No newline at end of file diff --git a/results/Qwen3-4B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json b/results/Qwen3-4B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b67d7cb77cf75fceb16f0f95696236fa3c9c645b --- /dev/null +++ b/results/Qwen3-4B-Instruct-2507/nonthinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.36776939054307617, + "inference_iteration_1_overall_metric": 0.3687031785251978, + "inference_iteration_2_overall_metric": 0.36661913973941934, + "inference_iteration_3_overall_metric": 0.3679858533646109, + "average_token_length_metric": { + "8k": 0.44810945279428716, + "16k": 0.4111037532728985, + "32k": 0.3865217233693066, + "64k": 0.34256931704639887, + "128k": 0.3342606193069893, + "256k": 0.28405147746857656 + }, + "average_contextual_requirement_metric": { + "Full": 0.3471098106864211, + "Partial": 0.39406340126972855 + }, + "average_difficulty_metric": { + "Easy": 0.4833480360927418, + "Moderate": 0.3021440333762109, + "Hard": 0.3393622074968013, + "Extreme": 0.3029205200127554 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6636265683033888, + "T2. Sequencing & Structure Reconstruction": 0.5914089584991948, + "T3. Evidence-Grounded QA": 0.5027777777777778, + "T4. Summarization & Synthesis": 0.5180185643786743, + "T5. Attribution & Citation Alignment": 0.3538642246619649, + "T6. Aggregation & Clustering": 0.3312771882807734, + "T7. Consistency & Compliance Checking": 0.1987781645586334, + "T8. Structured & Numeric Reasoning": 0.09861111111111111, + "T9. Version & Code Diff Analysis": 0.38761603668611966, + "T10. Rule Induction & In-Context Learning": 0.39236111111111105, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.24444444444444444 + }, + "average_language_metric": { + "Chinese": 0.3784916959047209, + "English": 0.35704708518143186 + }, + "BoN-1": { + "overall_metric": 0.3687031785251978, + "token_length": { + "8k": 0.45858099952533665, + "16k": 0.4197668091574054, + "32k": 0.3847492794129437, + "64k": 0.3409058296099612, + "128k": 0.32898091620375153, + "256k": 0.2792352372417906 + }, + "contextual_requirement": { + "Full": 0.34899758846399653, + "Partial": 0.3937830204212733 + }, + "difficulty": { + "Easy": 0.48834649579871187, + "Moderate": 0.30447624121232425, + "Hard": 0.3355461000287082, + "Extreme": 0.30160295498672696 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6577542622394902, + "T2. Sequencing & Structure Reconstruction": 0.5875901640003564, + "T3. Evidence-Grounded QA": 0.5166666666666667, + "T4. Summarization & Synthesis": 0.5182254030588427, + "T5. Attribution & Citation Alignment": 0.3501215130410147, + "T6. Aggregation & Clustering": 0.32672419090342486, + "T7. Consistency & Compliance Checking": 0.19951249587965042, + "T8. Structured & Numeric Reasoning": 0.10509259259259258, + "T9. Version & Code Diff Analysis": 0.3982433590506608, + "T10. Rule Induction & In-Context Learning": 0.3831944444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25 + }, + "language": { + "Chinese": 0.3804916349954618, + "English": 0.35691472205493485 + } + }, + "pass@1": 0.13733333333333334, + "BoN-2": { + "overall_metric": 0.3840460410863504, + "token_length": { + "8k": 0.46700683938890614, + "16k": 0.4284116254260062, + "32k": 0.40307059075358426, + "64k": 0.36117723090379694, + "128k": 0.34663017301008414, + "256k": 0.29797978703572725 + }, + "contextual_requirement": { + "Full": 0.3640278472961897, + "Partial": 0.40952374227382876 + }, + "difficulty": { + "Easy": 0.49937652616615, + "Moderate": 0.3169552390958882, + "Hard": 0.3549456654310522, + "Extreme": 0.32088685138444606 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6767487727424361, + "T2. Sequencing & Structure Reconstruction": 0.6066056866188131, + "T3. Evidence-Grounded QA": 0.525, + "T4. Summarization & Synthesis": 0.5276694633153738, + "T5. Attribution & Citation Alignment": 0.3727088921038869, + "T6. Aggregation & Clustering": 0.3527528047911058, + "T7. Consistency & Compliance Checking": 0.20981326336377637, + "T8. Structured & Numeric Reasoning": 0.11620370370370371, + "T9. Version & Code Diff Analysis": 0.41827137434433015, + "T10. Rule Induction & In-Context Learning": 0.39708333333333334, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336 + }, + "language": { + "Chinese": 0.39280983345013615, + "English": 0.3752822487225656 + } + }, + "pass@2": 0.14733333333333334, + "BoN-3": { + "overall_metric": 0.39322455891947206, + "token_length": { + "8k": 0.4701089334853473, + "16k": 0.43225890052967636, + "32k": 0.4184755108081086, + "64k": 0.3714644148022562, + "128k": 0.35734202547412225, + "256k": 0.3096975684173234 + }, + "contextual_requirement": { + "Full": 0.371530129813719, + "Partial": 0.4208356505086133 + }, + "difficulty": { + "Easy": 0.5099419962893179, + "Moderate": 0.3245920787447158, + "Hard": 0.36491737661035256, + "Extreme": 0.32903764447759715 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6847508372365279, + "T2. Sequencing & Structure Reconstruction": 0.6160001081301075, + "T3. Evidence-Grounded QA": 0.525, + "T4. Summarization & Synthesis": 0.5341098799748382, + "T5. Attribution & Citation Alignment": 0.3970649159521875, + "T6. Aggregation & Clustering": 0.3694259879861735, + "T7. Consistency & Compliance Checking": 0.21278092728735123, + "T8. Structured & Numeric Reasoning": 0.11712962962962964, + "T9. Version & Code Diff Analysis": 0.42895976117834506, + "T10. Rule Induction & In-Context Learning": 0.41375, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.26666666666666666 + }, + "language": { + "Chinese": 0.4014939630827822, + "English": 0.38495515475616277 + } + }, + "pass@3": 0.15333333333333332 +} \ No newline at end of file diff --git a/results/Qwen3-4B-Instruct-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-4B-Instruct-2507/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..be968e8d160a17c157e44d4d98626ce327db1d59 --- /dev/null +++ b/results/Qwen3-4B-Instruct-2507/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.45680682461664274, + "inference_iteration_1_overall_metric": 0.46832062566231697, + "inference_iteration_2_overall_metric": 0.4448543197576111, + "inference_iteration_3_overall_metric": 0.4572455284300009, + "average_token_length_metric": { + "8k": 0.5361902897675453, + "16k": 0.5387135746745756, + "32k": 0.49630855084750314, + "64k": 0.4349221423372099, + "128k": 0.4097160751427924, + "256k": 0.32499031493023295 + }, + "average_contextual_requirement_metric": { + "Full": 0.42152028749347914, + "Partial": 0.5017169627733982 + }, + "average_difficulty_metric": { + "Easy": 0.6782361744837745, + "Moderate": 0.39685509696404564, + "Hard": 0.36961618406661007, + "Extreme": 0.31088681955892267 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7107158896812488, + "T2. Sequencing & Structure Reconstruction": 0.6878841679901458, + "T3. Evidence-Grounded QA": 0.4972222222222221, + "T4. Summarization & Synthesis": 0.49954693637602554, + "T5. Attribution & Citation Alignment": 0.4031404238145398, + "T6. Aggregation & Clustering": 0.45473413170860827, + "T7. Consistency & Compliance Checking": 0.22366919601340218, + "T8. Structured & Numeric Reasoning": 0.39320987654321, + "T9. Version & Code Diff Analysis": 0.48054475011491815, + "T10. Rule Induction & In-Context Learning": 0.47361111111111115, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.35 + }, + "average_language_metric": { + "Chinese": 0.45095633442828276, + "English": 0.46265731480500444 + }, + "BoN-1": { + "overall_metric": 0.46832062566231697, + "token_length": { + "8k": 0.5556419203082478, + "16k": 0.5568269490713018, + "32k": 0.496345489575371, + "64k": 0.43294552638521866, + "128k": 0.40700473005528304, + "256k": 0.36115913857848075 + }, + "contextual_requirement": { + "Full": 0.4357580294575037, + "Partial": 0.5097639299229886 + }, + "difficulty": { + "Easy": 0.6918638477497621, + "Moderate": 0.4331357742500079, + "Hard": 0.3807852518328578, + "Extreme": 0.3040440621366871 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7207239810971894, + "T2. Sequencing & Structure Reconstruction": 0.6905517621196958, + "T3. Evidence-Grounded QA": 0.49166666666666664, + "T4. Summarization & Synthesis": 0.5029601464767286, + "T5. Attribution & Citation Alignment": 0.4001875429352411, + "T6. Aggregation & Clustering": 0.47488345973500956, + "T7. Consistency & Compliance Checking": 0.23593295101774395, + "T8. Structured & Numeric Reasoning": 0.42916666666666664, + "T9. Version & Code Diff Analysis": 0.48280421646542415, + "T10. Rule Induction & In-Context Learning": 0.4718055555555555, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.38333333333333336 + }, + "language": { + "Chinese": 0.4569671050679203, + "English": 0.4796741462567137 + } + }, + "pass@1": 0.22333333333333333, + "BoN-2": { + "overall_metric": 0.5212696813472717, + "token_length": { + "8k": 0.5986675362212036, + "16k": 0.5973077359959412, + "32k": 0.5622230583396595, + "64k": 0.4981746316724617, + "128k": 0.4716644530251574, + "256k": 0.3995806728292114 + }, + "contextual_requirement": { + "Full": 0.48512808383126105, + "Partial": 0.5672680781858331 + }, + "difficulty": { + "Easy": 0.7545680056315479, + "Moderate": 0.47986119164396557, + "Hard": 0.4295378743414953, + "Extreme": 0.35315093917432855 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7512588469355942, + "T2. Sequencing & Structure Reconstruction": 0.7467928367928364, + "T3. Evidence-Grounded QA": 0.55, + "T4. Summarization & Synthesis": 0.51430009825831, + "T5. Attribution & Citation Alignment": 0.4706079149193554, + "T6. Aggregation & Clustering": 0.5469885223100209, + "T7. Consistency & Compliance Checking": 0.2738455117300439, + "T8. Structured & Numeric Reasoning": 0.4750000000000001, + "T9. Version & Code Diff Analysis": 0.5577713799858229, + "T10. Rule Induction & In-Context Learning": 0.5730555555555554, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333 + }, + "language": { + "Chinese": 0.5125792853133213, + "English": 0.5299600773812242 + } + }, + "pass@2": 0.266, + "BoN-3": { + "overall_metric": 0.555104746077484, + "token_length": { + "8k": 0.6338196004796416, + "16k": 0.6317696475388848, + "32k": 0.5989131419373114, + "64k": 0.5293428230237414, + "128k": 0.5149355297222864, + "256k": 0.421847733763042 + }, + "contextual_requirement": { + "Full": 0.5193420147209636, + "Partial": 0.6006209496221481 + }, + "difficulty": { + "Easy": 0.78804822617357, + "Moderate": 0.5160928935941635, + "Hard": 0.46600874198821185, + "Extreme": 0.3840693214028631 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.789852791947241, + "T2. Sequencing & Structure Reconstruction": 0.7723075073075069, + "T3. Evidence-Grounded QA": 0.5833333333333334, + "T4. Summarization & Synthesis": 0.5189479579160801, + "T5. Attribution & Citation Alignment": 0.5055110625297212, + "T6. Aggregation & Clustering": 0.5940840810722463, + "T7. Consistency & Compliance Checking": 0.30869842588741486, + "T8. Structured & Numeric Reasoning": 0.5157407407407408, + "T9. Version & Code Diff Analysis": 0.5819051347174037, + "T10. Rule Induction & In-Context Learning": 0.6091666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.45 + }, + "language": { + "Chinese": 0.544214789512689, + "English": 0.5659947026422806 + } + }, + "pass@3": 0.29933333333333334 +} \ No newline at end of file diff --git a/results/Qwen3-4B-Thinking-2507/thinking_context-224000_bon-3_summary.json b/results/Qwen3-4B-Thinking-2507/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1cb8d09e7a395ff28dd04c41d686c202bd0d5141 --- /dev/null +++ b/results/Qwen3-4B-Thinking-2507/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5009859553721181, + "inference_iteration_1_overall_metric": 0.5048548848776019, + "inference_iteration_2_overall_metric": 0.5024790497866946, + "inference_iteration_3_overall_metric": 0.49562393145205963, + "average_token_length_metric": { + "8k": 0.632687775526695, + "16k": 0.5918320458309317, + "32k": 0.5823146846057027, + "64k": 0.46822470780697634, + "128k": 0.41206384144125535, + "256k": 0.31879267702115177 + }, + "average_contextual_requirement_metric": { + "Full": 0.4729409452430084, + "Partial": 0.5366796046273523 + }, + "average_difficulty_metric": { + "Easy": 0.7052606065866089, + "Moderate": 0.47658986127708997, + "Hard": 0.40985193332208675, + "Extreme": 0.3530946410097064 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7734823342182894, + "T2. Sequencing & Structure Reconstruction": 0.7182270412956683, + "T3. Evidence-Grounded QA": 0.5111111111111112, + "T4. Summarization & Synthesis": 0.5256473893029667, + "T5. Attribution & Citation Alignment": 0.453174269183897, + "T6. Aggregation & Clustering": 0.5035281189490585, + "T7. Consistency & Compliance Checking": 0.28246589806288247, + "T8. Structured & Numeric Reasoning": 0.46820987654321, + "T9. Version & Code Diff Analysis": 0.5332190492994193, + "T10. Rule Induction & In-Context Learning": 0.49949074074074074, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664 + }, + "average_language_metric": { + "Chinese": 0.5038793951541226, + "English": 0.49809251559011675 + }, + "BoN-1": { + "overall_metric": 0.5048548848776019, + "token_length": { + "8k": 0.6415649649637651, + "16k": 0.5932697726935569, + "32k": 0.5781773434671414, + "64k": 0.47574852291888636, + "128k": 0.42188240734810956, + "256k": 0.3184862978741517 + }, + "contextual_requirement": { + "Full": 0.48018245228514805, + "Partial": 0.5362561627225435 + }, + "difficulty": { + "Easy": 0.7166716175061497, + "Moderate": 0.478876954837501, + "Hard": 0.4052133058691331, + "Extreme": 0.3553284598505033 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7738486865765832, + "T2. Sequencing & Structure Reconstruction": 0.7040430674989494, + "T3. Evidence-Grounded QA": 0.5416666666666666, + "T4. Summarization & Synthesis": 0.5245700102853508, + "T5. Attribution & Citation Alignment": 0.46396062856589165, + "T6. Aggregation & Clustering": 0.5033233762229643, + "T7. Consistency & Compliance Checking": 0.2898251412665043, + "T8. Structured & Numeric Reasoning": 0.4486111111111112, + "T9. Version & Code Diff Analysis": 0.5448186695868223, + "T10. Rule Induction & In-Context Learning": 0.5034722222222222, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.39166666666666666 + }, + "language": { + "Chinese": 0.5010388581951326, + "English": 0.5086709115600716 + } + }, + "pass@1": 0.2693333333333333, + "BoN-2": { + "overall_metric": 0.5815730990020301, + "token_length": { + "8k": 0.7031921430610256, + "16k": 0.6820413887129978, + "32k": 0.6633230661301664, + "64k": 0.5513587902856957, + "128k": 0.4989138280568502, + "256k": 0.390609377765447 + }, + "contextual_requirement": { + "Full": 0.5521045772934127, + "Partial": 0.6190784902675451 + }, + "difficulty": { + "Easy": 0.8002564788590355, + "Moderate": 0.5756263992124815, + "Hard": 0.4952494575695323, + "Extreme": 0.40262023427951893 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8347164772660534, + "T2. Sequencing & Structure Reconstruction": 0.7779337329337325, + "T3. Evidence-Grounded QA": 0.625, + "T4. Summarization & Synthesis": 0.5385917063825295, + "T5. Attribution & Citation Alignment": 0.5486563132286816, + "T6. Aggregation & Clustering": 0.5970895337084138, + "T7. Consistency & Compliance Checking": 0.34483711167791187, + "T8. Structured & Numeric Reasoning": 0.563425925925926, + "T9. Version & Code Diff Analysis": 0.629792206301561, + "T10. Rule Induction & In-Context Learning": 0.6152777777777777, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665 + }, + "language": { + "Chinese": 0.5864920535980012, + "English": 0.5766541444060607 + } + }, + "pass@2": 0.3353333333333333, + "BoN-3": { + "overall_metric": 0.6109433025170551, + "token_length": { + "8k": 0.7379808139270423, + "16k": 0.7090640257842541, + "32k": 0.6929789126030277, + "64k": 0.5972496482938813, + "128k": 0.5202041288295823, + "256k": 0.4081822856645453 + }, + "contextual_requirement": { + "Full": 0.587829494832041, + "Partial": 0.6403608759343477 + }, + "difficulty": { + "Easy": 0.8231196664846697, + "Moderate": 0.624752372815826, + "Hard": 0.5190752370733982, + "Extreme": 0.429784307436333 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8563273820048136, + "T2. Sequencing & Structure Reconstruction": 0.8023217060717058, + "T3. Evidence-Grounded QA": 0.625, + "T4. Summarization & Synthesis": 0.545280602732678, + "T5. Attribution & Citation Alignment": 0.5707889491113174, + "T6. Aggregation & Clustering": 0.625445879237853, + "T7. Consistency & Compliance Checking": 0.39806047878615286, + "T8. Structured & Numeric Reasoning": 0.6027777777777777, + "T9. Version & Code Diff Analysis": 0.6498686600622252, + "T10. Rule Induction & In-Context Learning": 0.6477777777777778, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5 + }, + "language": { + "Chinese": 0.6146496797050219, + "English": 0.60723692532909 + } + }, + "pass@3": 0.36466666666666664 +} \ No newline at end of file diff --git a/results/Qwen3-4B/nonthinking_context-120000_bon-3_summary.json b/results/Qwen3-4B/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..7038193def03bbdb5dc71c2fe624024211773e5f --- /dev/null +++ b/results/Qwen3-4B/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3126302932284987, + "inference_iteration_1_overall_metric": 0.31121626849914974, + "inference_iteration_2_overall_metric": 0.314752838275754, + "inference_iteration_3_overall_metric": 0.31192177291059353, + "average_token_length_metric": { + "8k": 0.3741643840311737, + "16k": 0.31633468288420485, + "32k": 0.3451526313630571, + "64k": 0.28786229377190237, + "128k": 0.28715220052628043, + "256k": 0.26511556679437487 + }, + "average_contextual_requirement_metric": { + "Full": 0.30107503486959114, + "Partial": 0.3273369856852913 + }, + "average_difficulty_metric": { + "Easy": 0.404275621826777, + "Moderate": 0.23333879206992347, + "Hard": 0.30100444227102663, + "Extreme": 0.27195317135842567 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6097768436246711, + "T2. Sequencing & Structure Reconstruction": 0.5604218483789652, + "T3. Evidence-Grounded QA": 0.4305555555555555, + "T4. Summarization & Synthesis": 0.5071899925945303, + "T5. Attribution & Citation Alignment": 0.2003512792090914, + "T6. Aggregation & Clustering": 0.31485071719679697, + "T7. Consistency & Compliance Checking": 0.15277263547227143, + "T8. Structured & Numeric Reasoning": 0.07777777777777777, + "T9. Version & Code Diff Analysis": 0.258888857730561, + "T10. Rule Induction & In-Context Learning": 0.3309259259259259, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668 + }, + "average_language_metric": { + "Chinese": 0.30923682893499727, + "English": 0.3160237575220013 + }, + "BoN-1": { + "overall_metric": 0.31121626849914974, + "token_length": { + "8k": 0.36915288725425155, + "16k": 0.3139436770180253, + "32k": 0.35672840494575175, + "64k": 0.2822737049825353, + "128k": 0.2867451109679906, + "256k": 0.25845382582634246 + }, + "contextual_requirement": { + "Full": 0.2946313817206056, + "Partial": 0.33232430621729636 + }, + "difficulty": { + "Easy": 0.40768136684677136, + "Moderate": 0.22509851498282202, + "Hard": 0.3024334030525033, + "Extreme": 0.26787559003720773 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6106663615330956, + "T2. Sequencing & Structure Reconstruction": 0.5653203286176459, + "T3. Evidence-Grounded QA": 0.4166666666666667, + "T4. Summarization & Synthesis": 0.5026988695288178, + "T5. Attribution & Citation Alignment": 0.1941123340186669, + "T6. Aggregation & Clustering": 0.30605814169863926, + "T7. Consistency & Compliance Checking": 0.153482524546797, + "T8. Structured & Numeric Reasoning": 0.08148148148148147, + "T9. Version & Code Diff Analysis": 0.2592055742840997, + "T10. Rule Induction & In-Context Learning": 0.33, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2 + }, + "language": { + "Chinese": 0.30874607113427527, + "English": 0.31368646586402377 + } + }, + "pass@1": 0.10266666666666667, + "BoN-2": { + "overall_metric": 0.33311542718948595, + "token_length": { + "8k": 0.390054030159741, + "16k": 0.32615430525928374, + "32k": 0.3817681434674135, + "64k": 0.3127897381386726, + "128k": 0.3077977855264991, + "256k": 0.28012856058530294 + }, + "contextual_requirement": { + "Full": 0.31637653425658996, + "Partial": 0.3544194727404438 + }, + "difficulty": { + "Easy": 0.4246995397112217, + "Moderate": 0.25227259034564015, + "Hard": 0.32260424775606733, + "Extreme": 0.29279217370914995 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6411601457898036, + "T2. Sequencing & Structure Reconstruction": 0.5895421729520939, + "T3. Evidence-Grounded QA": 0.44166666666666665, + "T4. Summarization & Synthesis": 0.5176163934525975, + "T5. Attribution & Citation Alignment": 0.22133852710308327, + "T6. Aggregation & Clustering": 0.34506363663570866, + "T7. Consistency & Compliance Checking": 0.17423574705889155, + "T8. Structured & Numeric Reasoning": 0.08148148148148147, + "T9. Version & Code Diff Analysis": 0.299919858362423, + "T10. Rule Induction & In-Context Learning": 0.3515277777777778, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2 + }, + "language": { + "Chinese": 0.33158121218981956, + "English": 0.33464964218915183 + } + }, + "pass@2": 0.10933333333333334, + "BoN-3": { + "overall_metric": 0.3459735709442073, + "token_length": { + "8k": 0.4018096372567551, + "16k": 0.34146668092259574, + "32k": 0.39270536315408494, + "64k": 0.3308531716059115, + "128k": 0.31620040712911074, + "256k": 0.29280616559678363 + }, + "contextual_requirement": { + "Full": 0.3294609526166743, + "Partial": 0.3669896306337945 + }, + "difficulty": { + "Easy": 0.43779012322510574, + "Moderate": 0.27067727963707533, + "Hard": 0.33120650215495034, + "Extreme": 0.3045479638270778 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.646933160923283, + "T2. Sequencing & Structure Reconstruction": 0.606818823765589, + "T3. Evidence-Grounded QA": 0.45, + "T4. Summarization & Synthesis": 0.5261451995899935, + "T5. Attribution & Citation Alignment": 0.23085472354458936, + "T6. Aggregation & Clustering": 0.3665833344388899, + "T7. Consistency & Compliance Checking": 0.18478135703392412, + "T8. Structured & Numeric Reasoning": 0.08703703703703704, + "T9. Version & Code Diff Analysis": 0.31728735843657757, + "T10. Rule Induction & In-Context Learning": 0.36402777777777773, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.225 + }, + "language": { + "Chinese": 0.3427214477984433, + "English": 0.3492256940899709 + } + }, + "pass@3": 0.11466666666666667 +} \ No newline at end of file diff --git a/results/Qwen3-4B/thinking_context-120000_bon-3_summary.json b/results/Qwen3-4B/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ae0db6f0903c73a0aabaa0e44ca5ad47ad804067 --- /dev/null +++ b/results/Qwen3-4B/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 17, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.40820377372546796, + "inference_iteration_1_overall_metric": 0.40990347673882405, + "inference_iteration_2_overall_metric": 0.39860881288038574, + "inference_iteration_3_overall_metric": 0.41609903155719674, + "average_token_length_metric": { + "8k": 0.518261214308827, + "16k": 0.45885281352828067, + "32k": 0.45766504375868816, + "64k": 0.34332890432195795, + "128k": 0.3677707675199754, + "256k": 0.30334389891508334 + }, + "average_contextual_requirement_metric": { + "Full": 0.38531613330536113, + "Partial": 0.4373334978965144 + }, + "average_difficulty_metric": { + "Easy": 0.5984772560988485, + "Moderate": 0.3126595325360281, + "Hard": 0.34066117314091837, + "Extreme": 0.3068862254725463 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6569147883782315, + "T2. Sequencing & Structure Reconstruction": 0.6466218735859872, + "T3. Evidence-Grounded QA": 0.45, + "T4. Summarization & Synthesis": 0.5084591641143323, + "T5. Attribution & Citation Alignment": 0.33382002543718364, + "T6. Aggregation & Clustering": 0.4088451076318483, + "T7. Consistency & Compliance Checking": 0.19607244984784905, + "T8. Structured & Numeric Reasoning": 0.35447530864197535, + "T9. Version & Code Diff Analysis": 0.412734613462707, + "T10. Rule Induction & In-Context Learning": 0.41046296296296286, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2444444444444444 + }, + "average_language_metric": { + "Chinese": 0.3969713362374892, + "English": 0.41943621121344793 + }, + "BoN-1": { + "overall_metric": 0.40990347673882405, + "token_length": { + "8k": 0.5073935877667868, + "16k": 0.4625290937943797, + "32k": 0.46311499347751756, + "64k": 0.3585619661835362, + "128k": 0.3674845287723082, + "256k": 0.30033669043841327 + }, + "contextual_requirement": { + "Full": 0.3859794000653783, + "Partial": 0.4403523015959356 + }, + "difficulty": { + "Easy": 0.5960294211438244, + "Moderate": 0.3327872754241752, + "Hard": 0.3381491070256522, + "Extreme": 0.30379186270207253 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.663842843941683, + "T2. Sequencing & Structure Reconstruction": 0.6545791245791246, + "T3. Evidence-Grounded QA": 0.475, + "T4. Summarization & Synthesis": 0.5062086959994923, + "T5. Attribution & Citation Alignment": 0.3264498285141694, + "T6. Aggregation & Clustering": 0.405516164076872, + "T7. Consistency & Compliance Checking": 0.19553549353623603, + "T8. Structured & Numeric Reasoning": 0.36342592592592593, + "T9. Version & Code Diff Analysis": 0.4201632575589423, + "T10. Rule Induction & In-Context Learning": 0.3725, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336 + }, + "language": { + "Chinese": 0.3981684556698, + "English": 0.42163849780784673 + } + }, + "pass@1": 0.18, + "BoN-2": { + "overall_metric": 0.46589831041740126, + "token_length": { + "8k": 0.5699928269386603, + "16k": 0.5343488480787704, + "32k": 0.5264237759640333, + "64k": 0.39895039817299616, + "128k": 0.40892582534920624, + "256k": 0.3567481880007445 + }, + "contextual_requirement": { + "Full": 0.43584342446033286, + "Partial": 0.5041499834536717 + }, + "difficulty": { + "Easy": 0.6647834735405885, + "Moderate": 0.37447270682167993, + "Hard": 0.39477207070516285, + "Extreme": 0.3547956606233666 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7038416454068085, + "T2. Sequencing & Structure Reconstruction": 0.7210729085729087, + "T3. Evidence-Grounded QA": 0.5, + "T4. Summarization & Synthesis": 0.5215419635980021, + "T5. Attribution & Citation Alignment": 0.40511713894417817, + "T6. Aggregation & Clustering": 0.4739903039012416, + "T7. Consistency & Compliance Checking": 0.24396925466217162, + "T8. Structured & Numeric Reasoning": 0.4175925925925927, + "T9. Version & Code Diff Analysis": 0.48896588585050343, + "T10. Rule Induction & In-Context Learning": 0.44652777777777775, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333 + }, + "language": { + "Chinese": 0.46255576530710224, + "English": 0.4692408555277019 + } + }, + "pass@2": 0.216, + "BoN-3": { + "overall_metric": 0.504738390514233, + "token_length": { + "8k": 0.6204219657283213, + "16k": 0.5577361733074144, + "32k": 0.5696181517494681, + "64k": 0.43160873083607526, + "128k": 0.464000536292567, + "256k": 0.3850447851715533 + }, + "contextual_requirement": { + "Full": 0.47762538119520404, + "Partial": 0.5392458569202704 + }, + "difficulty": { + "Easy": 0.7265559994106623, + "Moderate": 0.4074392024047854, + "Hard": 0.4294021958454043, + "Extreme": 0.37513734082069433 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7403624370535461, + "T2. Sequencing & Structure Reconstruction": 0.7454373866873865, + "T3. Evidence-Grounded QA": 0.5333333333333333, + "T4. Summarization & Synthesis": 0.5277914424080392, + "T5. Attribution & Citation Alignment": 0.4422798028876217, + "T6. Aggregation & Clustering": 0.5201921178442834, + "T7. Consistency & Compliance Checking": 0.26324271269098753, + "T8. Structured & Numeric Reasoning": 0.4777777777777778, + "T9. Version & Code Diff Analysis": 0.5316787888106357, + "T10. Rule Induction & In-Context Learning": 0.5215277777777778, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.375 + }, + "language": { + "Chinese": 0.49773088107965896, + "English": 0.5117458999488075 + } + }, + "pass@3": 0.24666666666666667 +} \ No newline at end of file diff --git a/results/Qwen3-8B/nonthinking_context-120000_bon-3_summary.json b/results/Qwen3-8B/nonthinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..0b30b38eab6700cbd762d670ff679b683fc6a281 --- /dev/null +++ b/results/Qwen3-8B/nonthinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.3341263002850569, + "inference_iteration_1_overall_metric": 0.33209862396824386, + "inference_iteration_2_overall_metric": 0.33541946267252254, + "inference_iteration_3_overall_metric": 0.3348608142144041, + "average_token_length_metric": { + "8k": 0.3784198847034242, + "16k": 0.37953608827547447, + "32k": 0.36499771721065843, + "64k": 0.28008358934905153, + "128k": 0.3132115847486681, + "256k": 0.2885089374230642 + }, + "average_contextual_requirement_metric": { + "Full": 0.3137072166929107, + "Partial": 0.3601142248568793 + }, + "average_difficulty_metric": { + "Easy": 0.428559130222621, + "Moderate": 0.25195385511657276, + "Hard": 0.31092306311632556, + "Extreme": 0.2998920915703961 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.6412648134912415, + "T2. Sequencing & Structure Reconstruction": 0.6145308824378115, + "T3. Evidence-Grounded QA": 0.46388888888888896, + "T4. Summarization & Synthesis": 0.5163026571908423, + "T5. Attribution & Citation Alignment": 0.2523030174294647, + "T6. Aggregation & Clustering": 0.3470095028864795, + "T7. Consistency & Compliance Checking": 0.1606584732901947, + "T8. Structured & Numeric Reasoning": 0.07453703703703704, + "T9. Version & Code Diff Analysis": 0.2941939372673569, + "T10. Rule Induction & In-Context Learning": 0.30689814814814814, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.2138888888888889 + }, + "average_language_metric": { + "Chinese": 0.33785469604432067, + "English": 0.3303979045257931 + }, + "BoN-1": { + "overall_metric": 0.33209862396824386, + "token_length": { + "8k": 0.37642064292707555, + "16k": 0.3763334563411937, + "32k": 0.3747061763092641, + "64k": 0.26888181342117645, + "128k": 0.2968232384609332, + "256k": 0.2994264163498193 + }, + "contextual_requirement": { + "Full": 0.31079609686291426, + "Partial": 0.35921093119320874 + }, + "difficulty": { + "Easy": 0.42792762230121, + "Moderate": 0.25490326136513564, + "Hard": 0.30852284836466365, + "Extreme": 0.2933106279347054 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6344283740437071, + "T2. Sequencing & Structure Reconstruction": 0.627808712385359, + "T3. Evidence-Grounded QA": 0.45, + "T4. Summarization & Synthesis": 0.5165407101035675, + "T5. Attribution & Citation Alignment": 0.2452312681950123, + "T6. Aggregation & Clustering": 0.3367099858810609, + "T7. Consistency & Compliance Checking": 0.15639989872237942, + "T8. Structured & Numeric Reasoning": 0.08287037037037037, + "T9. Version & Code Diff Analysis": 0.30783668574801865, + "T10. Rule Induction & In-Context Learning": 0.29708333333333337, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334 + }, + "language": { + "Chinese": 0.3358334164162984, + "English": 0.3283638315201894 + } + }, + "pass@1": 0.11333333333333333, + "BoN-2": { + "overall_metric": 0.3675196408761814, + "token_length": { + "8k": 0.3886475538954706, + "16k": 0.41257816091326605, + "32k": 0.41159385509736474, + "64k": 0.31664455480081405, + "128k": 0.35128941591866764, + "256k": 0.32436430463150734 + }, + "contextual_requirement": { + "Full": 0.34706677159026683, + "Partial": 0.39355056542189176 + }, + "difficulty": { + "Easy": 0.4703351630161875, + "Moderate": 0.29345326870943717, + "Hard": 0.341763065112941, + "Extreme": 0.32045543696773376 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6724018408268643, + "T2. Sequencing & Structure Reconstruction": 0.6527223172989636, + "T3. Evidence-Grounded QA": 0.5166666666666667, + "T4. Summarization & Synthesis": 0.5284447990610109, + "T5. Attribution & Citation Alignment": 0.2918155891981263, + "T6. Aggregation & Clustering": 0.3951103080046706, + "T7. Consistency & Compliance Checking": 0.17493867106904193, + "T8. Structured & Numeric Reasoning": 0.09398148148148147, + "T9. Version & Code Diff Analysis": 0.33853749595673677, + "T10. Rule Induction & In-Context Learning": 0.33902777777777776, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336 + }, + "language": { + "Chinese": 0.36969778846765494, + "English": 0.36534149328470883 + } + }, + "pass@2": 0.13133333333333333, + "BoN-3": { + "overall_metric": 0.3871899208655039, + "token_length": { + "8k": 0.40912701437278987, + "16k": 0.4254527731119934, + "32k": 0.43307631053998147, + "64k": 0.3370765423947205, + "128k": 0.3753522812020257, + "256k": 0.3430546035715128 + }, + "contextual_requirement": { + "Full": 0.36557204234325574, + "Partial": 0.4147035844392751 + }, + "difficulty": { + "Easy": 0.48861463017463347, + "Moderate": 0.30875721939034645, + "Hard": 0.3637658481982982, + "Extreme": 0.3429851432294641 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6857312280559049, + "T2. Sequencing & Structure Reconstruction": 0.6763734162000631, + "T3. Evidence-Grounded QA": 0.5416666666666666, + "T4. Summarization & Synthesis": 0.5345601158557589, + "T5. Attribution & Citation Alignment": 0.32057216845982056, + "T6. Aggregation & Clustering": 0.4271227501408007, + "T7. Consistency & Compliance Checking": 0.19336910203847105, + "T8. Structured & Numeric Reasoning": 0.11620370370370368, + "T9. Version & Code Diff Analysis": 0.3618993039783438, + "T10. Rule Induction & In-Context Learning": 0.3556944444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336 + }, + "language": { + "Chinese": 0.3912716913429664, + "English": 0.38310815038804175 + } + }, + "pass@3": 0.14133333333333334 +} \ No newline at end of file diff --git a/results/Qwen3-8B/thinking_context-120000_bon-3_summary.json b/results/Qwen3-8B/thinking_context-120000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f6b23d23be76ce6d63a792692c4cd059287da569 --- /dev/null +++ b/results/Qwen3-8B/thinking_context-120000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 17, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.4433948911173604, + "inference_iteration_1_overall_metric": 0.4399572436291822, + "inference_iteration_2_overall_metric": 0.44495624974340414, + "inference_iteration_3_overall_metric": 0.4452711799794956, + "average_token_length_metric": { + "8k": 0.5383434714089156, + "16k": 0.49806741885943173, + "32k": 0.5048813586036265, + "64k": 0.3824635395778342, + "128k": 0.38213602550979975, + "256k": 0.3544775327445564 + }, + "average_contextual_requirement_metric": { + "Full": 0.4169048892199721, + "Partial": 0.47710943898676395 + }, + "average_difficulty_metric": { + "Easy": 0.670801806919622, + "Moderate": 0.3015890712418365, + "Hard": 0.3710263419639324, + "Extreme": 0.334954571191627 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.691601826560709, + "T2. Sequencing & Structure Reconstruction": 0.6929147615138178, + "T3. Evidence-Grounded QA": 0.46944444444444433, + "T4. Summarization & Synthesis": 0.5153536178825013, + "T5. Attribution & Citation Alignment": 0.343764413672508, + "T6. Aggregation & Clustering": 0.4465373768548264, + "T7. Consistency & Compliance Checking": 0.21215633369348813, + "T8. Structured & Numeric Reasoning": 0.43070987654320986, + "T9. Version & Code Diff Analysis": 0.48094613870018443, + "T10. Rule Induction & In-Context Learning": 0.3948611111111111, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3194444444444445 + }, + "average_language_metric": { + "Chinese": 0.440767848226732, + "English": 0.4460219340079889 + }, + "BoN-1": { + "overall_metric": 0.4399572436291822, + "token_length": { + "8k": 0.5103007254314137, + "16k": 0.47828585706153987, + "32k": 0.5290020762560991, + "64k": 0.3888098595475681, + "128k": 0.3734417981216912, + "256k": 0.3599031453567786 + }, + "contextual_requirement": { + "Full": 0.41724803011166967, + "Partial": 0.46885987901510623 + }, + "difficulty": { + "Easy": 0.6690339330094581, + "Moderate": 0.2916811720097684, + "Hard": 0.36205062647700836, + "Extreme": 0.33757504538758054 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.6812085604476644, + "T2. Sequencing & Structure Reconstruction": 0.6967399267399262, + "T3. Evidence-Grounded QA": 0.475, + "T4. Summarization & Synthesis": 0.5146666315316712, + "T5. Attribution & Citation Alignment": 0.3410958151689159, + "T6. Aggregation & Clustering": 0.44159343486389535, + "T7. Consistency & Compliance Checking": 0.19800047364593318, + "T8. Structured & Numeric Reasoning": 0.43009259259259264, + "T9. Version & Code Diff Analysis": 0.4938637487118504, + "T10. Rule Induction & In-Context Learning": 0.3506944444444444, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667 + }, + "language": { + "Chinese": 0.4330192432359592, + "English": 0.4468952440224042 + } + }, + "pass@1": 0.20066666666666666, + "BoN-2": { + "overall_metric": 0.5164964536465051, + "token_length": { + "8k": 0.5863358411112154, + "16k": 0.566411710517629, + "32k": 0.5835338435058043, + "64k": 0.4844792024621684, + "128k": 0.45010561125903437, + "256k": 0.42811251302317804 + }, + "contextual_requirement": { + "Full": 0.4885834016176216, + "Partial": 0.5520221562287199 + }, + "difficulty": { + "Easy": 0.779988512571463, + "Moderate": 0.36284391366821606, + "Hard": 0.43271065222226107, + "Extreme": 0.3838066996999193 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7478708806546476, + "T2. Sequencing & Structure Reconstruction": 0.7636960261960256, + "T3. Evidence-Grounded QA": 0.5666666666666667, + "T4. Summarization & Synthesis": 0.5296118538392725, + "T5. Attribution & Citation Alignment": 0.4258014929372265, + "T6. Aggregation & Clustering": 0.5129489242363339, + "T7. Consistency & Compliance Checking": 0.2736301164132571, + "T8. Structured & Numeric Reasoning": 0.5393518518518519, + "T9. Version & Code Diff Analysis": 0.5663013004241957, + "T10. Rule Induction & In-Context Learning": 0.48402777777777767, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.38333333333333336 + }, + "language": { + "Chinese": 0.5161513606389965, + "English": 0.5168415466540133 + } + }, + "pass@2": 0.25733333333333336, + "BoN-3": { + "overall_metric": 0.5567263766803745, + "token_length": { + "8k": 0.6362828635230333, + "16k": 0.6080554567904433, + "32k": 0.6268068570169055, + "64k": 0.52544445261354, + "128k": 0.4829862086238965, + "256k": 0.46078242151443083 + }, + "contextual_requirement": { + "Full": 0.5197805205774073, + "Partial": 0.603748375356879 + }, + "difficulty": { + "Easy": 0.8403187597200309, + "Moderate": 0.39459843208319806, + "Hard": 0.46151769338036713, + "Extreme": 0.4150871876739478 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.785936662865175, + "T2. Sequencing & Structure Reconstruction": 0.779274429274429, + "T3. Evidence-Grounded QA": 0.625, + "T4. Summarization & Synthesis": 0.5363098621370127, + "T5. Attribution & Citation Alignment": 0.4698549989248756, + "T6. Aggregation & Clustering": 0.5494079005598616, + "T7. Consistency & Compliance Checking": 0.3203866786545349, + "T8. Structured & Numeric Reasoning": 0.5949074074074073, + "T9. Version & Code Diff Analysis": 0.5913452198149313, + "T10. Rule Induction & In-Context Learning": 0.5493055555555556, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.425 + }, + "language": { + "Chinese": 0.5523930398850595, + "English": 0.5610597134756903 + } + }, + "pass@3": 0.29133333333333333 +} \ No newline at end of file diff --git a/results/Qwen3-Next-80B-A3B-Instruct/nonthinking_context-224000_bon-3_summary.json b/results/Qwen3-Next-80B-A3B-Instruct/nonthinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ed60182cc22b951c5d266b8975bb864780780a32 --- /dev/null +++ b/results/Qwen3-Next-80B-A3B-Instruct/nonthinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.5153699428141112, + "inference_iteration_1_overall_metric": 0.5166303102959966, + "inference_iteration_2_overall_metric": 0.5157188374753582, + "inference_iteration_3_overall_metric": 0.5137606806709764, + "average_token_length_metric": { + "8k": 0.5444390450433558, + "16k": 0.5325817607161717, + "32k": 0.55004125822099, + "64k": 0.5168441600849382, + "128k": 0.48782896231770906, + "256k": 0.4604844705014988 + }, + "average_contextual_requirement_metric": { + "Full": 0.4773995367563536, + "Partial": 0.5636959141603475 + }, + "average_difficulty_metric": { + "Easy": 0.6525257555815487, + "Moderate": 0.48765154528061855, + "Hard": 0.4992512003996272, + "Extreme": 0.3939150148392464 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.7836410129145194, + "T2. Sequencing & Structure Reconstruction": 0.7601097359430687, + "T3. Evidence-Grounded QA": 0.5722222222222223, + "T4. Summarization & Synthesis": 0.5429874590199666, + "T5. Attribution & Citation Alignment": 0.5936748905924045, + "T6. Aggregation & Clustering": 0.47375423753978874, + "T7. Consistency & Compliance Checking": 0.3365343156781269, + "T8. Structured & Numeric Reasoning": 0.2592592592592593, + "T9. Version & Code Diff Analysis": 0.621000579101771, + "T10. Rule Induction & In-Context Learning": 0.4863888888888888, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.4777777777777778 + }, + "average_language_metric": { + "Chinese": 0.5268601573059153, + "English": 0.5038797283223065 + }, + "BoN-1": { + "overall_metric": 0.5166303102959966, + "token_length": { + "8k": 0.5438980115785723, + "16k": 0.5379344457831181, + "32k": 0.5499773393749967, + "64k": 0.5239673563810381, + "128k": 0.48407264135846406, + "256k": 0.45993206729979164 + }, + "contextual_requirement": { + "Full": 0.4777093693790403, + "Partial": 0.566166053281215 + }, + "difficulty": { + "Easy": 0.6553713573709563, + "Moderate": 0.48623131248023554, + "Hard": 0.4957585702580054, + "Extreme": 0.398321352454188 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7850605454303755, + "T2. Sequencing & Structure Reconstruction": 0.7605132367632368, + "T3. Evidence-Grounded QA": 0.5666666666666667, + "T4. Summarization & Synthesis": 0.5438750921908875, + "T5. Attribution & Citation Alignment": 0.5978797516792373, + "T6. Aggregation & Clustering": 0.4781550130242737, + "T7. Consistency & Compliance Checking": 0.3404060026855857, + "T8. Structured & Numeric Reasoning": 0.2601851851851852, + "T9. Version & Code Diff Analysis": 0.6199309512936556, + "T10. Rule Induction & In-Context Learning": 0.49083333333333334, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.475 + }, + "language": { + "Chinese": 0.5240393165444767, + "English": 0.5092213040475176 + } + }, + "pass@1": 0.23933333333333334, + "BoN-2": { + "overall_metric": 0.5307678792049177, + "token_length": { + "8k": 0.5566045118760491, + "16k": 0.5523528870760122, + "32k": 0.565064007850647, + "64k": 0.5329215232825427, + "128k": 0.5070346589234701, + "256k": 0.47062968622078627 + }, + "contextual_requirement": { + "Full": 0.4936133806201656, + "Partial": 0.5780554228582404 + }, + "difficulty": { + "Easy": 0.6646977492242093, + "Moderate": 0.50755257479461, + "Hard": 0.5154147759439242, + "Extreme": 0.40938622572243505 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.794910051304565, + "T2. Sequencing & Structure Reconstruction": 0.7644632682132678, + "T3. Evidence-Grounded QA": 0.6, + "T4. Summarization & Synthesis": 0.5529787281614388, + "T5. Attribution & Citation Alignment": 0.6065695384216199, + "T6. Aggregation & Clustering": 0.4918158422307883, + "T7. Consistency & Compliance Checking": 0.35643719860134077, + "T8. Structured & Numeric Reasoning": 0.2837962962962963, + "T9. Version & Code Diff Analysis": 0.6413528982679426, + "T10. Rule Induction & In-Context Learning": 0.49291666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334 + }, + "language": { + "Chinese": 0.5373954743254495, + "English": 0.5241402840843874 + } + }, + "pass@2": 0.25133333333333335, + "BoN-3": { + "overall_metric": 0.5363452114492876, + "token_length": { + "8k": 0.5662343509403671, + "16k": 0.5541053906547336, + "32k": 0.5714052653352649, + "64k": 0.5365347531934883, + "128k": 0.5125057318235955, + "256k": 0.477285776748277 + }, + "contextual_requirement": { + "Full": 0.49978312289665056, + "Partial": 0.5828787786980997 + }, + "difficulty": { + "Easy": 0.6709349653337122, + "Moderate": 0.5142353747150851, + "Hard": 0.5181865994276285, + "Extreme": 0.4153573030814503 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.7957673791297515, + "T2. Sequencing & Structure Reconstruction": 0.7681934269434265, + "T3. Evidence-Grounded QA": 0.6, + "T4. Summarization & Synthesis": 0.5577840086379114, + "T5. Attribution & Citation Alignment": 0.6152937974264557, + "T6. Aggregation & Clustering": 0.49800879082542016, + "T7. Consistency & Compliance Checking": 0.3608232159498475, + "T8. Structured & Numeric Reasoning": 0.30046296296296293, + "T9. Version & Code Diff Analysis": 0.6437507430378715, + "T10. Rule Induction & In-Context Learning": 0.49291666666666667, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.49166666666666664 + }, + "language": { + "Chinese": 0.5451725427917254, + "English": 0.5275178801068509 + } + }, + "pass@3": 0.25533333333333336 +} \ No newline at end of file diff --git a/results/Qwen3-Next-80B-A3B-Instruct/thinking_context-224000_bon-3_summary.json b/results/Qwen3-Next-80B-A3B-Instruct/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..2034ab9a0e6eee1f50f647f9ee32107e3dbf7f66 --- /dev/null +++ b/results/Qwen3-Next-80B-A3B-Instruct/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 0, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.6075716303534897, + "inference_iteration_1_overall_metric": 0.6011630214015442, + "inference_iteration_2_overall_metric": 0.6048517302521914, + "inference_iteration_3_overall_metric": 0.6167001394067311, + "average_token_length_metric": { + "8k": 0.6538122003152632, + "16k": 0.6386361113417942, + "32k": 0.6075792549321678, + "64k": 0.6198997893946241, + "128k": 0.5589567834361618, + "256k": 0.5665456427009256 + }, + "average_contextual_requirement_metric": { + "Full": 0.5584048701107478, + "Partial": 0.6701475070260704 + }, + "average_difficulty_metric": { + "Easy": 0.8084452986372921, + "Moderate": 0.6415908415050756, + "Hard": 0.5474008359025712, + "Extreme": 0.4047004927642043 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8543788758470637, + "T2. Sequencing & Structure Reconstruction": 0.805848765432098, + "T3. Evidence-Grounded QA": 0.5944444444444446, + "T4. Summarization & Synthesis": 0.5231378715129659, + "T5. Attribution & Citation Alignment": 0.6489902589376487, + "T6. Aggregation & Clustering": 0.5332869283079988, + "T7. Consistency & Compliance Checking": 0.3951514391075655, + "T8. Structured & Numeric Reasoning": 0.6266975308641974, + "T9. Version & Code Diff Analysis": 0.7189839084173466, + "T10. Rule Induction & In-Context Learning": 0.5828240740740741, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333334 + }, + "average_language_metric": { + "Chinese": 0.6121820357866055, + "English": 0.6029612249203741 + }, + "BoN-1": { + "overall_metric": 0.6011630214015442, + "token_length": { + "8k": 0.6624953563895736, + "16k": 0.6300056666980762, + "32k": 0.5997782992095927, + "64k": 0.6121693133437759, + "128k": 0.5635843711638716, + "256k": 0.5389451216043789 + }, + "contextual_requirement": { + "Full": 0.54428292048123, + "Partial": 0.67355587711831 + }, + "difficulty": { + "Easy": 0.800956148011297, + "Moderate": 0.6440797168615464, + "Hard": 0.5307293137505391, + "Extreme": 0.4003724066226809 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8631061933334538, + "T2. Sequencing & Structure Reconstruction": 0.7992355329855325, + "T3. Evidence-Grounded QA": 0.625, + "T4. Summarization & Synthesis": 0.5237860950102718, + "T5. Attribution & Citation Alignment": 0.6286606136418816, + "T6. Aggregation & Clustering": 0.523201899147464, + "T7. Consistency & Compliance Checking": 0.4046354997828942, + "T8. Structured & Numeric Reasoning": 0.625, + "T9. Version & Code Diff Analysis": 0.6992432341526317, + "T10. Rule Induction & In-Context Learning": 0.52125, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.525 + }, + "language": { + "Chinese": 0.6058889746353506, + "English": 0.5964370681677398 + } + }, + "pass@1": 0.3546666666666667, + "BoN-2": { + "overall_metric": 0.6610666781489654, + "token_length": { + "8k": 0.7056825593069123, + "16k": 0.7025120747104457, + "32k": 0.6769789260877618, + "64k": 0.6679511379675653, + "128k": 0.5958338915375376, + "256k": 0.6174414792835736 + }, + "contextual_requirement": { + "Full": 0.6082650541075186, + "Partial": 0.7282687451108096 + }, + "difficulty": { + "Easy": 0.8495933138250492, + "Moderate": 0.7152672047846778, + "Hard": 0.6173703518012653, + "Extreme": 0.44764496842372237 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9026227557629393, + "T2. Sequencing & Structure Reconstruction": 0.8607230269730269, + "T3. Evidence-Grounded QA": 0.6833333333333333, + "T4. Summarization & Synthesis": 0.5332664489935818, + "T5. Attribution & Citation Alignment": 0.708058003290001, + "T6. Aggregation & Clustering": 0.6005714039047372, + "T7. Consistency & Compliance Checking": 0.4659994086764647, + "T8. Structured & Numeric Reasoning": 0.6902777777777778, + "T9. Version & Code Diff Analysis": 0.7525570229707225, + "T10. Rule Induction & In-Context Learning": 0.6125, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.575 + }, + "language": { + "Chinese": 0.6606066535667863, + "English": 0.6615267027311469 + } + }, + "pass@2": 0.4246666666666667, + "BoN-3": { + "overall_metric": 0.7007864030989018, + "token_length": { + "8k": 0.7348618958536445, + "16k": 0.7307241128589752, + "32k": 0.7095991520281644, + "64k": 0.7158858832025538, + "128k": 0.6481921445010119, + "256k": 0.6654552301490679 + }, + "contextual_requirement": { + "Full": 0.6551104064886832, + "Partial": 0.7589194896937296 + }, + "difficulty": { + "Easy": 0.8851870155041537, + "Moderate": 0.7611742624890716, + "Hard": 0.6700836828309551, + "Extreme": 0.47928617494969256 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9126235121225394, + "T2. Sequencing & Structure Reconstruction": 0.8791730029230027, + "T3. Evidence-Grounded QA": 0.7166666666666667, + "T4. Summarization & Synthesis": 0.5379060712223888, + "T5. Attribution & Citation Alignment": 0.7258679135396275, + "T6. Aggregation & Clustering": 0.6480317213650545, + "T7. Consistency & Compliance Checking": 0.4953056552621231, + "T8. Structured & Numeric Reasoning": 0.7416666666666667, + "T9. Version & Code Diff Analysis": 0.7806423628768492, + "T10. Rule Induction & In-Context Learning": 0.7211111111111113, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333 + }, + "language": { + "Chinese": 0.706235211076793, + "English": 0.6953375951210139 + } + }, + "pass@3": 0.47533333333333333 +} \ No newline at end of file diff --git a/results/Qwen3-Next-80B-A3B-Thinking/thinking_context-224000_bon-3_summary.json b/results/Qwen3-Next-80B-A3B-Thinking/thinking_context-224000_bon-3_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ddc3c1067ea62d9dd515db8be2211b587e71d714 --- /dev/null +++ b/results/Qwen3-Next-80B-A3B-Thinking/thinking_context-224000_bon-3_summary.json @@ -0,0 +1,164 @@ +{ + "date": "2025-12-08", + "total_questions_num": 1500, + "inference_iterations": 3, + "total_samples_num": 4500, + "fail_samples_num": 34, + "inference_inconsistent_samples_num": 0, + "average_overall_metric": 0.6395225139139809, + "inference_iteration_1_overall_metric": 0.6341195678747302, + "inference_iteration_2_overall_metric": 0.6408716243063108, + "inference_iteration_3_overall_metric": 0.6435763495608982, + "average_token_length_metric": { + "8k": 0.7086149367852088, + "16k": 0.6851138026926965, + "32k": 0.6504005097985216, + "64k": 0.6399676067396364, + "128k": 0.6081903722855004, + "256k": 0.544847855182321 + }, + "average_contextual_requirement_metric": { + "Full": 0.5847832398021344, + "Partial": 0.7091906809654226 + }, + "average_difficulty_metric": { + "Easy": 0.8189651329867498, + "Moderate": 0.6922832177824265, + "Hard": 0.6146117453677709, + "Extreme": 0.4246589373203432 + }, + "average_primary_task_metric": { + "T1. Retrieval & Ranking": 0.8753831884628448, + "T2. Sequencing & Structure Reconstruction": 0.836079250742492, + "T3. Evidence-Grounded QA": 0.6472222222222221, + "T4. Summarization & Synthesis": 0.5559024180310073, + "T5. Attribution & Citation Alignment": 0.6831202809594916, + "T6. Aggregation & Clustering": 0.6046103225117172, + "T7. Consistency & Compliance Checking": 0.4200843891775127, + "T8. Structured & Numeric Reasoning": 0.6665123456790124, + "T9. Version & Code Diff Analysis": 0.7482171811580411, + "T10. Rule Induction & In-Context Learning": 0.5751851851851851, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5361111111111112 + }, + "average_language_metric": { + "Chinese": 0.6498950648240791, + "English": 0.6291499630038833 + }, + "BoN-1": { + "overall_metric": 0.6341195678747302, + "token_length": { + "8k": 0.7020231017646056, + "16k": 0.6839496608825781, + "32k": 0.6481596388780655, + "64k": 0.6264029281930145, + "128k": 0.6089383419761589, + "256k": 0.5352437355539651 + }, + "contextual_requirement": { + "Full": 0.5805803773748881, + "Partial": 0.702260355783623 + }, + "difficulty": { + "Easy": 0.8169438449940154, + "Moderate": 0.7044054869931783, + "Hard": 0.5782945616258349, + "Extreme": 0.42434569198656086 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.8803664149043899, + "T2. Sequencing & Structure Reconstruction": 0.8312176889574149, + "T3. Evidence-Grounded QA": 0.6333333333333333, + "T4. Summarization & Synthesis": 0.5547574749584397, + "T5. Attribution & Citation Alignment": 0.6921710523314937, + "T6. Aggregation & Clustering": 0.5879316250623552, + "T7. Consistency & Compliance Checking": 0.4025441003215575, + "T8. Structured & Numeric Reasoning": 0.6638888888888889, + "T9. Version & Code Diff Analysis": 0.7462961569843115, + "T10. Rule Induction & In-Context Learning": 0.5484722222222224, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.5583333333333333 + }, + "language": { + "Chinese": 0.6383248368091883, + "English": 0.6299142989402748 + } + }, + "pass@1": 0.4033333333333333, + "BoN-2": { + "overall_metric": 0.7030204024240833, + "token_length": { + "8k": 0.7639776186212258, + "16k": 0.7486638197028843, + "32k": 0.7095546783100515, + "64k": 0.6979238211892997, + "128k": 0.6806853813914886, + "256k": 0.6173170953295508 + }, + "contextual_requirement": { + "Full": 0.6539478610781624, + "Partial": 0.7654763641370751 + }, + "difficulty": { + "Easy": 0.888641690030574, + "Moderate": 0.7693477973677636, + "Hard": 0.6905310978675843, + "Extreme": 0.46431888254175707 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9014160909768955, + "T2. Sequencing & Structure Reconstruction": 0.8695868020868018, + "T3. Evidence-Grounded QA": 0.75, + "T4. Summarization & Synthesis": 0.5700264981260397, + "T5. Attribution & Citation Alignment": 0.7558486553637924, + "T6. Aggregation & Clustering": 0.6562172241338908, + "T7. Consistency & Compliance Checking": 0.4824541236458827, + "T8. Structured & Numeric Reasoning": 0.7527777777777778, + "T9. Version & Code Diff Analysis": 0.8007866287445203, + "T10. Rule Induction & In-Context Learning": 0.6445833333333334, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333 + }, + "language": { + "Chinese": 0.7121930712441974, + "English": 0.6938477336039707 + } + }, + "pass@2": 0.4826666666666667, + "BoN-3": { + "overall_metric": 0.7357638591230633, + "token_length": { + "8k": 0.7947994120787286, + "16k": 0.7776820176641183, + "32k": 0.7381243144177431, + "64k": 0.7311339212477369, + "128k": 0.7186000776011717, + "256k": 0.654243411728885 + }, + "contextual_requirement": { + "Full": 0.682832657763219, + "Partial": 0.8031308426719579 + }, + "difficulty": { + "Easy": 0.9249743747143141, + "Moderate": 0.7994375898491137, + "Hard": 0.7200534393057935, + "Extreme": 0.496989165105746 + }, + "primary_task": { + "T1. Retrieval & Ranking": 0.9130805871242748, + "T2. Sequencing & Structure Reconstruction": 0.8928626466126465, + "T3. Evidence-Grounded QA": 0.7916666666666666, + "T4. Summarization & Synthesis": 0.5775148472726891, + "T5. Attribution & Citation Alignment": 0.7740008061816878, + "T6. Aggregation & Clustering": 0.7032171864847305, + "T7. Consistency & Compliance Checking": 0.5232353490903346, + "T8. Structured & Numeric Reasoning": 0.7805555555555556, + "T9. Version & Code Diff Analysis": 0.8344938818177356, + "T10. Rule Induction & In-Context Learning": 0.7195833333333334, + "T11. Dialogue Memory & Long-Horizon Tracking": 0.6833333333333333 + }, + "language": { + "Chinese": 0.7443997677249928, + "English": 0.7271279505211355 + } + }, + "pass@3": 0.528 +} \ No newline at end of file diff --git a/results/model_info.json b/results/model_info.json new file mode 100644 index 0000000000000000000000000000000000000000..ff1994a45f947e4df3b69912a3e8ebfa3c3a0f7a --- /dev/null +++ b/results/model_info.json @@ -0,0 +1,232 @@ +{ + "Gemini-2.5-Pro": { + "type": "Thinking", + "context_length": "1M", + "url": "https://ai.google.dev/gemini-api/docs/models?hl=zh-cn#gemini-2.5-pro" + }, + "Gemini-2.5-Flash": { + "type": "Mixed", + "context_length": "1M", + "url": "https://ai.google.dev/gemini-api/docs/models?hl=zh-cn#gemini-2.5-flash" + }, + "Gemma-3-27B-It": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/google/gemma-3-27b-it" + }, + "Gemma-3-12B-It": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/google/gemma-3-12b-it" + }, + "Gemma-3-4B-It": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/google/gemma-3-4b-it" + }, + "GPT-5": { + "type": "Thinking", + "context_length": "272k", + "url": "https://platform.openai.com/docs/models/gpt-5" + }, + "GPT-4o": { + "type": "Instruct", + "context_length": "128k", + "url": "https://platform.openai.com/docs/models/gpt-4o" + }, + "GPT-OSS-120B": { + "type": "Thinking", + "context_length": "128k", + "url": "https://huggingface.co/openai/gpt-oss-120b" + }, + "GPT-OSS-20B": { + "type": "Thinking", + "context_length": "128k", + "url": "https://huggingface.co/openai/gpt-oss-20b" + }, + "Claude-4-Sonnet": { + "type": "Mixed", + "context_length": "1M", + "url": "https://www.anthropic.com/news/claude-4" + }, + "Claude-3.7-Sonnet": { + "type": "Mixed", + "context_length": "200k", + "url": "https://www.anthropic.com/news/claude-3-7-sonnet" + }, + "DeepSeek-V3.2": { + "type": "Mixed", + "context_length": "160k", + "url": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2" + }, + "DeepSeek-V3.1": { + "type": "Mixed", + "context_length": "128k", + "url": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" + }, + "DeepSeek-R1-0528": { + "type": "Thinking", + "context_length": "128k", + "url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" + }, + "DeepSeek-R1": { + "type": "Thinking", + "context_length": "128k", + "url": "https://huggingface.co/deepseek-ai/DeepSeek-R1" + }, + "DeepSeek-V3-0324": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324" + }, + "Qwen3-235B-A22B-Thinking-2507": { + "type": "Thinking", + "context_length": "256k", + "url": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" + }, + "Qwen3-235B-A22B-Instruct-2507": { + "type": "Instruct", + "context_length": "256k", + "url": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" + }, + "Qwen3-Next-80B-A3B-Thinking": { + "type": "Thinking", + "context_length": "256k", + "url": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking" + }, + "Qwen3-Next-80B-A3B-Instruct": { + "type": "Instruct", + "context_length": "256k", + "url": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct" + }, + "Qwen3-30B-A3B-Thinking-2507": { + "type": "Thinking", + "context_length": "256k", + "url": "https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507" + }, + "Qwen3-30B-A3B-Instruct-2507": { + "type": "Instruct", + "context_length": "256k", + "url": "https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507" + }, + "Qwen3-4B-Thinking-2507": { + "type": "Thinking", + "context_length": "256k", + "url": "https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507" + }, + "Qwen3-4B-Instruct-2507": { + "type": "Instruct", + "context_length": "256k", + "url": "https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507" + }, + "Qwen3-32B": { + "type": "Mixed", + "context_length": "128k", + "url": "https://huggingface.co/Qwen/Qwen3-32B" + }, + "Qwen3-14B": { + "type": "Mixed", + "context_length": "128k", + "url": "https://huggingface.co/Qwen/Qwen3-14B" + }, + "Qwen3-8B": { + "type": "Mixed", + "context_length": "128k", + "url": "https://huggingface.co/Qwen/Qwen3-8B" + }, + "Qwen3-4B": { + "type": "Mixed", + "context_length": "128k", + "url": "https://huggingface.co/Qwen/Qwen3-4B" + }, + "Qwen2.5-72B-Instruct": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct" + }, + "GLM-4.6": { + "type": "Mixed", + "context_length": "198k", + "url": "https://huggingface.co/zai-org/GLM-4.6" + }, + "GLM-4.5": { + "type": "Mixed", + "context_length": "128k", + "url": "https://huggingface.co/zai-org/GLM-4.5" + }, + "Kimi-K2-Instruct-0905": { + "type": "Instruct", + "context_length": "256k", + "url": "https://huggingface.co/moonshotai/Kimi-K2-Instruct-0905" + }, + "MiniMax-M2": { + "type": "Thinking", + "context_length": "192k", + "url": "https://huggingface.co/MiniMaxAI/MiniMax-M2" + }, + "MiniMax-Text-01": { + "type": "Instruct", + "context_length": "4M", + "url": "https://huggingface.co/MiniMaxAI/MiniMax-Text-01" + }, + "Ministral-3-14B-Instruct-2512": { + "type": "Instruct", + "context_length": "256k", + "url": "https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512" + }, + "Ministral-3-8B-Instruct-2512": { + "type": "Instruct", + "context_length": "256k", + "url": "https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512" + }, + "Ministral-3-3B-Instruct-2512": { + "type": "Instruct", + "context_length": "256k", + "url": "https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512" + }, + "Magistral-Small-2509": { + "type": "Thinking", + "context_length": "128k", + "url": "https://huggingface.co/mistralai/Magistral-Small-2509" + }, + "Mistral-Small-3.2-24B-Instruct-2506": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506" + }, + "Mistral-Large-Instruct-2411": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2411" + }, + "Ministral-8B-Instruct-2410": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410" + }, + "Llama-3.1-405B-Instruct": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct" + }, + "Llama-3.3-70B-Instruct": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct" + }, + "Llama-3.1-70B-Instruct": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct" + }, + "Llama-3.1-8B-Instruct": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct" + }, + "Llama-3.2-3B-Instruct": { + "type": "Instruct", + "context_length": "128k", + "url": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct" + } +} \ No newline at end of file