Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

.DS_Store +0 -0
.gitattributes +2 -0
hard_divide.py +64 -0
simple_cell_acc.py +104 -0
sudoku_cal_hardness3.py +552 -0
test.csv +3 -0
train.csv +3 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.gitattributes CHANGED Viewed

@@ -66,3 +66,5 @@ cd5_train.jsonl filter=lfs diff=lfs merge=lfs -text
 hard_train.csv filter=lfs diff=lfs merge=lfs -text
 path_train.jsonl filter=lfs diff=lfs merge=lfs -text
 sudoku_train.csv filter=lfs diff=lfs merge=lfs -text

 hard_train.csv filter=lfs diff=lfs merge=lfs -text
 path_train.jsonl filter=lfs diff=lfs merge=lfs -text
 sudoku_train.csv filter=lfs diff=lfs merge=lfs -text
+test.csv filter=lfs diff=lfs merge=lfs -text
+train.csv filter=lfs diff=lfs merge=lfs -text

hard_divide.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import pandas as pd
+def process_csv(input_file, output_file, sample_size=50000):
+    """
+    处理CSV文件：
+    1. 读取数据并将question列中的所有'.'替换为'0'
+    2. 重命名列
+    3. 保留前指定数量的记录
+    4. 保存处理后的数据
+    参数:
+        input_file (str): 输入CSV文件路径
+        output_file (str): 输出CSV文件路径
+        sample_size (int): 要保留的记录数，默认为50,000
+    """
+    # 读取CSV文件
+    df = pd.read_csv(input_file)
+    # 检查列是否存在
+    required_columns = ['source', 'question', 'answer', 'rating']
+    for col in required_columns:
+        if col not in df.columns:
+            raise ValueError(f"CSV文件中缺少必需的列: {col}")
+    # 处理question列 - 将所有'.'替换为'0'（针对数独格式）
+    df['question'] = df['question'].str.replace('.', '0')
+    # 重命名列
+    df = df.rename(columns={
+        'question': 'quizzes',  # 使用question列作为quizzes
+        'answer': 'solutions'
+    })
+    # 保留前N个记录
+    if len(df) > sample_size:
+        df = df.head(sample_size)
+        print(f"已从{len(df)}条记录中保留前{sample_size}条")
+    else:
+        print(f"警告：文件只有{len(df)}条记录，不足{sample_size}条，将保留全部记录")
+    # 只保留需要的列
+    df = df[['quizzes', 'solutions', 'rating']]
+    # 保存处理后的数据
+    df.to_csv(output_file, index=False)
+    print(f"处理完成，结果已保存到: {output_file}")
+    print(f"最终记录数: {len(df)}")
+    print("\n替换效果验证（前3个示例）:")
+    print(df[['quizzes']].head(3).to_string(index=False))
+if __name__ == "__main__":
+    # 设置输入输出文件路径
+    input_csv = "data/test.csv"  # 替换为你的输入文件路径
+    output_csv = "data/hard_test.csv"  # 替换为你想要的输出路径
+    # 执行处理
+    process_csv(input_csv, output_csv, sample_size=5000)
+    input_csv = "data/train.csv"  # 替换为你的输入文件路径
+    output_csv = "data/hard_train.csv"  # 替换为你想要的输出路径
+    # 执行处理
+    # process_csv(input_csv, output_csv, sample_size=100000)

simple_cell_acc.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import json
+import matplotlib.pyplot as plt
+import os
+import numpy as np
+from collections import defaultdict
+def calculate_accuracy(label, predict):
+    """计算单个样本的准确率"""
+    label = label.replace("[PAD]", "").replace("[EOS]", "0")
+    predict = predict.replace("[PAD]", "").replace("[EOS]", "0")
+    total_chars = len(label)
+    correct_chars = sum(1 for l, p in zip(label, predict) if l == p)
+    return correct_chars / total_chars
+    # return label == predict
+def evaluate_jsonl(file_path):
+    total_accuracy = 0
+    sample_count = 0
+    individual_accuracies = []
+    with open(file_path, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            label = data['label']
+            predict = data['predict']
+            if len(label) != len(predict):
+                print(f"警告: 第{sample_count+1}行长度不一致 (label:{len(label)} vs predict:{len(predict)})")
+                continue
+            accuracy = calculate_accuracy(label, predict)
+            total_accuracy += accuracy
+            sample_count += 1
+    if sample_count == 0:
+        return 0, []
+    avg_accuracy = total_accuracy / sample_count
+    return avg_accuracy
+def load_experiment_data(exp_path, dataset_name, T_list, acc_key="predict_acc"):
+    t_values = []
+    accuracies = []
+    missing_files = []
+    results = {}
+    for t in T_list:
+        file_path = os.path.join(exp_path, f"{dataset_name}_T{t}")
+        filename = os.path.join(file_path, "all_results.json")
+        if not os.path.exists(filename):
+            missing_files.append(filename)
+            continue
+        try:
+            with open(filename, 'r') as f:
+                data = json.load(f)
+            if acc_key not in data:
+                print(f"警告: {filename} 中未找到键 '{acc_key}'")
+                src_filename = os.path.join(file_path, "generated_predictions.jsonl")
+                data[acc_key] = evaluate_jsonl(src_filename)
+                with open(filename, 'w', encoding='utf-8') as f:
+                    json.dump(data, f, ensure_ascii=False, indent=4)
+            acc = data[acc_key]
+            if isinstance(acc, str) and acc.endswith('%'):
+                acc = float(acc.strip('%')) / 100.0
+            t_values.append(t)
+            accuracies.append(acc)
+            results[t] = acc
+        except Exception as e:
+            print(f"处理文件 {filename} 时出错: {str(e)}")
+    return results, missing_files
+# experiments = [
+#     "output/sudoku/gpt2-model-bs1024-lr1e-3-ep100-20250702-123750",
+#     "output/sudoku/gpt2-model-bs1024-lr1e-3-ep100-20250703-021344"
+# ]
+# dataset_names = ["hard_test", "sudoku_test"]
+experiments = [
+    "output/sudoku/gpt2-model-bs1024-lr1e-3-ep100-20250703-073900",
+    "output/sudoku/gpt2-model-bs1024-lr1e-3-ep100-20250703-075910",
+    "output/sudoku/gpt2-model-bs1024-lr1e-3-ep300-20250618-082232"
+]
+dataset_names = ["sudoku_test", "sudoku_test", "sudoku_test"]
+acc_key = "cell_acc"
+for experiment in experiments:
+    for dataset_name in dataset_names:
+        file_path = os.path.join(experiment, dataset_name)
+        filename = os.path.join(file_path, "all_results.json")
+        with open(filename, 'r') as f:
+            data = json.load(f)
+            if acc_key not in data:
+                print(f"警告: {filename} 中未找到键 '{acc_key}'")
+                src_filename = os.path.join(file_path, "generated_predictions.jsonl")
+                data[acc_key] = evaluate_jsonl(src_filename)
+                with open(filename, 'w', encoding='utf-8') as f:
+                    json.dump(data, f, ensure_ascii=False, indent=4)

sudoku_cal_hardness3.py ADDED Viewed

	@@ -0,0 +1,552 @@

+# -*- coding: utf-8 -*-
+"""
+Sudoku batch solver + difficulty analytics
+- Bitmask + MRV DFS
+- Multiprocessing
+- Caching (.npz)
+- Plots (hist, CDF, log-scale, quintiles)
+"""
+import os
+import re
+import csv
+import time
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')  # 非交互模式（服务器/脚本环境）
+import matplotlib.pyplot as plt
+from matplotlib.ticker import MaxNLocator
+from tqdm import tqdm
+from multiprocessing import Pool
+import sys
+# 增大递归深度，避免极难题爆栈（按需调整）
+sys.setrecursionlimit(10000)
+# ==================== 全局优化表 & 常量 ====================
+POPCOUNT = [bin(x).count("1") for x in range(512)]  # 0..511 的 popcount 查表
+CELL_TO_BOX = [[3*(r//3)+(c//3) for c in range(9)] for r in range(9)]  # 预计算 box 索引
+RANDOM_SEED = 2025
+# ==================== Bitmask Sudoku Solver ====================
+def initialize_masks(board):
+    """初始化行/列/宫 bitmask"""
+    row_mask = [0] * 9
+    col_mask = [0] * 9
+    box_mask = [0] * 9
+    for r in range(9):
+        for c in range(9):
+            num = board[r][c]
+            if num != 0:
+                bit = 1 << (num - 1)
+                row_mask[r] |= bit
+                col_mask[c] |= bit
+                box_mask[CELL_TO_BOX[r][c]] |= bit
+    return row_mask, col_mask, box_mask
+def select_mrv_bitmask(board, row_mask, col_mask, box_mask):
+    """MRV 选择下一个格子，返回(cell, possible_mask)"""
+    min_count = 10
+    best_cell = None
+    best_mask = None
+    for r in range(9):
+        for c in range(9):
+            if board[r][c] == 0:
+                mask = row_mask[r] | col_mask[c] | box_mask[CELL_TO_BOX[r][c]]
+                possible = (~mask) & 0x1FF
+                count = POPCOUNT[possible]
+                if count < min_count:
+                    min_count = count
+                    best_cell = (r, c)
+                    best_mask = possible
+                    if count == 1:
+                        return best_cell, best_mask
+    return best_cell, best_mask  # 若无空格，cell=None
+def get_initial_min_remaining(board, row_mask, col_mask, box_mask):
+    """获取初始状态下的最小候选数（MRV基线）"""
+    min_count = 10
+    for r in range(9):
+        for c in range(9):
+            if board[r][c] == 0:
+                mask = row_mask[r] | col_mask[c] | box_mask[CELL_TO_BOX[r][c]]
+                possible = (~mask) & 0x1FF
+                count = POPCOUNT[possible]
+                if count < min_count:
+                    min_count = count
+                    if count == 1:
+                        return min_count
+    return min_count if min_count != 10 else 0
+def solve_sudoku_bitmask(board, row_mask, col_mask, box_mask, steps=None):
+    """数独求解（DFS + MRV + bitmask），steps 为可选计数器 list([0])"""
+    cell, mask = select_mrv_bitmask(board, row_mask, col_mask, box_mask)
+    if cell is None:
+        return True  # 已完成
+    if mask == 0:
+        return False  # 无解分支
+    r, c = cell
+    box_idx = CELL_TO_BOX[r][c]
+    while mask:
+        bit = mask & -mask  # 取最低位的1
+        num = (bit.bit_length() - 1) + 1
+        mask -= bit
+        board[r][c] = num
+        row_mask[r] |= bit
+        col_mask[c] |= bit
+        box_mask[box_idx] |= bit
+        if steps is not None:
+            steps[0] += 1
+        if solve_sudoku_bitmask(board, row_mask, col_mask, box_mask, steps):
+            return True
+        # 回溯
+        board[r][c] = 0
+        row_mask[r] ^= bit
+        col_mask[c] ^= bit
+        box_mask[box_idx] ^= bit
+    return False
+def evaluate_sudoku(board):
+    """
+    返回 (empty_count, steps_used, initial_mrv_min)
+    注：即便无解，也返回累计的 steps（便于统计）
+    """
+    empty_count = int(np.sum(np.array(board) == 0))
+    board_copy = [row[:] for row in board]
+    row_mask, col_mask, box_mask = initialize_masks(board_copy)
+    initial_min_remaining = get_initial_min_remaining(board_copy, row_mask, col_mask, box_mask)
+    steps = [0]
+    _ = solve_sudoku_bitmask(board_copy, row_mask, col_mask, box_mask, steps)
+    return empty_count, int(steps[0]), int(initial_min_remaining)
+# ==================== 并行处理 ====================
+def process_single_sudoku_optimized(args):
+    board, idx = args
+    empty_count, steps_count, initial_min_remaining = evaluate_sudoku(board)
+    return idx, empty_count, steps_count, initial_min_remaining
+def parallel_solve_optimized(boards, n_workers=4):
+    empty_counts = [0] * len(boards)
+    steps_counts = [0] * len(boards)
+    initial_min_remainings = [0] * len(boards)
+    args_list = [(board, i) for i, board in enumerate(boards)]
+    with Pool(processes=n_workers) as pool:
+        with tqdm(total=len(boards), desc="Solving Sudoku", unit="puzzle") as pbar:
+            for idx, empty_count, steps_count, initial_min_remaining in pool.imap(
+                    process_single_sudoku_optimized, args_list, chunksize=100):
+                empty_counts[idx] = empty_count
+                steps_counts[idx] = steps_count
+                initial_min_remainings[idx] = initial_min_remaining
+                pbar.update(1)
+    return empty_counts, steps_counts, initial_min_remainings
+# ==================== 性能测试 ====================
+def benchmark_solver(boards, sample_size=100):
+    print(f"Benchmarking solver with {sample_size} puzzles...")
+    start_time = time.time()
+    empty_counts, steps_counts, initial_min_remainings = parallel_solve_optimized(
+        boards[:sample_size], n_workers=4)
+    total_time = time.time() - start_time
+    print(f"Processed {sample_size} puzzles in {total_time:.2f} seconds")
+    print(f"Average per puzzle: {total_time / sample_size * 1000:.2f} ms")
+    print(f"Puzzles per second: {sample_size / total_time:.1f}")
+    return empty_counts, steps_counts, initial_min_remainings
+# ==================== 绘图函数 ====================
+def auto_bins(data):
+    """自动计算 bin 数（Freedman–Diaconis rule），带上下限"""
+    data = np.asarray(data)
+    data = data[np.isfinite(data)]
+    if data.size == 0:
+        return 30
+    q75, q25 = np.percentile(data, [75, 25])
+    iqr = q75 - q25
+    bin_width = 2 * iqr * (len(data) ** (-1/3)) if iqr > 0 else 0
+    if bin_width > 0:
+        bins = int((data.max() - data.min()) / bin_width)
+        return max(10, min(200, bins))
+    return 30
+def plot_cdf(data, title, xlabel, save_path=None):
+    """累计分布函数 (CDF)"""
+    data_sorted = np.sort(data)
+    if len(data_sorted) == 0:
+        print(f"[WARN] plot_cdf: empty data for {title}")
+        return
+    cdf = np.arange(1, len(data_sorted) + 1) / len(data_sorted)
+    fig, ax = plt.subplots(figsize=(8, 5))
+    ax.plot(data_sorted, cdf, linewidth=1)  # 用默认配色循环
+    ax.set_title(title, fontsize=12)
+    ax.set_xlabel(xlabel, fontsize=10)
+    ax.set_ylabel('CDF', fontsize=10)
+    ax.grid(True, linestyle='--', alpha=0.3)
+    plt.tight_layout()
+    if save_path:
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        print(f"Plot saved to: {save_path}")
+    plt.close(fig)
+def plot_histogram_auto(data, title, xlabel, save_path=None):
+    """直方图（自动 bins，x 轴尽量整数刻度）"""
+    if len(data) == 0:
+        print(f"[WARN] plot_histogram_auto: empty data for {title}")
+        return
+    fig, ax = plt.subplots(figsize=(8, 5))
+    bins = auto_bins(data)
+    ax.hist(data, bins=bins, alpha=0.7, edgecolor='white', linewidth=0.5)
+    ax.xaxis.set_major_locator(MaxNLocator(nbins=15, integer=True))
+    ax.set_title(title, fontsize=12)
+    ax.set_xlabel(xlabel, fontsize=10)
+    ax.set_ylabel('Frequency', fontsize=10)
+    ax.grid(True, linestyle='--', alpha=0.3)
+    plt.tight_layout()
+    if save_path:
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        print(f"Plot saved to: {save_path}")
+    plt.close(fig)
+def plot_histogram_log(data, title, xlabel, save_path=None):
+    """直方图 + 对数横轴"""
+    if len(data) == 0:
+        print(f"[WARN] plot_histogram_log: empty data for {title}")
+        return
+    fig, ax = plt.subplots(figsize=(8, 5))
+    bins = auto_bins(data)
+    ax.hist(data, bins=bins, alpha=0.7, edgecolor='white', linewidth=0.5)
+    ax.set_xscale("log")
+    ax.set_title(title, fontsize=12)
+    ax.set_xlabel(xlabel + " (log scale)", fontsize=10)
+    ax.set_ylabel('Frequency', fontsize=10)
+    ax.grid(True, linestyle='--', alpha=0.3)
+    plt.tight_layout()
+    if save_path:
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        print(f"Plot saved to: {save_path}")
+    plt.close(fig)
+def plot_histogram_with_vlines_log(data, vlines, title, xlabel, save_path=None):
+    """直方图（对数横轴）+ 竖线标注阈值"""
+    if len(data) == 0:
+        print(f"[WARN] plot_histogram_with_vlines_log: empty data for {title}")
+        return
+    fig, ax = plt.subplots(figsize=(8, 5))
+    bins = auto_bins(data)
+    ax.hist(data, bins=bins, alpha=0.7, edgecolor='white', linewidth=0.5)
+    ax.set_xscale("log")
+    for v in vlines:
+        ax.axvline(v, linestyle='--', linewidth=1)
+    ax.set_title(title, fontsize=12)
+    ax.set_xlabel(xlabel + " (log scale)", fontsize=10)
+    ax.set_ylabel('Frequency', fontsize=10)
+    ax.grid(True, linestyle='--', alpha=0.3)
+    plt.tight_layout()
+    if save_path:
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        print(f"Plot saved to: {save_path}")
+    plt.close(fig)
+def plot_cdf_multiple(datasets, labels, title, xlabel, save_path=None):
+    """多组 CDF 叠加"""
+    fig, ax = plt.subplots(figsize=(8, 5))
+    for data, lab in zip(datasets, labels):
+        data = np.asarray(data)
+        if data.size == 0:
+            continue
+        data_sorted = np.sort(data)
+        cdf = np.arange(1, len(data_sorted) + 1) / len(data_sorted)
+        ax.plot(data_sorted, cdf, linewidth=1, label=lab)
+    ax.set_title(title, fontsize=12)
+    ax.set_xlabel(xlabel, fontsize=10)
+    ax.set_ylabel('CDF', fontsize=10)
+    ax.grid(True, linestyle='--', alpha=0.3)
+    ax.legend()
+    plt.tight_layout()
+    if save_path:
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        print(f"Plot saved to: {save_path}")
+    plt.close(fig)
+# ==================== 数据缓存 ====================
+def save_data(empty_counts, steps_counts, initial_min_remainings, data_file):
+    np.savez(
+        data_file,
+        empty_counts=np.asarray(empty_counts, dtype=np.int32),
+        steps_counts=np.asarray(steps_counts, dtype=np.int32),
+        initial_min_remainings=np.asarray(initial_min_remainings, dtype=np.int32),
+    )
+    print(f"Data saved to: {data_file}")
+def load_data(data_file):
+    if os.path.exists(data_file):
+        data = np.load(data_file)
+        print(f"Data loaded from: {data_file}")
+        # 兼容可能缺少 initial_min_remainings 的旧数据
+        initial_min_remainings = data.get('initial_min_remainings')
+        return data['empty_counts'], data['steps_counts'], initial_min_remainings
+    return None, None, None
+# ==================== CSV Loader ====================
+def load_sudoku_csv(file_path):
+    """读取 CSV（第一列是 81 长度题面，'.' 或 '0' 表空），返回 board list"""
+    quizzes = []
+    with open(file_path, newline='') as f:
+        reader = csv.reader(f)
+        header = next(reader, None)  # 跳过表头（如果有）
+        for row in tqdm(reader, desc="Loading CSV"):
+            quiz_str = re.sub(r'[^0-9.]', '', row[0])
+            quiz = np.array([int(c) if c != '.' else 0 for c in quiz_str]).reshape(9, 9).tolist()
+            quizzes.append(quiz)
+    return quizzes
+def load_sudoku_csv_strings(file_path):
+    """加载原始题面字符串（长度81），用'0'替代'.'，便于保存回溯。"""
+    puzzles = []
+    with open(file_path, newline='') as f:
+        reader = csv.reader(f)
+        header = next(reader, None)
+        for row in reader:
+            quiz_str = re.sub(r'[^0-9.]', '', row[0])
+            quiz_str = ''.join(['0' if c == '.' else c for c in quiz_str])
+            puzzles.append(quiz_str)
+    return puzzles
+# ==================== Main ====================
+if __name__ == "__main__":
+    INPUT_CSV_FILES = ['data/hard_test.csv']  # 可替换为你的文件列表
+    N_WORKERS = 4
+    SAMPLE_SIZE = 5000      # 每个 CSV 最多处理多少条（用于快速试跑）
+    BENCHMARK = True       # True 则对 solver 做基准测试
+    for csv_file in INPUT_CSV_FILES:
+        print(f"\nProcessing file: {csv_file}")
+        file_prefix = os.path.splitext(os.path.basename(csv_file))[0]
+        data_file = f"{file_prefix}_data.npz"
+        empty_counts, steps_counts, initial_min_remainings = load_data(data_file)
+        # if empty_counts is None or steps_counts is None or initial_min_remainings is None:
+        if True:
+            print("Data not found, processing...")
+            quizzes = load_sudoku_csv(csv_file)
+            print(f"Loaded {len(quizzes)} sudoku puzzles")
+            if BENCHMARK:
+                empty_counts, steps_counts, initial_min_remainings = benchmark_solver(quizzes, SAMPLE_SIZE)
+            else:
+                empty_counts, steps_counts, initial_min_remainings = parallel_solve_optimized(
+                    quizzes[:SAMPLE_SIZE], n_workers=N_WORKERS)
+            save_data(empty_counts, steps_counts, initial_min_remainings, data_file)
+        else:
+            print("Using cached data")
+        # ============ 绘图输出目录 ============
+        out_dir = os.path.join('figures', file_prefix)
+        os.makedirs(out_dir, exist_ok=True)
+        # ============ 全量分布图 ============
+        plot_histogram_auto(empty_counts,
+                            f'{file_prefix} - Empty Count Distribution',
+                            'Empty Count',
+                            os.path.join(out_dir, f'{file_prefix}_empty_count_hist.png'))
+        plot_histogram_auto(steps_counts,
+                            f'{file_prefix} - Steps Count Distribution',
+                            'Steps Count',
+                            os.path.join(out_dir, f'{file_prefix}_steps_count_hist.png'))
+        plot_histogram_log(steps_counts,
+                           f'{file_prefix} - Steps Count Distribution (Log X)',
+                           'Steps Count',
+                           os.path.join(out_dir, f'{file_prefix}_steps_count_hist_log.png'))
+        plot_cdf(steps_counts,
+                 f'{file_prefix} - Steps Count CDF',
+                 'Steps Count',
+                 os.path.join(out_dir, f'{file_prefix}_steps_count_cdf.png'))
+        plot_cdf(empty_counts,
+                 f'{file_prefix} - Empty Count CDF',
+                 'Empty Count',
+                 os.path.join(out_dir, f'{file_prefix}_empty_count_cdf.png'))
+        # ============ 五档分位 + 抽样保存 ============
+        TOTAL_N = 5000
+        SAMPLE_PER_BIN = 1000
+        CONSIDER_N = int(min(TOTAL_N, len(steps_counts)))
+        consider_idx = np.arange(CONSIDER_N)
+        if CONSIDER_N > 0:
+            steps_consider = np.array(steps_counts)[:CONSIDER_N]
+            empty_consider = np.array(empty_counts)[:CONSIDER_N]
+            # 分位点（原逻辑：基于数值阈值分箱，可能因边界重复导致每箱数量非严格等分）
+            # q20, q40, q60, q80 = np.quantile(steps_consider, [0.2, 0.4, 0.6, 0.8])
+            # bins = [-np.inf, q20, q40, q60, q80, np.inf]
+            # labels = np.digitize(steps_consider, bins)  # 1..5 档
+            # 改为基于排序排名严格切分，保证每个 bin 精确包含 SAMPLE_PER_BIN 条（若可行）
+            sorted_idx = np.argsort(steps_consider, kind='stable')
+            # 计算每个 bin 的起止索引（尽量平均，优先保证前四个 bin 为 SAMPLE_PER_BIN 条）
+            bin_size = SAMPLE_PER_BIN
+            b1 = sorted_idx[0:bin_size]
+            b2 = sorted_idx[bin_size:bin_size*2]
+            b3 = sorted_idx[bin_size*2:bin_size*3]
+            b4 = sorted_idx[bin_size*3:bin_size*4]
+            b5 = sorted_idx[bin_size*4:CONSIDER_N]
+            # 直方图（含基于排名的阈值，log x）
+            def safe_idx(pos):
+                return min(max(pos, 0), CONSIDER_N - 1)
+            v20 = steps_consider[sorted_idx[safe_idx(bin_size - 1)]]
+            v40 = steps_consider[sorted_idx[safe_idx(bin_size * 2 - 1)]]
+            v60 = steps_consider[sorted_idx[safe_idx(bin_size * 3 - 1)]]
+            v80 = steps_consider[sorted_idx[safe_idx(bin_size * 4 - 1)]]
+            plot_histogram_with_vlines_log(
+                steps_consider,
+                [v20, v40, v60, v80],
+                f'{file_prefix} - Steps Histogram with Quintile Thresholds',
+                'Steps Count',
+                save_path=os.path.join(out_dir, f'{file_prefix}_steps_hist_with_quintiles.png')
+            )
+            # 各档数据 & 叠加 CDF
+            # bin 可视化数据基于排名切分
+            bin_data = [
+                steps_consider[b1],
+                steps_consider[b2],
+                steps_consider[b3],
+                steps_consider[b4],
+                steps_consider[b5],
+            ]
+            plot_cdf_multiple(
+                bin_data,
+                [f'Bin{k}' for k in [1, 2, 3, 4, 5]],
+                f'{file_prefix} - Steps CDF by Quintile Bins',
+                'Steps Count',
+                save_path=os.path.join(out_dir, f'{file_prefix}_steps_cdf_quintiles.png')
+            )
+            # 读取原始题面字符串用于导出
+            puzzles_raw = load_sudoku_csv_strings(csv_file)
+            sampled_by_bin = {}
+            for target_bin in [1, 3, 5]:
+                # 基于排名切分得到每个 bin 的索引集合
+                if target_bin == 1:
+                    idx_in_bin = b1
+                elif target_bin == 3:
+                    idx_in_bin = b3
+                elif target_bin == 5:
+                    idx_in_bin = b5
+                else:
+                    idx_in_bin = np.array([], dtype=int)
+                if len(idx_in_bin) == 0:
+                    print(f"Bin {target_bin} has no samples.")
+                    sampled_by_bin[target_bin] = np.array([], dtype=int)
+                    continue
+                # 取前 SAMPLE_PER_BIN 条，确保导出数量一致（若该 bin 少于该数量，则全量导出）
+                take_n = min(len(idx_in_bin), SAMPLE_PER_BIN)
+                sampled_idx = idx_in_bin[:take_n]
+                sampled_by_bin[target_bin] = np.array(sampled_idx, dtype=int)
+                out_csv = f"{file_prefix}_bin{target_bin}_sample{SAMPLE_PER_BIN}.csv"
+                with open(out_csv, 'w', newline='') as wf:
+                    writer = csv.writer(wf)
+                    writer.writerow(['puzzle_index', 'steps', 'empty_count', 'puzzle'])
+                    for i in sampled_idx:
+                        writer.writerow([int(i), int(steps_consider[i]), int(empty_consider[i]), puzzles_raw[i]])
+                print(f"Saved bin {target_bin} sample to: {out_csv} (count={len(sampled_idx)})")
+                # 该部分的直方图与CDF
+                d_steps = steps_consider[sampled_by_bin[target_bin]]
+                plot_histogram_auto(
+                    d_steps,
+                    f'{file_prefix} - Bin{target_bin} Sample Steps Hist',
+                    'Steps Count',
+                    os.path.join(out_dir, f'{file_prefix}_bin{target_bin}_sample_steps_hist.png')
+                )
+                plot_cdf(
+                    d_steps,
+                    f'{file_prefix} - Bin{target_bin} Sample Steps CDF',
+                    'Steps Count',
+                    os.path.join(out_dir, f'{file_prefix}_bin{target_bin}_sample_steps_cdf.png')
+                )
+            # 混合保存 1/3/5 档
+            def save_mix(mix_bins, out_name):
+                mix_idx = np.concatenate([sampled_by_bin.get(b, np.array([], dtype=int)) for b in mix_bins])
+                if mix_idx.size == 0:
+                    print(f"Mix {out_name} has no samples; skipped.")
+                    return
+                out_csv = f"{file_prefix}_{out_name}.csv"
+                with open(out_csv, 'w', newline='') as wf:
+                    writer = csv.writer(wf)
+                    writer.writerow(['puzzle_index', 'steps', 'empty_count', 'puzzle'])
+                    for i in mix_idx:
+                        writer.writerow([int(i), int(steps_consider[i]), int(empty_consider[i]), puzzles_raw[i]])
+                print(f"Saved mix {out_name} to: {out_csv} (count={mix_idx.size})")
+            # 混合 1+3+5：直方图与CDF
+            mix_name = f"mix_bin1_3_5_sample2000"
+            save_mix([1, 3, 5], mix_name)
+            mix_idx_all = np.concatenate([sampled_by_bin.get(b, np.array([], dtype=int)) for b in [1, 3, 5]])
+            if mix_idx_all.size > 0:
+                d_mix = steps_consider[mix_idx_all]
+                plot_histogram_auto(
+                    d_mix,
+                    f'{file_prefix} - Mix(1+3+5) Sample Steps Hist',
+                    'Steps Count',
+                    os.path.join(out_dir, f'{file_prefix}_{mix_name}_steps_hist.png')
+                )
+                plot_cdf(
+                    d_mix,
+                    f'{file_prefix} - Mix(1+3+5) Sample Steps CDF',
+                    'Steps Count',
+                    os.path.join(out_dir, f'{file_prefix}_{mix_name}_steps_cdf.png')
+                )
+    print("\nDone.")

test.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2fd52aea23d331d5b4ee723c856236e838a9fb9a70e66f4e0e0cf26c338c6a8
+size 79360390

train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64b46674db0148e0d73a16346dadeb2b1c00824d3fca3f85b2ae7037f6b4b38e
+size 718819925