Spaces:

Now-Join-Us
/

Generalist-Value-Model-V0

Running

App Files Files Community

zhangyikai commited on Feb 1

Commit

89c9672

1 Parent(s): 8518eef

Upload V0 model and UI

Browse files

Files changed (14) hide show

app.py +425 -0
requirements.txt +10 -0
v0_core/config/__init__.py +0 -0
v0_core/config/arguments.py +133 -0
v0_core/data/__init__.py +0 -0
v0_core/data/collator.py +27 -0
v0_core/data/dataset.py +273 -0
v0_core/data/utils.py +20 -0
v0_core/models/__init__.py +0 -0
v0_core/models/v0.py +216 -0
v0_core/utils/__init__.py +0 -0
v0_core/utils/checkpoint.py +117 -0
v0_core/utils/metrics.py +195 -0
v0_core/utils/tabpfn_patches.py +116 -0

app.py ADDED Viewed

	@@ -0,0 +1,425 @@

+import gradio as gr
+import pandas as pd
+import json
+import os
+import torch
+from huggingface_hub import hf_hub_download, snapshot_download # 引入下载工具
+# ==========================================
+# 0. 模型初始化
+# ==========================================
+MODEL_REPO_ID = "Now-Join-Us/Generalist-Value-Model-V0"
+EMBEDDING_REPO_ID = "Qwen/Qwen3-Embedding-0.6B"
+v0_model = None
+print(">>> Starting V0 App...")
+try:
+    from v0_core.models.v0 import V0
+    print(f">>> Downloading models...")
+    # 1. 下载你的训练权重
+    checkpoint_path = hf_hub_download(
+        repo_id=MODEL_REPO_ID,
+        filename="v_0_for_grpo_training.pt"
+    )
+    # 2. 下载 TabPFN
+    tabpfn_path = hf_hub_download(
+        repo_id=MODEL_REPO_ID,
+        filename="tabpfn-v2.5-classifier-v2.5_default.ckpt"
+    )
+    # 3. 下载 Qwen Embedding
+    embedding_path = snapshot_download(
+        repo_id=EMBEDDING_REPO_ID
+    )
+    print(">>> Models downloaded. Initializing V0 class...")
+    # device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = "cpu"
+    # 加载模型
+    v0_model = V0.from_pretrained(
+        checkpoint_path=checkpoint_path,
+        embedding_model_path=embedding_path,
+        tabpfn_head_path=tabpfn_path,
+        device=device
+    )
+    print(f">>> V0 Model Loaded Successfully on {device}!")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    print("UI will run in Mock Mode.")
+    v0_model = None
+# ==========================================
+# 1. 核心逻辑
+# ==========================================
+# 默认数据 (作为 Context)
+history_default = [
+    {"prompt": "Let $d(m)$ denote the number of positive integer divisors of a positive integer $m$. If $r$ is the number of integers $n \\leq 2023$ for which $\\sum_{i=1}^{n} d(i)$ is odd, find the sum of the digits of $r$.", "is_correct": True},
+    {"prompt": "设在 $5 \\times 5$ 的方格表的第 $i$ 行第 $j$ 列所填的数为 $a_{i j}\\left(a_{i j} \\in\\{0,1\\}\\right), a_{i j}=a_{j i}(1 \\leqslant i、j \\leqslant 5)$ .则表中共有五个 1 的填表方法总数为 $\\qquad$ （用具体数字作答).", "is_correct": True},
+    {"prompt": "Suppose $x, y \\in \\mathbb{Z}$ satisfy the equation:\n\\[\ny^4 + 4y^3 + 28y + 8x^3 + 6y^2 + 32x + 1 = (x^2 - y^2)(x^2 + y^2 + 24).\n\\]\nFind the sum of all possible values of $|xy|$.", "is_correct": False},
+    {"prompt": "Three builders are scheduled to build a house in 60 days. However, they procrastinate and do nothing for the first 50 days. To complete the house on time, they decide to hire more workers and work at twice their original speed. If the new workers also work at this doubled rate, how many new workers are needed? Assume each builder works at the same rate and does not interfere with the others.", "is_correct": True},
+    {"prompt": "Let $P_0 = (3,1)$ and define $P_{n+1} = (x_n, y_n)$ for $n \\ge 0$ by \\[ x_{n+1} = - \\frac{3x_n - y_n}{2}, \\quad y_{n+1} = - \\frac{x_n + y_n}{2} \\] Find the area of the quadrilateral formed by the points $P_{96}, P_{97}, P_{98}, P_{99}$.", "is_correct": False}
+]
+def format_model_card(data_list, model_name, is_custom=False):
+    if not data_list:
+        if is_custom:
+            return f"<div class='model-card empty'><div class='card-title'>No Custom Model Uploaded</div></div>"
+        return ""
+    total = len(data_list)
+    rows_html = ""
+    preview_limit = 3
+    preview_data = data_list[:preview_limit]
+    for item in preview_data:
+        p_text = item.get('prompt', '')
+        if len(p_text) > 64:
+            p_text = p_text[:64] + "..."
+        is_acc = item.get('is_correct', False)
+        status_class = "status-green" if is_acc else "status-red"
+        icon = "✔" if is_acc else "✘"
+        rows_html += f"""
+        <div class='history-row'>
+            <div class='status-box {status_class}'>{icon}</div>
+            <div class='prompt-text'>{p_text}</div>
+        </div>
+        """
+    remaining = total - preview_limit
+    if remaining > 0:
+        rows_html += f"<div class='history-more'>+ {remaining} more items</div>"
+    return f"""
+    <div class='model-card populated'>
+        <div class='card-header'>
+            <span class='model-name'>{model_name}</span>
+            <span class='acc-badge'>Total Samples: {total}</span>
+        </div>
+        <div class='card-body'>
+            <div class='history-container'>{rows_html}</div>
+        </div>
+    </div>
+    """
+def process_upload(file_obj):
+    if file_obj is None:
+        return None, format_model_card(None, "Custom", True)
+    content = []
+    try:
+        with open(file_obj.name, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    json_obj = json.loads(line)
+                    content.append(json_obj)
+        if not content: return None, "<div class='model-card empty'>File is empty</div>"
+        if 'is_correct' not in content[0]: return None, "<div class='model-card empty'>Missing 'is_correct' field</div>"
+        # 简单的验证逻辑
+        has_positive = any(item.get('is_correct') for item in content)
+        has_negative = any(not item.get('is_correct') for item in content)
+        if not (has_positive and has_negative):
+            return None, """
+            <div class='model-card empty' style='border-color: var(--fail); color: var(--fail);'>
+                <div class='card-title'>Invalid Dataset Distribution</div>
+                <div class='card-subtitle'>Please upload at least one positive AND one negative sample.</div>
+            </div>
+            """
+        return content, format_model_card(content, "Custom Model")
+    except json.JSONDecodeError:
+        return None, f"<div class='model-card empty'>Invalid JSONL Format</div>"
+    except Exception as e:
+        return None, f"<div class='model-card empty'>Error: {str(e)}</div>"
+def predict_performance(default_data, custom_data, t1, t2, t3):
+    """
+    使用加载的 V0 模型进行预测。
+    """
+    targets = [t for t in [t1, t2, t3] if t.strip()]
+    if not targets:
+        return pd.DataFrame([{"Error": "Please enter at least one target prompt."}])
+    models_to_run = []
+    if default_data:
+        models_to_run.append(("Qwen3-4B-Instruct-2507", default_data))
+    if custom_data:
+        models_to_run.append(("Custom Uploaded Model", custom_data))
+    results = []
+    for m_name, m_history in models_to_run:
+        context_prompts = [item['prompt'] for item in m_history]
+        context_labels = [1 if item.get('is_correct') else 0 for item in m_history]
+        scores = []
+        if v0_model:
+            try:
+                # print(f"Running inference for {m_name} on {len(targets)} targets with {len(context_prompts)} context examples...")
+                scores = v0_model.predict(
+                    context_prompts=context_prompts,
+                    context_labels=context_labels,
+                    target_prompts=targets
+                )
+            except Exception as e:
+                print(f"Inference Error: {e}")
+                scores = [0.0] * len(targets)
+        else:
+            import random
+            scores = [random.uniform(0.1, 0.9) for _ in targets]
+        for t_text, score in zip(targets, scores):
+            # 处理 Tensor 或 float
+            if isinstance(score, torch.Tensor):
+                final_score = score.item()
+            else:
+                final_score = float(score)
+            if final_score > 0.5:
+                pred_str = "✔ Success"
+            else:
+                pred_str = "✘ Failure"
+            results.append({
+                "Model": m_name,
+                "Instruction": t_text,
+                "Predicted Value Score": round(final_score, 4),
+                "Prediction": pred_str
+            })
+    df = pd.DataFrame(results)
+    return df
+# ==========================================
+# 2. CSS 样式
+# ==========================================
+css = """
+/* 全局变量 */
+:root {
+    --primary: #10b981;
+    --primary-light: #ecfdf5;
+    --primary-dark: #047857;
+    --bg-card: #ffffff;
+    --border-sub: #e5e7eb;
+    --text-main: #1f2937;
+    --text-sub: #6b7280;
+    --success: #10b981;
+    --fail: #ef4444;
+    --popup-bg: #ffffff;
+    --popup-text: #1f2937;
+    --popup-border: #e5e7eb;
+    --popup-shadow: rgba(0,0,0,0.15);
+}
+.dark {
+    --bg-card: #1f2937;
+    --border-sub: #374151;
+    --text-main: #f3f4f6;
+    --text-sub: #9ca3af;
+    --popup-bg: #2d2d2d;
+    --popup-text: #e5e5e5;
+    --popup-border: #4b5563;
+    --popup-shadow: rgba(0,0,0,0.4);
+}
+.label-row { display: flex; align-items: center; margin-bottom: 6px; font-family: 'Source Sans Pro', sans-serif; }
+.upload-label-text { font-size: 1rem; color: var(--text-main); margin-right: 8px; }
+.format-hint-wrapper { display: inline-block; position: relative; cursor: help; font-size: 0.9rem; color: var(--primary); font-weight: 600; border-bottom: 1px dashed var(--primary); line-height: 1.2; }
+.format-popup {
+    visibility: hidden; opacity: 0; position: absolute; bottom: 145%; left: -20px; width: 450px;
+    background: var(--popup-bg); color: var(--popup-text); border: 1px solid var(--popup-border);
+    padding: 16px; border-radius: 8px; box-shadow: 0 10px 30px var(--popup-shadow); z-index: 1000;
+    transition: all 0.2s cubic-bezier(0.165, 0.84, 0.44, 1); transform: translateY(10px); pointer-events: none;
+    font-size: 0.95rem; line-height: 1.5;
+}
+.format-hint-wrapper:hover .format-popup { visibility: visible; opacity: 1; transform: translateY(0); }
+.format-popup::after {
+    content: ""; position: absolute; top: 100%; left: 60px; border-width: 8px; border-style: solid;
+    border-color: var(--popup-bg) transparent transparent transparent;
+}
+.code-snippet {
+    display: block; background: #1a1a1a; color: #a7f3d0; font-family: 'Courier New', monospace;
+    font-size: 0.85em; padding: 8px; border-radius: 6px; margin-top: 6px; white-space: pre; border: 1px solid #444;
+}
+.concept-banner {
+    background: linear-gradient(135deg, rgba(16, 185, 129, 0.08) 0%, rgba(59, 130, 246, 0.05) 100%);
+    border: 1px solid var(--primary-light); border-radius: 12px; padding: 24px; text-align: center; margin-bottom: 30px;
+}
+.concept-title { font-size: 1.8em; font-weight: 700; color: var(--text-main); margin-bottom: 8px;}
+.concept-subtitle { font-size: 1em; color: var(--text-sub); max-width: 600px; margin: 0 auto; line-height: 1.5; }
+.equation-box {
+    margin-top: 15px; font-family: 'Courier New', monospace; font-weight: bold;
+    color: var(--primary); background: var(--bg-card); display: inline-block;
+    padding: 8px 16px; border-radius: 8px; border: 1px dashed var(--primary);
+    box-shadow: 0 2px 6px rgba(0,0,0,0.05);
+}
+.step-header { display: flex; align-items: center; margin-bottom: 15px; border-bottom: 2px solid var(--border-sub); padding-bottom: 10px; }
+.step-num {
+    background: var(--primary); color: white; width: 28px; height: 28px;
+    border-radius: 50%; display: flex; align-items: center; justify-content: center;
+    font-weight: bold; margin-right: 10px; font-size: 0.9em;
+}
+.step-title { font-size: 1.2em; font-weight: 600; color: var(--text-main); }
+.step-desc { font-size: 0.93em; color: var(--text-sub); margin-left: auto; font-style: italic;}
+.model-card {
+    background: var(--bg-card); border: 1px solid var(--border-sub);
+    border-radius: 10px; padding: 16px; margin-bottom: 15px;
+    transition: all 0.2s; position: relative; overflow: hidden;
+}
+.model-card.populated { border-left: 5px solid var(--primary); box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); }
+.model-card.empty { border: 2px dashed var(--border-sub); text-align: center; opacity: 0.7; padding: 30px 16px; }
+.card-title { font-weight: bold; color: var(--text-sub); }
+.card-subtitle { font-size: 0.8em; color: var(--text-sub); }
+.card-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 15px; }
+.model-name { font-weight: bold; font-size: 1.1em; color: var(--text-main); }
+.acc-badge { background: var(--primary-light); color: var(--primary-dark); font-size: 0.75em; padding: 3px 10px; border-radius: 12px; font-weight: 700; }
+.history-container { display: flex; flex-direction: column; gap: 8px; margin-bottom: 15px; }
+.history-row { display: flex; align-items: center; background: rgba(0,0,0,0.02); padding: 6px 8px; border-radius: 6px; }
+.status-box {
+    width: 24px; height: 24px; border-radius: 6px; display: flex; align-items: center; justify-content: center;
+    color: white; font-size: 0.8em; font-weight: bold; margin-right: 10px; flex-shrink: 0;
+}
+.status-green { background-color: var(--success); }
+.status-red { background-color: var(--fail); }
+.prompt-text {
+    font-size: 0.9em; color: var(--text-main); white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
+}
+.history-more { font-size: 0.95em; color: var(--text-sub); text-align: center; font-style: italic; margin-top: -4px; }
+.custom-btn { font-weight: bold !important; font-size: 1.1em !important; }
+.paper-link {
+    font-size: 0.5em; vertical-align: middle; color: var(--primary); text-decoration: none;
+    border: 1px solid var(--primary); padding: 4px 10px; border-radius: 15px; font-weight: normal;
+    transition: all 0.2s; background: transparent;
+}
+.paper-link:hover { background: var(--primary); color: white; }
+"""
+# ==========================================
+# 3. UI 构建
+# ==========================================
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="emerald"), css=css, title="V0 Predictor") as demo:
+    state_default = gr.State(value=history_default)
+    state_custom = gr.State(value=None)
+    gr.HTML("""
+    <div class="concept-banner">
+        <div class="concept-title">
+            V<sub>0</sub> Value Model
+            <a href="TBD" target="_blank" class="paper-link">Paper ↗</a>
+            <a href="TBD" target="_blank" class="paper-link">Code ↗</a>
+        </div>
+        <div class="concept-subtitle">
+            <span style="color: var(--primary); font-weight: bold;">Function:</span> V<sub>0</sub> uses a model's historical performance to predict<br>
+            how it will perform on unseen instructions<br>
+            without running the model itself.
+        </div>
+        <div class="equation-box">
+            Historical Perf. + Instruction &rarr; Predicted Perf.
+        </div>
+    </div>
+    """)
+    with gr.Row(equal_height=False):
+        with gr.Column(scale=1, variant="panel"):
+            gr.HTML("""
+            <div class="step-header">
+                <div class="step-num">1</div>
+                <div class="step-title">Represent Any Model with <span style="color: var(--primary);">Performance-Instruction Pairs</span></div>
+            </div>
+            """)
+            preview_default = gr.HTML(format_model_card(history_default, "Qwen3-4B-Instruct-2507"))
+            gr.HTML("""
+            <div class="label-row">
+                <span class="upload-label-text"><span style="font-weight: 800;">[Optional]</span> Upload Your Model</span>
+                <div class="format-hint-wrapper">
+                    Required JSONL Format ⓘ
+                    <div class="format-popup">
+                        <div style="font-weight: bold; margin-bottom:4px;">File Content Example:</div>
+                        <code class="code-snippet">
+{"prompt": "Calculate 1+1", "is_correct": true}
+{"prompt": "Write a poem", "is_correct": false}
+                        </code>
+                        <div style="margin-top:6px; font-size:0.9em; opacity: 0.8;">
+                            Each line must be a valid JSON object containing <b>'prompt'</b> (string) and <b>'is_correct'</b> (boolean).
+                        </div>
+                    </div>
+                </div>
+            </div>
+            """)
+            upload_btn = gr.File(
+                label=None,
+                show_label=False,
+                file_types=[".jsonl"],
+                height=130
+            )
+            preview_custom = gr.HTML(format_model_card(None, "Custom", True))
+        with gr.Column(scale=1, variant="panel"):
+            gr.HTML("""
+            <div class="step-header" style="margin-top: 80px;">
+                <div class="step-num">2</div>
+                <div class="step-title">Enter Instructions</div>
+                <div class="step-desc">trigger V<sub>0</sub> to predict the expected perf. for each model</div>
+            </div>
+            """)
+            t1 = gr.Textbox(label="Instruction 1", value="What is the largest $n$ such that there exists a non-degenerate convex $n$-gon where each of its angles is an integer number of degrees, and all angles are distinct?", lines=2)
+            t2 = gr.Textbox(label="Instruction 2", value="已知四面体 \\(A B C D\\) 内接于球 \\(O\\)，且 \\(A D\\) 是球 \\(O\\) 的直径。若 \\(\\triangle A B C\\) 和 \\(\\triangle B C D\\) 都是边长为 1 的等边三角形，则四面体 \\(A B C D\\) 的体积是多少？原始答案的形式为 \\(\\frac{\\sqrt{c}}{b}\\)，请给出a+b+c的值。", lines=2)
+            t3 = gr.Textbox(label="Instruction 3", placeholder="Your instruction here ...", lines=2)
+            gr.HTML("""
+            <div style="margin-top: 15px; font-size: 1.05em; color: var(--text-main);">
+                <span style="color: var(--primary); font-weight: bold;">Next:</span> Clicking <b>Run V<sub>0</sub> Prediction!</b>
+            </div>
+            """)
+    with gr.Row():
+        with gr.Column():
+            predict_btn = gr.Button("Run V₀ Prediction", variant="primary", size="lg", elem_classes=["custom-btn"])
+            gr.HTML("""
+            <div class="step-header" style="margin-top: 20px; border-bottom: none;">
+                <div class="step-num">3</div>
+                <div class="step-title">Results</div>
+            </div>
+            """)
+            output_df = gr.Dataframe(
+                headers=["Model Entity", "Unseen Instruction", "Predicted Value Score", "Prediction"],
+                datatype=["str", "str", "number", "str"],
+                interactive=False,
+                column_widths=["20%", "40%", "20%", "20%"]
+            )
+    upload_btn.change(
+        fn=process_upload,
+        inputs=[upload_btn],
+        outputs=[state_custom, preview_custom]
+    )
+    predict_btn.click(
+        fn=predict_performance,
+        inputs=[state_default, state_custom, t1, t2, t3],
+        outputs=[output_df]
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio>=4.0.0
+pandas
+einops
+numpy==2.2.6
+scikit-learn==1.7.2
+-e git+https://github.com/PriorLabs/TabPFN.git@2cd2326038e789a26f7a07e70e1ea986ffd040c9#egg=tabpfn
+torch==2.7.1
+tqdm==4.67.1
+transformers==4.55.4
+wandb==0.21.3

v0_core/config/__init__.py ADDED Viewed

File without changes

v0_core/config/arguments.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+import argparse
+# =============================================================================
+# 参数解析配置
+# =============================================================================
+def parse_args():
+    parser = argparse.ArgumentParser(description="Generalist Value Model")
+    # --- 路径相关 ---
+    parser.add_argument("--time_str", type=str, required=True)
+    parser.add_argument("--qwen_path", type=str, required=True, help="Qwen 模型路径")
+    parser.add_argument("--tabpfn_checkpoint", type=str, required=True, help="TabPFN Checkpoint 路径")
+    # 数据路径配置
+    parser.add_argument("--context_data_paths", type=str, required=True, help="Context Pool Jsonl路径 (支持多个,逗号分隔)")
+    parser.add_argument("--train_data_paths", type=str, default=None, help="Train Query Pool Jsonl路径 (支持多个,逗号分隔)")
+    parser.add_argument("--eval_data_paths", type=str, default=None, help="Test Query Pool Jsonl路径 (支持多个,逗号分隔)")
+    parser.add_argument("--validity_data_paths", type=str, default=None, help="Validity Test Pool Jsonl路径 (支持多个,逗号分隔)")
+    parser.add_argument("--prompt_dict_path", type=str, required=True, help="Prompt 字典 JSON 路径")
+    # --- Checkpoint 保存相关 ---
+    parser.add_argument("--checkpoint_dir", type=str, default=None, help="模型保存目录")
+    parser.add_argument("--save_interval", type=int, default=1, help="每隔多少个 Epoch 保存一次模型")
+    parser.add_argument("--max_keep_checkpoints", type=int, default=2, help="最多保留多少个最新的 Checkpoint")
+    parser.add_argument("--resume", action="store_true", help="是否尝试从 checkpoint_dir 恢复训练")
+    parser.add_argument("--resume_from_specific_epoch", type=int, default=None, help="指定要 resume 的 epoch")
+    # --- 日志相关 ---
+    parser.add_argument("--log_path", type=str, default=None)
+    parser.add_argument("--log_interval", type=int, default=10, help="保存间隔")
+    parser.add_argument("--metric_path", type=str, default=None)
+    parser.add_argument("--wandb_project", type=str, default="context-v", help="Wandb 项目名称")
+    parser.add_argument("--wandb_interval", type=int, default=1, help="Wandb 记录间隔")
+    parser.add_argument("--wandb_id", type=str, default=None)
+    # --- 运行模式与策略 ---
+    parser.add_argument("--run_mode", type=str, default="eval", choices=["train", "eval"])
+    parser.add_argument("--pooling_strategy", type=str, default="dynamic_query",
+                        choices=["last_token", "fixed_query", "dynamic_query"],
+                        help="Embedding 提取策略")
+    parser.add_argument("--label_strategy", type=str, default="binary",
+                        choices=["binary", "minmax_norm"],
+                        help="Label 处理策略")
+    parser.add_argument("--loss_type", type=str, default="ce_hard",
+                        choices=["ce_hard", "ce_soft", "kl_div", "pairwise", "combined"],
+                        help="Loss 函数类型: combined = pairwise + ce_soft")
+    parser.add_argument("--loss_alpha", type=float, default=0.5,
+                        help="Combined Loss 中 Pairwise 的权重 (0.0-1.0)。Total = alpha * Pair + (1-alpha) * CE")
+    parser.add_argument("--loss_balance", action="store_true", help="是否对正负样本加权")
+    parser.add_argument("--kl_temperature", type=float, default=1.0,
+                        help="KL 散度或 Softmax 的温度系数 T")
+    # --- 降维参数 ---
+    parser.add_argument("--reduce_method", type=str, default="none",
+                        choices=["none", "avg_pool", "max_pool"])
+    parser.add_argument("--target_dim", type=int, default=1024)
+    parser.add_argument("--num_heads", type=int, default=4)
+    parser.add_argument("--context_clustering", action="store_true", help="是否启用 Support Set 聚类筛选")
+    parser.add_argument("--context_num_clusters", type=int, default=128, help="聚类保留的原型数量 (k值)")
+    # --- 模型超参数 ---
+    parser.add_argument("--num_queries", type=int, default=10)
+    parser.add_argument("--embed_dim", type=int, default=32)
+    parser.add_argument("--tabpfn_estimators", type=int, default=4)
+    parser.add_argument("--dynamic_query_generator_bottleneck_dim", type=int, default=128)
+    parser.add_argument("--dynamic_query_generator_dropout_rate", type=float, default=0.2)
+    # --- 训练超参数 ---
+    parser.add_argument("--epochs", type=int, default=5)
+    parser.add_argument("--meta_batch_size", type=int, default=1, help="每次forward处理多少个Task(一个Task包含Support+Query)")
+    parser.add_argument("--grad_accum_steps", type=int, default=4)
+    parser.add_argument("--train_query_batch_size", type=int, default=8, help="每个Task包含多少个Query样本 (必须来自同一个Step)")
+    parser.add_argument("--eval_query_batch_size", type=int, default=8, help="每个Task包含��少个Query样本 (必须来自同一个Step)")
+    parser.add_argument("--support_size", type=int, default=256, help="每个Task采样的Context样本数量")
+    parser.add_argument("--lr_backbone", type=float, default=1e-5)
+    parser.add_argument("--lr_adapter", type=float, default=1e-4)
+    parser.add_argument("--lr_tabpfn", type=float, default=1e-5)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--warmup_ratio", type=float, default=0.05)
+    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
+    parser.add_argument("--max_grad_norm", type=float, default=1.0)
+    parser.add_argument("--train_embed_bs", type=int, default=4)
+    parser.add_argument("--eval_embed_bs", type=int, default=4)
+    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
+    args = parser.parse_args()
+    def split_paths(path_str):
+        if not path_str: return []
+        return [p.strip() for p in path_str.split(',') if p.strip()]
+    args.context_data_paths = split_paths(args.context_data_paths)
+    args.train_data_paths = split_paths(args.train_data_paths)
+    args.eval_data_paths = split_paths(args.eval_data_paths)
+    args.validity_data_paths = split_paths(args.validity_data_paths)
+    args.prompt_dict_path = split_paths(args.prompt_dict_path)
+    return args
+def print_elegant_args(args):
+    """
+    打印参数列表
+    """
+    args_dict = vars(args)
+    keys = sorted(args_dict.keys())
+    # 计算最长键名以便对齐
+    max_k = max([len(k) for k in keys]) if keys else 10
+    # 定义颜色
+    C_KEY = "\033[36m"    # 青色用于键
+    C_VALUE = "\033[33m"  # 黄色用于值（如果不想要颜色，设为 "" 即可）
+    C_RESET = "\033[0m"   # 重置
+    print(f"\n{C_VALUE}Arguments:{C_RESET}")
+    for k in keys:
+        val = str(args_dict[k])
+        # 格式说明：
+        # {k:<{max_k}} : 让键名左对齐并填充空格
+        # val          : 完整打印值，不截断
+        print(f"  {C_KEY}{k:<{max_k}}{C_RESET} : {val}")
+    print() # 打印末尾空行

v0_core/data/__init__.py ADDED Viewed

File without changes

v0_core/data/collator.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+def meta_collate_fn(batch):
+    all_prompts = []
+    all_labels = []
+    metadata = []
+    current_start = 0
+    for item in batch:
+        t_len = len(item['prompts'])
+        all_prompts.extend(item['prompts'])
+        all_labels.append(item['labels'])
+        metadata.append({
+            'start': current_start,
+            'len': t_len,
+            'split': item['split_idx'],
+            'q_ids': item['q_ids'],
+            'pair_ids': item['pair_ids'],
+            'pair_types': item['pair_types'],
+            'key': item['key'],
+            'stats': item['stats']
+        })
+        current_start += t_len
+    return {
+        'flat_prompts': all_prompts,
+        'flat_labels': torch.cat(all_labels),
+        'metadata': metadata
+    }

v0_core/data/dataset.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import json
+import torch
+import numpy as np
+import random
+from collections import defaultdict
+from torch.utils.data import Dataset
+from v0_core.data.utils import load_jsonl_lines
+# =============================================================================
+# 数据与日志工具
+# =============================================================================
+class ValueModelDataset(Dataset):
+    def __init__(self,
+                 context_paths,
+                 query_paths,
+                 prompt_dict_path,
+                 label_strategy='binary',
+                 query_batch_size=8,
+                 support_size=256,
+                 mode='train'):
+        """
+        args:
+            context_paths: List of paths to context_pool jsonl files
+            query_paths: List of paths to query_pool jsonl files (train/test/validity)
+            prompt_dict_path: List of paths to prompt dictionaries
+            query_batch_size: Number of queries in one forward pass (all from same step)
+            support_size: Number of context samples to sample
+            mode: 'train' (shuffle queries before chunking) or 'eval' (sequential)
+        """
+        self.label_strategy = label_strategy
+        self.query_batch_size = query_batch_size
+        self.support_size = support_size
+        self.mode = mode
+        # 1. Load Prompt Dictionary
+        print(f"Loading prompts from {prompt_dict_path}...")
+        self.prompt_map = {}
+        for path in prompt_dict_path:
+            with open(path, 'r', encoding='utf-8') as f:
+                self.prompt_map.update(json.load(f))
+        # 2. Load Context Pool and Index it
+        # Structure: {(dataset, model, step): [list of sample dicts]}
+        print("Loading Context Pool...")
+        self.context_pool = defaultdict(list)
+        self.context_pool_fallback = defaultdict(list)
+        raw_context = load_jsonl_lines(context_paths)
+        for item in raw_context:
+            key = (item['dataset'], item['model'], item['step'])
+            self.context_pool[key].append(item)
+            fallback_key = (item['model'], item['step'])
+            self.context_pool_fallback[fallback_key].append(item)
+        print(f"Loaded Context Pool with {len(self.context_pool)} unique (dataset, model, step) keys.")
+        # 3. Load Query Pool
+        print(f"Loading Query Pool from {query_paths}...")
+        raw_queries = load_jsonl_lines(query_paths)
+        # 4. Group Queries by Key
+        self.queries_by_key = defaultdict(list)
+        print("Grouping Queries...")
+        for item in raw_queries:
+            key = (item['dataset'], item['model'], item['step'])
+            # Pre-fetch prompt text to save time later, if ID exists
+            s_id_str = f"{item['dataset']}_{item['id']}"
+            item['text'] = self.prompt_map[s_id_str]
+            self.queries_by_key[key].append(item)
+        # 5. Pre-calculate Class Statistics for Context-Aware Re-weighting
+        # 统计每个Context Key下，Query Pool中的正负样本总数，用于计算加权Loss
+        print("Calculating Global Context Statistics for Re-weighting...")
+        self.context_stats = {}
+        if mode == 'train':
+            for key, items in self.queries_by_key.items():
+                # 定义正样本: score >= 0
+                n_pos = sum(1 for x in items if float(x.get('score', -1)) >= 0)
+                n_neg = len(items) - n_pos
+                self.context_stats[key] = {'n_pos': n_pos, 'n_neg': n_neg}
+            print("\n" + "="*60)
+            print(f"Top 10 Steps Statistics ({mode} mode)")
+            print(f"{'Dataset':<15} | {'Model':<15} | {'Step':<6} | {'n_pos':<6} | {'n_neg':<6} | {'Total':<6}")
+            print("-" * 60)
+            sorted_keys = sorted(list(self.context_stats.keys()))
+            for i, key in enumerate(sorted_keys[:10]):
+                dataset_name, model_name, step_val = key
+                stats = self.context_stats[key]
+                total = stats['n_pos'] + stats['n_neg']
+                print(f"{dataset_name:<15} | {model_name:<15} | {str(step_val):<6} | "
+                        f"{stats['n_pos']:<6} | {stats['n_neg']:<6} | {total:<6}")
+            print(f"... (Total {len(sorted_keys)} steps loaded)")
+            print("="*60 + "\n")
+        # 6. Create Tasks (Chunks of Queries)
+        self.tasks = []
+        self.generate_tasks(shuffle=(self.mode == 'train'))
+        print(f"Dataset Initialized. Total Tasks: {len(self.tasks)}")
+    def generate_tasks(self, shuffle=True):
+        """
+        Pairwise Task Generation with Cyclic Oversampling.
+        目标：保留所有样本，不进行丢弃。对于数量较少的一方，循环重复使用以匹配数量较多的一方。
+        """
+        new_tasks = []
+        keys = sorted(list(self.queries_by_key.keys()))
+        if shuffle:
+            random.shuffle(keys)
+        dropped_steps = 0
+        total_pairs = 0
+        for key in keys:
+            samples = list(self.queries_by_key[key])
+            if self.mode == 'train':
+                # 1. 分离正负样本
+                pos_list = [x for x in samples if self._process_label(x['score']) >= 0.5]
+                neg_list = [x for x in samples if self._process_label(x['score']) < 0.5]
+                n_pos = len(pos_list)
+                n_neg = len(neg_list)
+                # 2. 如果某一方完全缺失，不得不跳过 (无法构建 Pair)
+                if n_pos == 0 or n_neg == 0:
+                    dropped_steps += 1
+                    continue
+                # 3. Shuffle (保证每次 Epoch 重复使用的样本是随机顺序的)
+                if shuffle:
+                    random.shuffle(pos_list)
+                    random.shuffle(neg_list)
+                # 4. Maximize Pairs via Cyclic Oversampling
+                # 取最大长度，保证所有样本至少被用到一次
+                n_pairs = max(n_pos, n_neg)
+                paired_samples = []
+                for i in range(n_pairs):
+                    p = pos_list[i % n_pos]
+                    n = neg_list[i % n_neg]
+                    paired_samples.append(p)
+                    paired_samples.append(n)
+                total_pairs += n_pairs
+                # 5. Chunking
+                # query_batch_size 必须是偶数
+                bs = self.query_batch_size
+                if bs % 2 != 0:
+                    bs -= 1
+                if bs < 2: bs = 2
+                for i in range(0, len(paired_samples), bs):
+                    chunk = paired_samples[i : i + bs]
+                    # 丢弃末尾不完整的 Pair (极少发生，仅当 chunk 长度为奇数时)
+                    if len(chunk) % 2 != 0:
+                        chunk = chunk[:-1]
+                    context_key_to_use = None
+                    if key in self.context_pool and len(self.context_pool[key]) > 0:
+                        context_key_to_use = key
+                    else:
+                        fallback_key = (key[1], key[2]) # (model, step)
+                        if fallback_key in self.context_pool_fallback and len(self.context_pool_fallback[fallback_key]) > 0:
+                            context_key_to_use = fallback_key
+                    if len(chunk) > 0 and context_key_to_use is not None:
+                        new_tasks.append({
+                            'key': key,
+                            'context_key': context_key_to_use,
+                            'queries': chunk,
+                            'is_pairwise': True
+                        })
+            else:
+                if shuffle: random.shuffle(samples)
+                for i in range(0, len(samples), self.query_batch_size):
+                    chunk = samples[i : i + self.query_batch_size]
+                    context_key_to_use = None
+                    if key in self.context_pool and len(self.context_pool[key]) > 0:
+                        context_key_to_use = key
+                    else:
+                        fallback_key = (key[1], key[2]) # (model, step)
+                        if fallback_key in self.context_pool_fallback and len(self.context_pool_fallback[fallback_key]) > 0:
+                            context_key_to_use = fallback_key
+                    if context_key_to_use is not None:
+                        new_tasks.append({
+                            'key': key,
+                            'context_key': context_key_to_use,
+                            'queries': chunk,
+                            'is_pairwise': False
+                        })
+        self.tasks = new_tasks
+        if self.mode == 'train':
+            print(f"  >>> [Dataset] Generated {len(self.tasks)} tasks from {len(keys)} contexts.")
+            print(f"  >>> [Pairwise Stats] Total Pairs: {total_pairs} (Using Oversampling). Dropped Steps (0 pos or 0 neg): {dropped_steps}")
+    def _process_label(self, reward):
+        val = float(reward)
+        if self.label_strategy == "binary":
+            return 1.0 if val >= 0 else 0.0
+        elif self.label_strategy == "minmax_norm":
+            return (np.clip(val, -1.0, 1.0) + 1.0) / 2.0
+        return val
+    def __len__(self):
+        return len(self.tasks)
+    def __getitem__(self, idx):
+        task = self.tasks[idx]
+        key = task['key'] # (dataset, model, step)
+        query_samples = task['queries']
+        # 1. Sample Context
+        context_key = task.get('context_key', key)
+        available_context = self.context_pool[key] if context_key == key else self.context_pool_fallback[context_key]
+        if len(available_context) >= self.support_size:
+            support_samples = random.sample(available_context, self.support_size)
+        else:
+            support_samples = available_context
+        # 2. Format Data
+        prompts = []
+        labels = []
+        # Process Support
+        for item in support_samples:
+            s_id_str = f"{item['dataset']}_{item['id']}"
+            text = self.prompt_map[s_id_str]
+            if text:
+                prompts.append(text)
+                labels.append(self._process_label(item['score']))
+        split_idx = len(prompts) # Boundary
+        # Process Query
+        q_ids = []
+        pair_ids = []
+        pair_types = []
+        for item in query_samples:
+            prompts.append(item['text'])
+            labels.append(self._process_label(item['score']))
+            q_ids.append(item['id'])
+            if 'pair_id' in item:
+                pair_ids.append(item['pair_id'])
+            if 'pair_type' in item:
+                pair_types.append(item['pair_type'])
+        # 获取该Context的全局正负样本统计量
+        stats = self.context_stats.get(key, {'n_pos': 0, 'n_neg': 0})
+        return {
+            "prompts": prompts,
+            "labels": torch.tensor(labels, dtype=torch.float),
+            "split_idx": split_idx,
+            "q_ids": q_ids,
+            "pair_ids": pair_ids,
+            "pair_types": pair_types,
+            "key": key,
+            "stats": stats # Pass stats to collate
+        }

v0_core/data/utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+import json
+def load_jsonl_lines(paths):
+    """读取多个文件路径并将所有行合并为一个列表"""
+    all_lines = []
+    if not isinstance(paths, list): paths = [paths]
+    for p in paths:
+        if not p or not os.path.exists(p):
+            print(f"Warning: Path not found {p}")
+            continue
+        print(f"Loading {p}...")
+        try:
+            with open(p, 'r', encoding='utf-8') as f:
+                for line in f:
+                    if line.strip():
+                        all_lines.append(json.loads(line.strip()))
+        except Exception as e:
+            print(f"Error reading {p}: {e}")
+    return all_lines

v0_core/models/__init__.py ADDED Viewed

File without changes

v0_core/models/v0.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from transformers import AutoTokenizer, AutoModel
+# =============================================================================
+# TabPFN 修复补丁
+# =============================================================================
+try:
+    from tabpfn import TabPFNClassifier
+except ImportError as e:
+    print(f"导入 TabPFN 模块失败: {e}")
+    print("请确保已安装 tabpfn，并且处于包含 tabpfn 源代码的环境中。")
+    exit(1)
+from v0_core.utils.tabpfn_patches import fixed_fit, fixed_forward
+# Apply Patches
+TabPFNClassifier.fit = fixed_fit
+TabPFNClassifier.forward = fixed_forward
+# print("已应用 TabPFNClassifier 的 fit 和 forward 最终修复补丁。")
+# =============================================================================
+# Qwen Official Pooling
+# =============================================================================
+def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return last_hidden_states[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = last_hidden_states.shape[0]
+        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+# =============================================================================
+# Adapter 策略模块
+# =============================================================================
+class FixedQueryAdapter(nn.Module):
+    def __init__(self, input_dim, num_queries=10, embed_dim=32, num_heads=4):
+        super().__init__()
+        self.proj_kv = nn.Linear(input_dim, embed_dim)
+        self.queries = nn.Parameter(torch.randn(1, num_queries, embed_dim))
+        self.mha = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, batch_first=True)
+        self.ln_q = nn.LayerNorm(embed_dim)
+        self.ln_kv = nn.LayerNorm(embed_dim)
+    def forward(self, hidden_states, attention_mask=None):
+        batch_size = hidden_states.size(0)
+        kv = self.proj_kv(hidden_states)
+        q = self.queries.repeat(batch_size, 1, 1)
+        key_padding_mask = ~attention_mask.bool() if attention_mask is not None else None
+        attn_out, _ = self.mha(query=self.ln_q(q), key=self.ln_kv(kv), value=kv, key_padding_mask=key_padding_mask)
+        return attn_out.reshape(batch_size, -1)
+class DynamicQueryAdapter(nn.Module):
+    def __init__(self, input_dim, num_queries=10, embed_dim=32, num_heads=4, generator_bottleneck_dim=128, generator_dropout_rate=0.2):
+        super().__init__()
+        self.num_queries = num_queries
+        self.embed_dim = embed_dim
+        self.static_queries = nn.Parameter(torch.randn(1, num_queries, embed_dim))
+        self.generator = nn.Sequential(
+            nn.Linear(input_dim, generator_bottleneck_dim),
+            nn.LayerNorm(generator_bottleneck_dim),
+            nn.GELU(),
+            nn.Dropout(generator_dropout_rate),
+            nn.Linear(generator_bottleneck_dim, num_queries * embed_dim)
+        )
+        nn.init.zeros_(self.generator[-1].weight)
+        nn.init.zeros_(self.generator[-1].bias)
+        self.proj_kv = nn.Linear(input_dim, embed_dim)
+        self.mha = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, batch_first=True)
+        self.ln_q = nn.LayerNorm(embed_dim)
+        self.ln_kv = nn.LayerNorm(embed_dim)
+    def forward(self, hidden_states, attention_mask):
+        batch_size = hidden_states.size(0)
+        v_global = last_token_pool(hidden_states, attention_mask)
+        delta_q = self.generator(v_global).view(batch_size, self.num_queries, self.embed_dim)
+        q_final = self.static_queries.repeat(batch_size, 1, 1) + delta_q
+        kv = self.proj_kv(hidden_states)
+        key_padding_mask = ~attention_mask.bool() if attention_mask is not None else None
+        attn_out, _ = self.mha(query=self.ln_q(q_final), key=self.ln_kv(kv), value=kv, key_padding_mask=key_padding_mask)
+        return attn_out.reshape(batch_size, -1)
+# =============================================================================
+# Qwen Embedding 模型封装
+# =============================================================================
+class QwenEmbeddingModel(nn.Module):
+    def __init__(self, model_path, pooling_type='last_token', num_queries=10, embed_dim=32,
+                 reduce_method='avg_pool', target_dim=1024, num_heads=4, generator_bottleneck_dim=128, generator_dropout_rate=0.2, device='cuda'):
+        super().__init__()
+        self.device = device
+        self.pooling_type = pooling_type
+        self.reduce_method = reduce_method
+        self.target_dim = target_dim
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, padding_side='left')
+        self.backbone = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(device)
+        self.backbone.train()
+        with torch.no_grad(): hidden_size = self.backbone.config.hidden_size
+        if self.pooling_type == 'fixed_query':
+            self.adapter_layer = FixedQueryAdapter(input_dim=hidden_size, num_queries=num_queries, embed_dim=embed_dim, num_heads=num_heads).to(device)
+        elif self.pooling_type == 'dynamic_query':
+            self.adapter_layer = DynamicQueryAdapter(input_dim=hidden_size, num_queries=num_queries, embed_dim=embed_dim, num_heads=num_heads, generator_bottleneck_dim=generator_bottleneck_dim, generator_dropout_rate=generator_dropout_rate).to(device)
+        elif self.pooling_type == 'last_token':
+            self.adapter_layer = last_token_pool
+    def forward(self, prompts, batch_size=32):
+        embeddings = []
+        for i in range(0, len(prompts), batch_size):
+            batch_prompts = prompts[i : i + batch_size]
+            batch_dict = self.tokenizer(batch_prompts, max_length=2048, padding=True, truncation=True, return_tensors="pt").to(self.device)
+            with torch.no_grad():
+                outputs = self.backbone(**batch_dict)
+                last_hidden_state = outputs.last_hidden_state
+            emb = self.adapter_layer(last_hidden_state, batch_dict['attention_mask'])
+            if self.reduce_method == 'avg_pool' and emb.shape[1] > self.target_dim:
+                emb = F.adaptive_avg_pool1d(emb.unsqueeze(1), self.target_dim).squeeze(1)
+            elif self.reduce_method == 'max_pool' and emb.shape[1] > self.target_dim:
+                emb = F.adaptive_max_pool1d(emb.unsqueeze(1), self.target_dim).squeeze(1)
+            embeddings.append(emb)
+        return torch.cat(embeddings, dim=0)
+class V0:
+    def __init__(self, embedding_model, tabpfn_model, device):
+        self.embedding_model = embedding_model
+        self.tabpfn = tabpfn_model
+        self.device = device
+    @classmethod
+    def from_pretrained(cls,
+                        checkpoint_path,
+                        embedding_model_path,
+                        tabpfn_head_path,
+                        device="cuda",
+                        num_queries=168,
+                        embed_dim=6,
+                        num_heads=3,
+                        bottleneck_dim=128,
+                        tabpfn_estimators=4):
+        # 1. Initialize Embedding Model (Qwen + Adapter)
+        embedding_model = QwenEmbeddingModel(
+            model_path=embedding_model_path,
+            num_queries=num_queries,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            generator_bottleneck_dim=bottleneck_dim,
+            generator_dropout_rate=0.0, # Dropout not needed for inference
+            device=device
+        )
+        # 2. Load Trained Weights (Adapter + potentially Backbone)
+        ckpt = torch.load(checkpoint_path, map_location=device)
+        state_dict = ckpt['model_state_dict']
+        # Clean DDP 'module.' prefix if present
+        if list(state_dict.keys())[0].startswith('module.'):
+            state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
+        # Load weights
+        msg = embedding_model.load_state_dict(state_dict, strict=False)
+        # 3. Initialize TabPFN
+        tabpfn = TabPFNClassifier(
+            model_path=tabpfn_head_path,
+            device=device,
+            n_estimators=tabpfn_estimators,
+            inference_precision=torch.float32,
+            differentiable_input=True # As per training script
+        )
+        # Manual init to ensure weights are loaded
+        tabpfn._initialize_model_variables()
+        return cls(embedding_model, tabpfn, device)
+    def predict(self, context_prompts, context_labels, target_prompts, batch_size=32):
+        """
+        Args:
+            context_prompts: List[str] - Support Set Texts
+            context_labels: List[float] - Support Set Scores (0.0 to 1.0)
+            target_prompts: List[str] - Query Set Texts to be scored
+        Returns:
+            scores: List[float] - Predicted scores (probability of class 1)
+        """
+        # 1. Encode Context (Support Set)
+        X_sup = self.embedding_model(context_prompts, batch_size=batch_size)
+        # 2. Process Labels (Training script logic: >= 0.5 is Positive)
+        y_sup = torch.tensor(context_labels, device=self.device)
+        y_sup_hard = (y_sup >= 0.5).long() # Convert to class indices 0 or 1
+        # 3. Fit TabPFN (In-Context Learning)
+        # TabPFN learns from this specific batch of context
+        self.tabpfn.fit(X_sup, y_sup_hard)
+        # 4. Encode Targets (Query Set)
+        X_que = self.embedding_model(target_prompts)
+        # 5. Predict
+        # use_inference_mode=True as per eval logic in run_epoch
+        with torch.no_grad():
+            logits = self.tabpfn.forward(X_que, use_inference_mode=True, return_logits=True)
+            probs = torch.softmax(logits, dim=1)
+        # Return probability of the positive class (class 1)
+        # If batch size is 1, output might be squeezed, handling that:
+        if probs.dim() == 1:
+            return [probs[1].item()]
+        else:
+            return probs[:, 1].tolist()

v0_core/utils/__init__.py ADDED Viewed

File without changes

v0_core/utils/checkpoint.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import re
+import glob
+import torch
+# =============================================================================
+# Checkpoint 管理器
+# =============================================================================
+class CheckpointManager:
+    def __init__(self, checkpoint_dir, max_keep=2, is_master=False):
+        self.checkpoint_dir = checkpoint_dir
+        self.max_keep = max_keep
+        self.is_master = is_master
+        if self.is_master and self.checkpoint_dir:
+            os.makedirs(self.checkpoint_dir, exist_ok=True)
+    def save(self, model, optimizer, scheduler, epoch, args, wandb_run_id=None):
+        if not self.is_master or not self.checkpoint_dir: return
+        raw_model = model.module if hasattr(model, 'module') else model
+        state = {
+            'epoch': epoch,
+            'model_state_dict': raw_model.state_dict(),
+            'optimizer_state_dict': optimizer.state_dict() if optimizer else None,
+            'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
+            'args': vars(args),
+            'wandb_run_id': wandb_run_id
+        }
+        filename = f"checkpoint_epoch_{epoch:04d}.pt"
+        filepath = os.path.join(self.checkpoint_dir, filename)
+        tmp_filepath = filepath + ".tmp"
+        print(f">> Saving Checkpoint to {filepath} (Atomic)...")
+        try:
+            # 1. 先写入临时文件
+            torch.save(state, tmp_filepath)
+            # 2. 强制刷盘，确保数据落盘
+            if os.path.exists(tmp_filepath):
+                with open(tmp_filepath, 'rb') as f:
+                    os.fsync(f.fileno())
+            # 3. 原子重命名 (如果掉电发生在这里之前，旧文件还在；之后，新文件生效)
+            os.replace(tmp_filepath, filepath)
+        except Exception as e:
+            print(f"Error saving checkpoint: {e}")
+            if os.path.exists(tmp_filepath):
+                os.remove(tmp_filepath)
+            return
+        self._rotate_checkpoints()
+    def _rotate_checkpoints(self):
+        # 保持原逻辑不变，但增加健壮性检查
+        files = glob.glob(os.path.join(self.checkpoint_dir, "checkpoint_epoch_*.pt"))
+        # 过滤掉 .tmp 文件
+        files = [f for f in files if not f.endswith('.tmp')]
+        def extract_epoch(f):
+            try:
+                match = re.search(r"epoch_(\d+)", f)
+                return int(match.group(1)) if match else -1
+            except: return -1
+        files.sort(key=extract_epoch)
+        if len(files) > self.max_keep:
+            to_delete = files[: -self.max_keep]
+            for f in to_delete:
+                try:
+                    print(f"Removing old checkpoint: {f}")
+                    os.remove(f)
+                except OSError as e:
+                    print(f"Error removing {f}: {e}")
+    def find_latest_epoch_num(self):
+        if not self.checkpoint_dir or not os.path.exists(self.checkpoint_dir): return 0
+        files = glob.glob(os.path.join(self.checkpoint_dir, "checkpoint_epoch_*.pt"))
+        files = [f for f in files if not f.endswith('.tmp')]
+        if not files: return 0
+        def extract_epoch(f):
+            match = re.search(r"epoch_(\d+)", f)
+            return int(match.group(1)) if match else -1
+        files.sort(key=extract_epoch)
+        return extract_epoch(files[-1])
+    def load_specific_epoch(self, target_epoch, model, optimizer, scheduler, device):
+        if target_epoch <= 0: return 1
+        filename = f"checkpoint_epoch_{target_epoch:04d}.pt"
+        filepath = os.path.join(self.checkpoint_dir, filename)
+        if not os.path.exists(filepath):
+            import time
+            print(f">> [Warning] Checkpoint {filepath} not found immediately. Waiting for FS sync...")
+            time.sleep(5)
+            if not os.path.exists(filepath): raise FileNotFoundError(f"Checkpoint {filepath} does not exist.")
+        print(f">> Resuming from checkpoint: {filepath}")
+        checkpoint = torch.load(filepath, map_location=device)
+        state_dict = checkpoint['model_state_dict']
+        raw_model = model.module if hasattr(model, 'module') else model
+        # 检查是否 key 不匹配 (例如保存时有 module. 读取时没有，或者反之)
+        model_keys = set(raw_model.state_dict().keys())
+        ckpt_keys = set(state_dict.keys())
+        # 简单的 key 修正逻辑
+        if list(model_keys)[0].startswith('module.') and not list(ckpt_keys)[0].startswith('module.'):
+            state_dict = {f"module.{k}": v for k, v in state_dict.items()}
+        elif not list(model_keys)[0].startswith('module.') and list(ckpt_keys)[0].startswith('module.'):
+            state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
+        raw_model.load_state_dict(state_dict)
+        if optimizer is not None and 'optimizer_state_dict' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        if scheduler is not None and 'scheduler_state_dict' in checkpoint:
+            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+        start_epoch = checkpoint['epoch'] + 1
+        wandb_id = checkpoint.get('wandb_run_id', None)
+        print(f"✅ Successfully resumed. Next epoch: {start_epoch}. WandB ID: {wandb_id}")
+        return start_epoch, wandb_id

v0_core/utils/metrics.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+import json
+from collections import defaultdict
+from sklearn.metrics import roc_auc_score, accuracy_score
+import torch.distributed as dist
+def append_jsonl(path, data):
+    try:
+        with open(path, 'a', encoding='utf-8') as f:
+            f.write(json.dumps(data, ensure_ascii=False) + '\n')
+    except Exception as e:
+        print(f"Error appending to jsonl: {e}")
+# =============================================================================
+# Global Metrics Calculation & Aggregation
+# =============================================================================
+def calculate_metrics_by_group(all_results, phase, epoch, is_master=True, output_dir=None, dataset_name_tag="", avg_loss=None):
+    # 1. Gather from all ranks
+    world_size = dist.get_world_size()
+    gathered_results = [None for _ in range(world_size)]
+    dist.all_gather_object(gathered_results, all_results)
+    if not is_master:
+        return {}
+    # Flatten list of lists
+    flat_results = []
+    for rank_res in gathered_results:
+        flat_results.extend(rank_res)
+    print(f"[{phase}] Collected {len(flat_results)} samples for evaluation.")
+    if len(flat_results) == 0:
+        return {}
+    metrics_summary = {"epoch": epoch}
+    # =========================================================================
+    # Part A: Pair-wise Metrics Calculation (Global)
+    # =========================================================================
+    pair_grouping = defaultdict(lambda: {'pos': [], 'neg': []})
+    for r in flat_results:
+        pid = r.get('pair_id')
+        if pid is not None:
+            if r['label'] >= 0.5:
+                pair_grouping[pid]['pos'].append(r)
+            else:
+                pair_grouping[pid]['neg'].append(r)
+    valid_pairs = []
+    for pid, group in pair_grouping.items():
+        if len(group['pos']) == 1 and len(group['neg']) == 1:
+            valid_pairs.append((group['pos'][0], group['neg'][0]))
+    total_valid_pairs = len(valid_pairs)
+    strict_pair_correct_count = 0
+    rlhf_pair_correct_count = 0
+    # 3. Calculate Global Pair Metrics
+    for pos_item, neg_item in valid_pairs:
+        # Strict
+        if (pos_item['pred'] == 1) and (neg_item['pred'] == 0):
+            strict_pair_correct_count += 1
+        # RLHF
+        if pos_item['prob'] > neg_item['prob']:
+            rlhf_pair_correct_count += 1
+    metrics_summary[f"{phase}/global_strict_pair_acc"] = strict_pair_correct_count / total_valid_pairs if total_valid_pairs > 0 else -1
+    metrics_summary[f"{phase}/global_rlhf_pair_acc"] = rlhf_pair_correct_count / total_valid_pairs if total_valid_pairs > 0 else -1
+    metrics_summary[f"{phase}/num_valid_pairs"] = total_valid_pairs
+    # =========================================================================
+    # Part B: Standard Global Metrics (Acc / AUC)
+    # =========================================================================
+    y_true_binary = [1 if r['label'] >= 0.5 else 0 for r in flat_results]
+    y_scores = [r['prob'] for r in flat_results]
+    y_preds = [r['pred'] for r in flat_results]
+    def get_auc_strict(y_t, y_s):
+        try:
+            return roc_auc_score(y_t, y_s) if len(set(y_t)) > 1 else -1
+        except:
+            return -1
+    g_auc = get_auc_strict(y_true_binary, y_scores)
+    metrics_summary[f"{phase}/global_acc"] = accuracy_score(y_true_binary, y_preds)
+    metrics_summary[f"{phase}/global_auc"] = g_auc
+    if avg_loss is not None:
+        metrics_summary[f"{phase}/loss"] = avg_loss
+    # =========================================================================
+    # Part C: Step-wise Metrics
+    # =========================================================================
+    step_groups = defaultdict(list)
+    for r in flat_results:
+        step_groups[r['step']].append(r)
+    step_valid_pairs = defaultdict(list)
+    for pos_item, neg_item in valid_pairs:
+        if pos_item['step'] == neg_item['step']:
+            step_valid_pairs[pos_item['step']].append((pos_item, neg_item))
+    print(f"[{phase}] Calculating metrics for {len(step_groups)} distinct steps...")
+    gauc_weighted_sum = 0.0
+    gauc_total_weight = 0.0
+    valid_gauc_steps = 0
+    step_details_list = []
+    for step_val, items in step_groups.items():
+        s_true = [1 if x['label'] >= 0.5 else 0 for x in items]
+        s_scores = [x['prob'] for x in items]
+        s_preds = [x['pred'] for x in items]
+        # 1. Basic Step Metrics
+        step_acc = accuracy_score(s_true, s_preds)
+        step_auc = get_auc_strict(s_true, s_scores) # Returns None if only 1 class
+        step_record = {
+            "step": step_val,
+            "count": len(items),
+            "acc": step_acc,
+            "auc": step_auc
+        }
+        if step_auc != -1:
+            weight = len(items)
+            gauc_weighted_sum += step_auc * weight
+            gauc_total_weight += weight
+            valid_gauc_steps += 1
+        # 3. Step Pair Metrics
+        pairs_in_step = step_valid_pairs.get(step_val, [])
+        n_pairs = len(pairs_in_step)
+        if n_pairs > 0:
+            s_strict_corr = sum(1 for p, n in pairs_in_step if (p['pred'] == 1 and n['pred'] == 0))
+            s_rlhf_corr = sum(1 for p, n in pairs_in_step if p['prob'] > n['prob'])
+            step_record["pair_count"] = n_pairs
+            step_record["strict_pair_acc"] = s_strict_corr / n_pairs
+            step_record["rlhf_pair_acc"] = s_rlhf_corr / n_pairs
+        else:
+            step_record["pair_count"] = 0
+            step_record["strict_pair_acc"] = -1
+            step_record["rlhf_pair_acc"] = -1
+        step_details_list.append(step_record)
+    # Calculate Weighted gAUC
+    final_gauc = gauc_weighted_sum / gauc_total_weight if gauc_total_weight > 0 else -1
+    metrics_summary[f"{phase}/gAUC"] = final_gauc
+    metrics_summary[f"{phase}/gAUC_valid_steps"] = valid_gauc_steps
+    print(f"[{phase}] gAUC: {final_gauc:.4f} (Computed over {valid_gauc_steps} valid steps out of {len(step_groups)})")
+    # =========================================================================
+    # Part D: Save Logs
+    # =========================================================================
+    if output_dir:
+        # 1. Save Raw Predictions (Keep as is)
+        log_filename = f"{phase}_predictions_epoch_{epoch}{dataset_name_tag}.jsonl"
+        log_path = os.path.join(output_dir, log_filename)
+        valid_pair_ids = set(p[0]['pair_id'] for p in valid_pairs)
+        print(f"Saving raw predictions to {log_path}...")
+        with open(log_path, 'w', encoding='utf-8') as f:
+            for item in flat_results:
+                item['is_valid_pair_part'] = item.get('pair_id') in valid_pair_ids
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+        # 2. Save Global Metrics (Only summary)
+        metric_filename = "all_metrics.jsonl"
+        metric_path = os.path.join(output_dir, metric_filename)
+        append_jsonl(metric_path, metrics_summary)
+        # 3. [NEW] Save Step-wise Details to a separate file
+        step_log_filename = f"{phase}_step_metrics_epoch_{epoch}{dataset_name_tag}.jsonl"
+        step_log_path = os.path.join(output_dir, step_log_filename)
+        print(f"Saving step-wise metrics to {step_log_path}...")
+        # Sort by step for readability
+        step_details_list.sort(key=lambda x: x['step'] if isinstance(x['step'], int) else -1)
+        with open(step_log_path, 'w', encoding='utf-8') as f:
+            for item in step_details_list:
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+    return metrics_summary

v0_core/utils/tabpfn_patches.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+import logging
+import numpy as np
+try:
+    from tabpfn import TabPFNClassifier
+    from tabpfn.base import create_inference_engine, determine_precision
+    from tabpfn.utils import infer_random_state
+    from tabpfn.classifier import _validate_eval_metric
+    from tabpfn.inference import InferenceEngineBatchedNoPreprocessing
+except ImportError as e:
+    print(f"导入 TabPFN 模块失败: {e}")
+    print("请确保已安装 tabpfn，并且处于包含 tabpfn 源代码的环境中。")
+    exit(1)
+def fixed_fit(self, X, y) -> "TabPFNClassifier":
+    """修复 fit 方法：解决 differentiable_input=True 时 ensemble_configs 未定义的问题"""
+    self.eval_metric_ = _validate_eval_metric(self.eval_metric)
+    if self.fit_mode == "batched":
+        logging.warning("Switching from 'batched' to 'fit_preprocessors' mode...")
+        self.fit_mode = "fit_preprocessors"
+    if not hasattr(self, "models_") or not self.differentiable_input:
+        byte_size, rng = self._initialize_model_variables()
+        ensemble_configs, X, y = self._initialize_dataset_preprocessing(X, y, rng)
+    else:
+        _, rng = infer_random_state(self.random_state)
+        _, _, byte_size = determine_precision(self.inference_precision, self.devices_)
+        ensemble_configs, X, y = self._initialize_dataset_preprocessing(X, y, rng)
+    self._maybe_calibrate_temperature_and_tune_decision_thresholds(X=X, y=y)
+    self.executor_ = create_inference_engine(
+        X_train=X,
+        y_train=y,
+        models=self.models_,
+        ensemble_configs=ensemble_configs,
+        cat_ix=self.inferred_categorical_indices_,
+        fit_mode=self.fit_mode,
+        devices_=self.devices_,
+        rng=rng,
+        n_preprocessing_jobs=self.n_preprocessing_jobs,
+        byte_size=byte_size,
+        forced_inference_dtype_=self.forced_inference_dtype_,
+        memory_saving_mode=self.memory_saving_mode,
+        use_autocast_=self.use_autocast_,
+        inference_mode=not self.differentiable_input,
+    )
+    return self
+def fixed_forward(
+    self,
+    X: list[torch.Tensor] | torch.Tensor,
+    *,
+    use_inference_mode: bool = False,
+    return_logits: bool = False,
+    return_raw_logits: bool = False,
+) -> torch.Tensor:
+    """修复 forward 方法：允许 standard inference 下保留梯度"""
+    if return_logits and return_raw_logits:
+        raise ValueError("Cannot return both logits and raw logits.")
+    is_standard_inference = not isinstance(
+        self.executor_, InferenceEngineBatchedNoPreprocessing
+    )
+    is_batched_for_grads = (
+        not use_inference_mode
+        and isinstance(self.executor_, InferenceEngineBatchedNoPreprocessing)
+        and isinstance(X, list)
+    )
+    assert is_standard_inference or is_batched_for_grads, "Invalid forward pass."
+    if self.fit_mode in ["fit_preprocessors", "batched"]:
+        self.executor_.use_torch_inference_mode(use_inference=use_inference_mode)
+    outputs = []
+    for output, config in self.executor_.iter_outputs(X, autocast=self.use_autocast_):
+        processed_output = output.unsqueeze(1) if output.ndim == 2 else output
+        config_list = [config] if output.ndim == 2 else config
+        output_batch = []
+        for i, batch_config in enumerate(config_list):
+            if batch_config.class_permutation is None:
+                output_batch.append(processed_output[:, i, : self.n_classes_])
+            else:
+                use_perm = batch_config.class_permutation
+                if len(use_perm) != self.n_classes_:
+                    full_perm = np.arange(self.n_classes_)
+                    full_perm[:len(use_perm)] = use_perm
+                    use_perm = full_perm
+                output_batch.append(processed_output[:, i, use_perm])
+        outputs.append(torch.stack(output_batch, dim=1))
+    stacked_outputs = torch.stack(outputs) # (Chunks, Samples, Est, Classes)
+    if return_logits:
+        temp_scaled = self._apply_temperature(stacked_outputs)
+        output = temp_scaled.mean(dim=(0, 2))
+    elif return_raw_logits:
+        output = stacked_outputs
+    else:
+        temp_scaled = self._apply_temperature(stacked_outputs)
+        avg_logits = temp_scaled.mean(dim=(0, 2))
+        output = torch.nn.functional.softmax(avg_logits, dim=-1)
+    if not use_inference_mode:
+        if return_logits and output.ndim == 2:
+            return output
+        if output.ndim == 2:
+            output = output.unsqueeze(0)
+        output = output.transpose(0, 1).transpose(1, 2)
+    elif output.ndim > 2 and use_inference_mode:
+        output = output.squeeze(1) if not return_raw_logits else output.squeeze(2)
+    return output