File size: 6,083 Bytes
d8a76be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Add a scoring-prefix (evaluation rubric) to the beginning of each row's `chosen_prompt` in a Parquet dataset.

Features
- Idempotent: won't double-prefix rows that already start with the prefix.
- In-place or write-to-new-file.
- Dry-run mode to see how many rows would change.
- Safe handling of NaN / non-string values.

Usage
------
python add_prefix_to_parquet.py \

    --input /home/data/train_20k_label_v5.0safe.parquet \

    --output /home/data/train_20k_label_v5.0safe.prefixed.parquet

# In-place
python add_prefix_to_parquet.py --input /home/data/train_20k_label_v5.0safe.parquet --in-place

# Dry run (no file written)
python add_prefix_to_parquet.py --input /home/data/train_20k_label_v5.0safe.parquet --dry-run

# Custom column name (if needed)
python add_prefix_to_parquet.py --input your.parquet --column chosen_prompt --output your.prefixed.parquet
"""

import argparse
import sys
from pathlib import Path

import pandas as pd

# -------- Prefix to insert --------
PREFIX = '系统指令(请严格遵守;不要更改下文任何现有格式或占位符):\n你是一名针对“候选回复对(chosen vs. reject)”的奖励模型评审器。你的目标是基于以下标准,对更符合要求的回复给出更高分数(保持数据集中原有的输出格式与范围,不新增字段名或改变返回结构)。\n\n评分总原则(重要→次要,括号内为建议权重,可在原有评分区间内体现):\n1) 人设一致性(35%):\n   - 说话口吻、价值观、知识边界与“角色设定/世界观”一致;避免 OOC(Out-of-Character)。\n   - 行为动机合理,前后不自相矛盾;对已给定背景信息有正确引用与延续。\n2) 剧情衔接与合理性(35%):\n   - 与已发生情节顺畅衔接,推动剧情或回应当前情境;避免跳戏、无端转场、重复或无效赘述。\n   - 逻辑因果清晰,伏笔与回收、冲突与缓解自然,避免硬拗设定或忽略既有事实。\n3) 文采与吸引力(25%):\n   - 语言有画面感与细节度,节奏拿捏得当(张力、留白、转折);描写具体而不过度堆砌。\n   - 对话生动不尴尬,叙述有层次,避免口水化/模板化表达。\n4) 安全与合规(5%,若数据集另有统一规范,以其为准):\n   - 不输出违法、露骨、仇恨或隐私泄露等明显违规内容;敏感题材应克制且与剧情需要相称。\n\n优先打更高分给下述特征的回复:\n- 更贴合既定“人设与世界观”的措辞与行为;\n- 更好地延续与推进当前剧情、减少读者困惑;\n- 更有感染力与可读性(细节精准、描写节制且有效)。\n\n降低分数或判为较差的情形:\n- OOC、设定违背、事实矛盾或前后冲突;\n- 情节跳脱、无端信息、堆砌辞藻但无实质推进;\n- 空泛、模板化、缺少细节支撑或明显不合规。\n\n请仅据上列标准进行偏好判断与评分;保持与原任务一致的输出格式(包括字段、取值范围、排序/对齐方式等),不要加入额外解释或多余文本。' + "\n\n"  # keep two newlines between prefix and existing content


def already_prefixed(text: str) -> bool:
    if not isinstance(text, str):
        return False
    return text.startswith(PREFIX)


def apply_prefix(text):
    if text is None or (isinstance(text, float) and pd.isna(text)):
        text = ""
    elif not isinstance(text, str):
        text = str(text)
    if already_prefixed(text):
        return text
    return PREFIX + text


def main(argv=None):
    parser = argparse.ArgumentParser(description="Add an evaluation prefix to a Parquet dataset's chosen_prompt column.")
    parser.add_argument("--input", required=True, help="Path to input Parquet file.")
    parser.add_argument("--output", help="Path to output Parquet file (omit when using --in-place).")
    parser.add_argument("--column", default="chosen_prompt", help="Column name to prefix. Default: chosen_prompt")
    parser.add_argument("--in-place", action="store_true", help="Modify the input file in place.")
    parser.add_argument("--dry-run", action="store_true", help="Only report how many rows would change; do not write output.")
    args = parser.parse_args(argv)

    in_path = Path(args.input)
    if not in_path.exists():
        print(f"[ERROR] Input file not found: {in_path}", file=sys.stderr)
        return 2

    if args.in_place and args.output:
        print("[ERROR] Use either --in-place or --output, not both.", file=sys.stderr)
        return 2

    if not args.in_place and not args.output and not args.dry_run:
        print("[ERROR] Must specify --output (or use --in-place / --dry-run).", file=sys.stderr)
        return 2

    # Load dataset
    try:
        df = pd.read_parquet(in_path)
    except Exception as e:
        print(f"[ERROR] Failed to read Parquet: {e}", file=sys.stderr)
        return 2

    if args.column not in df.columns:
        print(f"[ERROR] Column '{args.column}' not found. Available columns: {list(df.columns)}", file=sys.stderr)
        return 2

    col = df[args.column]
    # Compute which rows would change
    to_change = ~col.fillna("").astype(str).str.startswith(PREFIX)
    n_change = int(to_change.sum())
    n_total = len(df)

    if args.dry_run:
        print(f"[DRY-RUN] Rows to update: {n_change} / {n_total}")
        return 0

    # Apply
    df.loc[to_change, args.column] = col.loc[to_change].apply(apply_prefix)

    # Write back
    if args.in_place:
        out_path = in_path
    else:
        out_path = Path(args.output).resolve()

    try:
        # Use pyarrow if available; fallback otherwise
        df.to_parquet(out_path, index=False)
    except Exception as e:
        print(f"[ERROR] Failed to write Parquet: {e}", file=sys.stderr)
        return 2

    print(f"[OK] Updated file written to: {out_path}")
    print(f"[OK] Rows updated: {n_change} / {n_total}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())