sft_zh_tox / build_train.py
p1k0's picture
Upload folder using huggingface_hub
e77cf8d verified
import os
import json
from pathlib import Path
def normal():
# 配置路径
DATA_FILE = "/mnt/data/users/liamding/data/sft_zh_tox/data/train_1000.json"
PROMPT_DIR = "/mnt/data/users/liamding/data/sft_zh_tox/prompt"
OUTPUT_FILE = "qwen3_normal_train.json"
# 用户输入模板
USER_TEMPLATE = "输入:{sentence}"
# 加载数据
with open(DATA_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
# 缓存每个 dataset 的 prompt(system 提示)
dataset_prompts = {}
datasets = set(item["dataset"] for item in data)
for dataset in datasets:
prompt_path = os.path.join(PROMPT_DIR, f"{dataset}_prompt.txt")
with open(prompt_path, 'r', encoding='utf-8') as f:
dataset_prompts[dataset] = f.read().strip()
# 构建 messages 并保存
results = []
cate = {}
train_id = 0
for item in data:
cate[item["dataset"]] = cate.get(item["dataset"], 0) + 1
toxic = item["toxic"]
dataset = item["dataset"]
# idx = item["idx"]
system_prompt = dataset_prompts[dataset]
user_input = USER_TEMPLATE.format(sentence=toxic.strip())
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_input},
{
"role": "assistant",
# "content": item["neutral"]
"content": "<think>\n\n</think>\n\n"+item["neutral"]
}
]
results.append({
"train_id": train_id,
"messages": messages,
"dataset": dataset,
})
train_id += 1
# 保存为 JSON 文件
Path(os.path.dirname(OUTPUT_FILE)).mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4)
print(f"构建完成,保存到:{OUTPUT_FILE}")
print(cate)
def r1():
# 配置路径
DATA_FILE = "/mnt/data/users/liamding/data/sft_zh_tox/data/deepseek-r1_train.json"
PROMPT_DIR = "/mnt/data/users/liamding/data/sft_zh_tox/prompt"
# OUTPUT_FILE = "r1_train.json"
OUTPUT_FILE = "qwen3_r1_train.json"
# 用户输入模板
USER_TEMPLATE = "输入:{sentence}"
# 加载数据
with open(DATA_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
# 缓存每个 dataset 的 prompt(system 提示)
dataset_prompts = {}
datasets = set(item["dataset"] for item in data)
for dataset in datasets:
prompt_path = os.path.join(PROMPT_DIR, f"{dataset}_prompt.txt")
with open(prompt_path, 'r', encoding='utf-8') as f:
dataset_prompts[dataset] = f.read().strip()
# 构建 messages 并保存
results = []
cate = {}
train_id = 0
for item in data:
cate[item["dataset"]] = cate.get(item["dataset"], 0) + 1
toxic = item["toxic"]
dataset = item["dataset"]
# idx = item["idx"]
system_prompt = dataset_prompts[dataset]
user_input = USER_TEMPLATE.format(sentence=toxic.strip())
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_input},
{
"role": "assistant",
# "content": "<think>"+item["reason"]+"</think>"+"<answer>"+item["rewritten"]+"</answer>"
"content": "<think>\n"+item["reason"]+"\n</think>\n\n"+"<answer>"+item["rewritten"]+"</answer>"
}
]
results.append({
"train_id": train_id,
"messages": messages,
"dataset": dataset,
})
train_id += 1
# 保存为 JSON 文件
Path(os.path.dirname(OUTPUT_FILE)).mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4)
print(f"构建完成,保存到:{OUTPUT_FILE}")
print(cate)
normal()
r1()