| import os |
| import json |
| from pathlib import Path |
|
|
| def normal(): |
| |
| DATA_FILE = "/mnt/data/users/liamding/data/sft_zh_tox/data/train_1000.json" |
| PROMPT_DIR = "/mnt/data/users/liamding/data/sft_zh_tox/prompt" |
| OUTPUT_FILE = "qwen3_normal_train.json" |
|
|
| |
| USER_TEMPLATE = "输入:{sentence}" |
|
|
| |
| with open(DATA_FILE, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
|
|
| |
| dataset_prompts = {} |
| datasets = set(item["dataset"] for item in data) |
| for dataset in datasets: |
| prompt_path = os.path.join(PROMPT_DIR, f"{dataset}_prompt.txt") |
| with open(prompt_path, 'r', encoding='utf-8') as f: |
| dataset_prompts[dataset] = f.read().strip() |
|
|
| |
| results = [] |
| cate = {} |
| train_id = 0 |
| for item in data: |
| cate[item["dataset"]] = cate.get(item["dataset"], 0) + 1 |
| toxic = item["toxic"] |
| dataset = item["dataset"] |
| |
| |
| system_prompt = dataset_prompts[dataset] |
| user_input = USER_TEMPLATE.format(sentence=toxic.strip()) |
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_input}, |
| { |
| "role": "assistant", |
| |
| "content": "<think>\n\n</think>\n\n"+item["neutral"] |
| } |
| ] |
| |
| results.append({ |
| "train_id": train_id, |
| "messages": messages, |
| "dataset": dataset, |
| }) |
| train_id += 1 |
|
|
| |
| Path(os.path.dirname(OUTPUT_FILE)).mkdir(parents=True, exist_ok=True) |
| with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: |
| json.dump(results, f, ensure_ascii=False, indent=4) |
|
|
| print(f"构建完成,保存到:{OUTPUT_FILE}") |
| print(cate) |
|
|
| def r1(): |
| |
| DATA_FILE = "/mnt/data/users/liamding/data/sft_zh_tox/data/deepseek-r1_train.json" |
| PROMPT_DIR = "/mnt/data/users/liamding/data/sft_zh_tox/prompt" |
| |
| OUTPUT_FILE = "qwen3_r1_train.json" |
|
|
| |
| USER_TEMPLATE = "输入:{sentence}" |
|
|
| |
| with open(DATA_FILE, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
|
|
| |
| dataset_prompts = {} |
| datasets = set(item["dataset"] for item in data) |
| for dataset in datasets: |
| prompt_path = os.path.join(PROMPT_DIR, f"{dataset}_prompt.txt") |
| with open(prompt_path, 'r', encoding='utf-8') as f: |
| dataset_prompts[dataset] = f.read().strip() |
|
|
| |
| results = [] |
| cate = {} |
| train_id = 0 |
| for item in data: |
| cate[item["dataset"]] = cate.get(item["dataset"], 0) + 1 |
| toxic = item["toxic"] |
| dataset = item["dataset"] |
| |
| |
| system_prompt = dataset_prompts[dataset] |
| user_input = USER_TEMPLATE.format(sentence=toxic.strip()) |
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_input}, |
| { |
| "role": "assistant", |
| |
| "content": "<think>\n"+item["reason"]+"\n</think>\n\n"+"<answer>"+item["rewritten"]+"</answer>" |
| } |
| ] |
| |
| results.append({ |
| "train_id": train_id, |
| "messages": messages, |
| "dataset": dataset, |
| }) |
| train_id += 1 |
|
|
| |
| Path(os.path.dirname(OUTPUT_FILE)).mkdir(parents=True, exist_ok=True) |
| with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: |
| json.dump(results, f, ensure_ascii=False, indent=4) |
|
|
| print(f"构建完成,保存到:{OUTPUT_FILE}") |
| print(cate) |
|
|
| normal() |
| r1() |