GTC-Guard-0 / fix_data.py
clarenceleo's picture
Upload 10 files
7be6323 verified
Raw
History Blame Contribute Delete
796 Bytes
import pandas as pd
# 读取现有数据
df = pd.read_csv('toxic_data.csv')
# 添加安全但有脏话的例子
safe_with_swear = [
("damn this is so cool", 0),
("holy shit that's amazing", 0),
("what the hell is that", 0),
("fucking awesome work", 0),
("shit this is good", 0),
]
# 添加明显有害的例子(补充)
unsafe_examples = [
("i want to rape you", 1),
("you should kill yourself", 1),
("i'll burn your house", 1),
]
new_rows = pd.DataFrame(safe_with_swear + unsafe_examples, columns=['text', 'label'])
df = pd.concat([df, new_rows], ignore_index=True)
# 保存并重新训练
df.to_csv('toxic_data.csv', index=False)
print(f"✅ 添加 {len(new_rows)} 条数据,总数: {len(df)}")
print("现在运行 python train_guard0.py 重新训练")