hr-eval-api-v2 / scripts /prepare_sentiment_data.py
KarenYYH
Initial commit - HR Evaluation API v2
c8b1f17
"""
准备情绪分析训练数据
从HR对话数据中生成带情绪标注的训练样本
"""
import json
import random
from pathlib import Path
from typing import List, Dict
class SentimentDataPreparer:
"""情绪分析数据准备器"""
# HR场景情绪词典
POSITIVE_KEYWORDS = [
"好", "满意", "喜欢", "谢谢", "感谢", "可以", "行", "没问题", "支持",
"理解", "配合", "接受", "认可", "专业", "高效", "合理", "规范",
"excellent", "good", "ok", "yes", "great", "满意", "帮助", "清楚"
]
NEGATIVE_KEYWORDS = [
"不", "没", "不行", "不好", "讨厌", "烦", "生气", "愤怒", "不满",
"投诉", "举报", "错误", "失败", "拒绝", "反对", "抗议", "质疑",
"不合理", "强制", "拒绝", "no", "not", "bad", "错误", "问题",
"失望", "糟糕", "差", "难受", "不满意"
]
# HR积极场景模板
POSITIVE_TEMPLATES = [
("好的,我来帮你处理{topic}", "positive"),
("感谢你的反馈,我们会尽快处理", "positive"),
("没问题,这个申请我可以帮你", "positive"),
("非常满意这次的服务", "positive"),
("好的,我理解公司的规定", "positive"),
("谢谢你的解答,很清楚", "positive"),
("支持公司的决定", "positive"),
("流程很规范,我很认可", "positive"),
("专业高效的回复,感谢", "positive"),
("配合完成相关手续", "positive"),
("你好,我想咨询一下{topic}", "neutral"),
("请问申请{topic}需要什么材料", "neutral"),
("我想了解一下关于{topic}的政策", "neutral"),
("请问办理{topic}的流程是什么", "neutral"),
("{topic}的申请条件是什么", "neutral"),
("好的,我知道了", "neutral"),
("请问还有什么需要补充的吗", "neutral"),
("请提供具体的证明材料", "neutral"),
("请问处理时间需要多久", "neutral"),
("我想确认一下{topic}的进度", "neutral"),
]
# HR消极场景模板
NEGATIVE_TEMPLATES = [
("我对这个处理结果很不满意", "negative"),
("这个制度太不合理了,我很生气", "negative"),
("为什么要强制执行这个规定", "negative"),
("你们的做法让我很失望", "negative"),
("我要投诉这个处理方式", "negative"),
("拒绝接受这个安排", "negative"),
("这个规定存在很大问题", "negative"),
("我对这个结果很不理解", "negative"),
("服务质量太差了,很不满", "negative"),
("这个流程太复杂了,很烦", "negative"),
]
# HR话题
HR_TOPICS = [
"培训申请", "请假申请", "报销申请", "社保查询",
"公积金查询", "工资条", "劳动合同", "离职手续",
"入职办理", "证明开具", "福利申请", "考勤异常"
]
def __init__(self, output_dir: str = "./data/processed/sentiment"):
"""
Args:
output_dir: 输出目录
"""
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
def generate_synthetic_data(self, num_samples: int = 500) -> List[Dict]:
"""
生成合成训练数据
Args:
num_samples: 生成的样本数量
Returns:
样本列表
"""
print(f"生成 {num_samples} 条合成数据...")
data = []
# 从模板生成
for _ in range(num_samples // 3):
# 积极样本
template, label = random.choice(self.POSITIVE_TEMPLATES)
topic = random.choice(self.HR_TOPICS)
text = template.format(topic=topic)
data.append({"text": text, "label": label})
# 中性样本
template, label = random.choice(self.POSITIVE_TEMPLATES)
if label == "neutral":
topic = random.choice(self.HR_TOPICS)
text = template.format(topic=topic)
data.append({"text": text, "label": label})
# 消极样本
template, label = random.choice(self.NEGATIVE_TEMPLATES)
data.append({"text": template, "label": label})
# 从关键词生成
for _ in range(num_samples // 3):
# 积极关键词组合
num_positive = random.randint(1, 3)
words = random.sample(self.POSITIVE_KEYWORDS, num_positive)
text = f"{','.join(words)}!"
data.append({"text": text, "label": "positive"})
# 消极关键词组合
num_negative = random.randint(1, 3)
words = random.sample(self.NEGATIVE_KEYWORDS, num_negative)
text = f"{','.join(words)}!"
data.append({"text": text, "label": "negative"})
# 混合场景
for _ in range(num_samples // 3):
# 积极+中性
pos_word = random.choice(self.POSITIVE_KEYWORDS)
topic = random.choice(self.HR_TOPICS)
text = f"{topic}方面{pos_word},请问具体流程是什么"
data.append({"text": text, "label": "positive"})
# 消极+投诉
neg_word = random.choice(self.NEGATIVE_KEYWORDS)
topic = random.choice(self.HR_TOPICS)
text = f"关于{topic},我感到很{neg_word},要求重新处理"
data.append({"text": text, "label": "negative"})
# 打乱数据
random.shuffle(data)
return data[:num_samples]
def load_existing_dialogues(self, dialogue_path: str) -> List[Dict]:
"""
从现有对话数据加载并标注情绪
Args:
dialogue_path: 对话数据文件路径
Returns:
标注后的数据
"""
print(f"从 {dialogue_path} 加载对话数据...")
data = []
try:
with open(dialogue_path, 'r', encoding='utf-8') as f:
dialogues = json.load(f)
for dialogue in dialogues:
for turn in dialogue.get('turns', []):
text = turn.get('utterance', '')
speaker = turn.get('speaker', '')
# 根据说话人和内容推断情绪
if speaker == 'Employee':
# 员工情绪分析
label = self._infer_sentiment(text)
else:
# HR回复通常是中性
label = 'neutral'
data.append({
'text': text,
'label': label,
'speaker': speaker,
'source': 'dialogue'
})
except Exception as e:
print(f"加载对话数据失败: {e}")
return data
def _infer_sentiment(self, text: str) -> str:
"""根据关键词推断情绪"""
positive_count = sum(1 for word in self.POSITIVE_KEYWORDS if word in text.lower())
negative_count = sum(1 for word in self.NEGATIVE_KEYWORDS if word in text.lower())
if positive_count > negative_count:
return 'positive'
elif negative_count > positive_count:
return 'negative'
else:
return 'neutral'
def create_manual_examples(self) -> List[Dict]:
"""创建手动标注的高质量示例"""
return [
# 积极
{"text": "好的,谢谢你的帮助,非常满意!", "label": "positive"},
{"text": "没问题,我理解公司的规定", "label": "positive"},
{"text": "太好了,感谢你的解答", "label": "positive"},
{"text": "流程很规范,我很认可", "label": "positive"},
{"text": "专业高效的回复,感谢支持", "label": "positive"},
{"text": "配合完成相关手续,谢谢", "label": "positive"},
{"text": "我对这次服务很满意", "label": "positive"},
{"text": "好的,清楚明白了", "label": "positive"},
# 中性
{"text": "您好,请问有什么可以帮您?", "label": "neutral"},
{"text": "我需要了解培训的具体时间", "label": "neutral"},
{"text": "请问申请年假需要什么材料", "label": "neutral"},
{"text": "我想咨询一下社保缴纳问题", "label": "neutral"},
{"text": "好的,我知道了", "label": "neutral"},
{"text": "请问还有什么需要补充的吗", "label": "neutral"},
{"text": "请提供具体的证明材料", "label": "neutral"},
{"text": "请问处理时间需要多久", "label": "neutral"},
{"text": "我想确认一下申请进度", "label": "neutral"},
{"text": "根据公司规定办理即可", "label": "neutral"},
# 消极
{"text": "我对这个处理结果很不满意", "label": "negative"},
{"text": "这个制度太不合理了,我很生气", "label": "negative"},
{"text": "为什么要强制执行这个规定", "label": "negative"},
{"text": "你们的做法让我很失望", "label": "negative"},
{"text": "我要投诉这个处理方式", "label": "negative"},
{"text": "拒绝接受这个安排", "label": "negative"},
{"text": "这个规定存在很大问题", "label": "negative"},
{"text": "我对这个结果很不满,要求重新处理", "label": "negative"},
{"text": "服务质量太差了", "label": "negative"},
{"text": "这个流程太复杂了,很烦", "label": "negative"},
]
def prepare_datasets(
self,
num_synthetic: int = 300,
dialogue_path: str = None,
train_ratio: float = 0.8
):
"""
准备训练和验证数据集
Args:
num_synthetic: 合成数据数量
dialogue_path: 对话数据路径
train_ratio: 训练集比例
"""
print("开始准备情绪分析数据集...")
all_data = []
# 1. 添加手动标注示例(高质量)
manual_data = self.create_manual_examples()
all_data.extend(manual_data)
print(f"添加 {len(manual_data)} 条手动标注数据")
# 2. 生成合成数据
synthetic_data = self.generate_synthetic_data(num_synthetic)
all_data.extend(synthetic_data)
print(f"生成 {len(synthetic_data)} 条合成数据")
# 3. 从现有对话加载(如果有)
if dialogue_path and Path(dialogue_path).exists():
dialogue_data = self.load_existing_dialogues(dialogue_path)
all_data.extend(dialogue_data)
print(f"从对话加载 {len(dialogue_data)} 条数据")
# 打乱数据
random.shuffle(all_data)
# 统计标签分布
label_counts = {}
for item in all_data:
label = item['label']
label_counts[label] = label_counts.get(label, 0) + 1
print(f"\n数据集统计:")
print(f" 总样本数: {len(all_data)}")
for label, count in sorted(label_counts.items()):
print(f" {label}: {count} ({count/len(all_data)*100:.1f}%)")
# 划分训练集和验证集
split_idx = int(len(all_data) * train_ratio)
train_data = all_data[:split_idx]
val_data = all_data[split_idx:]
# 保存数据
train_path = self.output_dir / 'train.json'
val_path = self.output_dir / 'val.json'
with open(train_path, 'w', encoding='utf-8') as f:
json.dump(train_data, f, ensure_ascii=False, indent=2)
with open(val_path, 'w', encoding='utf-8') as f:
json.dump(val_data, f, ensure_ascii=False, indent=2)
print(f"\n数据已保存:")
print(f" 训练集: {train_path} ({len(train_data)} 条)")
print(f" 验证集: {val_path} ({len(val_data)} 条)")
return train_data, val_data
def main():
"""主函数"""
import argparse
parser = argparse.ArgumentParser(description="准备情绪分析训练数据")
parser.add_argument("--output_dir", type=str, default="./data/processed/sentiment",
help="输出目录")
parser.add_argument("--num_synthetic", type=int, default=300,
help="合成数据数量")
parser.add_argument("--dialogue_path", type=str, default=None,
help="对话数据路径")
parser.add_argument("--train_ratio", type=float, default=0.8,
help="训练集比例")
args = parser.parse_args()
preparer = SentimentDataPreparer(output_dir=args.output_dir)
preparer.prepare_datasets(
num_synthetic=args.num_synthetic,
dialogue_path=args.dialogue_path,
train_ratio=args.train_ratio
)
if __name__ == "__main__":
main()