Spaces:
Sleeping
Sleeping
| """ | |
| 准备情绪分析训练数据 | |
| 从HR对话数据中生成带情绪标注的训练样本 | |
| """ | |
| import json | |
| import random | |
| from pathlib import Path | |
| from typing import List, Dict | |
| class SentimentDataPreparer: | |
| """情绪分析数据准备器""" | |
| # HR场景情绪词典 | |
| POSITIVE_KEYWORDS = [ | |
| "好", "满意", "喜欢", "谢谢", "感谢", "可以", "行", "没问题", "支持", | |
| "理解", "配合", "接受", "认可", "专业", "高效", "合理", "规范", | |
| "excellent", "good", "ok", "yes", "great", "满意", "帮助", "清楚" | |
| ] | |
| NEGATIVE_KEYWORDS = [ | |
| "不", "没", "不行", "不好", "讨厌", "烦", "生气", "愤怒", "不满", | |
| "投诉", "举报", "错误", "失败", "拒绝", "反对", "抗议", "质疑", | |
| "不合理", "强制", "拒绝", "no", "not", "bad", "错误", "问题", | |
| "失望", "糟糕", "差", "难受", "不满意" | |
| ] | |
| # HR积极场景模板 | |
| POSITIVE_TEMPLATES = [ | |
| ("好的,我来帮你处理{topic}", "positive"), | |
| ("感谢你的反馈,我们会尽快处理", "positive"), | |
| ("没问题,这个申请我可以帮你", "positive"), | |
| ("非常满意这次的服务", "positive"), | |
| ("好的,我理解公司的规定", "positive"), | |
| ("谢谢你的解答,很清楚", "positive"), | |
| ("支持公司的决定", "positive"), | |
| ("流程很规范,我很认可", "positive"), | |
| ("专业高效的回复,感谢", "positive"), | |
| ("配合完成相关手续", "positive"), | |
| ("你好,我想咨询一下{topic}", "neutral"), | |
| ("请问申请{topic}需要什么材料", "neutral"), | |
| ("我想了解一下关于{topic}的政策", "neutral"), | |
| ("请问办理{topic}的流程是什么", "neutral"), | |
| ("{topic}的申请条件是什么", "neutral"), | |
| ("好的,我知道了", "neutral"), | |
| ("请问还有什么需要补充的吗", "neutral"), | |
| ("请提供具体的证明材料", "neutral"), | |
| ("请问处理时间需要多久", "neutral"), | |
| ("我想确认一下{topic}的进度", "neutral"), | |
| ] | |
| # HR消极场景模板 | |
| NEGATIVE_TEMPLATES = [ | |
| ("我对这个处理结果很不满意", "negative"), | |
| ("这个制度太不合理了,我很生气", "negative"), | |
| ("为什么要强制执行这个规定", "negative"), | |
| ("你们的做法让我很失望", "negative"), | |
| ("我要投诉这个处理方式", "negative"), | |
| ("拒绝接受这个安排", "negative"), | |
| ("这个规定存在很大问题", "negative"), | |
| ("我对这个结果很不理解", "negative"), | |
| ("服务质量太差了,很不满", "negative"), | |
| ("这个流程太复杂了,很烦", "negative"), | |
| ] | |
| # HR话题 | |
| HR_TOPICS = [ | |
| "培训申请", "请假申请", "报销申请", "社保查询", | |
| "公积金查询", "工资条", "劳动合同", "离职手续", | |
| "入职办理", "证明开具", "福利申请", "考勤异常" | |
| ] | |
| def __init__(self, output_dir: str = "./data/processed/sentiment"): | |
| """ | |
| Args: | |
| output_dir: 输出目录 | |
| """ | |
| self.output_dir = Path(output_dir) | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| def generate_synthetic_data(self, num_samples: int = 500) -> List[Dict]: | |
| """ | |
| 生成合成训练数据 | |
| Args: | |
| num_samples: 生成的样本数量 | |
| Returns: | |
| 样本列表 | |
| """ | |
| print(f"生成 {num_samples} 条合成数据...") | |
| data = [] | |
| # 从模板生成 | |
| for _ in range(num_samples // 3): | |
| # 积极样本 | |
| template, label = random.choice(self.POSITIVE_TEMPLATES) | |
| topic = random.choice(self.HR_TOPICS) | |
| text = template.format(topic=topic) | |
| data.append({"text": text, "label": label}) | |
| # 中性样本 | |
| template, label = random.choice(self.POSITIVE_TEMPLATES) | |
| if label == "neutral": | |
| topic = random.choice(self.HR_TOPICS) | |
| text = template.format(topic=topic) | |
| data.append({"text": text, "label": label}) | |
| # 消极样本 | |
| template, label = random.choice(self.NEGATIVE_TEMPLATES) | |
| data.append({"text": template, "label": label}) | |
| # 从关键词生成 | |
| for _ in range(num_samples // 3): | |
| # 积极关键词组合 | |
| num_positive = random.randint(1, 3) | |
| words = random.sample(self.POSITIVE_KEYWORDS, num_positive) | |
| text = f"{','.join(words)}!" | |
| data.append({"text": text, "label": "positive"}) | |
| # 消极关键词组合 | |
| num_negative = random.randint(1, 3) | |
| words = random.sample(self.NEGATIVE_KEYWORDS, num_negative) | |
| text = f"{','.join(words)}!" | |
| data.append({"text": text, "label": "negative"}) | |
| # 混合场景 | |
| for _ in range(num_samples // 3): | |
| # 积极+中性 | |
| pos_word = random.choice(self.POSITIVE_KEYWORDS) | |
| topic = random.choice(self.HR_TOPICS) | |
| text = f"{topic}方面{pos_word},请问具体流程是什么" | |
| data.append({"text": text, "label": "positive"}) | |
| # 消极+投诉 | |
| neg_word = random.choice(self.NEGATIVE_KEYWORDS) | |
| topic = random.choice(self.HR_TOPICS) | |
| text = f"关于{topic},我感到很{neg_word},要求重新处理" | |
| data.append({"text": text, "label": "negative"}) | |
| # 打乱数据 | |
| random.shuffle(data) | |
| return data[:num_samples] | |
| def load_existing_dialogues(self, dialogue_path: str) -> List[Dict]: | |
| """ | |
| 从现有对话数据加载并标注情绪 | |
| Args: | |
| dialogue_path: 对话数据文件路径 | |
| Returns: | |
| 标注后的数据 | |
| """ | |
| print(f"从 {dialogue_path} 加载对话数据...") | |
| data = [] | |
| try: | |
| with open(dialogue_path, 'r', encoding='utf-8') as f: | |
| dialogues = json.load(f) | |
| for dialogue in dialogues: | |
| for turn in dialogue.get('turns', []): | |
| text = turn.get('utterance', '') | |
| speaker = turn.get('speaker', '') | |
| # 根据说话人和内容推断情绪 | |
| if speaker == 'Employee': | |
| # 员工情绪分析 | |
| label = self._infer_sentiment(text) | |
| else: | |
| # HR回复通常是中性 | |
| label = 'neutral' | |
| data.append({ | |
| 'text': text, | |
| 'label': label, | |
| 'speaker': speaker, | |
| 'source': 'dialogue' | |
| }) | |
| except Exception as e: | |
| print(f"加载对话数据失败: {e}") | |
| return data | |
| def _infer_sentiment(self, text: str) -> str: | |
| """根据关键词推断情绪""" | |
| positive_count = sum(1 for word in self.POSITIVE_KEYWORDS if word in text.lower()) | |
| negative_count = sum(1 for word in self.NEGATIVE_KEYWORDS if word in text.lower()) | |
| if positive_count > negative_count: | |
| return 'positive' | |
| elif negative_count > positive_count: | |
| return 'negative' | |
| else: | |
| return 'neutral' | |
| def create_manual_examples(self) -> List[Dict]: | |
| """创建手动标注的高质量示例""" | |
| return [ | |
| # 积极 | |
| {"text": "好的,谢谢你的帮助,非常满意!", "label": "positive"}, | |
| {"text": "没问题,我理解公司的规定", "label": "positive"}, | |
| {"text": "太好了,感谢你的解答", "label": "positive"}, | |
| {"text": "流程很规范,我很认可", "label": "positive"}, | |
| {"text": "专业高效的回复,感谢支持", "label": "positive"}, | |
| {"text": "配合完成相关手续,谢谢", "label": "positive"}, | |
| {"text": "我对这次服务很满意", "label": "positive"}, | |
| {"text": "好的,清楚明白了", "label": "positive"}, | |
| # 中性 | |
| {"text": "您好,请问有什么可以帮您?", "label": "neutral"}, | |
| {"text": "我需要了解培训的具体时间", "label": "neutral"}, | |
| {"text": "请问申请年假需要什么材料", "label": "neutral"}, | |
| {"text": "我想咨询一下社保缴纳问题", "label": "neutral"}, | |
| {"text": "好的,我知道了", "label": "neutral"}, | |
| {"text": "请问还有什么需要补充的吗", "label": "neutral"}, | |
| {"text": "请提供具体的证明材料", "label": "neutral"}, | |
| {"text": "请问处理时间需要多久", "label": "neutral"}, | |
| {"text": "我想确认一下申请进度", "label": "neutral"}, | |
| {"text": "根据公司规定办理即可", "label": "neutral"}, | |
| # 消极 | |
| {"text": "我对这个处理结果很不满意", "label": "negative"}, | |
| {"text": "这个制度太不合理了,我很生气", "label": "negative"}, | |
| {"text": "为什么要强制执行这个规定", "label": "negative"}, | |
| {"text": "你们的做法让我很失望", "label": "negative"}, | |
| {"text": "我要投诉这个处理方式", "label": "negative"}, | |
| {"text": "拒绝接受这个安排", "label": "negative"}, | |
| {"text": "这个规定存在很大问题", "label": "negative"}, | |
| {"text": "我对这个结果很不满,要求重新处理", "label": "negative"}, | |
| {"text": "服务质量太差了", "label": "negative"}, | |
| {"text": "这个流程太复杂了,很烦", "label": "negative"}, | |
| ] | |
| def prepare_datasets( | |
| self, | |
| num_synthetic: int = 300, | |
| dialogue_path: str = None, | |
| train_ratio: float = 0.8 | |
| ): | |
| """ | |
| 准备训练和验证数据集 | |
| Args: | |
| num_synthetic: 合成数据数量 | |
| dialogue_path: 对话数据路径 | |
| train_ratio: 训练集比例 | |
| """ | |
| print("开始准备情绪分析数据集...") | |
| all_data = [] | |
| # 1. 添加手动标注示例(高质量) | |
| manual_data = self.create_manual_examples() | |
| all_data.extend(manual_data) | |
| print(f"添加 {len(manual_data)} 条手动标注数据") | |
| # 2. 生成合成数据 | |
| synthetic_data = self.generate_synthetic_data(num_synthetic) | |
| all_data.extend(synthetic_data) | |
| print(f"生成 {len(synthetic_data)} 条合成数据") | |
| # 3. 从现有对话加载(如果有) | |
| if dialogue_path and Path(dialogue_path).exists(): | |
| dialogue_data = self.load_existing_dialogues(dialogue_path) | |
| all_data.extend(dialogue_data) | |
| print(f"从对话加载 {len(dialogue_data)} 条数据") | |
| # 打乱数据 | |
| random.shuffle(all_data) | |
| # 统计标签分布 | |
| label_counts = {} | |
| for item in all_data: | |
| label = item['label'] | |
| label_counts[label] = label_counts.get(label, 0) + 1 | |
| print(f"\n数据集统计:") | |
| print(f" 总样本数: {len(all_data)}") | |
| for label, count in sorted(label_counts.items()): | |
| print(f" {label}: {count} ({count/len(all_data)*100:.1f}%)") | |
| # 划分训练集和验证集 | |
| split_idx = int(len(all_data) * train_ratio) | |
| train_data = all_data[:split_idx] | |
| val_data = all_data[split_idx:] | |
| # 保存数据 | |
| train_path = self.output_dir / 'train.json' | |
| val_path = self.output_dir / 'val.json' | |
| with open(train_path, 'w', encoding='utf-8') as f: | |
| json.dump(train_data, f, ensure_ascii=False, indent=2) | |
| with open(val_path, 'w', encoding='utf-8') as f: | |
| json.dump(val_data, f, ensure_ascii=False, indent=2) | |
| print(f"\n数据已保存:") | |
| print(f" 训练集: {train_path} ({len(train_data)} 条)") | |
| print(f" 验证集: {val_path} ({len(val_data)} 条)") | |
| return train_data, val_data | |
| def main(): | |
| """主函数""" | |
| import argparse | |
| parser = argparse.ArgumentParser(description="准备情绪分析训练数据") | |
| parser.add_argument("--output_dir", type=str, default="./data/processed/sentiment", | |
| help="输出目录") | |
| parser.add_argument("--num_synthetic", type=int, default=300, | |
| help="合成数据数量") | |
| parser.add_argument("--dialogue_path", type=str, default=None, | |
| help="对话数据路径") | |
| parser.add_argument("--train_ratio", type=float, default=0.8, | |
| help="训练集比例") | |
| args = parser.parse_args() | |
| preparer = SentimentDataPreparer(output_dir=args.output_dir) | |
| preparer.prepare_datasets( | |
| num_synthetic=args.num_synthetic, | |
| dialogue_path=args.dialogue_path, | |
| train_ratio=args.train_ratio | |
| ) | |
| if __name__ == "__main__": | |
| main() | |