File size: 13,170 Bytes
c8b1f17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
"""
准备情绪分析训练数据
从HR对话数据中生成带情绪标注的训练样本
"""
import json
import random
from pathlib import Path
from typing import List, Dict


class SentimentDataPreparer:
    """情绪分析数据准备器"""

    # HR场景情绪词典
    POSITIVE_KEYWORDS = [
        "好", "满意", "喜欢", "谢谢", "感谢", "可以", "行", "没问题", "支持",
        "理解", "配合", "接受", "认可", "专业", "高效", "合理", "规范",
        "excellent", "good", "ok", "yes", "great", "满意", "帮助", "清楚"
    ]

    NEGATIVE_KEYWORDS = [
        "不", "没", "不行", "不好", "讨厌", "烦", "生气", "愤怒", "不满",
        "投诉", "举报", "错误", "失败", "拒绝", "反对", "抗议", "质疑",
        "不合理", "强制", "拒绝", "no", "not", "bad", "错误", "问题",
        "失望", "糟糕", "差", "难受", "不满意"
    ]

    # HR积极场景模板
    POSITIVE_TEMPLATES = [
        ("好的,我来帮你处理{topic}", "positive"),
        ("感谢你的反馈,我们会尽快处理", "positive"),
        ("没问题,这个申请我可以帮你", "positive"),
        ("非常满意这次的服务", "positive"),
        ("好的,我理解公司的规定", "positive"),
        ("谢谢你的解答,很清楚", "positive"),
        ("支持公司的决定", "positive"),
        ("流程很规范,我很认可", "positive"),
        ("专业高效的回复,感谢", "positive"),
        ("配合完成相关手续", "positive"),
        ("你好,我想咨询一下{topic}", "neutral"),
        ("请问申请{topic}需要什么材料", "neutral"),
        ("我想了解一下关于{topic}的政策", "neutral"),
        ("请问办理{topic}的流程是什么", "neutral"),
        ("{topic}的申请条件是什么", "neutral"),
        ("好的,我知道了", "neutral"),
        ("请问还有什么需要补充的吗", "neutral"),
        ("请提供具体的证明材料", "neutral"),
        ("请问处理时间需要多久", "neutral"),
        ("我想确认一下{topic}的进度", "neutral"),
    ]

    # HR消极场景模板
    NEGATIVE_TEMPLATES = [
        ("我对这个处理结果很不满意", "negative"),
        ("这个制度太不合理了,我很生气", "negative"),
        ("为什么要强制执行这个规定", "negative"),
        ("你们的做法让我很失望", "negative"),
        ("我要投诉这个处理方式", "negative"),
        ("拒绝接受这个安排", "negative"),
        ("这个规定存在很大问题", "negative"),
        ("我对这个结果很不理解", "negative"),
        ("服务质量太差了,很不满", "negative"),
        ("这个流程太复杂了,很烦", "negative"),
    ]

    # HR话题
    HR_TOPICS = [
        "培训申请", "请假申请", "报销申请", "社保查询",
        "公积金查询", "工资条", "劳动合同", "离职手续",
        "入职办理", "证明开具", "福利申请", "考勤异常"
    ]

    def __init__(self, output_dir: str = "./data/processed/sentiment"):
        """
        Args:
            output_dir: 输出目录
        """
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def generate_synthetic_data(self, num_samples: int = 500) -> List[Dict]:
        """
        生成合成训练数据

        Args:
            num_samples: 生成的样本数量

        Returns:
            样本列表
        """
        print(f"生成 {num_samples} 条合成数据...")

        data = []

        # 从模板生成
        for _ in range(num_samples // 3):
            # 积极样本
            template, label = random.choice(self.POSITIVE_TEMPLATES)
            topic = random.choice(self.HR_TOPICS)
            text = template.format(topic=topic)
            data.append({"text": text, "label": label})

            # 中性样本
            template, label = random.choice(self.POSITIVE_TEMPLATES)
            if label == "neutral":
                topic = random.choice(self.HR_TOPICS)
                text = template.format(topic=topic)
                data.append({"text": text, "label": label})

            # 消极样本
            template, label = random.choice(self.NEGATIVE_TEMPLATES)
            data.append({"text": template, "label": label})

        # 从关键词生成
        for _ in range(num_samples // 3):
            # 积极关键词组合
            num_positive = random.randint(1, 3)
            words = random.sample(self.POSITIVE_KEYWORDS, num_positive)
            text = f"{','.join(words)}!"
            data.append({"text": text, "label": "positive"})

            # 消极关键词组合
            num_negative = random.randint(1, 3)
            words = random.sample(self.NEGATIVE_KEYWORDS, num_negative)
            text = f"{','.join(words)}!"
            data.append({"text": text, "label": "negative"})

        # 混合场景
        for _ in range(num_samples // 3):
            # 积极+中性
            pos_word = random.choice(self.POSITIVE_KEYWORDS)
            topic = random.choice(self.HR_TOPICS)
            text = f"{topic}方面{pos_word},请问具体流程是什么"
            data.append({"text": text, "label": "positive"})

            # 消极+投诉
            neg_word = random.choice(self.NEGATIVE_KEYWORDS)
            topic = random.choice(self.HR_TOPICS)
            text = f"关于{topic},我感到很{neg_word},要求重新处理"
            data.append({"text": text, "label": "negative"})

        # 打乱数据
        random.shuffle(data)

        return data[:num_samples]

    def load_existing_dialogues(self, dialogue_path: str) -> List[Dict]:
        """
        从现有对话数据加载并标注情绪

        Args:
            dialogue_path: 对话数据文件路径

        Returns:
            标注后的数据
        """
        print(f"从 {dialogue_path} 加载对话数据...")

        data = []

        try:
            with open(dialogue_path, 'r', encoding='utf-8') as f:
                dialogues = json.load(f)

            for dialogue in dialogues:
                for turn in dialogue.get('turns', []):
                    text = turn.get('utterance', '')
                    speaker = turn.get('speaker', '')

                    # 根据说话人和内容推断情绪
                    if speaker == 'Employee':
                        # 员工情绪分析
                        label = self._infer_sentiment(text)
                    else:
                        # HR回复通常是中性
                        label = 'neutral'

                    data.append({
                        'text': text,
                        'label': label,
                        'speaker': speaker,
                        'source': 'dialogue'
                    })

        except Exception as e:
            print(f"加载对话数据失败: {e}")

        return data

    def _infer_sentiment(self, text: str) -> str:
        """根据关键词推断情绪"""
        positive_count = sum(1 for word in self.POSITIVE_KEYWORDS if word in text.lower())
        negative_count = sum(1 for word in self.NEGATIVE_KEYWORDS if word in text.lower())

        if positive_count > negative_count:
            return 'positive'
        elif negative_count > positive_count:
            return 'negative'
        else:
            return 'neutral'

    def create_manual_examples(self) -> List[Dict]:
        """创建手动标注的高质量示例"""
        return [
            # 积极
            {"text": "好的,谢谢你的帮助,非常满意!", "label": "positive"},
            {"text": "没问题,我理解公司的规定", "label": "positive"},
            {"text": "太好了,感谢你的解答", "label": "positive"},
            {"text": "流程很规范,我很认可", "label": "positive"},
            {"text": "专业高效的回复,感谢支持", "label": "positive"},
            {"text": "配合完成相关手续,谢谢", "label": "positive"},
            {"text": "我对这次服务很满意", "label": "positive"},
            {"text": "好的,清楚明白了", "label": "positive"},

            # 中性
            {"text": "您好,请问有什么可以帮您?", "label": "neutral"},
            {"text": "我需要了解培训的具体时间", "label": "neutral"},
            {"text": "请问申请年假需要什么材料", "label": "neutral"},
            {"text": "我想咨询一下社保缴纳问题", "label": "neutral"},
            {"text": "好的,我知道了", "label": "neutral"},
            {"text": "请问还有什么需要补充的吗", "label": "neutral"},
            {"text": "请提供具体的证明材料", "label": "neutral"},
            {"text": "请问处理时间需要多久", "label": "neutral"},
            {"text": "我想确认一下申请进度", "label": "neutral"},
            {"text": "根据公司规定办理即可", "label": "neutral"},

            # 消极
            {"text": "我对这个处理结果很不满意", "label": "negative"},
            {"text": "这个制度太不合理了,我很生气", "label": "negative"},
            {"text": "为什么要强制执行这个规定", "label": "negative"},
            {"text": "你们的做法让我很失望", "label": "negative"},
            {"text": "我要投诉这个处理方式", "label": "negative"},
            {"text": "拒绝接受这个安排", "label": "negative"},
            {"text": "这个规定存在很大问题", "label": "negative"},
            {"text": "我对这个结果很不满,要求重新处理", "label": "negative"},
            {"text": "服务质量太差了", "label": "negative"},
            {"text": "这个流程太复杂了,很烦", "label": "negative"},
        ]

    def prepare_datasets(
        self,
        num_synthetic: int = 300,
        dialogue_path: str = None,
        train_ratio: float = 0.8
    ):
        """
        准备训练和验证数据集

        Args:
            num_synthetic: 合成数据数量
            dialogue_path: 对话数据路径
            train_ratio: 训练集比例
        """
        print("开始准备情绪分析数据集...")

        all_data = []

        # 1. 添加手动标注示例(高质量)
        manual_data = self.create_manual_examples()
        all_data.extend(manual_data)
        print(f"添加 {len(manual_data)} 条手动标注数据")

        # 2. 生成合成数据
        synthetic_data = self.generate_synthetic_data(num_synthetic)
        all_data.extend(synthetic_data)
        print(f"生成 {len(synthetic_data)} 条合成数据")

        # 3. 从现有对话加载(如果有)
        if dialogue_path and Path(dialogue_path).exists():
            dialogue_data = self.load_existing_dialogues(dialogue_path)
            all_data.extend(dialogue_data)
            print(f"从对话加载 {len(dialogue_data)} 条数据")

        # 打乱数据
        random.shuffle(all_data)

        # 统计标签分布
        label_counts = {}
        for item in all_data:
            label = item['label']
            label_counts[label] = label_counts.get(label, 0) + 1

        print(f"\n数据集统计:")
        print(f"  总样本数: {len(all_data)}")
        for label, count in sorted(label_counts.items()):
            print(f"  {label}: {count} ({count/len(all_data)*100:.1f}%)")

        # 划分训练集和验证集
        split_idx = int(len(all_data) * train_ratio)
        train_data = all_data[:split_idx]
        val_data = all_data[split_idx:]

        # 保存数据
        train_path = self.output_dir / 'train.json'
        val_path = self.output_dir / 'val.json'

        with open(train_path, 'w', encoding='utf-8') as f:
            json.dump(train_data, f, ensure_ascii=False, indent=2)

        with open(val_path, 'w', encoding='utf-8') as f:
            json.dump(val_data, f, ensure_ascii=False, indent=2)

        print(f"\n数据已保存:")
        print(f"  训练集: {train_path} ({len(train_data)} 条)")
        print(f"  验证集: {val_path} ({len(val_data)} 条)")

        return train_data, val_data


def main():
    """主函数"""
    import argparse

    parser = argparse.ArgumentParser(description="准备情绪分析训练数据")
    parser.add_argument("--output_dir", type=str, default="./data/processed/sentiment",
                        help="输出目录")
    parser.add_argument("--num_synthetic", type=int, default=300,
                        help="合成数据数量")
    parser.add_argument("--dialogue_path", type=str, default=None,
                        help="对话数据路径")
    parser.add_argument("--train_ratio", type=float, default=0.8,
                        help="训练集比例")

    args = parser.parse_args()

    preparer = SentimentDataPreparer(output_dir=args.output_dir)
    preparer.prepare_datasets(
        num_synthetic=args.num_synthetic,
        dialogue_path=args.dialogue_path,
        train_ratio=args.train_ratio
    )


if __name__ == "__main__":
    main()