Spaces:
Sleeping
Sleeping
File size: 13,170 Bytes
c8b1f17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 |
"""
准备情绪分析训练数据
从HR对话数据中生成带情绪标注的训练样本
"""
import json
import random
from pathlib import Path
from typing import List, Dict
class SentimentDataPreparer:
"""情绪分析数据准备器"""
# HR场景情绪词典
POSITIVE_KEYWORDS = [
"好", "满意", "喜欢", "谢谢", "感谢", "可以", "行", "没问题", "支持",
"理解", "配合", "接受", "认可", "专业", "高效", "合理", "规范",
"excellent", "good", "ok", "yes", "great", "满意", "帮助", "清楚"
]
NEGATIVE_KEYWORDS = [
"不", "没", "不行", "不好", "讨厌", "烦", "生气", "愤怒", "不满",
"投诉", "举报", "错误", "失败", "拒绝", "反对", "抗议", "质疑",
"不合理", "强制", "拒绝", "no", "not", "bad", "错误", "问题",
"失望", "糟糕", "差", "难受", "不满意"
]
# HR积极场景模板
POSITIVE_TEMPLATES = [
("好的,我来帮你处理{topic}", "positive"),
("感谢你的反馈,我们会尽快处理", "positive"),
("没问题,这个申请我可以帮你", "positive"),
("非常满意这次的服务", "positive"),
("好的,我理解公司的规定", "positive"),
("谢谢你的解答,很清楚", "positive"),
("支持公司的决定", "positive"),
("流程很规范,我很认可", "positive"),
("专业高效的回复,感谢", "positive"),
("配合完成相关手续", "positive"),
("你好,我想咨询一下{topic}", "neutral"),
("请问申请{topic}需要什么材料", "neutral"),
("我想了解一下关于{topic}的政策", "neutral"),
("请问办理{topic}的流程是什么", "neutral"),
("{topic}的申请条件是什么", "neutral"),
("好的,我知道了", "neutral"),
("请问还有什么需要补充的吗", "neutral"),
("请提供具体的证明材料", "neutral"),
("请问处理时间需要多久", "neutral"),
("我想确认一下{topic}的进度", "neutral"),
]
# HR消极场景模板
NEGATIVE_TEMPLATES = [
("我对这个处理结果很不满意", "negative"),
("这个制度太不合理了,我很生气", "negative"),
("为什么要强制执行这个规定", "negative"),
("你们的做法让我很失望", "negative"),
("我要投诉这个处理方式", "negative"),
("拒绝接受这个安排", "negative"),
("这个规定存在很大问题", "negative"),
("我对这个结果很不理解", "negative"),
("服务质量太差了,很不满", "negative"),
("这个流程太复杂了,很烦", "negative"),
]
# HR话题
HR_TOPICS = [
"培训申请", "请假申请", "报销申请", "社保查询",
"公积金查询", "工资条", "劳动合同", "离职手续",
"入职办理", "证明开具", "福利申请", "考勤异常"
]
def __init__(self, output_dir: str = "./data/processed/sentiment"):
"""
Args:
output_dir: 输出目录
"""
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
def generate_synthetic_data(self, num_samples: int = 500) -> List[Dict]:
"""
生成合成训练数据
Args:
num_samples: 生成的样本数量
Returns:
样本列表
"""
print(f"生成 {num_samples} 条合成数据...")
data = []
# 从模板生成
for _ in range(num_samples // 3):
# 积极样本
template, label = random.choice(self.POSITIVE_TEMPLATES)
topic = random.choice(self.HR_TOPICS)
text = template.format(topic=topic)
data.append({"text": text, "label": label})
# 中性样本
template, label = random.choice(self.POSITIVE_TEMPLATES)
if label == "neutral":
topic = random.choice(self.HR_TOPICS)
text = template.format(topic=topic)
data.append({"text": text, "label": label})
# 消极样本
template, label = random.choice(self.NEGATIVE_TEMPLATES)
data.append({"text": template, "label": label})
# 从关键词生成
for _ in range(num_samples // 3):
# 积极关键词组合
num_positive = random.randint(1, 3)
words = random.sample(self.POSITIVE_KEYWORDS, num_positive)
text = f"{','.join(words)}!"
data.append({"text": text, "label": "positive"})
# 消极关键词组合
num_negative = random.randint(1, 3)
words = random.sample(self.NEGATIVE_KEYWORDS, num_negative)
text = f"{','.join(words)}!"
data.append({"text": text, "label": "negative"})
# 混合场景
for _ in range(num_samples // 3):
# 积极+中性
pos_word = random.choice(self.POSITIVE_KEYWORDS)
topic = random.choice(self.HR_TOPICS)
text = f"{topic}方面{pos_word},请问具体流程是什么"
data.append({"text": text, "label": "positive"})
# 消极+投诉
neg_word = random.choice(self.NEGATIVE_KEYWORDS)
topic = random.choice(self.HR_TOPICS)
text = f"关于{topic},我感到很{neg_word},要求重新处理"
data.append({"text": text, "label": "negative"})
# 打乱数据
random.shuffle(data)
return data[:num_samples]
def load_existing_dialogues(self, dialogue_path: str) -> List[Dict]:
"""
从现有对话数据加载并标注情绪
Args:
dialogue_path: 对话数据文件路径
Returns:
标注后的数据
"""
print(f"从 {dialogue_path} 加载对话数据...")
data = []
try:
with open(dialogue_path, 'r', encoding='utf-8') as f:
dialogues = json.load(f)
for dialogue in dialogues:
for turn in dialogue.get('turns', []):
text = turn.get('utterance', '')
speaker = turn.get('speaker', '')
# 根据说话人和内容推断情绪
if speaker == 'Employee':
# 员工情绪分析
label = self._infer_sentiment(text)
else:
# HR回复通常是中性
label = 'neutral'
data.append({
'text': text,
'label': label,
'speaker': speaker,
'source': 'dialogue'
})
except Exception as e:
print(f"加载对话数据失败: {e}")
return data
def _infer_sentiment(self, text: str) -> str:
"""根据关键词推断情绪"""
positive_count = sum(1 for word in self.POSITIVE_KEYWORDS if word in text.lower())
negative_count = sum(1 for word in self.NEGATIVE_KEYWORDS if word in text.lower())
if positive_count > negative_count:
return 'positive'
elif negative_count > positive_count:
return 'negative'
else:
return 'neutral'
def create_manual_examples(self) -> List[Dict]:
"""创建手动标注的高质量示例"""
return [
# 积极
{"text": "好的,谢谢你的帮助,非常满意!", "label": "positive"},
{"text": "没问题,我理解公司的规定", "label": "positive"},
{"text": "太好了,感谢你的解答", "label": "positive"},
{"text": "流程很规范,我很认可", "label": "positive"},
{"text": "专业高效的回复,感谢支持", "label": "positive"},
{"text": "配合完成相关手续,谢谢", "label": "positive"},
{"text": "我对这次服务很满意", "label": "positive"},
{"text": "好的,清楚明白了", "label": "positive"},
# 中性
{"text": "您好,请问有什么可以帮您?", "label": "neutral"},
{"text": "我需要了解培训的具体时间", "label": "neutral"},
{"text": "请问申请年假需要什么材料", "label": "neutral"},
{"text": "我想咨询一下社保缴纳问题", "label": "neutral"},
{"text": "好的,我知道了", "label": "neutral"},
{"text": "请问还有什么需要补充的吗", "label": "neutral"},
{"text": "请提供具体的证明材料", "label": "neutral"},
{"text": "请问处理时间需要多久", "label": "neutral"},
{"text": "我想确认一下申请进度", "label": "neutral"},
{"text": "根据公司规定办理即可", "label": "neutral"},
# 消极
{"text": "我对这个处理结果很不满意", "label": "negative"},
{"text": "这个制度太不合理了,我很生气", "label": "negative"},
{"text": "为什么要强制执行这个规定", "label": "negative"},
{"text": "你们的做法让我很失望", "label": "negative"},
{"text": "我要投诉这个处理方式", "label": "negative"},
{"text": "拒绝接受这个安排", "label": "negative"},
{"text": "这个规定存在很大问题", "label": "negative"},
{"text": "我对这个结果很不满,要求重新处理", "label": "negative"},
{"text": "服务质量太差了", "label": "negative"},
{"text": "这个流程太复杂了,很烦", "label": "negative"},
]
def prepare_datasets(
self,
num_synthetic: int = 300,
dialogue_path: str = None,
train_ratio: float = 0.8
):
"""
准备训练和验证数据集
Args:
num_synthetic: 合成数据数量
dialogue_path: 对话数据路径
train_ratio: 训练集比例
"""
print("开始准备情绪分析数据集...")
all_data = []
# 1. 添加手动标注示例(高质量)
manual_data = self.create_manual_examples()
all_data.extend(manual_data)
print(f"添加 {len(manual_data)} 条手动标注数据")
# 2. 生成合成数据
synthetic_data = self.generate_synthetic_data(num_synthetic)
all_data.extend(synthetic_data)
print(f"生成 {len(synthetic_data)} 条合成数据")
# 3. 从现有对话加载(如果有)
if dialogue_path and Path(dialogue_path).exists():
dialogue_data = self.load_existing_dialogues(dialogue_path)
all_data.extend(dialogue_data)
print(f"从对话加载 {len(dialogue_data)} 条数据")
# 打乱数据
random.shuffle(all_data)
# 统计标签分布
label_counts = {}
for item in all_data:
label = item['label']
label_counts[label] = label_counts.get(label, 0) + 1
print(f"\n数据集统计:")
print(f" 总样本数: {len(all_data)}")
for label, count in sorted(label_counts.items()):
print(f" {label}: {count} ({count/len(all_data)*100:.1f}%)")
# 划分训练集和验证集
split_idx = int(len(all_data) * train_ratio)
train_data = all_data[:split_idx]
val_data = all_data[split_idx:]
# 保存数据
train_path = self.output_dir / 'train.json'
val_path = self.output_dir / 'val.json'
with open(train_path, 'w', encoding='utf-8') as f:
json.dump(train_data, f, ensure_ascii=False, indent=2)
with open(val_path, 'w', encoding='utf-8') as f:
json.dump(val_data, f, ensure_ascii=False, indent=2)
print(f"\n数据已保存:")
print(f" 训练集: {train_path} ({len(train_data)} 条)")
print(f" 验证集: {val_path} ({len(val_data)} 条)")
return train_data, val_data
def main():
"""主函数"""
import argparse
parser = argparse.ArgumentParser(description="准备情绪分析训练数据")
parser.add_argument("--output_dir", type=str, default="./data/processed/sentiment",
help="输出目录")
parser.add_argument("--num_synthetic", type=int, default=300,
help="合成数据数量")
parser.add_argument("--dialogue_path", type=str, default=None,
help="对话数据路径")
parser.add_argument("--train_ratio", type=float, default=0.8,
help="训练集比例")
args = parser.parse_args()
preparer = SentimentDataPreparer(output_dir=args.output_dir)
preparer.prepare_datasets(
num_synthetic=args.num_synthetic,
dialogue_path=args.dialogue_path,
train_ratio=args.train_ratio
)
if __name__ == "__main__":
main()
|