File size: 14,351 Bytes
a226682 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 |
import re
from typing import List, Dict, Tuple
from collections import Counter, defaultdict
from tqdm import tqdm
from config import Config
from utils.cache_manager import CacheManager
from core.openai_client import OpenAIClient
class CharacterExtractor:
"""角色提取器 - 从大规模文本中识别主要角色"""
def __init__(self):
self.client = OpenAIClient.get_client()
self.cache = CacheManager()
def extract_names_pattern(self, text: str, language: str = "en") -> List[str]:
"""使用正则模式提取人名
Args:
text: 输入文本
language: 语言类型 ('en' 或 'zh')
Returns:
提取到的人名列表
"""
if language == "en":
# 英文人名:大写字母开头的连续词
pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'
else:
# 中文人名:常见姓氏+1-2个字
common_surnames = '哈赫王李张刘陈杨黄赵吴周徐孙马朱胡郭何林高梁郑罗宋谢唐韩曹许邓萧冯曾程蔡彭潘袁于董余苏叶吕魏蒋田杜丁沈姜范江傅钟卢汪戴崔任陆廖姚方金邱夏谭韦贾邹石熊孟秦阎薛侯雷白龙段郝孔邵史毛常万顾赖武康贺严尹钱施牛洪龚'
pattern = f'[{common_surnames}][\\u4e00-\\u9fff]{{1,3}}'
names = re.findall(pattern, text)
return names
def extract_characters_from_chunks(self, chunks: List[Dict],
language: str = "en") -> Dict[str, Dict]:
"""从文本块中提取角色及其出现信息
Args:
chunks: 文本块列表
language: 语言类型
Returns:
角色信息字典
"""
# 检查缓存
cache_key = f"characters_{hash(str(chunks[:3]))}"
cached = self.cache.get(cache_key)
if cached:
print("从缓存加载角色信息")
return cached
character_mentions = defaultdict(lambda: {
'count': 0,
'chunks': set(),
'positions': [],
'contexts': []
})
# 添加常见的非角色词汇黑名单
blacklist = {
# 章节标记
'Chapter', 'CHAPTER', 'Part', 'PART', 'Section', 'SECTION',
'Book', 'BOOK', 'Volume', 'VOLUME',
# 冠词和连词
'The', 'THE', 'And', 'AND', 'But', 'BUT', 'Or', 'OR',
# 疑问词
'When', 'WHERE', 'What', 'WHAT', 'Who', 'WHO', 'Why', 'WHY', 'How', 'HOW',
# 称谓(单独出现)
'Mr', 'Mrs', 'Ms', 'Miss', 'Dr', 'Professor', 'Sir', 'Madam',
# 星期
'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
# 月份
'January', 'February', 'March', 'April', 'May', 'June', 'July',
'August', 'September', 'October', 'November', 'December',
# 常见地点
'Street', 'Road', 'Avenue', 'Place', 'Square', 'Hall', 'House',
'School', 'Academy', 'University', 'Castle', 'Forest', 'Mountain',
# 其他常见词
'Note', 'End', 'Beginning', 'Epilogue', 'Prologue', 'Appendix',
'First', 'Second', 'Third', 'Last', 'Next', 'Previous',
}
print("\n提取角色名称...")
for chunk in tqdm(chunks):
names = self.extract_names_pattern(chunk['text'], language)
for name in names:
# 过滤太短或太长的名字
if language == "en":
if len(name) < 3 or len(name) > 30:
continue
else: # zh
if len(name) < 2 or len(name) > 4:
continue
# 过滤黑名单
if name in blacklist:
continue
# 过滤纯数字
if name.isdigit():
continue
# 过滤单个字母或单个词(可能是缩写)
if len(name) <= 2:
continue
# 过滤全大写(可能是缩写或标题)
if name.isupper() and len(name) < 5:
continue
character_mentions[name]['count'] += 1
character_mentions[name]['chunks'].add(chunk['chunk_id'])
character_mentions[name]['positions'].append(chunk['start'])
# 转换set为list以便序列化
for char in character_mentions:
character_mentions[char]['chunks'] = list(character_mentions[char]['chunks'])
# 缓存结果
self.cache.set(cache_key, dict(character_mentions))
return dict(character_mentions)
def rank_characters(self, character_mentions: Dict) -> List[Tuple[str, Dict]]:
"""根据出场频率和分布排序角色
Args:
character_mentions: 角色提及信息
Returns:
排序后的角色列表
"""
scored_characters = []
for name, info in character_mentions.items():
# 综合评分:出现次数 + 出现的块数量 * 2
score = info['count'] + len(info['chunks']) * 2
scored_characters.append((name, info, score))
# 按分数排序
scored_characters.sort(key=lambda x: x[2], reverse=True)
# 返回前N个角色
return [(name, info) for name, info, score in scored_characters[:Config.TOP_N_CHARACTERS]]
def merge_similar_names(self, characters: List[Tuple[str, Dict]]) -> List[Tuple[str, Dict]]:
"""合并相似的名字(如 Harry 和 Harry Potter)
Args:
characters: 角色列表
Returns:
合并后的角色列表
"""
merged = {}
for name, info in characters:
# 查找是否有包含关系
found_parent = False
for existing_name in list(merged.keys()):
# 如果当前名字是已存在名字的一部分,或反之
if name in existing_name or existing_name in name:
# 合并到更长的名字下
longer_name = name if len(name) > len(existing_name) else existing_name
shorter_name = existing_name if longer_name == name else name
if longer_name not in merged:
merged[longer_name] = merged.pop(existing_name)
# 合并信息
merged[longer_name]['count'] += info['count']
merged[longer_name]['chunks'].extend(info['chunks'])
merged[longer_name]['chunks'] = list(set(merged[longer_name]['chunks']))
found_parent = True
break
if not found_parent:
merged[name] = info
return list(merged.items())
def refine_with_llm(self, text_sample: str, candidate_names: List[str]) -> List[str]:
"""使用LLM进一步确认和提取主要角色
Args:
text_sample: 文本样本
candidate_names: 候选角色名列表
Returns:
确认的角色名列表
"""
prompt = f"""
以下是一部小说的开头片段。请识别出主要角色的名字。
候选名字列表:{', '.join(candidate_names[:30])}
文本片段:
{text_sample[:2000]}
请返回确认的主要角色名字,每行一个。只返回真正的角色名,排除:
1. 地名、物品名
2. 章节标记(如 "Chapter")
3. 称谓词(如 "Mr", "Professor")
4. 普通名词
只返回人名,每个名字一行。
"""
try:
response = self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "你是一个文学分析专家,擅长识别小说角色。请只返回真正的人物角色名字。"},
{"role": "user", "content": prompt}
]
)
result = response.choices[0].message.content.strip()
confirmed_names = [name.strip() for name in result.split('\n') if name.strip()]
# 再次过滤明显不是人名的词
final_names = []
for name in confirmed_names:
# 跳过太短的
if len(name) < 3:
continue
# 跳过包含数字的
if any(c.isdigit() for c in name):
continue
# 跳过全大写的短词
if name.isupper() and len(name) < 5:
continue
final_names.append(name)
return final_names
except Exception as e:
print(f"LLM精炼失败: {e}")
# 返回前10个,但过滤一些明显的错误
filtered = []
for name in candidate_names[:20]:
if len(name) >= 3 and not name.isdigit() and name not in ['Chapter', 'Book', 'Part']:
filtered.append(name)
return filtered[:10]
def extract_main_characters(self, chunks: List[Dict],
text_sample: str = None,
language: str = "en") -> List[Dict]:
"""提取主要角色的完整流程
Args:
chunks: 文本块列表
text_sample: 文本样本(用于LLM精炼)
language: 语言类型
Returns:
主要角色列表
"""
print("\n" + "="*60)
print("开始角色提取流程")
print("="*60)
# 1. 从文本块中提取所有名字
character_mentions = self.extract_characters_from_chunks(chunks, language)
print(f"\n初步提取到 {len(character_mentions)} 个名字")
# 2. 过滤低频角色
filtered = {name: info for name, info in character_mentions.items()
if info['count'] >= Config.MIN_CHARACTER_MENTIONS}
print(f"过滤后剩余 {len(filtered)} 个角色(出现次数≥{Config.MIN_CHARACTER_MENTIONS})")
if len(filtered) == 0:
print("⚠️ 没有找到符合条件的角色,降低阈值...")
# 降低阈值重试
filtered = {name: info for name, info in character_mentions.items()
if info['count'] >= max(3, Config.MIN_CHARACTER_MENTIONS // 2)}
print(f"降低阈值后找到 {len(filtered)} 个角色")
# 3. 排序
ranked = self.rank_characters(filtered)
print(f"排序后前 {len(ranked)} 个角色")
# 4. 合并相似名字
merged = self.merge_similar_names(ranked)
print(f"合并相似名字后:{len(merged)} 个角色")
# 5. LLM精炼(可选)
if text_sample and len(merged) > 15:
print("\n使用 LLM 精炼角色列表...")
candidate_names = [name for name, _ in merged[:30]]
try:
confirmed = self.refine_with_llm(text_sample, candidate_names)
if confirmed:
# 只保留确认的角色
final_characters = []
for name, info in merged:
if any(confirmed_name in name or name in confirmed_name
for confirmed_name in confirmed):
final_characters.append({'name': name, 'info': info})
if final_characters:
print(f"LLM确认后:{len(final_characters)} 个主要角色")
else:
print("LLM确认后没有角色,使用原始列表")
final_characters = [{'name': name, 'info': info} for name, info in merged]
else:
print("LLM未返回结果,使用原始列表")
final_characters = [{'name': name, 'info': info} for name, info in merged]
except Exception as e:
print(f"LLM精炼出错: {e},使用原始列表")
final_characters = [{'name': name, 'info': info} for name, info in merged]
else:
final_characters = [{'name': name, 'info': info} for name, info in merged]
# 6. 最终过滤:移除明显不是人名的
cleaned_characters = []
for char in final_characters:
name = char['name']
# 跳过明显的非人名
if name in ['Chapter', 'Book', 'Part', 'Section', 'The', 'And', 'But', 'When', 'Where']:
continue
if name.lower() in ['chapter', 'book', 'part', 'section']:
continue
cleaned_characters.append(char)
if len(cleaned_characters) < len(final_characters):
print(f"最终清理后:{len(cleaned_characters)} 个角色")
final_characters = cleaned_characters
# 打印角色列表
print("\n" + "="*60)
print("主要角色列表(按重要性排序)")
print("="*60)
for i, char in enumerate(final_characters[:15], 1):
print(f"{i}. {char['name']:<20} - 出现{char['info']['count']}次, "
f"分布在{len(char['info']['chunks'])}个章节")
if not final_characters:
print("\n⚠️ 警告:未找到任何角色!")
print("建议:")
print(" 1. 检查文本格式是否正确")
print(" 2. 确认文本语言设置")
print(" 3. 尝试降低 MIN_CHARACTER_MENTIONS 参数")
return final_characters |