|
|
import re |
|
|
from typing import List, Dict, Tuple |
|
|
from collections import Counter, defaultdict |
|
|
from tqdm import tqdm |
|
|
from config import Config |
|
|
from utils.cache_manager import CacheManager |
|
|
from core.openai_client import OpenAIClient |
|
|
|
|
|
class CharacterExtractor: |
|
|
"""角色提取器 - 从大规模文本中识别主要角色""" |
|
|
|
|
|
def __init__(self): |
|
|
self.client = OpenAIClient.get_client() |
|
|
self.cache = CacheManager() |
|
|
|
|
|
def extract_names_pattern(self, text: str, language: str = "en") -> List[str]: |
|
|
"""使用正则模式提取人名 |
|
|
|
|
|
Args: |
|
|
text: 输入文本 |
|
|
language: 语言类型 ('en' 或 'zh') |
|
|
|
|
|
Returns: |
|
|
提取到的人名列表 |
|
|
""" |
|
|
if language == "en": |
|
|
|
|
|
pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b' |
|
|
else: |
|
|
|
|
|
common_surnames = '哈赫王李张刘陈杨黄赵吴周徐孙马朱胡郭何林高梁郑罗宋谢唐韩曹许邓萧冯曾程蔡彭潘袁于董余苏叶吕魏蒋田杜丁沈姜范江傅钟卢汪戴崔任陆廖姚方金邱夏谭韦贾邹石熊孟秦阎薛侯雷白龙段郝孔邵史毛常万顾赖武康贺严尹钱施牛洪龚' |
|
|
pattern = f'[{common_surnames}][\\u4e00-\\u9fff]{{1,3}}' |
|
|
|
|
|
names = re.findall(pattern, text) |
|
|
return names |
|
|
|
|
|
def extract_characters_from_chunks(self, chunks: List[Dict], |
|
|
language: str = "en") -> Dict[str, Dict]: |
|
|
"""从文本块中提取角色及其出现信息 |
|
|
|
|
|
Args: |
|
|
chunks: 文本块列表 |
|
|
language: 语言类型 |
|
|
|
|
|
Returns: |
|
|
角色信息字典 |
|
|
""" |
|
|
|
|
|
|
|
|
cache_key = f"characters_{hash(str(chunks[:3]))}" |
|
|
cached = self.cache.get(cache_key) |
|
|
if cached: |
|
|
print("从缓存加载角色信息") |
|
|
return cached |
|
|
|
|
|
character_mentions = defaultdict(lambda: { |
|
|
'count': 0, |
|
|
'chunks': set(), |
|
|
'positions': [], |
|
|
'contexts': [] |
|
|
}) |
|
|
|
|
|
|
|
|
blacklist = { |
|
|
|
|
|
'Chapter', 'CHAPTER', 'Part', 'PART', 'Section', 'SECTION', |
|
|
'Book', 'BOOK', 'Volume', 'VOLUME', |
|
|
|
|
|
'The', 'THE', 'And', 'AND', 'But', 'BUT', 'Or', 'OR', |
|
|
|
|
|
'When', 'WHERE', 'What', 'WHAT', 'Who', 'WHO', 'Why', 'WHY', 'How', 'HOW', |
|
|
|
|
|
'Mr', 'Mrs', 'Ms', 'Miss', 'Dr', 'Professor', 'Sir', 'Madam', |
|
|
|
|
|
'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', |
|
|
|
|
|
'January', 'February', 'March', 'April', 'May', 'June', 'July', |
|
|
'August', 'September', 'October', 'November', 'December', |
|
|
|
|
|
'Street', 'Road', 'Avenue', 'Place', 'Square', 'Hall', 'House', |
|
|
'School', 'Academy', 'University', 'Castle', 'Forest', 'Mountain', |
|
|
|
|
|
'Note', 'End', 'Beginning', 'Epilogue', 'Prologue', 'Appendix', |
|
|
'First', 'Second', 'Third', 'Last', 'Next', 'Previous', |
|
|
} |
|
|
|
|
|
print("\n提取角色名称...") |
|
|
for chunk in tqdm(chunks): |
|
|
names = self.extract_names_pattern(chunk['text'], language) |
|
|
|
|
|
for name in names: |
|
|
|
|
|
if language == "en": |
|
|
if len(name) < 3 or len(name) > 30: |
|
|
continue |
|
|
else: |
|
|
if len(name) < 2 or len(name) > 4: |
|
|
continue |
|
|
|
|
|
|
|
|
if name in blacklist: |
|
|
continue |
|
|
|
|
|
|
|
|
if name.isdigit(): |
|
|
continue |
|
|
|
|
|
|
|
|
if len(name) <= 2: |
|
|
continue |
|
|
|
|
|
|
|
|
if name.isupper() and len(name) < 5: |
|
|
continue |
|
|
|
|
|
character_mentions[name]['count'] += 1 |
|
|
character_mentions[name]['chunks'].add(chunk['chunk_id']) |
|
|
character_mentions[name]['positions'].append(chunk['start']) |
|
|
|
|
|
|
|
|
for char in character_mentions: |
|
|
character_mentions[char]['chunks'] = list(character_mentions[char]['chunks']) |
|
|
|
|
|
|
|
|
self.cache.set(cache_key, dict(character_mentions)) |
|
|
|
|
|
return dict(character_mentions) |
|
|
|
|
|
def rank_characters(self, character_mentions: Dict) -> List[Tuple[str, Dict]]: |
|
|
"""根据出场频率和分布排序角色 |
|
|
|
|
|
Args: |
|
|
character_mentions: 角色提及信息 |
|
|
|
|
|
Returns: |
|
|
排序后的角色列表 |
|
|
""" |
|
|
|
|
|
scored_characters = [] |
|
|
|
|
|
for name, info in character_mentions.items(): |
|
|
|
|
|
score = info['count'] + len(info['chunks']) * 2 |
|
|
scored_characters.append((name, info, score)) |
|
|
|
|
|
|
|
|
scored_characters.sort(key=lambda x: x[2], reverse=True) |
|
|
|
|
|
|
|
|
return [(name, info) for name, info, score in scored_characters[:Config.TOP_N_CHARACTERS]] |
|
|
|
|
|
def merge_similar_names(self, characters: List[Tuple[str, Dict]]) -> List[Tuple[str, Dict]]: |
|
|
"""合并相似的名字(如 Harry 和 Harry Potter) |
|
|
|
|
|
Args: |
|
|
characters: 角色列表 |
|
|
|
|
|
Returns: |
|
|
合并后的角色列表 |
|
|
""" |
|
|
|
|
|
merged = {} |
|
|
|
|
|
for name, info in characters: |
|
|
|
|
|
found_parent = False |
|
|
for existing_name in list(merged.keys()): |
|
|
|
|
|
if name in existing_name or existing_name in name: |
|
|
|
|
|
longer_name = name if len(name) > len(existing_name) else existing_name |
|
|
shorter_name = existing_name if longer_name == name else name |
|
|
|
|
|
if longer_name not in merged: |
|
|
merged[longer_name] = merged.pop(existing_name) |
|
|
|
|
|
|
|
|
merged[longer_name]['count'] += info['count'] |
|
|
merged[longer_name]['chunks'].extend(info['chunks']) |
|
|
merged[longer_name]['chunks'] = list(set(merged[longer_name]['chunks'])) |
|
|
|
|
|
found_parent = True |
|
|
break |
|
|
|
|
|
if not found_parent: |
|
|
merged[name] = info |
|
|
|
|
|
return list(merged.items()) |
|
|
|
|
|
def refine_with_llm(self, text_sample: str, candidate_names: List[str]) -> List[str]: |
|
|
"""使用LLM进一步确认和提取主要角色 |
|
|
|
|
|
Args: |
|
|
text_sample: 文本样本 |
|
|
candidate_names: 候选角色名列表 |
|
|
|
|
|
Returns: |
|
|
确认的角色名列表 |
|
|
""" |
|
|
|
|
|
prompt = f""" |
|
|
以下是一部小说的开头片段。请识别出主要角色的名字。 |
|
|
|
|
|
候选名字列表:{', '.join(candidate_names[:30])} |
|
|
|
|
|
文本片段: |
|
|
{text_sample[:2000]} |
|
|
|
|
|
请返回确认的主要角色名字,每行一个。只返回真正的角色名,排除: |
|
|
1. 地名、物品名 |
|
|
2. 章节标记(如 "Chapter") |
|
|
3. 称谓词(如 "Mr", "Professor") |
|
|
4. 普通名词 |
|
|
|
|
|
只返回人名,每个名字一行。 |
|
|
""" |
|
|
|
|
|
try: |
|
|
response = self.client.chat.completions.create( |
|
|
model="gpt-3.5-turbo", |
|
|
messages=[ |
|
|
{"role": "system", "content": "你是一个文学分析专家,擅长识别小说角色。请只返回真正的人物角色名字。"}, |
|
|
{"role": "user", "content": prompt} |
|
|
] |
|
|
) |
|
|
|
|
|
result = response.choices[0].message.content.strip() |
|
|
confirmed_names = [name.strip() for name in result.split('\n') if name.strip()] |
|
|
|
|
|
|
|
|
final_names = [] |
|
|
for name in confirmed_names: |
|
|
|
|
|
if len(name) < 3: |
|
|
continue |
|
|
|
|
|
if any(c.isdigit() for c in name): |
|
|
continue |
|
|
|
|
|
if name.isupper() and len(name) < 5: |
|
|
continue |
|
|
final_names.append(name) |
|
|
|
|
|
return final_names |
|
|
|
|
|
except Exception as e: |
|
|
print(f"LLM精炼失败: {e}") |
|
|
|
|
|
filtered = [] |
|
|
for name in candidate_names[:20]: |
|
|
if len(name) >= 3 and not name.isdigit() and name not in ['Chapter', 'Book', 'Part']: |
|
|
filtered.append(name) |
|
|
return filtered[:10] |
|
|
|
|
|
def extract_main_characters(self, chunks: List[Dict], |
|
|
text_sample: str = None, |
|
|
language: str = "en") -> List[Dict]: |
|
|
"""提取主要角色的完整流程 |
|
|
|
|
|
Args: |
|
|
chunks: 文本块列表 |
|
|
text_sample: 文本样本(用于LLM精炼) |
|
|
language: 语言类型 |
|
|
|
|
|
Returns: |
|
|
主要角色列表 |
|
|
""" |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("开始角色提取流程") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
character_mentions = self.extract_characters_from_chunks(chunks, language) |
|
|
print(f"\n初步提取到 {len(character_mentions)} 个名字") |
|
|
|
|
|
|
|
|
filtered = {name: info for name, info in character_mentions.items() |
|
|
if info['count'] >= Config.MIN_CHARACTER_MENTIONS} |
|
|
print(f"过滤后剩余 {len(filtered)} 个角色(出现次数≥{Config.MIN_CHARACTER_MENTIONS})") |
|
|
|
|
|
if len(filtered) == 0: |
|
|
print("⚠️ 没有找到符合条件的角色,降低阈值...") |
|
|
|
|
|
filtered = {name: info for name, info in character_mentions.items() |
|
|
if info['count'] >= max(3, Config.MIN_CHARACTER_MENTIONS // 2)} |
|
|
print(f"降低阈值后找到 {len(filtered)} 个角色") |
|
|
|
|
|
|
|
|
ranked = self.rank_characters(filtered) |
|
|
print(f"排序后前 {len(ranked)} 个角色") |
|
|
|
|
|
|
|
|
merged = self.merge_similar_names(ranked) |
|
|
print(f"合并相似名字后:{len(merged)} 个角色") |
|
|
|
|
|
|
|
|
if text_sample and len(merged) > 15: |
|
|
print("\n使用 LLM 精炼角色列表...") |
|
|
candidate_names = [name for name, _ in merged[:30]] |
|
|
try: |
|
|
confirmed = self.refine_with_llm(text_sample, candidate_names) |
|
|
|
|
|
if confirmed: |
|
|
|
|
|
final_characters = [] |
|
|
for name, info in merged: |
|
|
if any(confirmed_name in name or name in confirmed_name |
|
|
for confirmed_name in confirmed): |
|
|
final_characters.append({'name': name, 'info': info}) |
|
|
|
|
|
if final_characters: |
|
|
print(f"LLM确认后:{len(final_characters)} 个主要角色") |
|
|
else: |
|
|
print("LLM确认后没有角色,使用原始列表") |
|
|
final_characters = [{'name': name, 'info': info} for name, info in merged] |
|
|
else: |
|
|
print("LLM未返回结果,使用原始列表") |
|
|
final_characters = [{'name': name, 'info': info} for name, info in merged] |
|
|
except Exception as e: |
|
|
print(f"LLM精炼出错: {e},使用原始列表") |
|
|
final_characters = [{'name': name, 'info': info} for name, info in merged] |
|
|
else: |
|
|
final_characters = [{'name': name, 'info': info} for name, info in merged] |
|
|
|
|
|
|
|
|
cleaned_characters = [] |
|
|
for char in final_characters: |
|
|
name = char['name'] |
|
|
|
|
|
if name in ['Chapter', 'Book', 'Part', 'Section', 'The', 'And', 'But', 'When', 'Where']: |
|
|
continue |
|
|
if name.lower() in ['chapter', 'book', 'part', 'section']: |
|
|
continue |
|
|
cleaned_characters.append(char) |
|
|
|
|
|
if len(cleaned_characters) < len(final_characters): |
|
|
print(f"最终清理后:{len(cleaned_characters)} 个角色") |
|
|
final_characters = cleaned_characters |
|
|
|
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("主要角色列表(按重要性排序)") |
|
|
print("="*60) |
|
|
for i, char in enumerate(final_characters[:15], 1): |
|
|
print(f"{i}. {char['name']:<20} - 出现{char['info']['count']}次, " |
|
|
f"分布在{len(char['info']['chunks'])}个章节") |
|
|
|
|
|
if not final_characters: |
|
|
print("\n⚠️ 警告:未找到任何角色!") |
|
|
print("建议:") |
|
|
print(" 1. 检查文本格式是否正确") |
|
|
print(" 2. 确认文本语言设置") |
|
|
print(" 3. 尝试降低 MIN_CHARACTER_MENTIONS 参数") |
|
|
|
|
|
return final_characters |