File size: 14,351 Bytes
a226682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
import re
from typing import List, Dict, Tuple
from collections import Counter, defaultdict
from tqdm import tqdm
from config import Config
from utils.cache_manager import CacheManager
from core.openai_client import OpenAIClient

class CharacterExtractor:
    """角色提取器 - 从大规模文本中识别主要角色"""
    
    def __init__(self):
        self.client = OpenAIClient.get_client()
        self.cache = CacheManager()
    
    def extract_names_pattern(self, text: str, language: str = "en") -> List[str]:
        """使用正则模式提取人名
        
        Args:
            text: 输入文本
            language: 语言类型 ('en' 或 'zh')
            
        Returns:
            提取到的人名列表
        """
        if language == "en":
            # 英文人名:大写字母开头的连续词
            pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'
        else:
            # 中文人名:常见姓氏+1-2个字
            common_surnames = '哈赫王李张刘陈杨黄赵吴周徐孙马朱胡郭何林高梁郑罗宋谢唐韩曹许邓萧冯曾程蔡彭潘袁于董余苏叶吕魏蒋田杜丁沈姜范江傅钟卢汪戴崔任陆廖姚方金邱夏谭韦贾邹石熊孟秦阎薛侯雷白龙段郝孔邵史毛常万顾赖武康贺严尹钱施牛洪龚'
            pattern = f'[{common_surnames}][\\u4e00-\\u9fff]{{1,3}}'
        
        names = re.findall(pattern, text)
        return names
    
    def extract_characters_from_chunks(self, chunks: List[Dict], 
                                      language: str = "en") -> Dict[str, Dict]:
        """从文本块中提取角色及其出现信息
        
        Args:
            chunks: 文本块列表
            language: 语言类型
            
        Returns:
            角色信息字典
        """
        
        # 检查缓存
        cache_key = f"characters_{hash(str(chunks[:3]))}"
        cached = self.cache.get(cache_key)
        if cached:
            print("从缓存加载角色信息")
            return cached
        
        character_mentions = defaultdict(lambda: {
            'count': 0,
            'chunks': set(),
            'positions': [],
            'contexts': []
        })
        
        # 添加常见的非角色词汇黑名单
        blacklist = {
            # 章节标记
            'Chapter', 'CHAPTER', 'Part', 'PART', 'Section', 'SECTION', 
            'Book', 'BOOK', 'Volume', 'VOLUME',
            # 冠词和连词
            'The', 'THE', 'And', 'AND', 'But', 'BUT', 'Or', 'OR',
            # 疑问词
            'When', 'WHERE', 'What', 'WHAT', 'Who', 'WHO', 'Why', 'WHY', 'How', 'HOW',
            # 称谓(单独出现)
            'Mr', 'Mrs', 'Ms', 'Miss', 'Dr', 'Professor', 'Sir', 'Madam',
            # 星期
            'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
            # 月份
            'January', 'February', 'March', 'April', 'May', 'June', 'July', 
            'August', 'September', 'October', 'November', 'December',
            # 常见地点
            'Street', 'Road', 'Avenue', 'Place', 'Square', 'Hall', 'House',
            'School', 'Academy', 'University', 'Castle', 'Forest', 'Mountain',
            # 其他常见词
            'Note', 'End', 'Beginning', 'Epilogue', 'Prologue', 'Appendix',
            'First', 'Second', 'Third', 'Last', 'Next', 'Previous',
        }
        
        print("\n提取角色名称...")
        for chunk in tqdm(chunks):
            names = self.extract_names_pattern(chunk['text'], language)
            
            for name in names:
                # 过滤太短或太长的名字
                if language == "en":
                    if len(name) < 3 or len(name) > 30:
                        continue
                else:  # zh
                    if len(name) < 2 or len(name) > 4:
                        continue
                
                # 过滤黑名单
                if name in blacklist:
                    continue
                
                # 过滤纯数字
                if name.isdigit():
                    continue
                
                # 过滤单个字母或单个词(可能是缩写)
                if len(name) <= 2:
                    continue
                
                # 过滤全大写(可能是缩写或标题)
                if name.isupper() and len(name) < 5:
                    continue
                
                character_mentions[name]['count'] += 1
                character_mentions[name]['chunks'].add(chunk['chunk_id'])
                character_mentions[name]['positions'].append(chunk['start'])
        
        # 转换set为list以便序列化
        for char in character_mentions:
            character_mentions[char]['chunks'] = list(character_mentions[char]['chunks'])
        
        # 缓存结果
        self.cache.set(cache_key, dict(character_mentions))
        
        return dict(character_mentions)
    
    def rank_characters(self, character_mentions: Dict) -> List[Tuple[str, Dict]]:
        """根据出场频率和分布排序角色
        
        Args:
            character_mentions: 角色提及信息
            
        Returns:
            排序后的角色列表
        """
        
        scored_characters = []
        
        for name, info in character_mentions.items():
            # 综合评分:出现次数 + 出现的块数量 * 2
            score = info['count'] + len(info['chunks']) * 2
            scored_characters.append((name, info, score))
        
        # 按分数排序
        scored_characters.sort(key=lambda x: x[2], reverse=True)
        
        # 返回前N个角色
        return [(name, info) for name, info, score in scored_characters[:Config.TOP_N_CHARACTERS]]
    
    def merge_similar_names(self, characters: List[Tuple[str, Dict]]) -> List[Tuple[str, Dict]]:
        """合并相似的名字(如 Harry 和 Harry Potter)
        
        Args:
            characters: 角色列表
            
        Returns:
            合并后的角色列表
        """
        
        merged = {}
        
        for name, info in characters:
            # 查找是否有包含关系
            found_parent = False
            for existing_name in list(merged.keys()):
                # 如果当前名字是已存在名字的一部分,或反之
                if name in existing_name or existing_name in name:
                    # 合并到更长的名字下
                    longer_name = name if len(name) > len(existing_name) else existing_name
                    shorter_name = existing_name if longer_name == name else name
                    
                    if longer_name not in merged:
                        merged[longer_name] = merged.pop(existing_name)
                    
                    # 合并信息
                    merged[longer_name]['count'] += info['count']
                    merged[longer_name]['chunks'].extend(info['chunks'])
                    merged[longer_name]['chunks'] = list(set(merged[longer_name]['chunks']))
                    
                    found_parent = True
                    break
            
            if not found_parent:
                merged[name] = info
        
        return list(merged.items())
    
    def refine_with_llm(self, text_sample: str, candidate_names: List[str]) -> List[str]:
        """使用LLM进一步确认和提取主要角色
        
        Args:
            text_sample: 文本样本
            candidate_names: 候选角色名列表
            
        Returns:
            确认的角色名列表
        """
        
        prompt = f"""
以下是一部小说的开头片段。请识别出主要角色的名字。

候选名字列表:{', '.join(candidate_names[:30])}

文本片段:
{text_sample[:2000]}

请返回确认的主要角色名字,每行一个。只返回真正的角色名,排除:
1. 地名、物品名
2. 章节标记(如 "Chapter")
3. 称谓词(如 "Mr", "Professor")
4. 普通名词

只返回人名,每个名字一行。
"""
        
        try:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "你是一个文学分析专家,擅长识别小说角色。请只返回真正的人物角色名字。"},
                    {"role": "user", "content": prompt}
                ]
            )
            
            result = response.choices[0].message.content.strip()
            confirmed_names = [name.strip() for name in result.split('\n') if name.strip()]
            
            # 再次过滤明显不是人名的词
            final_names = []
            for name in confirmed_names:
                # 跳过太短的
                if len(name) < 3:
                    continue
                # 跳过包含数字的
                if any(c.isdigit() for c in name):
                    continue
                # 跳过全大写的短词
                if name.isupper() and len(name) < 5:
                    continue
                final_names.append(name)
            
            return final_names
        
        except Exception as e:
            print(f"LLM精炼失败: {e}")
            # 返回前10个,但过滤一些明显的错误
            filtered = []
            for name in candidate_names[:20]:
                if len(name) >= 3 and not name.isdigit() and name not in ['Chapter', 'Book', 'Part']:
                    filtered.append(name)
            return filtered[:10]
    
    def extract_main_characters(self, chunks: List[Dict], 
                               text_sample: str = None,
                               language: str = "en") -> List[Dict]:
        """提取主要角色的完整流程
        
        Args:
            chunks: 文本块列表
            text_sample: 文本样本(用于LLM精炼)
            language: 语言类型
            
        Returns:
            主要角色列表
        """
        
        print("\n" + "="*60)
        print("开始角色提取流程")
        print("="*60)
        
        # 1. 从文本块中提取所有名字
        character_mentions = self.extract_characters_from_chunks(chunks, language)
        print(f"\n初步提取到 {len(character_mentions)} 个名字")
        
        # 2. 过滤低频角色
        filtered = {name: info for name, info in character_mentions.items() 
                   if info['count'] >= Config.MIN_CHARACTER_MENTIONS}
        print(f"过滤后剩余 {len(filtered)} 个角色(出现次数≥{Config.MIN_CHARACTER_MENTIONS})")
        
        if len(filtered) == 0:
            print("⚠️  没有找到符合条件的角色,降低阈值...")
            # 降低阈值重试
            filtered = {name: info for name, info in character_mentions.items() 
                       if info['count'] >= max(3, Config.MIN_CHARACTER_MENTIONS // 2)}
            print(f"降低阈值后找到 {len(filtered)} 个角色")
        
        # 3. 排序
        ranked = self.rank_characters(filtered)
        print(f"排序后前 {len(ranked)} 个角色")
        
        # 4. 合并相似名字
        merged = self.merge_similar_names(ranked)
        print(f"合并相似名字后:{len(merged)} 个角色")
        
        # 5. LLM精炼(可选)
        if text_sample and len(merged) > 15:
            print("\n使用 LLM 精炼角色列表...")
            candidate_names = [name for name, _ in merged[:30]]
            try:
                confirmed = self.refine_with_llm(text_sample, candidate_names)
                
                if confirmed:
                    # 只保留确认的角色
                    final_characters = []
                    for name, info in merged:
                        if any(confirmed_name in name or name in confirmed_name 
                              for confirmed_name in confirmed):
                            final_characters.append({'name': name, 'info': info})
                    
                    if final_characters:
                        print(f"LLM确认后:{len(final_characters)} 个主要角色")
                    else:
                        print("LLM确认后没有角色,使用原始列表")
                        final_characters = [{'name': name, 'info': info} for name, info in merged]
                else:
                    print("LLM未返回结果,使用原始列表")
                    final_characters = [{'name': name, 'info': info} for name, info in merged]
            except Exception as e:
                print(f"LLM精炼出错: {e},使用原始列表")
                final_characters = [{'name': name, 'info': info} for name, info in merged]
        else:
            final_characters = [{'name': name, 'info': info} for name, info in merged]
        
        # 6. 最终过滤:移除明显不是人名的
        cleaned_characters = []
        for char in final_characters:
            name = char['name']
            # 跳过明显的非人名
            if name in ['Chapter', 'Book', 'Part', 'Section', 'The', 'And', 'But', 'When', 'Where']:
                continue
            if name.lower() in ['chapter', 'book', 'part', 'section']:
                continue
            cleaned_characters.append(char)
        
        if len(cleaned_characters) < len(final_characters):
            print(f"最终清理后:{len(cleaned_characters)} 个角色")
            final_characters = cleaned_characters
        
        # 打印角色列表
        print("\n" + "="*60)
        print("主要角色列表(按重要性排序)")
        print("="*60)
        for i, char in enumerate(final_characters[:15], 1):
            print(f"{i}. {char['name']:<20} - 出现{char['info']['count']}次, "
                  f"分布在{len(char['info']['chunks'])}个章节")
        
        if not final_characters:
            print("\n⚠️  警告:未找到任何角色!")
            print("建议:")
            print("  1. 检查文本格式是否正确")
            print("  2. 确认文本语言设置")
            print("  3. 尝试降低 MIN_CHARACTER_MENTIONS 参数")
        
        return final_characters