File size: 6,595 Bytes
a226682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
from typing import Dict, List
from tqdm import tqdm
from config import Config
from utils.cache_manager import CacheManager
from core.openai_client import OpenAIClient

class CharacterAnalyzer:
    """角色性格分析器 - 针对大规模文本优化"""
    
    def __init__(self):
        self.client = OpenAIClient.get_client()
        self.cache = CacheManager()
    
    def select_representative_chunks(self, chunks: List[Dict], 
                                    character_chunks: List[int],
                                    max_chunks: int = None) -> List[Dict]:
        """选择最具代表性的文本块"""
        
        max_chunks = max_chunks or Config.MAX_ANALYSIS_CHUNKS
        
        # 均匀分布选择
        if len(character_chunks) <= max_chunks:
            selected_ids = character_chunks
        else:
            step = len(character_chunks) // max_chunks
            selected_ids = [character_chunks[i * step] for i in range(max_chunks)]
        
        selected_chunks = [chunks[i] for i in selected_ids if i < len(chunks)]
        return selected_chunks
    
    def analyze_character_batch(self, character_name: str, 
                                text_chunks: List[Dict]) -> Dict:
        """分批分析角色性格"""
        
        # 检查缓存
        cache_key = f"analysis_{character_name}_{hash(str([c['chunk_id'] for c in text_chunks]))}"
        cached = self.cache.get(cache_key)
        if cached:
            print(f"从缓存加载 {character_name} 的分析结果")
            return cached
        
        # 合并文本块
        combined_text = "\n\n---\n\n".join([c['text'] for c in text_chunks])
        
        analysis_prompt = f"""
        请深度分析小说中"{character_name}"这个角色的性格特征。
        
        基于以下文本片段进行分析:
        
        {combined_text[:8000]}  # 限制输入长度
        
        请从以下维度分析,并以JSON格式返回:
        
        {{
            "name": "{character_name}",
            "core_traits": ["特质1", "特质2", "特质3"],
            "speaking_style": "说话风格描述",
            "behavior_patterns": "行为模式描述",
            "values": "核心价值观",
            "emotional_style": "情感表达方式",
            "relationship_style": "人际关系风格",
            "background": "背景信息",
            "key_quotes": ["典型语句1", "典型语句2"],
            "personality_summary": "性格总结(100字以内)"
        }}
        
        注意:
        1. 只基于文本内容分析,不要添加原著之外的信息
        2. 提取该角色的典型对话和行为
        3. 关注语言风格、用词习惯、口头禅等
        """
        
        try:
            response = self.client.chat.completions.create(
                model=Config.MODEL_NAME,
                messages=[
                    {"role": "system", "content": "你是一个专业的文学角色分析专家。请基于文本内容进行深入分析。"},
                    {"role": "user", "content": analysis_prompt}
                ]
            )
            
            analysis_text = response.choices[0].message.content.strip()
            
            # 尝试解析JSON
            import json
            import re
            
            json_match = re.search(r'\{.*\}', analysis_text, re.DOTALL)
            if json_match:
                profile = json.loads(json_match.group())
            else:
                profile = self._parse_text_analysis(analysis_text, character_name)
            
            profile['raw_analysis'] = analysis_text
            
            # 缓存结果
            self.cache.set(cache_key, profile)
            
            return profile
        
        except Exception as e:
            print(f"分析失败: {e}")
            return self._default_profile(character_name)
    
    def _parse_text_analysis(self, text: str, character_name: str) -> Dict:
        """解析文本格式的分析结果"""
        
        profile = {
            'name': character_name,
            'core_traits': [],
            'speaking_style': '',
            'behavior_patterns': '',
            'values': '',
            'emotional_style': '',
            'relationship_style': '',
            'background': '',
            'key_quotes': [],
            'personality_summary': ''
        }
        
        # 提取各个部分
        import re
        
        patterns = {
            'core_traits': r'core_traits["\']?\s*:\s*\[(.*?)\]',
            'speaking_style': r'speaking_style["\']?\s*:\s*["\']([^"\']+)["\']',
            'key_quotes': r'key_quotes["\']?\s*:\s*\[(.*?)\]',
        }
        
        for key, pattern in patterns.items():
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            if match:
                content = match.group(1)
                if key in ['core_traits', 'key_quotes']:
                    items = re.findall(r'["\']([^"\']+)["\']', content)
                    profile[key] = items
                else:
                    profile[key] = content
        
        return profile
    
    def _default_profile(self, character_name: str) -> Dict:
        """默认角色配置"""
        return {
            'name': character_name,
            'core_traits': ['复杂', '多面'],
            'speaking_style': '根据情境变化',
            'behavior_patterns': '待观察',
            'values': '待分析',
            'emotional_style': '情感丰富',
            'relationship_style': '因人而异',
            'background': '小说角色',
            'key_quotes': [],
            'personality_summary': f'{character_name}是一个复杂的角色',
            'raw_analysis': '使用默认配置'
        }
    
    def enhance_profile_with_examples(self, profile: Dict, chunks: List[Dict],
                                     character_chunks: List[int]) -> Dict:
        """通过对话实例增强角色配置"""
        
        # 提取包含该角色的对话
        dialogues = []
        for chunk_id in character_chunks[:5]:  # 只看前几个块
            if chunk_id < len(chunks):
                chunk_text = chunks[chunk_id]['text']
                # 简单提取引号内容
                import re
                quotes = re.findall(r'["\']([^"\']{10,100})["\']', chunk_text)
                dialogues.extend(quotes[:3])
        
        if dialogues:
            profile['example_dialogues'] = dialogues[:5]
        
        return profile