File size: 6,595 Bytes

a226682

from typing import Dict, List
from tqdm import tqdm
from config import Config
from utils.cache_manager import CacheManager
from core.openai_client import OpenAIClient

class CharacterAnalyzer:
    """角色性格分析器 - 针对大规模文本优化"""
    
    def __init__(self):
        self.client = OpenAIClient.get_client()
        self.cache = CacheManager()
    
    def select_representative_chunks(self, chunks: List[Dict], 
                                    character_chunks: List[int],
                                    max_chunks: int = None) -> List[Dict]:
        """选择最具代表性的文本块"""
        
        max_chunks = max_chunks or Config.MAX_ANALYSIS_CHUNKS
        
        # 均匀分布选择
        if len(character_chunks) <= max_chunks:
            selected_ids = character_chunks
        else:
            step = len(character_chunks) // max_chunks
            selected_ids = [character_chunks[i * step] for i in range(max_chunks)]
        
        selected_chunks = [chunks[i] for i in selected_ids if i < len(chunks)]
        return selected_chunks
    
    def analyze_character_batch(self, character_name: str, 
                                text_chunks: List[Dict]) -> Dict:
        """分批分析角色性格"""
        
        # 检查缓存
        cache_key = f"analysis_{character_name}_{hash(str([c['chunk_id'] for c in text_chunks]))}"
        cached = self.cache.get(cache_key)
        if cached:
            print(f"从缓存加载 {character_name} 的分析结果")
            return cached
        
        # 合并文本块
        combined_text = "\n\n---\n\n".join([c['text'] for c in text_chunks])
        
        analysis_prompt = f"""
        请深度分析小说中"{character_name}"这个角色的性格特征。
        
        基于以下文本片段进行分析：
        
        {combined_text[:8000]}  # 限制输入长度
        
        请从以下维度分析，并以JSON格式返回：
        
        {{
            "name": "{character_name}",
            "core_traits": ["特质1", "特质2", "特质3"],
            "speaking_style": "说话风格描述",
            "behavior_patterns": "行为模式描述",
            "values": "核心价值观",
            "emotional_style": "情感表达方式",
            "relationship_style": "人际关系风格",
            "background": "背景信息",
            "key_quotes": ["典型语句1", "典型语句2"],
            "personality_summary": "性格总结（100字以内）"
        }}
        
        注意：
        1. 只基于文本内容分析，不要添加原著之外的信息
        2. 提取该角色的典型对话和行为
        3. 关注语言风格、用词习惯、口头禅等
        """
        
        try:
            response = self.client.chat.completions.create(
                model=Config.MODEL_NAME,
                messages=[
                    {"role": "system", "content": "你是一个专业的文学角色分析专家。请基于文本内容进行深入分析。"},
                    {"role": "user", "content": analysis_prompt}
                ]
            )
            
            analysis_text = response.choices[0].message.content.strip()
            
            # 尝试解析JSON
            import json
            import re
            
            json_match = re.search(r'\{.*\}', analysis_text, re.DOTALL)
            if json_match:
                profile = json.loads(json_match.group())
            else:
                profile = self._parse_text_analysis(analysis_text, character_name)
            
            profile['raw_analysis'] = analysis_text
            
            # 缓存结果
            self.cache.set(cache_key, profile)
            
            return profile
        
        except Exception as e:
            print(f"分析失败: {e}")
            return self._default_profile(character_name)
    
    def _parse_text_analysis(self, text: str, character_name: str) -> Dict:
        """解析文本格式的分析结果"""
        
        profile = {
            'name': character_name,
            'core_traits': [],
            'speaking_style': '',
            'behavior_patterns': '',
            'values': '',
            'emotional_style': '',
            'relationship_style': '',
            'background': '',
            'key_quotes': [],
            'personality_summary': ''
        }
        
        # 提取各个部分
        import re
        
        patterns = {
            'core_traits': r'core_traits["\']?\s*:\s*\[(.*?)\]',
            'speaking_style': r'speaking_style["\']?\s*:\s*["\']([^"\']+)["\']',
            'key_quotes': r'key_quotes["\']?\s*:\s*\[(.*?)\]',
        }
        
        for key, pattern in patterns.items():
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            if match:
                content = match.group(1)
                if key in ['core_traits', 'key_quotes']:
                    items = re.findall(r'["\']([^"\']+)["\']', content)
                    profile[key] = items
                else:
                    profile[key] = content
        
        return profile
    
    def _default_profile(self, character_name: str) -> Dict:
        """默认角色配置"""
        return {
            'name': character_name,
            'core_traits': ['复杂', '多面'],
            'speaking_style': '根据情境变化',
            'behavior_patterns': '待观察',
            'values': '待分析',
            'emotional_style': '情感丰富',
            'relationship_style': '因人而异',
            'background': '小说角色',
            'key_quotes': [],
            'personality_summary': f'{character_name}是一个复杂的角色',
            'raw_analysis': '使用默认配置'
        }
    
    def enhance_profile_with_examples(self, profile: Dict, chunks: List[Dict],
                                     character_chunks: List[int]) -> Dict:
        """通过对话实例增强角色配置"""
        
        # 提取包含该角色的对话
        dialogues = []
        for chunk_id in character_chunks[:5]:  # 只看前几个块
            if chunk_id < len(chunks):
                chunk_text = chunks[chunk_id]['text']
                # 简单提取引号内容
                import re
                quotes = re.findall(r'["\']([^"\']{10,100})["\']', chunk_text)
                dialogues.extend(quotes[:3])
        
        if dialogues:
            profile['example_dialogues'] = dialogues[:5]
        
        return profile