Upload folder using huggingface_hub
Browse files- .env.example +29 -0
- __pycache__/config.cpython-310.pyc +0 -0
- cache/analysis_Thomas_-1235069295746715801.pkl +3 -0
- cache/analysis_Thomas_2921407663095156522.pkl +3 -0
- cache/analysis_哈利波特_1748832246067442544.pkl +3 -0
- cache/characters_-3712775294649564773.pkl +3 -0
- cache/characters_250361385978348193.pkl +3 -0
- cache/characters_4782512100612132426.pkl +3 -0
- cache/characters_6926022490162963799.pkl +3 -0
- cache/characters_7412048294772991388.pkl +3 -0
- cache/characters_8212446298422427823.pkl +3 -0
- config.py +35 -0
- core/__pycache__/character_agent.cpython-310.pyc +0 -0
- core/__pycache__/character_analyzer.cpython-310.pyc +0 -0
- core/__pycache__/character_extractor.cpython-310.pyc +0 -0
- core/__pycache__/memory_manager.cpython-310.pyc +0 -0
- core/__pycache__/openai_client.cpython-310.pyc +0 -0
- core/__pycache__/text_processor.cpython-310.pyc +0 -0
- core/character_agent.py +211 -0
- core/character_analyzer.py +175 -0
- core/character_extractor.py +356 -0
- core/memory_manager.py +168 -0
- core/openai_client.py +30 -0
- core/text_processor.py +429 -0
- main.py +367 -0
- sample_novels/harry_potter_sample.txt +0 -0
- test_agent.py +431 -0
- tools/__init__.py +1 -0
- tools/batch_analyze.py +163 -0
- tools/clear_cache.py +66 -0
- utils/__init__.py +4 -0
- utils/__pycache__/__init__.cpython-310.pyc +0 -0
- utils/__pycache__/cache_manager.cpython-310.pyc +0 -0
- utils/__pycache__/text_utils.cpython-310.pyc +0 -0
- utils/cache_manager.py +147 -0
- utils/text_utils.py +139 -0
.env.example
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenAI API 配置
|
| 2 |
+
OPENAI_API_KEY=xxx
|
| 3 |
+
OPENAI_BASE_URL=https://matrixllm.alipay.com/v1
|
| 4 |
+
|
| 5 |
+
# 模型配置
|
| 6 |
+
MODEL_NAME=gpt-4
|
| 7 |
+
EMBEDDING_MODEL=text-embedding-ada-002
|
| 8 |
+
|
| 9 |
+
# 文本处理配置
|
| 10 |
+
MAX_CHUNK_SIZE=2000
|
| 11 |
+
CHUNK_OVERLAP=200
|
| 12 |
+
|
| 13 |
+
# 角色提取配置
|
| 14 |
+
MIN_CHARACTER_MENTIONS=10
|
| 15 |
+
TOP_N_CHARACTERS=20
|
| 16 |
+
|
| 17 |
+
# 角色分析配置
|
| 18 |
+
MAX_ANALYSIS_CHUNKS=10
|
| 19 |
+
ANALYSIS_TEMPERATURE=0.3
|
| 20 |
+
|
| 21 |
+
# 对话配置
|
| 22 |
+
MAX_HISTORY=10
|
| 23 |
+
CONVERSATION_TEMPERATURE=0.8
|
| 24 |
+
MAX_MEMORY_RETRIEVAL=5
|
| 25 |
+
|
| 26 |
+
# 缓存配置
|
| 27 |
+
ENABLE_CACHE=True
|
| 28 |
+
CACHE_DIR=cache
|
| 29 |
+
VECTOR_DB_PATH=cache/vectordb
|
__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (961 Bytes). View file
|
|
|
cache/analysis_Thomas_-1235069295746715801.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a54828c9e1e0079587330b82f65bed3602afaef2fedfcdf519fc5ab86b4d997
|
| 3 |
+
size 9919
|
cache/analysis_Thomas_2921407663095156522.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02ded18a8adeda53d5c030429ce091c23fed2f07ba868a2a52476fff825fb686
|
| 3 |
+
size 9837
|
cache/analysis_哈利波特_1748832246067442544.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0e1a27fd10a4dae2e0a707de5d376eb4fdcc7f6d5cd866399457af83a9396ce
|
| 3 |
+
size 13044
|
cache/characters_-3712775294649564773.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d00c966ed5d03784c446d231aabe64ace3b45ad035904f8b2269aed97ca79572
|
| 3 |
+
size 79316
|
cache/characters_250361385978348193.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82043c4997ac79a9a8e934b828a07006d63d542a80b17977ff93b24545c646a9
|
| 3 |
+
size 3256
|
cache/characters_4782512100612132426.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82043c4997ac79a9a8e934b828a07006d63d542a80b17977ff93b24545c646a9
|
| 3 |
+
size 3256
|
cache/characters_6926022490162963799.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8629e56fd1389e9e345ada3802117d9dfe757da87e18b0e423e7a5de68dd79a3
|
| 3 |
+
size 117980
|
cache/characters_7412048294772991388.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82043c4997ac79a9a8e934b828a07006d63d542a80b17977ff93b24545c646a9
|
| 3 |
+
size 3256
|
cache/characters_8212446298422427823.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bddb7ca207a343482bd772d79aa1f28f82b37f0c323bda1f59913ab7292983ce
|
| 3 |
+
size 3594
|
config.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
class Config:
|
| 7 |
+
# API 配置
|
| 8 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "xxx")
|
| 9 |
+
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://matrixllm.alipay.com/v1")
|
| 10 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-5.1")
|
| 11 |
+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-ada-002")
|
| 12 |
+
|
| 13 |
+
# 文本处理配置
|
| 14 |
+
MAX_CHUNK_SIZE = 2000 # 每个文本块的最大字符数
|
| 15 |
+
CHUNK_OVERLAP = 200 # 文本块之间的重叠
|
| 16 |
+
|
| 17 |
+
# 角色提取配置
|
| 18 |
+
MIN_CHARACTER_MENTIONS = 10 # 最小出场次数
|
| 19 |
+
TOP_N_CHARACTERS = 20 # 提取前N个主要角色
|
| 20 |
+
|
| 21 |
+
# 角色分析配置
|
| 22 |
+
MAX_ANALYSIS_CHUNKS = 10 # 每个角色最多分析的文本块数量
|
| 23 |
+
ANALYSIS_TEMPERATURE = 0.3
|
| 24 |
+
|
| 25 |
+
# 对话配置
|
| 26 |
+
MAX_HISTORY = 10
|
| 27 |
+
CONVERSATION_TEMPERATURE = 0.8
|
| 28 |
+
MAX_MEMORY_RETRIEVAL = 5 # 从记忆中检索的最大片段数
|
| 29 |
+
|
| 30 |
+
# 缓存配置
|
| 31 |
+
CACHE_DIR = "cache"
|
| 32 |
+
ENABLE_CACHE = True
|
| 33 |
+
|
| 34 |
+
# 向量数据库配置
|
| 35 |
+
VECTOR_DB_PATH = "cache/vectordb"
|
core/__pycache__/character_agent.cpython-310.pyc
ADDED
|
Binary file (6.8 kB). View file
|
|
|
core/__pycache__/character_analyzer.cpython-310.pyc
ADDED
|
Binary file (5.88 kB). View file
|
|
|
core/__pycache__/character_extractor.cpython-310.pyc
ADDED
|
Binary file (11.2 kB). View file
|
|
|
core/__pycache__/memory_manager.cpython-310.pyc
ADDED
|
Binary file (4.54 kB). View file
|
|
|
core/__pycache__/openai_client.cpython-310.pyc
ADDED
|
Binary file (1.25 kB). View file
|
|
|
core/__pycache__/text_processor.cpython-310.pyc
ADDED
|
Binary file (9.39 kB). View file
|
|
|
core/character_agent.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Optional
|
| 2 |
+
from config import Config
|
| 3 |
+
from core.memory_manager import MemoryManager
|
| 4 |
+
from core.openai_client import OpenAIClient
|
| 5 |
+
|
| 6 |
+
class CharacterAgent:
|
| 7 |
+
"""基于角色性格的对话代理 - 支持长文本记忆"""
|
| 8 |
+
|
| 9 |
+
def __init__(self, character_profile: Dict,
|
| 10 |
+
chunks: List[Dict] = None,
|
| 11 |
+
character_chunks: List[int] = None):
|
| 12 |
+
self.character_profile = character_profile
|
| 13 |
+
self.client = OpenAIClient.get_client()
|
| 14 |
+
|
| 15 |
+
self.conversation_history: List[Dict] = []
|
| 16 |
+
self.system_prompt = self._build_system_prompt()
|
| 17 |
+
|
| 18 |
+
# 初始化记忆管理器
|
| 19 |
+
self.memory_manager = None
|
| 20 |
+
if chunks and character_chunks:
|
| 21 |
+
self.memory_manager = MemoryManager(character_profile['name'])
|
| 22 |
+
self.memory_manager.add_text_chunks(chunks, character_chunks)
|
| 23 |
+
print(f"已为 {character_profile['name']} 初始化记忆系统")
|
| 24 |
+
|
| 25 |
+
def _build_system_prompt(self) -> str:
|
| 26 |
+
"""根据角色分析构建系统提示"""
|
| 27 |
+
profile = self.character_profile
|
| 28 |
+
|
| 29 |
+
# 构建角色特征描述
|
| 30 |
+
traits_desc = "、".join(profile.get('core_traits', []))
|
| 31 |
+
|
| 32 |
+
prompt = f"""
|
| 33 |
+
你现在要完全沉浸式地扮演小说中的角色"{profile['name']}"。这不是角色扮演游戏,你就是{profile['name']}本人。
|
| 34 |
+
|
| 35 |
+
## 角色身份
|
| 36 |
+
你是 {profile['name']},{profile.get('background', '一个小说角色')}。
|
| 37 |
+
|
| 38 |
+
## 性格特征
|
| 39 |
+
{profile.get('personality_summary', '')}
|
| 40 |
+
|
| 41 |
+
核心性格特质:{traits_desc}
|
| 42 |
+
|
| 43 |
+
## 说话风格
|
| 44 |
+
{profile.get('speaking_style', '自然真实的对话风格')}
|
| 45 |
+
|
| 46 |
+
## 行为模式
|
| 47 |
+
{profile.get('behavior_patterns', '根据性格行事')}
|
| 48 |
+
|
| 49 |
+
## 价值观与信念
|
| 50 |
+
{profile.get('values', '')}
|
| 51 |
+
|
| 52 |
+
## 情感表达
|
| 53 |
+
{profile.get('emotional_style', '')}
|
| 54 |
+
|
| 55 |
+
## 人际关系风格
|
| 56 |
+
{profile.get('relationship_style', '')}
|
| 57 |
+
|
| 58 |
+
## 典型语句示例
|
| 59 |
+
{chr(10).join('- "' + q + '"' for q in profile.get('key_quotes', [])[:5])}
|
| 60 |
+
|
| 61 |
+
## 对话原则
|
| 62 |
+
1. **完全成为角色**:用第一人称,以{profile['name']}的身份、记忆、情感回应
|
| 63 |
+
2. **保持一致性**:所有回应必须符合上述性格特征和说话风格
|
| 64 |
+
3. **自然真实**:像真人一样对话,有情绪起伏,可以开玩笑、生气、困惑
|
| 65 |
+
4. **不跳戏**:绝不说"作为AI"、"我是语言模型"等破坏沉浸感的话
|
| 66 |
+
5. **合理应对**:如果问到角色不知道的事,以角色的方式回应(如"我不太清楚"、"这个我没听说过")
|
| 67 |
+
6. **保持神秘**:不要过度解释角色设定,自然地展现性格
|
| 68 |
+
7. **情境感知**:根据对话氛围调整语气和态度
|
| 69 |
+
|
| 70 |
+
记住:你不是在"扮演"或"模仿"{profile['name']},你就是{profile['name']}。
|
| 71 |
+
|
| 72 |
+
现在,以{profile['name']}的身份开始对话。
|
| 73 |
+
"""
|
| 74 |
+
return prompt
|
| 75 |
+
|
| 76 |
+
def _retrieve_relevant_memory(self, user_message: str) -> str:
|
| 77 |
+
"""从记忆中检索相关上下文"""
|
| 78 |
+
if not self.memory_manager:
|
| 79 |
+
return ""
|
| 80 |
+
|
| 81 |
+
relevant_contexts = self.memory_manager.search_relevant_context(
|
| 82 |
+
user_message,
|
| 83 |
+
n_results=3
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
if relevant_contexts:
|
| 87 |
+
memory_text = "\n\n".join(relevant_contexts[:2]) # 只用前2个
|
| 88 |
+
return f"\n[相关记忆片段:\n{memory_text[:1000]}]\n"
|
| 89 |
+
return ""
|
| 90 |
+
|
| 91 |
+
def chat(self, user_message: str, use_memory: bool = True) -> str:
|
| 92 |
+
"""与用户对话"""
|
| 93 |
+
|
| 94 |
+
# 构建消息列表
|
| 95 |
+
messages = [
|
| 96 |
+
{"role": "system", "content": self.system_prompt}
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
# 如果启用记忆,检索相关上下文
|
| 100 |
+
enhanced_message = user_message
|
| 101 |
+
if use_memory and self.memory_manager:
|
| 102 |
+
memory_context = self._retrieve_relevant_memory(user_message)
|
| 103 |
+
if memory_context:
|
| 104 |
+
# 在系统消息中添加记忆上下文
|
| 105 |
+
memory_prompt = f"""
|
| 106 |
+
{self.system_prompt}
|
| 107 |
+
|
| 108 |
+
## 相关记忆
|
| 109 |
+
以下是与当前对话相关的原著片段,帮助你回忆:
|
| 110 |
+
{memory_context}
|
| 111 |
+
|
| 112 |
+
请基于这些记忆和你的角色性格来回应。
|
| 113 |
+
"""
|
| 114 |
+
messages[0] = {"role": "system", "content": memory_prompt}
|
| 115 |
+
|
| 116 |
+
# 添加对话历史
|
| 117 |
+
recent_history = self.conversation_history[-Config.MAX_HISTORY:]
|
| 118 |
+
messages.extend(recent_history)
|
| 119 |
+
|
| 120 |
+
# 添加当前用户消息
|
| 121 |
+
messages.append({
|
| 122 |
+
"role": "user",
|
| 123 |
+
"content": user_message
|
| 124 |
+
})
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
response = self.client.chat.completions.create(
|
| 128 |
+
model=Config.MODEL_NAME,
|
| 129 |
+
messages=messages
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
assistant_message = response.choices[0].message.content.strip()
|
| 133 |
+
|
| 134 |
+
# 保存到历史
|
| 135 |
+
self.conversation_history.append({
|
| 136 |
+
"role": "user",
|
| 137 |
+
"content": user_message
|
| 138 |
+
})
|
| 139 |
+
self.conversation_history.append({
|
| 140 |
+
"role": "assistant",
|
| 141 |
+
"content": assistant_message
|
| 142 |
+
})
|
| 143 |
+
|
| 144 |
+
return assistant_message
|
| 145 |
+
|
| 146 |
+
except Exception as e:
|
| 147 |
+
error_msg = f"对话出错: {e}"
|
| 148 |
+
print(error_msg)
|
| 149 |
+
return f"*{self.character_profile['name']}沉默了片刻,似乎在思考着什么...*"
|
| 150 |
+
|
| 151 |
+
def reset_conversation(self):
|
| 152 |
+
"""重置对话历史"""
|
| 153 |
+
self.conversation_history = []
|
| 154 |
+
print(f"\n[对话已重置,{self.character_profile['name']}忘记了之前的谈话内容]\n")
|
| 155 |
+
|
| 156 |
+
def get_character_info(self) -> str:
|
| 157 |
+
"""获取角色信息摘要"""
|
| 158 |
+
profile = self.character_profile
|
| 159 |
+
|
| 160 |
+
info = f"""
|
| 161 |
+
{'='*70}
|
| 162 |
+
角色档案:{profile['name']}
|
| 163 |
+
{'='*70}
|
| 164 |
+
|
| 165 |
+
【性格特质】
|
| 166 |
+
{' • '.join(profile.get('core_traits', []))}
|
| 167 |
+
|
| 168 |
+
【性格总结】
|
| 169 |
+
{profile.get('personality_summary', 'N/A')}
|
| 170 |
+
|
| 171 |
+
【说话风格】
|
| 172 |
+
{profile.get('speaking_style', 'N/A')}
|
| 173 |
+
|
| 174 |
+
【核心价值观】
|
| 175 |
+
{profile.get('values', 'N/A')}
|
| 176 |
+
|
| 177 |
+
【典型语句】
|
| 178 |
+
"""
|
| 179 |
+
for i, quote in enumerate(profile.get('key_quotes', [])[:3], 1):
|
| 180 |
+
info += f'{i}. "{quote}"\n'
|
| 181 |
+
|
| 182 |
+
info += f"\n{'='*70}\n"
|
| 183 |
+
|
| 184 |
+
return info
|
| 185 |
+
|
| 186 |
+
def save_conversation(self, filepath: str):
|
| 187 |
+
"""保存对话历史"""
|
| 188 |
+
import json
|
| 189 |
+
|
| 190 |
+
data = {
|
| 191 |
+
'character': self.character_profile['name'],
|
| 192 |
+
'history': self.conversation_history
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 196 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 197 |
+
|
| 198 |
+
print(f"\n对话已保存到: {filepath}")
|
| 199 |
+
|
| 200 |
+
def load_conversation(self, filepath: str):
|
| 201 |
+
"""加载对话历史"""
|
| 202 |
+
import json
|
| 203 |
+
|
| 204 |
+
try:
|
| 205 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 206 |
+
data = json.load(f)
|
| 207 |
+
|
| 208 |
+
self.conversation_history = data['history']
|
| 209 |
+
print(f"\n已加载 {len(self.conversation_history)} 条对话记录")
|
| 210 |
+
except Exception as e:
|
| 211 |
+
print(f"加载对话失败: {e}")
|
core/character_analyzer.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
from config import Config
|
| 4 |
+
from utils.cache_manager import CacheManager
|
| 5 |
+
from core.openai_client import OpenAIClient
|
| 6 |
+
|
| 7 |
+
class CharacterAnalyzer:
|
| 8 |
+
"""角色性格分析器 - 针对大规模文本优化"""
|
| 9 |
+
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.client = OpenAIClient.get_client()
|
| 12 |
+
self.cache = CacheManager()
|
| 13 |
+
|
| 14 |
+
def select_representative_chunks(self, chunks: List[Dict],
|
| 15 |
+
character_chunks: List[int],
|
| 16 |
+
max_chunks: int = None) -> List[Dict]:
|
| 17 |
+
"""选择最具代表性的文本块"""
|
| 18 |
+
|
| 19 |
+
max_chunks = max_chunks or Config.MAX_ANALYSIS_CHUNKS
|
| 20 |
+
|
| 21 |
+
# 均匀分布选择
|
| 22 |
+
if len(character_chunks) <= max_chunks:
|
| 23 |
+
selected_ids = character_chunks
|
| 24 |
+
else:
|
| 25 |
+
step = len(character_chunks) // max_chunks
|
| 26 |
+
selected_ids = [character_chunks[i * step] for i in range(max_chunks)]
|
| 27 |
+
|
| 28 |
+
selected_chunks = [chunks[i] for i in selected_ids if i < len(chunks)]
|
| 29 |
+
return selected_chunks
|
| 30 |
+
|
| 31 |
+
def analyze_character_batch(self, character_name: str,
|
| 32 |
+
text_chunks: List[Dict]) -> Dict:
|
| 33 |
+
"""分批分析角色性格"""
|
| 34 |
+
|
| 35 |
+
# 检查缓存
|
| 36 |
+
cache_key = f"analysis_{character_name}_{hash(str([c['chunk_id'] for c in text_chunks]))}"
|
| 37 |
+
cached = self.cache.get(cache_key)
|
| 38 |
+
if cached:
|
| 39 |
+
print(f"从缓存加载 {character_name} 的分析结果")
|
| 40 |
+
return cached
|
| 41 |
+
|
| 42 |
+
# 合并文本块
|
| 43 |
+
combined_text = "\n\n---\n\n".join([c['text'] for c in text_chunks])
|
| 44 |
+
|
| 45 |
+
analysis_prompt = f"""
|
| 46 |
+
请深度分析小说中"{character_name}"这个角色的性格特征。
|
| 47 |
+
|
| 48 |
+
基于以下文本片段进行分析:
|
| 49 |
+
|
| 50 |
+
{combined_text[:8000]} # 限制输入长度
|
| 51 |
+
|
| 52 |
+
请从以下维度分析,并以JSON格式返回:
|
| 53 |
+
|
| 54 |
+
{{
|
| 55 |
+
"name": "{character_name}",
|
| 56 |
+
"core_traits": ["特质1", "特质2", "特质3"],
|
| 57 |
+
"speaking_style": "说话风格描述",
|
| 58 |
+
"behavior_patterns": "行为模式描述",
|
| 59 |
+
"values": "核心价值观",
|
| 60 |
+
"emotional_style": "情感表达方式",
|
| 61 |
+
"relationship_style": "人际关系风格",
|
| 62 |
+
"background": "背景信息",
|
| 63 |
+
"key_quotes": ["典型语句1", "典型语句2"],
|
| 64 |
+
"personality_summary": "性格总结(100字以内)"
|
| 65 |
+
}}
|
| 66 |
+
|
| 67 |
+
注意:
|
| 68 |
+
1. 只基于文本内容分析,不要添加原著之外的信息
|
| 69 |
+
2. 提取该角色的典型对话和行为
|
| 70 |
+
3. 关注语言风格、用词习惯、口头禅等
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
response = self.client.chat.completions.create(
|
| 75 |
+
model=Config.MODEL_NAME,
|
| 76 |
+
messages=[
|
| 77 |
+
{"role": "system", "content": "你是一个专业的文学角色分析专家。请基于文本内容进行深入分析。"},
|
| 78 |
+
{"role": "user", "content": analysis_prompt}
|
| 79 |
+
]
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
analysis_text = response.choices[0].message.content.strip()
|
| 83 |
+
|
| 84 |
+
# 尝试解析JSON
|
| 85 |
+
import json
|
| 86 |
+
import re
|
| 87 |
+
|
| 88 |
+
json_match = re.search(r'\{.*\}', analysis_text, re.DOTALL)
|
| 89 |
+
if json_match:
|
| 90 |
+
profile = json.loads(json_match.group())
|
| 91 |
+
else:
|
| 92 |
+
profile = self._parse_text_analysis(analysis_text, character_name)
|
| 93 |
+
|
| 94 |
+
profile['raw_analysis'] = analysis_text
|
| 95 |
+
|
| 96 |
+
# 缓存结果
|
| 97 |
+
self.cache.set(cache_key, profile)
|
| 98 |
+
|
| 99 |
+
return profile
|
| 100 |
+
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"分析失败: {e}")
|
| 103 |
+
return self._default_profile(character_name)
|
| 104 |
+
|
| 105 |
+
def _parse_text_analysis(self, text: str, character_name: str) -> Dict:
|
| 106 |
+
"""解析文本格式的分析结果"""
|
| 107 |
+
|
| 108 |
+
profile = {
|
| 109 |
+
'name': character_name,
|
| 110 |
+
'core_traits': [],
|
| 111 |
+
'speaking_style': '',
|
| 112 |
+
'behavior_patterns': '',
|
| 113 |
+
'values': '',
|
| 114 |
+
'emotional_style': '',
|
| 115 |
+
'relationship_style': '',
|
| 116 |
+
'background': '',
|
| 117 |
+
'key_quotes': [],
|
| 118 |
+
'personality_summary': ''
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
# 提取各个部分
|
| 122 |
+
import re
|
| 123 |
+
|
| 124 |
+
patterns = {
|
| 125 |
+
'core_traits': r'core_traits["\']?\s*:\s*\[(.*?)\]',
|
| 126 |
+
'speaking_style': r'speaking_style["\']?\s*:\s*["\']([^"\']+)["\']',
|
| 127 |
+
'key_quotes': r'key_quotes["\']?\s*:\s*\[(.*?)\]',
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
for key, pattern in patterns.items():
|
| 131 |
+
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
|
| 132 |
+
if match:
|
| 133 |
+
content = match.group(1)
|
| 134 |
+
if key in ['core_traits', 'key_quotes']:
|
| 135 |
+
items = re.findall(r'["\']([^"\']+)["\']', content)
|
| 136 |
+
profile[key] = items
|
| 137 |
+
else:
|
| 138 |
+
profile[key] = content
|
| 139 |
+
|
| 140 |
+
return profile
|
| 141 |
+
|
| 142 |
+
def _default_profile(self, character_name: str) -> Dict:
|
| 143 |
+
"""默认角色配置"""
|
| 144 |
+
return {
|
| 145 |
+
'name': character_name,
|
| 146 |
+
'core_traits': ['复杂', '多面'],
|
| 147 |
+
'speaking_style': '根据情境变化',
|
| 148 |
+
'behavior_patterns': '待观察',
|
| 149 |
+
'values': '待分析',
|
| 150 |
+
'emotional_style': '情感丰富',
|
| 151 |
+
'relationship_style': '因人而异',
|
| 152 |
+
'background': '小说角色',
|
| 153 |
+
'key_quotes': [],
|
| 154 |
+
'personality_summary': f'{character_name}是一个复杂的角色',
|
| 155 |
+
'raw_analysis': '使用默认配置'
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
def enhance_profile_with_examples(self, profile: Dict, chunks: List[Dict],
|
| 159 |
+
character_chunks: List[int]) -> Dict:
|
| 160 |
+
"""通过对话实例增强角色配置"""
|
| 161 |
+
|
| 162 |
+
# 提取包含该角色的对话
|
| 163 |
+
dialogues = []
|
| 164 |
+
for chunk_id in character_chunks[:5]: # 只看前几个块
|
| 165 |
+
if chunk_id < len(chunks):
|
| 166 |
+
chunk_text = chunks[chunk_id]['text']
|
| 167 |
+
# 简单提取引号内容
|
| 168 |
+
import re
|
| 169 |
+
quotes = re.findall(r'["\']([^"\']{10,100})["\']', chunk_text)
|
| 170 |
+
dialogues.extend(quotes[:3])
|
| 171 |
+
|
| 172 |
+
if dialogues:
|
| 173 |
+
profile['example_dialogues'] = dialogues[:5]
|
| 174 |
+
|
| 175 |
+
return profile
|
core/character_extractor.py
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List, Dict, Tuple
|
| 3 |
+
from collections import Counter, defaultdict
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
from config import Config
|
| 6 |
+
from utils.cache_manager import CacheManager
|
| 7 |
+
from core.openai_client import OpenAIClient
|
| 8 |
+
|
| 9 |
+
class CharacterExtractor:
|
| 10 |
+
"""角色提取器 - 从大规模文本中识别主要角色"""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.client = OpenAIClient.get_client()
|
| 14 |
+
self.cache = CacheManager()
|
| 15 |
+
|
| 16 |
+
def extract_names_pattern(self, text: str, language: str = "en") -> List[str]:
|
| 17 |
+
"""使用正则模式提取人名
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
text: 输入文本
|
| 21 |
+
language: 语言类型 ('en' 或 'zh')
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
提取到的人名列表
|
| 25 |
+
"""
|
| 26 |
+
if language == "en":
|
| 27 |
+
# 英文人名:大写字母开头的连续词
|
| 28 |
+
pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'
|
| 29 |
+
else:
|
| 30 |
+
# 中文人名:常见姓氏+1-2个字
|
| 31 |
+
common_surnames = '哈赫王李张刘陈杨黄赵吴周徐孙马朱胡郭何林高梁郑罗宋谢唐韩曹许邓萧冯曾程蔡彭潘袁于董余苏叶吕魏蒋田杜丁沈姜范江傅钟卢汪戴崔任陆廖姚方金邱夏谭韦贾邹石熊孟秦阎薛侯雷白龙段郝孔邵史毛常万顾赖武康贺严尹钱施牛洪龚'
|
| 32 |
+
pattern = f'[{common_surnames}][\\u4e00-\\u9fff]{{1,3}}'
|
| 33 |
+
|
| 34 |
+
names = re.findall(pattern, text)
|
| 35 |
+
return names
|
| 36 |
+
|
| 37 |
+
def extract_characters_from_chunks(self, chunks: List[Dict],
|
| 38 |
+
language: str = "en") -> Dict[str, Dict]:
|
| 39 |
+
"""从文本块中提取角色及其出现信息
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
chunks: 文本块列表
|
| 43 |
+
language: 语言类型
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
角色信息字典
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
# 检查缓存
|
| 50 |
+
cache_key = f"characters_{hash(str(chunks[:3]))}"
|
| 51 |
+
cached = self.cache.get(cache_key)
|
| 52 |
+
if cached:
|
| 53 |
+
print("从缓存加载角色信息")
|
| 54 |
+
return cached
|
| 55 |
+
|
| 56 |
+
character_mentions = defaultdict(lambda: {
|
| 57 |
+
'count': 0,
|
| 58 |
+
'chunks': set(),
|
| 59 |
+
'positions': [],
|
| 60 |
+
'contexts': []
|
| 61 |
+
})
|
| 62 |
+
|
| 63 |
+
# 添加常见的非角色词汇黑名单
|
| 64 |
+
blacklist = {
|
| 65 |
+
# 章节标记
|
| 66 |
+
'Chapter', 'CHAPTER', 'Part', 'PART', 'Section', 'SECTION',
|
| 67 |
+
'Book', 'BOOK', 'Volume', 'VOLUME',
|
| 68 |
+
# 冠词和连词
|
| 69 |
+
'The', 'THE', 'And', 'AND', 'But', 'BUT', 'Or', 'OR',
|
| 70 |
+
# 疑问词
|
| 71 |
+
'When', 'WHERE', 'What', 'WHAT', 'Who', 'WHO', 'Why', 'WHY', 'How', 'HOW',
|
| 72 |
+
# 称谓(单独出现)
|
| 73 |
+
'Mr', 'Mrs', 'Ms', 'Miss', 'Dr', 'Professor', 'Sir', 'Madam',
|
| 74 |
+
# 星期
|
| 75 |
+
'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
|
| 76 |
+
# 月份
|
| 77 |
+
'January', 'February', 'March', 'April', 'May', 'June', 'July',
|
| 78 |
+
'August', 'September', 'October', 'November', 'December',
|
| 79 |
+
# 常见地点
|
| 80 |
+
'Street', 'Road', 'Avenue', 'Place', 'Square', 'Hall', 'House',
|
| 81 |
+
'School', 'Academy', 'University', 'Castle', 'Forest', 'Mountain',
|
| 82 |
+
# 其他常见词
|
| 83 |
+
'Note', 'End', 'Beginning', 'Epilogue', 'Prologue', 'Appendix',
|
| 84 |
+
'First', 'Second', 'Third', 'Last', 'Next', 'Previous',
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
print("\n提取角色名称...")
|
| 88 |
+
for chunk in tqdm(chunks):
|
| 89 |
+
names = self.extract_names_pattern(chunk['text'], language)
|
| 90 |
+
|
| 91 |
+
for name in names:
|
| 92 |
+
# 过滤太短或太长的名字
|
| 93 |
+
if language == "en":
|
| 94 |
+
if len(name) < 3 or len(name) > 30:
|
| 95 |
+
continue
|
| 96 |
+
else: # zh
|
| 97 |
+
if len(name) < 2 or len(name) > 4:
|
| 98 |
+
continue
|
| 99 |
+
|
| 100 |
+
# 过滤黑名单
|
| 101 |
+
if name in blacklist:
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
# 过滤纯数字
|
| 105 |
+
if name.isdigit():
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
+
# 过滤单个字母或单个词(可能是缩写)
|
| 109 |
+
if len(name) <= 2:
|
| 110 |
+
continue
|
| 111 |
+
|
| 112 |
+
# 过滤全大写(可能是缩写或标题)
|
| 113 |
+
if name.isupper() and len(name) < 5:
|
| 114 |
+
continue
|
| 115 |
+
|
| 116 |
+
character_mentions[name]['count'] += 1
|
| 117 |
+
character_mentions[name]['chunks'].add(chunk['chunk_id'])
|
| 118 |
+
character_mentions[name]['positions'].append(chunk['start'])
|
| 119 |
+
|
| 120 |
+
# 转换set为list以便序列化
|
| 121 |
+
for char in character_mentions:
|
| 122 |
+
character_mentions[char]['chunks'] = list(character_mentions[char]['chunks'])
|
| 123 |
+
|
| 124 |
+
# 缓存结果
|
| 125 |
+
self.cache.set(cache_key, dict(character_mentions))
|
| 126 |
+
|
| 127 |
+
return dict(character_mentions)
|
| 128 |
+
|
| 129 |
+
def rank_characters(self, character_mentions: Dict) -> List[Tuple[str, Dict]]:
|
| 130 |
+
"""根据出场频率和分布排序角色
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
character_mentions: 角色提及信息
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
排序后的角色列表
|
| 137 |
+
"""
|
| 138 |
+
|
| 139 |
+
scored_characters = []
|
| 140 |
+
|
| 141 |
+
for name, info in character_mentions.items():
|
| 142 |
+
# 综合评分:出现次数 + 出现的块数量 * 2
|
| 143 |
+
score = info['count'] + len(info['chunks']) * 2
|
| 144 |
+
scored_characters.append((name, info, score))
|
| 145 |
+
|
| 146 |
+
# 按分数排序
|
| 147 |
+
scored_characters.sort(key=lambda x: x[2], reverse=True)
|
| 148 |
+
|
| 149 |
+
# 返回前N个角色
|
| 150 |
+
return [(name, info) for name, info, score in scored_characters[:Config.TOP_N_CHARACTERS]]
|
| 151 |
+
|
| 152 |
+
def merge_similar_names(self, characters: List[Tuple[str, Dict]]) -> List[Tuple[str, Dict]]:
|
| 153 |
+
"""合并相似的名字(如 Harry 和 Harry Potter)
|
| 154 |
+
|
| 155 |
+
Args:
|
| 156 |
+
characters: 角色列表
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
合并后的角色列表
|
| 160 |
+
"""
|
| 161 |
+
|
| 162 |
+
merged = {}
|
| 163 |
+
|
| 164 |
+
for name, info in characters:
|
| 165 |
+
# 查找是否有包含关系
|
| 166 |
+
found_parent = False
|
| 167 |
+
for existing_name in list(merged.keys()):
|
| 168 |
+
# 如果当前名字是已存在名字的一部分,或反之
|
| 169 |
+
if name in existing_name or existing_name in name:
|
| 170 |
+
# 合并到更长的名字下
|
| 171 |
+
longer_name = name if len(name) > len(existing_name) else existing_name
|
| 172 |
+
shorter_name = existing_name if longer_name == name else name
|
| 173 |
+
|
| 174 |
+
if longer_name not in merged:
|
| 175 |
+
merged[longer_name] = merged.pop(existing_name)
|
| 176 |
+
|
| 177 |
+
# 合并信息
|
| 178 |
+
merged[longer_name]['count'] += info['count']
|
| 179 |
+
merged[longer_name]['chunks'].extend(info['chunks'])
|
| 180 |
+
merged[longer_name]['chunks'] = list(set(merged[longer_name]['chunks']))
|
| 181 |
+
|
| 182 |
+
found_parent = True
|
| 183 |
+
break
|
| 184 |
+
|
| 185 |
+
if not found_parent:
|
| 186 |
+
merged[name] = info
|
| 187 |
+
|
| 188 |
+
return list(merged.items())
|
| 189 |
+
|
| 190 |
+
def refine_with_llm(self, text_sample: str, candidate_names: List[str]) -> List[str]:
|
| 191 |
+
"""使用LLM进一步确认和提取主要角色
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
text_sample: 文本样本
|
| 195 |
+
candidate_names: 候选角色名列表
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
确认的角色名列表
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
prompt = f"""
|
| 202 |
+
以下是一部小说的开头片段。请识别出主要角色的名字。
|
| 203 |
+
|
| 204 |
+
候选名字列表:{', '.join(candidate_names[:30])}
|
| 205 |
+
|
| 206 |
+
文本片段:
|
| 207 |
+
{text_sample[:2000]}
|
| 208 |
+
|
| 209 |
+
请返回确认的主要角色名字,每行一个。只返回真正的角色名,排除:
|
| 210 |
+
1. 地名、物品名
|
| 211 |
+
2. 章节标记(如 "Chapter")
|
| 212 |
+
3. 称谓词(如 "Mr", "Professor")
|
| 213 |
+
4. 普通名词
|
| 214 |
+
|
| 215 |
+
只返回人名,每个名字一行。
|
| 216 |
+
"""
|
| 217 |
+
|
| 218 |
+
try:
|
| 219 |
+
response = self.client.chat.completions.create(
|
| 220 |
+
model="gpt-3.5-turbo",
|
| 221 |
+
messages=[
|
| 222 |
+
{"role": "system", "content": "你是一个文学分析专家,擅长识别小说角色。请只返回真正的人物角色名字。"},
|
| 223 |
+
{"role": "user", "content": prompt}
|
| 224 |
+
]
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
result = response.choices[0].message.content.strip()
|
| 228 |
+
confirmed_names = [name.strip() for name in result.split('\n') if name.strip()]
|
| 229 |
+
|
| 230 |
+
# 再次过滤明显不是人名的词
|
| 231 |
+
final_names = []
|
| 232 |
+
for name in confirmed_names:
|
| 233 |
+
# 跳过太短的
|
| 234 |
+
if len(name) < 3:
|
| 235 |
+
continue
|
| 236 |
+
# 跳过包含数字的
|
| 237 |
+
if any(c.isdigit() for c in name):
|
| 238 |
+
continue
|
| 239 |
+
# 跳过全大写的短词
|
| 240 |
+
if name.isupper() and len(name) < 5:
|
| 241 |
+
continue
|
| 242 |
+
final_names.append(name)
|
| 243 |
+
|
| 244 |
+
return final_names
|
| 245 |
+
|
| 246 |
+
except Exception as e:
|
| 247 |
+
print(f"LLM精炼失败: {e}")
|
| 248 |
+
# 返回前10个,但过滤一些明显的错误
|
| 249 |
+
filtered = []
|
| 250 |
+
for name in candidate_names[:20]:
|
| 251 |
+
if len(name) >= 3 and not name.isdigit() and name not in ['Chapter', 'Book', 'Part']:
|
| 252 |
+
filtered.append(name)
|
| 253 |
+
return filtered[:10]
|
| 254 |
+
|
| 255 |
+
def extract_main_characters(self, chunks: List[Dict],
|
| 256 |
+
text_sample: str = None,
|
| 257 |
+
language: str = "en") -> List[Dict]:
|
| 258 |
+
"""提取主要角色的完整流程
|
| 259 |
+
|
| 260 |
+
Args:
|
| 261 |
+
chunks: 文本块列表
|
| 262 |
+
text_sample: 文本样本(用于LLM精炼)
|
| 263 |
+
language: 语言类型
|
| 264 |
+
|
| 265 |
+
Returns:
|
| 266 |
+
主要角色列表
|
| 267 |
+
"""
|
| 268 |
+
|
| 269 |
+
print("\n" + "="*60)
|
| 270 |
+
print("开始角色提取流程")
|
| 271 |
+
print("="*60)
|
| 272 |
+
|
| 273 |
+
# 1. 从文本块中提取所有名字
|
| 274 |
+
character_mentions = self.extract_characters_from_chunks(chunks, language)
|
| 275 |
+
print(f"\n初步提取到 {len(character_mentions)} 个名字")
|
| 276 |
+
|
| 277 |
+
# 2. 过滤低频角色
|
| 278 |
+
filtered = {name: info for name, info in character_mentions.items()
|
| 279 |
+
if info['count'] >= Config.MIN_CHARACTER_MENTIONS}
|
| 280 |
+
print(f"过滤后剩余 {len(filtered)} 个角色(出现次数≥{Config.MIN_CHARACTER_MENTIONS})")
|
| 281 |
+
|
| 282 |
+
if len(filtered) == 0:
|
| 283 |
+
print("⚠️ 没有找到符合条件的角色,降低阈值...")
|
| 284 |
+
# 降低阈值重试
|
| 285 |
+
filtered = {name: info for name, info in character_mentions.items()
|
| 286 |
+
if info['count'] >= max(3, Config.MIN_CHARACTER_MENTIONS // 2)}
|
| 287 |
+
print(f"降低阈值后找到 {len(filtered)} 个角色")
|
| 288 |
+
|
| 289 |
+
# 3. 排序
|
| 290 |
+
ranked = self.rank_characters(filtered)
|
| 291 |
+
print(f"排序后前 {len(ranked)} 个角色")
|
| 292 |
+
|
| 293 |
+
# 4. 合并相似名字
|
| 294 |
+
merged = self.merge_similar_names(ranked)
|
| 295 |
+
print(f"合并相似名字后:{len(merged)} 个角色")
|
| 296 |
+
|
| 297 |
+
# 5. LLM精炼(可选)
|
| 298 |
+
if text_sample and len(merged) > 15:
|
| 299 |
+
print("\n使用 LLM 精炼角色列表...")
|
| 300 |
+
candidate_names = [name for name, _ in merged[:30]]
|
| 301 |
+
try:
|
| 302 |
+
confirmed = self.refine_with_llm(text_sample, candidate_names)
|
| 303 |
+
|
| 304 |
+
if confirmed:
|
| 305 |
+
# 只保留确认的角色
|
| 306 |
+
final_characters = []
|
| 307 |
+
for name, info in merged:
|
| 308 |
+
if any(confirmed_name in name or name in confirmed_name
|
| 309 |
+
for confirmed_name in confirmed):
|
| 310 |
+
final_characters.append({'name': name, 'info': info})
|
| 311 |
+
|
| 312 |
+
if final_characters:
|
| 313 |
+
print(f"LLM确认后:{len(final_characters)} 个主要角色")
|
| 314 |
+
else:
|
| 315 |
+
print("LLM确认后没有角色,使用原始列表")
|
| 316 |
+
final_characters = [{'name': name, 'info': info} for name, info in merged]
|
| 317 |
+
else:
|
| 318 |
+
print("LLM未返回结果,使用原始列表")
|
| 319 |
+
final_characters = [{'name': name, 'info': info} for name, info in merged]
|
| 320 |
+
except Exception as e:
|
| 321 |
+
print(f"LLM精炼出错: {e},使用原始列表")
|
| 322 |
+
final_characters = [{'name': name, 'info': info} for name, info in merged]
|
| 323 |
+
else:
|
| 324 |
+
final_characters = [{'name': name, 'info': info} for name, info in merged]
|
| 325 |
+
|
| 326 |
+
# 6. 最终过滤:移除明显不是人名的
|
| 327 |
+
cleaned_characters = []
|
| 328 |
+
for char in final_characters:
|
| 329 |
+
name = char['name']
|
| 330 |
+
# 跳过明显的非人名
|
| 331 |
+
if name in ['Chapter', 'Book', 'Part', 'Section', 'The', 'And', 'But', 'When', 'Where']:
|
| 332 |
+
continue
|
| 333 |
+
if name.lower() in ['chapter', 'book', 'part', 'section']:
|
| 334 |
+
continue
|
| 335 |
+
cleaned_characters.append(char)
|
| 336 |
+
|
| 337 |
+
if len(cleaned_characters) < len(final_characters):
|
| 338 |
+
print(f"最终清理后:{len(cleaned_characters)} 个角色")
|
| 339 |
+
final_characters = cleaned_characters
|
| 340 |
+
|
| 341 |
+
# 打印角色列表
|
| 342 |
+
print("\n" + "="*60)
|
| 343 |
+
print("主要角色列表(按重要性排序)")
|
| 344 |
+
print("="*60)
|
| 345 |
+
for i, char in enumerate(final_characters[:15], 1):
|
| 346 |
+
print(f"{i}. {char['name']:<20} - 出现{char['info']['count']}次, "
|
| 347 |
+
f"分布在{len(char['info']['chunks'])}个章节")
|
| 348 |
+
|
| 349 |
+
if not final_characters:
|
| 350 |
+
print("\n⚠️ 警告:未找到任何角色!")
|
| 351 |
+
print("建议:")
|
| 352 |
+
print(" 1. 检查文本格式是否正确")
|
| 353 |
+
print(" 2. 确认文本语言设置")
|
| 354 |
+
print(" 3. 尝试降低 MIN_CHARACTER_MENTIONS 参数")
|
| 355 |
+
|
| 356 |
+
return final_characters
|
core/memory_manager.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import chromadb
|
| 3 |
+
from chromadb.config import Settings
|
| 4 |
+
from typing import List, Dict, Optional
|
| 5 |
+
from config import Config
|
| 6 |
+
from core.openai_client import OpenAIClient
|
| 7 |
+
|
| 8 |
+
class MemoryManager:
|
| 9 |
+
"""向量记忆管理器 - 存储和检索角色相关的文本片段"""
|
| 10 |
+
|
| 11 |
+
def __init__(self, character_name: str):
|
| 12 |
+
self.character_name = character_name
|
| 13 |
+
self.client = OpenAIClient.get_client()
|
| 14 |
+
|
| 15 |
+
# 创建向量数据库目录
|
| 16 |
+
os.makedirs(Config.VECTOR_DB_PATH, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
self.chroma_client = chromadb.Client(Settings(
|
| 20 |
+
persist_directory=Config.VECTOR_DB_PATH,
|
| 21 |
+
anonymized_telemetry=False
|
| 22 |
+
))
|
| 23 |
+
except:
|
| 24 |
+
# 如果上面的方式失败,尝试使用 PersistentClient
|
| 25 |
+
self.chroma_client = chromadb.PersistentClient(
|
| 26 |
+
path=Config.VECTOR_DB_PATH
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# 为每个角色创建独立的集合
|
| 30 |
+
collection_name = f"char_{character_name.replace(' ', '_').lower()}"
|
| 31 |
+
collection_name = collection_name[:63] # ChromaDB 限制集合名长度
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
self.collection = self.chroma_client.get_or_create_collection(
|
| 35 |
+
name=collection_name,
|
| 36 |
+
metadata={"character": character_name}
|
| 37 |
+
)
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"创建集合时出错: {e}")
|
| 40 |
+
# 如果创建失败,尝试使用更简单的名称
|
| 41 |
+
collection_name = f"char_{hash(character_name) % 10000}"
|
| 42 |
+
self.collection = self.chroma_client.get_or_create_collection(
|
| 43 |
+
name=collection_name,
|
| 44 |
+
metadata={"character": character_name}
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
def add_text_chunks(self, chunks: List[Dict], character_chunks: List[int]):
|
| 48 |
+
"""添加与角色相关的文本块
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
chunks: 所有文本块
|
| 52 |
+
character_chunks: 角色出现的文本块ID列表
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
documents = []
|
| 56 |
+
metadatas = []
|
| 57 |
+
ids = []
|
| 58 |
+
|
| 59 |
+
for chunk_id in character_chunks:
|
| 60 |
+
if chunk_id < len(chunks):
|
| 61 |
+
chunk = chunks[chunk_id]
|
| 62 |
+
documents.append(chunk['text'])
|
| 63 |
+
metadatas.append({
|
| 64 |
+
'chunk_id': chunk_id,
|
| 65 |
+
'position': chunk['start']
|
| 66 |
+
})
|
| 67 |
+
ids.append(f"chunk_{chunk_id}")
|
| 68 |
+
|
| 69 |
+
if documents:
|
| 70 |
+
try:
|
| 71 |
+
# 批量添加,避免一次性添加太多
|
| 72 |
+
batch_size = 100
|
| 73 |
+
for i in range(0, len(documents), batch_size):
|
| 74 |
+
batch_docs = documents[i:i+batch_size]
|
| 75 |
+
batch_metas = metadatas[i:i+batch_size]
|
| 76 |
+
batch_ids = ids[i:i+batch_size]
|
| 77 |
+
|
| 78 |
+
self.collection.add(
|
| 79 |
+
documents=batch_docs,
|
| 80 |
+
metadatas=batch_metas,
|
| 81 |
+
ids=batch_ids
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
print(f"已为 {self.character_name} 添加 {len(documents)} 个文本块到向量库")
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"添加文本块到向量库失败: {e}")
|
| 87 |
+
print("将继续运行,但不使用记忆功能")
|
| 88 |
+
|
| 89 |
+
def search_relevant_context(self, query: str, n_results: int = None) -> List[str]:
|
| 90 |
+
"""检索与查询相关的上下文
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
query: 查询文本
|
| 94 |
+
n_results: 返回结果数量
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
相关文本片段列表
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
n_results = n_results or Config.MAX_MEMORY_RETRIEVAL
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
collection_count = self.collection.count()
|
| 104 |
+
if collection_count == 0:
|
| 105 |
+
return []
|
| 106 |
+
|
| 107 |
+
actual_n_results = min(n_results, collection_count)
|
| 108 |
+
|
| 109 |
+
results = self.collection.query(
|
| 110 |
+
query_texts=[query],
|
| 111 |
+
n_results=actual_n_results
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
if results and results['documents']:
|
| 115 |
+
return results['documents'][0]
|
| 116 |
+
return []
|
| 117 |
+
|
| 118 |
+
except Exception as e:
|
| 119 |
+
print(f"检索失败: {e}")
|
| 120 |
+
return []
|
| 121 |
+
|
| 122 |
+
def get_embedding(self, text: str) -> List[float]:
|
| 123 |
+
"""获取文本嵌入向量
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
text: 输入文本
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
嵌入向量
|
| 130 |
+
"""
|
| 131 |
+
try:
|
| 132 |
+
response = self.client.embeddings.create(
|
| 133 |
+
model=Config.EMBEDDING_MODEL,
|
| 134 |
+
input=text
|
| 135 |
+
)
|
| 136 |
+
return response.data[0].embedding
|
| 137 |
+
except Exception as e:
|
| 138 |
+
print(f"获取嵌入失败: {e}")
|
| 139 |
+
return []
|
| 140 |
+
|
| 141 |
+
def get_statistics(self) -> Dict:
|
| 142 |
+
"""获取记忆库统计信息
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
统计信��字典
|
| 146 |
+
"""
|
| 147 |
+
try:
|
| 148 |
+
count = self.collection.count()
|
| 149 |
+
return {
|
| 150 |
+
'character': self.character_name,
|
| 151 |
+
'chunk_count': count,
|
| 152 |
+
'collection_name': self.collection.name
|
| 153 |
+
}
|
| 154 |
+
except:
|
| 155 |
+
return {
|
| 156 |
+
'character': self.character_name,
|
| 157 |
+
'chunk_count': 0,
|
| 158 |
+
'collection_name': 'unknown'
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
def clear(self):
|
| 162 |
+
"""清空记忆库"""
|
| 163 |
+
try:
|
| 164 |
+
# 删除集合
|
| 165 |
+
self.chroma_client.delete_collection(self.collection.name)
|
| 166 |
+
print(f"已清空 {self.character_name} 的记忆库")
|
| 167 |
+
except Exception as e:
|
| 168 |
+
print(f"清空记忆库失败: {e}")
|
core/openai_client.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from openai import OpenAI
|
| 2 |
+
from config import Config
|
| 3 |
+
|
| 4 |
+
class OpenAIClient:
|
| 5 |
+
"""OpenAI客户端单例"""
|
| 6 |
+
|
| 7 |
+
_instance = None
|
| 8 |
+
_client = None
|
| 9 |
+
|
| 10 |
+
def __new__(cls):
|
| 11 |
+
if cls._instance is None:
|
| 12 |
+
cls._instance = super(OpenAIClient, cls).__new__(cls)
|
| 13 |
+
return cls._instance
|
| 14 |
+
|
| 15 |
+
def __init__(self):
|
| 16 |
+
if self._client is None:
|
| 17 |
+
self._client = OpenAI(
|
| 18 |
+
api_key=Config.OPENAI_API_KEY,
|
| 19 |
+
base_url=Config.OPENAI_BASE_URL
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
@property
|
| 23 |
+
def client(self):
|
| 24 |
+
return self._client
|
| 25 |
+
|
| 26 |
+
@classmethod
|
| 27 |
+
def get_client(cls) -> OpenAI:
|
| 28 |
+
"""获取OpenAI客户端实例"""
|
| 29 |
+
instance = cls()
|
| 30 |
+
return instance.client
|
core/text_processor.py
ADDED
|
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List, Dict, Tuple # 确保有 Dict
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
from utils.text_utils import TextUtils
|
| 5 |
+
from config import Config
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TextProcessor:
|
| 9 |
+
"""大规模文本处理器"""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.text_utils = TextUtils()
|
| 13 |
+
|
| 14 |
+
def chunk_text(self, text: str, chunk_size: int = None,
|
| 15 |
+
overlap: int = None) -> List[Dict]:
|
| 16 |
+
"""将长文本分块,保持语义完整性
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
text: 输入文本
|
| 20 |
+
chunk_size: 每块的最大字符数
|
| 21 |
+
overlap: 块之间的重叠字符数
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
分块结果列表,每个元素包含 text, start, end, chunk_id
|
| 25 |
+
"""
|
| 26 |
+
chunk_size = chunk_size or Config.MAX_CHUNK_SIZE
|
| 27 |
+
overlap = overlap or Config.CHUNK_OVERLAP
|
| 28 |
+
|
| 29 |
+
# 先按段落分割
|
| 30 |
+
paragraphs = text.split('\n\n')
|
| 31 |
+
|
| 32 |
+
chunks = []
|
| 33 |
+
current_chunk = ""
|
| 34 |
+
current_start = 0
|
| 35 |
+
total_processed = 0
|
| 36 |
+
|
| 37 |
+
print(f"开始分块处理 (块大小: {chunk_size}, 重叠: {overlap})...")
|
| 38 |
+
|
| 39 |
+
for para in tqdm(paragraphs, desc="分块进度"):
|
| 40 |
+
para = para.strip()
|
| 41 |
+
if not para:
|
| 42 |
+
continue
|
| 43 |
+
|
| 44 |
+
# 如果当前块加上新段落超过限制
|
| 45 |
+
if len(current_chunk) + len(para) + 2 > chunk_size: # +2 for \n\n
|
| 46 |
+
if current_chunk:
|
| 47 |
+
# 保存当前块
|
| 48 |
+
chunks.append({
|
| 49 |
+
'text': current_chunk.strip(),
|
| 50 |
+
'start': current_start,
|
| 51 |
+
'end': current_start + len(current_chunk),
|
| 52 |
+
'chunk_id': len(chunks)
|
| 53 |
+
})
|
| 54 |
+
|
| 55 |
+
# 计算重叠部分
|
| 56 |
+
if len(current_chunk) > overlap:
|
| 57 |
+
# 从当前块末尾取重叠部分
|
| 58 |
+
overlap_text = current_chunk[-overlap:]
|
| 59 |
+
# 尝试在句子边界处分割
|
| 60 |
+
sentences = self.text_utils.split_into_sentences(overlap_text)
|
| 61 |
+
if sentences:
|
| 62 |
+
overlap_text = sentences[-1] if len(sentences) == 1 else ' '.join(sentences[-2:])
|
| 63 |
+
else:
|
| 64 |
+
overlap_text = current_chunk
|
| 65 |
+
|
| 66 |
+
# 更新起始位置
|
| 67 |
+
total_processed += len(current_chunk) - len(overlap_text)
|
| 68 |
+
current_start = total_processed
|
| 69 |
+
|
| 70 |
+
# 开始新块
|
| 71 |
+
current_chunk = overlap_text + "\n\n" + para
|
| 72 |
+
else:
|
| 73 |
+
# 当前块为空,直接使用新段落
|
| 74 |
+
current_chunk = para
|
| 75 |
+
current_start = total_processed
|
| 76 |
+
else:
|
| 77 |
+
# 添加到当前块
|
| 78 |
+
if current_chunk:
|
| 79 |
+
current_chunk += "\n\n" + para
|
| 80 |
+
else:
|
| 81 |
+
current_chunk = para
|
| 82 |
+
|
| 83 |
+
# 添加最后一块
|
| 84 |
+
if current_chunk:
|
| 85 |
+
chunks.append({
|
| 86 |
+
'text': current_chunk.strip(),
|
| 87 |
+
'start': current_start,
|
| 88 |
+
'end': current_start + len(current_chunk),
|
| 89 |
+
'chunk_id': len(chunks)
|
| 90 |
+
})
|
| 91 |
+
|
| 92 |
+
print(f"✓ 文本分块完成: 总共 {len(chunks)} 块")
|
| 93 |
+
return chunks
|
| 94 |
+
|
| 95 |
+
def chunk_text_by_tokens(self, text: str, max_tokens: int = 1500,
|
| 96 |
+
overlap_tokens: int = 150) -> List[Dict]:
|
| 97 |
+
"""按 token 数量分块(更精确但较慢)
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
text: 输入文本
|
| 101 |
+
max_tokens: 每块的最大 token 数
|
| 102 |
+
overlap_tokens: 重叠的 token 数
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
分块结果列表
|
| 106 |
+
"""
|
| 107 |
+
sentences = self.text_utils.split_into_sentences(text)
|
| 108 |
+
|
| 109 |
+
chunks = []
|
| 110 |
+
current_chunk = []
|
| 111 |
+
current_tokens = 0
|
| 112 |
+
current_start = 0
|
| 113 |
+
|
| 114 |
+
print(f"按 token 分块处理 (最大: {max_tokens} tokens)...")
|
| 115 |
+
|
| 116 |
+
for sentence in tqdm(sentences, desc="处理句子"):
|
| 117 |
+
sentence_tokens = self.text_utils.count_tokens(sentence)
|
| 118 |
+
|
| 119 |
+
if current_tokens + sentence_tokens > max_tokens and current_chunk:
|
| 120 |
+
# 保存当前块
|
| 121 |
+
chunk_text = ' '.join(current_chunk)
|
| 122 |
+
chunks.append({
|
| 123 |
+
'text': chunk_text,
|
| 124 |
+
'start': current_start,
|
| 125 |
+
'end': current_start + len(chunk_text),
|
| 126 |
+
'chunk_id': len(chunks),
|
| 127 |
+
'token_count': current_tokens
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
# 处理重叠
|
| 131 |
+
overlap_chunk = []
|
| 132 |
+
overlap_tokens_count = 0
|
| 133 |
+
for s in reversed(current_chunk):
|
| 134 |
+
s_tokens = self.text_utils.count_tokens(s)
|
| 135 |
+
if overlap_tokens_count + s_tokens <= overlap_tokens:
|
| 136 |
+
overlap_chunk.insert(0, s)
|
| 137 |
+
overlap_tokens_count += s_tokens
|
| 138 |
+
else:
|
| 139 |
+
break
|
| 140 |
+
|
| 141 |
+
current_chunk = overlap_chunk + [sentence]
|
| 142 |
+
current_tokens = overlap_tokens_count + sentence_tokens
|
| 143 |
+
current_start += len(chunk_text) - len(' '.join(overlap_chunk))
|
| 144 |
+
else:
|
| 145 |
+
current_chunk.append(sentence)
|
| 146 |
+
current_tokens += sentence_tokens
|
| 147 |
+
|
| 148 |
+
# 添加最后一块
|
| 149 |
+
if current_chunk:
|
| 150 |
+
chunk_text = ' '.join(current_chunk)
|
| 151 |
+
chunks.append({
|
| 152 |
+
'text': chunk_text,
|
| 153 |
+
'start': current_start,
|
| 154 |
+
'end': current_start + len(chunk_text),
|
| 155 |
+
'chunk_id': len(chunks),
|
| 156 |
+
'token_count': current_tokens
|
| 157 |
+
})
|
| 158 |
+
|
| 159 |
+
print(f"✓ Token 分块完成: 总共 {len(chunks)} 块")
|
| 160 |
+
return chunks
|
| 161 |
+
|
| 162 |
+
def extract_dialogues(self, text: str) -> List[Dict]:
|
| 163 |
+
"""提取对话片段
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
text: 输入文本
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
对话列表,每个元素包含 content, attribution, position
|
| 170 |
+
"""
|
| 171 |
+
# 检测语言
|
| 172 |
+
language = self.text_utils.detect_language(text)
|
| 173 |
+
|
| 174 |
+
dialogues = []
|
| 175 |
+
|
| 176 |
+
if language == "zh":
|
| 177 |
+
# 中文对话模式
|
| 178 |
+
patterns = [
|
| 179 |
+
(r'"([^"]+)"[,,]?\s*([^说道讲告诉问答叫喊]*(?:说|道|讲|告诉|问|答|叫|喊))', 'chinese_quote'),
|
| 180 |
+
(r'「([^」]+)」[,,]?\s*([^说道讲]*(?:说|道|讲))', 'chinese_bracket'),
|
| 181 |
+
(r'"([^"]+)"', 'simple_quote'),
|
| 182 |
+
]
|
| 183 |
+
else:
|
| 184 |
+
# 英文对话模式
|
| 185 |
+
patterns = [
|
| 186 |
+
(r'"([^"]+)",?\s+([A-Z][a-z]+\s+(?:said|asked|replied|shouted|whispered|muttered|exclaimed))', 'english_quote_said'),
|
| 187 |
+
(r'"([^"]+)"', 'simple_quote'),
|
| 188 |
+
(r"'([^']+)',?\s+([A-Z][a-z]+\s+said)", 'english_single_quote'),
|
| 189 |
+
]
|
| 190 |
+
|
| 191 |
+
for pattern, pattern_type in patterns:
|
| 192 |
+
matches = re.finditer(pattern, text, re.IGNORECASE)
|
| 193 |
+
for match in matches:
|
| 194 |
+
dialogue = {
|
| 195 |
+
'content': match.group(1).strip(),
|
| 196 |
+
'attribution': match.group(2).strip() if len(match.groups()) > 1 else '',
|
| 197 |
+
'position': match.start(),
|
| 198 |
+
'type': pattern_type
|
| 199 |
+
}
|
| 200 |
+
# 过滤太短的对话
|
| 201 |
+
if len(dialogue['content']) > 5:
|
| 202 |
+
dialogues.append(dialogue)
|
| 203 |
+
|
| 204 |
+
# 按位置排序
|
| 205 |
+
dialogues.sort(key=lambda x: x['position'])
|
| 206 |
+
|
| 207 |
+
return dialogues
|
| 208 |
+
|
| 209 |
+
def split_by_chapters(self, text: str) -> List[Dict]:
|
| 210 |
+
"""按章节分割文本
|
| 211 |
+
|
| 212 |
+
Args:
|
| 213 |
+
text: 输入文本
|
| 214 |
+
|
| 215 |
+
Returns:
|
| 216 |
+
章节列表,每个元素包含 title, content, chapter_num
|
| 217 |
+
"""
|
| 218 |
+
# 检测章节标记模式
|
| 219 |
+
chapter_patterns = [
|
| 220 |
+
r'Chapter\s+(\d+)[:\s]*([^\n]*)', # English: Chapter 1: Title
|
| 221 |
+
r'第([一二三四五六七八九十百千零\d]+)章[:\s]*([^\n]*)', # Chinese: 第一章:标题
|
| 222 |
+
r'CHAPTER\s+([IVXLCDM]+)[:\s]*([^\n]*)', # Roman numerals
|
| 223 |
+
]
|
| 224 |
+
|
| 225 |
+
chapters = []
|
| 226 |
+
last_pos = 0
|
| 227 |
+
|
| 228 |
+
for pattern in chapter_patterns:
|
| 229 |
+
matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))
|
| 230 |
+
|
| 231 |
+
if matches:
|
| 232 |
+
for i, match in enumerate(matches):
|
| 233 |
+
start = match.start()
|
| 234 |
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
| 235 |
+
|
| 236 |
+
chapters.append({
|
| 237 |
+
'chapter_num': match.group(1),
|
| 238 |
+
'title': match.group(2).strip() if len(match.groups()) > 1 else '',
|
| 239 |
+
'content': text[start:end].strip(),
|
| 240 |
+
'start': start,
|
| 241 |
+
'end': end
|
| 242 |
+
})
|
| 243 |
+
break # 找到匹配的模式就停止
|
| 244 |
+
|
| 245 |
+
# 如果没找到章节,返回整个文本作为一章
|
| 246 |
+
if not chapters:
|
| 247 |
+
chapters.append({
|
| 248 |
+
'chapter_num': '1',
|
| 249 |
+
'title': 'Full Text',
|
| 250 |
+
'content': text,
|
| 251 |
+
'start': 0,
|
| 252 |
+
'end': len(text)
|
| 253 |
+
})
|
| 254 |
+
|
| 255 |
+
return chapters
|
| 256 |
+
|
| 257 |
+
def get_statistics(self, text: str) -> Dict:
|
| 258 |
+
"""获取文本统计信息
|
| 259 |
+
|
| 260 |
+
Args:
|
| 261 |
+
text: 输入文本
|
| 262 |
+
|
| 263 |
+
Returns:
|
| 264 |
+
统计信息字典
|
| 265 |
+
"""
|
| 266 |
+
# 基本统计
|
| 267 |
+
total_length = len(text)
|
| 268 |
+
total_tokens = self.text_utils.count_tokens(text)
|
| 269 |
+
|
| 270 |
+
# 段落统计
|
| 271 |
+
paragraphs = [p for p in text.split('\n\n') if p.strip()]
|
| 272 |
+
paragraph_count = len(paragraphs)
|
| 273 |
+
|
| 274 |
+
# 句子统计
|
| 275 |
+
sentences = self.text_utils.split_into_sentences(text)
|
| 276 |
+
sentence_count = len(sentences)
|
| 277 |
+
|
| 278 |
+
# 单词/字符统计
|
| 279 |
+
words = re.findall(r'\b\w+\b', text)
|
| 280 |
+
word_count = len(words)
|
| 281 |
+
|
| 282 |
+
# 语言检测
|
| 283 |
+
language = self.text_utils.detect_language(text)
|
| 284 |
+
|
| 285 |
+
# 对话统计
|
| 286 |
+
dialogues = self.extract_dialogues(text[:10000]) # 只检查前10000字符
|
| 287 |
+
dialogue_count = len(dialogues)
|
| 288 |
+
|
| 289 |
+
# 章节检测
|
| 290 |
+
chapters = self.split_by_chapters(text)
|
| 291 |
+
chapter_count = len(chapters)
|
| 292 |
+
|
| 293 |
+
return {
|
| 294 |
+
'total_length': total_length,
|
| 295 |
+
'total_tokens': total_tokens,
|
| 296 |
+
'paragraphs': paragraph_count,
|
| 297 |
+
'sentences': sentence_count,
|
| 298 |
+
'words': word_count,
|
| 299 |
+
'language': language,
|
| 300 |
+
'dialogues': dialogue_count,
|
| 301 |
+
'chapters': chapter_count,
|
| 302 |
+
'avg_paragraph_length': total_length // paragraph_count if paragraph_count > 0 else 0,
|
| 303 |
+
'avg_sentence_length': total_length // sentence_count if sentence_count > 0 else 0,
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
def clean_text(self, text: str,
|
| 307 |
+
remove_extra_whitespace: bool = True,
|
| 308 |
+
normalize_quotes: bool = True) -> str:
|
| 309 |
+
"""清理文本
|
| 310 |
+
|
| 311 |
+
Args:
|
| 312 |
+
text: 输入文本
|
| 313 |
+
remove_extra_whitespace: 是否移除多余空白
|
| 314 |
+
normalize_quotes: 是否标准化引号
|
| 315 |
+
|
| 316 |
+
Returns:
|
| 317 |
+
清理后的文本
|
| 318 |
+
"""
|
| 319 |
+
cleaned = text
|
| 320 |
+
|
| 321 |
+
# 移除多余空白
|
| 322 |
+
if remove_extra_whitespace:
|
| 323 |
+
# 移除行首行尾空白
|
| 324 |
+
cleaned = '\n'.join(line.strip() for line in cleaned.split('\n'))
|
| 325 |
+
# 合并多个空行为一个
|
| 326 |
+
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
|
| 327 |
+
# 移除制表符
|
| 328 |
+
cleaned = cleaned.replace('\t', ' ')
|
| 329 |
+
# 合并多个空格
|
| 330 |
+
cleaned = re.sub(r' {2,}', ' ', cleaned)
|
| 331 |
+
|
| 332 |
+
# 标准化引号
|
| 333 |
+
if normalize_quotes:
|
| 334 |
+
# 中文引号统一为 ""
|
| 335 |
+
cleaned = cleaned.replace('『', '"').replace('』', '"')
|
| 336 |
+
cleaned = cleaned.replace('「', '"').replace('」', '"')
|
| 337 |
+
# 英文引号统一为 ""
|
| 338 |
+
cleaned = cleaned.replace('"', '"').replace('"', '"')
|
| 339 |
+
cleaned = cleaned.replace(''', "'").replace(''', "'")
|
| 340 |
+
|
| 341 |
+
return cleaned
|
| 342 |
+
|
| 343 |
+
def extract_metadata(self, text: str) -> Dict:
|
| 344 |
+
"""提取文本元数据(标题、作者等)
|
| 345 |
+
|
| 346 |
+
Args:
|
| 347 |
+
text: 输入文本
|
| 348 |
+
|
| 349 |
+
Returns:
|
| 350 |
+
元数据字典
|
| 351 |
+
"""
|
| 352 |
+
metadata = {
|
| 353 |
+
'title': None,
|
| 354 |
+
'author': None,
|
| 355 |
+
'year': None,
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
# 尝试从文本开头提取标题和作者
|
| 359 |
+
lines = text.split('\n')[:20] # 只看前20行
|
| 360 |
+
|
| 361 |
+
for line in lines:
|
| 362 |
+
line = line.strip()
|
| 363 |
+
|
| 364 |
+
# 尝试匹配标题
|
| 365 |
+
if not metadata['title'] and len(line) > 5 and len(line) < 100:
|
| 366 |
+
# 如果是全大写或标题格式
|
| 367 |
+
if line.isupper() or line.istitle():
|
| 368 |
+
metadata['title'] = line
|
| 369 |
+
|
| 370 |
+
# 尝试匹配作者
|
| 371 |
+
author_patterns = [
|
| 372 |
+
r'by\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)',
|
| 373 |
+
r'作者[::]\s*(.+)',
|
| 374 |
+
r'Author[:\s]+(.+)',
|
| 375 |
+
]
|
| 376 |
+
|
| 377 |
+
for pattern in author_patterns:
|
| 378 |
+
match = re.search(pattern, line, re.IGNORECASE)
|
| 379 |
+
if match:
|
| 380 |
+
metadata['author'] = match.group(1).strip()
|
| 381 |
+
break
|
| 382 |
+
|
| 383 |
+
# 尝试匹配年份
|
| 384 |
+
year_match = re.search(r'\b(19|20)\d{2}\b', line)
|
| 385 |
+
if year_match:
|
| 386 |
+
metadata['year'] = year_match.group(0)
|
| 387 |
+
|
| 388 |
+
return metadata
|
| 389 |
+
|
| 390 |
+
def sample_text(self, text: str, sample_size: int = 1000,
|
| 391 |
+
strategy: str = 'random') -> str:
|
| 392 |
+
"""从文本中采样
|
| 393 |
+
|
| 394 |
+
Args:
|
| 395 |
+
text: 输入文本
|
| 396 |
+
sample_size: 采样大小(字符数)
|
| 397 |
+
strategy: 采样策略 ('start', 'random', 'distributed')
|
| 398 |
+
|
| 399 |
+
Returns:
|
| 400 |
+
采样的文本
|
| 401 |
+
"""
|
| 402 |
+
if len(text) <= sample_size:
|
| 403 |
+
return text
|
| 404 |
+
|
| 405 |
+
if strategy == 'start':
|
| 406 |
+
# 从开头采样
|
| 407 |
+
return text[:sample_size]
|
| 408 |
+
|
| 409 |
+
elif strategy == 'random':
|
| 410 |
+
# 随机位置采样
|
| 411 |
+
import random
|
| 412 |
+
start = random.randint(0, len(text) - sample_size)
|
| 413 |
+
return text[start:start + sample_size]
|
| 414 |
+
|
| 415 |
+
elif strategy == 'distributed':
|
| 416 |
+
# 分布式采样(从文本的不同部分采样)
|
| 417 |
+
num_samples = 3
|
| 418 |
+
sample_per_part = sample_size // num_samples
|
| 419 |
+
samples = []
|
| 420 |
+
|
| 421 |
+
for i in range(num_samples):
|
| 422 |
+
start = (len(text) // num_samples) * i
|
| 423 |
+
end = min(start + sample_per_part, len(text))
|
| 424 |
+
samples.append(text[start:end])
|
| 425 |
+
|
| 426 |
+
return '\n...\n'.join(samples)
|
| 427 |
+
|
| 428 |
+
else:
|
| 429 |
+
return text[:sample_size]
|
main.py
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
from core.text_processor import TextProcessor
|
| 5 |
+
from core.character_extractor import CharacterExtractor
|
| 6 |
+
from core.character_analyzer import CharacterAnalyzer
|
| 7 |
+
from core.character_agent import CharacterAgent
|
| 8 |
+
from utils.text_utils import TextUtils
|
| 9 |
+
from utils.cache_manager import CacheManager
|
| 10 |
+
from config import Config
|
| 11 |
+
|
| 12 |
+
def print_banner():
|
| 13 |
+
"""打印欢迎横幅"""
|
| 14 |
+
banner = """
|
| 15 |
+
╔══════════════════════════════════════════════════════════════════╗
|
| 16 |
+
║ ║
|
| 17 |
+
║ 🎭 小说角色 Agent 系统 (大规模文本版) ║
|
| 18 |
+
║ ║
|
| 19 |
+
║ 基于 AI 的角色性格分析与对话系统 ║
|
| 20 |
+
║ ║
|
| 21 |
+
╚══════════════════════════════════════════════════════════════════╝
|
| 22 |
+
"""
|
| 23 |
+
print(banner)
|
| 24 |
+
|
| 25 |
+
def load_novel(file_path: str) -> str:
|
| 26 |
+
"""加载小说文本"""
|
| 27 |
+
try:
|
| 28 |
+
# 尝试不同的编码
|
| 29 |
+
encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1', 'utf-16']
|
| 30 |
+
|
| 31 |
+
for encoding in encodings:
|
| 32 |
+
try:
|
| 33 |
+
with open(file_path, 'r', encoding=encoding) as f:
|
| 34 |
+
text = f.read()
|
| 35 |
+
print(f"✓ 成功使用 {encoding} 编码加载文件")
|
| 36 |
+
return text
|
| 37 |
+
except UnicodeDecodeError:
|
| 38 |
+
continue
|
| 39 |
+
|
| 40 |
+
print("✗ 所有编码尝试失败")
|
| 41 |
+
return ""
|
| 42 |
+
|
| 43 |
+
except FileNotFoundError:
|
| 44 |
+
print(f"✗ 文件不存在: {file_path}")
|
| 45 |
+
return ""
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"✗ 加载小说失败: {e}")
|
| 48 |
+
return ""
|
| 49 |
+
|
| 50 |
+
def display_statistics(stats: dict):
|
| 51 |
+
"""显示文本统计信息"""
|
| 52 |
+
print("\n" + "="*70)
|
| 53 |
+
print("📊 文本统计信息")
|
| 54 |
+
print("="*70)
|
| 55 |
+
print(f"总字符数: {stats['total_length']:,}")
|
| 56 |
+
print(f"Token 数量: {stats['total_tokens']:,}")
|
| 57 |
+
print(f"段落数: {stats['paragraphs']:,}")
|
| 58 |
+
print(f"句子数: {stats['sentences']:,}")
|
| 59 |
+
|
| 60 |
+
lang_map = {'zh': '中文', 'en': '英文', 'mixed': '中英混合', 'unknown': '未知'}
|
| 61 |
+
print(f"检测语言: {lang_map.get(stats['language'], stats['language'])}")
|
| 62 |
+
|
| 63 |
+
# 估算阅读时间
|
| 64 |
+
text_utils = TextUtils()
|
| 65 |
+
reading_time = text_utils.estimate_reading_time(" " * stats['total_length'])
|
| 66 |
+
print(f"预计阅读: 约 {reading_time} 分钟")
|
| 67 |
+
print("="*70)
|
| 68 |
+
|
| 69 |
+
def select_character_interactive(characters: list) -> dict:
|
| 70 |
+
"""交互式选择角色"""
|
| 71 |
+
print("\n" + "="*70)
|
| 72 |
+
print("📋 检测到的主要角色")
|
| 73 |
+
print("="*70)
|
| 74 |
+
print(f"{'序号':<6}{'角色名':<25}{'出现次数':<12}{'分布章节':<12}")
|
| 75 |
+
print("-"*70)
|
| 76 |
+
|
| 77 |
+
for i, char in enumerate(characters[:15], 1):
|
| 78 |
+
name = char['name']
|
| 79 |
+
count = char['info']['count']
|
| 80 |
+
chunks = len(char['info']['chunks'])
|
| 81 |
+
print(f"{i:<6}{name:<25}{count:<12}{chunks:<12}")
|
| 82 |
+
|
| 83 |
+
print("="*70)
|
| 84 |
+
|
| 85 |
+
while True:
|
| 86 |
+
try:
|
| 87 |
+
choice = input(f"\n请选择角色编号 (1-{min(15, len(characters))}): ").strip()
|
| 88 |
+
|
| 89 |
+
if choice.isdigit():
|
| 90 |
+
idx = int(choice) - 1
|
| 91 |
+
if 0 <= idx < len(characters):
|
| 92 |
+
return characters[idx]
|
| 93 |
+
|
| 94 |
+
print("❌ 无效选择,请重试")
|
| 95 |
+
except KeyboardInterrupt:
|
| 96 |
+
print("\n\n👋 程序已退出")
|
| 97 |
+
sys.exit(0)
|
| 98 |
+
except:
|
| 99 |
+
print("❌ 输入错误,请重试")
|
| 100 |
+
|
| 101 |
+
def interactive_chat(agent: CharacterAgent):
|
| 102 |
+
"""交互式对话界面"""
|
| 103 |
+
|
| 104 |
+
print("\n" + "="*70)
|
| 105 |
+
print(agent.get_character_info())
|
| 106 |
+
print("\n💬 对话开始!")
|
| 107 |
+
print("-"*70)
|
| 108 |
+
print("💡 提示:")
|
| 109 |
+
print(" • 输入 'quit' 或 'exit' - 退出对话")
|
| 110 |
+
print(" • 输入 'reset' - 重置对话历史")
|
| 111 |
+
print(" • 输入 'save' - 保存对话")
|
| 112 |
+
print(" • 输入 'info' - 查看角色信息")
|
| 113 |
+
print(" • 输入 'help' - 显示帮助")
|
| 114 |
+
print("="*70 + "\n")
|
| 115 |
+
|
| 116 |
+
char_name = agent.character_profile['name']
|
| 117 |
+
|
| 118 |
+
while True:
|
| 119 |
+
try:
|
| 120 |
+
# 用户输入
|
| 121 |
+
user_input = input("🧑 你: ").strip()
|
| 122 |
+
|
| 123 |
+
if not user_input:
|
| 124 |
+
continue
|
| 125 |
+
|
| 126 |
+
# 处理命令
|
| 127 |
+
if user_input.lower() in ['quit', 'exit', '退出', 'q']:
|
| 128 |
+
print(f"\n👋 {char_name}: 再见,很高兴和你聊天!")
|
| 129 |
+
|
| 130 |
+
# 询问是否保存对话
|
| 131 |
+
if len(agent.conversation_history) > 0:
|
| 132 |
+
save = input("\n是否保��对话记录?(y/n): ").strip().lower()
|
| 133 |
+
if save in ['y', 'yes', '是']:
|
| 134 |
+
filename = f"conversation_{char_name}_{len(agent.conversation_history)}.json"
|
| 135 |
+
agent.save_conversation(filename)
|
| 136 |
+
break
|
| 137 |
+
|
| 138 |
+
if user_input.lower() in ['reset', '重置']:
|
| 139 |
+
agent.reset_conversation()
|
| 140 |
+
continue
|
| 141 |
+
|
| 142 |
+
if user_input.lower() in ['save', '保存']:
|
| 143 |
+
filename = f"conversation_{char_name}_{len(agent.conversation_history)}.json"
|
| 144 |
+
agent.save_conversation(filename)
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
if user_input.lower() in ['info', '信息']:
|
| 148 |
+
print(agent.get_character_info())
|
| 149 |
+
continue
|
| 150 |
+
|
| 151 |
+
if user_input.lower() in ['help', '帮助']:
|
| 152 |
+
print("\n可用命令:")
|
| 153 |
+
print(" quit/exit - 退出对话")
|
| 154 |
+
print(" reset - 重置对话历史")
|
| 155 |
+
print(" save - 保存对话")
|
| 156 |
+
print(" info - 查看角色信息")
|
| 157 |
+
print(" help - 显示此帮助\n")
|
| 158 |
+
continue
|
| 159 |
+
|
| 160 |
+
# 正常对话
|
| 161 |
+
print(f"\n{'⏳ ' + char_name + ' 正在思考...':<70}", end='\r')
|
| 162 |
+
response = agent.chat(user_input)
|
| 163 |
+
print(" " * 70, end='\r') # 清除"思考中"
|
| 164 |
+
print(f"🎭 {char_name}: {response}\n")
|
| 165 |
+
|
| 166 |
+
except KeyboardInterrupt:
|
| 167 |
+
print(f"\n\n👋 {char_name}: 再见!")
|
| 168 |
+
break
|
| 169 |
+
except Exception as e:
|
| 170 |
+
print(f"\n❌ 错误: {e}\n")
|
| 171 |
+
|
| 172 |
+
def check_environment():
|
| 173 |
+
"""检查运行环境"""
|
| 174 |
+
issues = []
|
| 175 |
+
|
| 176 |
+
# 检查 API Key
|
| 177 |
+
if not Config.OPENAI_API_KEY or Config.OPENAI_API_KEY == "":
|
| 178 |
+
issues.append("未设置 OPENAI_API_KEY")
|
| 179 |
+
|
| 180 |
+
# 检查缓存目录
|
| 181 |
+
if not os.path.exists(Config.CACHE_DIR):
|
| 182 |
+
try:
|
| 183 |
+
os.makedirs(Config.CACHE_DIR)
|
| 184 |
+
except:
|
| 185 |
+
issues.append(f"无法创建缓存目录: {Config.CACHE_DIR}")
|
| 186 |
+
|
| 187 |
+
# 检查必要的包
|
| 188 |
+
try:
|
| 189 |
+
import openai
|
| 190 |
+
import chromadb
|
| 191 |
+
import tiktoken
|
| 192 |
+
except ImportError as e:
|
| 193 |
+
issues.append(f"缺少必要的包: {e}")
|
| 194 |
+
|
| 195 |
+
if issues:
|
| 196 |
+
print("\n⚠️ 环境检查发现问题:")
|
| 197 |
+
for issue in issues:
|
| 198 |
+
print(f" • {issue}")
|
| 199 |
+
print("\n请检查配置文件 .env 和依赖安装\n")
|
| 200 |
+
return False
|
| 201 |
+
|
| 202 |
+
return True
|
| 203 |
+
|
| 204 |
+
def main():
|
| 205 |
+
"""主函数 - 完整流程"""
|
| 206 |
+
|
| 207 |
+
# 打印横幅
|
| 208 |
+
print_banner()
|
| 209 |
+
|
| 210 |
+
# 检查环境
|
| 211 |
+
if not check_environment():
|
| 212 |
+
return
|
| 213 |
+
|
| 214 |
+
# 显示缓存信息
|
| 215 |
+
cache = CacheManager()
|
| 216 |
+
cache_info = cache.get_cache_info()
|
| 217 |
+
if cache_info['count'] > 0:
|
| 218 |
+
print(f"📦 缓存: {cache_info['count']} 个文件, {cache_info['size_mb']} MB")
|
| 219 |
+
|
| 220 |
+
# 1. 加载小说
|
| 221 |
+
print("\n" + "="*70)
|
| 222 |
+
print("📖 步骤 1/5: 加载小说")
|
| 223 |
+
print("="*70)
|
| 224 |
+
|
| 225 |
+
default_path = "sample_novels/harry_potter_sample.txt"
|
| 226 |
+
novel_path = input(f"\n请输入小说文件路径 (默认: {default_path})\n> ").strip()
|
| 227 |
+
|
| 228 |
+
if not novel_path:
|
| 229 |
+
novel_path = default_path
|
| 230 |
+
|
| 231 |
+
if not os.path.exists(novel_path):
|
| 232 |
+
print(f"❌ 文件不存在: {novel_path}")
|
| 233 |
+
|
| 234 |
+
# 尝试在 sample_novels 目录下查找
|
| 235 |
+
alt_path = os.path.join("sample_novels", os.path.basename(novel_path))
|
| 236 |
+
if os.path.exists(alt_path):
|
| 237 |
+
print(f"✓ 找到文件: {alt_path}")
|
| 238 |
+
novel_path = alt_path
|
| 239 |
+
else:
|
| 240 |
+
print("程序退出")
|
| 241 |
+
return
|
| 242 |
+
|
| 243 |
+
print(f"\n正在加载: {novel_path}")
|
| 244 |
+
novel_text = load_novel(novel_path)
|
| 245 |
+
|
| 246 |
+
if not novel_text:
|
| 247 |
+
print("❌ 无法加载小说,程序退出")
|
| 248 |
+
return
|
| 249 |
+
|
| 250 |
+
# 显示统计信息
|
| 251 |
+
processor = TextProcessor()
|
| 252 |
+
stats = processor.get_statistics(novel_text)
|
| 253 |
+
display_statistics(stats)
|
| 254 |
+
|
| 255 |
+
# 检查文本长度
|
| 256 |
+
if stats['total_length'] < 1000:
|
| 257 |
+
print("⚠️ 警告: 文本过短 (< 1000字符),可能影响分析效果")
|
| 258 |
+
proceed = input("是否继续?(y/n): ").strip().lower()
|
| 259 |
+
if proceed not in ['y', 'yes', '是']:
|
| 260 |
+
return
|
| 261 |
+
|
| 262 |
+
# 2. 文本分块
|
| 263 |
+
print("\n" + "="*70)
|
| 264 |
+
print("📄 步骤 2/5: 文本分块处理")
|
| 265 |
+
print("="*70)
|
| 266 |
+
|
| 267 |
+
chunks = processor.chunk_text(novel_text)
|
| 268 |
+
print(f"✓ 文本已分为 {len(chunks)} 个块")
|
| 269 |
+
print(f" 平均每块: {stats['total_length'] // len(chunks)} 字符")
|
| 270 |
+
|
| 271 |
+
# 3. 提取角色
|
| 272 |
+
print("\n" + "="*70)
|
| 273 |
+
print("👥 步骤 3/5: 提取主要角色")
|
| 274 |
+
print("="*70)
|
| 275 |
+
|
| 276 |
+
extractor = CharacterExtractor()
|
| 277 |
+
characters = extractor.extract_main_characters(
|
| 278 |
+
chunks,
|
| 279 |
+
text_sample=novel_text[:3000],
|
| 280 |
+
language=stats['language']
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
if not characters:
|
| 284 |
+
print("❌ 未能提取到角色,程序退出")
|
| 285 |
+
return
|
| 286 |
+
|
| 287 |
+
# 4. 选择角色
|
| 288 |
+
print("\n" + "="*70)
|
| 289 |
+
print("🎯 步骤 4/5: 选择要对话的角色")
|
| 290 |
+
print("="*70)
|
| 291 |
+
|
| 292 |
+
selected = select_character_interactive(characters)
|
| 293 |
+
character_name = selected['name']
|
| 294 |
+
character_info = selected['info']
|
| 295 |
+
|
| 296 |
+
print(f"\n✓ 已选择: {character_name}")
|
| 297 |
+
print(f" 出现次数: {character_info['count']}")
|
| 298 |
+
print(f" 分布章节: {len(character_info['chunks'])}")
|
| 299 |
+
|
| 300 |
+
# 5. 分析角色
|
| 301 |
+
print(f"\n" + "="*70)
|
| 302 |
+
print(f"🧠 步骤 5/5: 分析角色性格")
|
| 303 |
+
print("="*70)
|
| 304 |
+
print(f"正在深度分析 {character_name} 的性格特征...")
|
| 305 |
+
print("这可能需要几分钟,请耐心等待...\n")
|
| 306 |
+
|
| 307 |
+
analyzer = CharacterAnalyzer()
|
| 308 |
+
|
| 309 |
+
# 选择代表性文本块
|
| 310 |
+
representative_chunks = analyzer.select_representative_chunks(
|
| 311 |
+
chunks,
|
| 312 |
+
character_info['chunks']
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
print(f"✓ 选取了 {len(representative_chunks)} 个代表性片段进行分析")
|
| 316 |
+
|
| 317 |
+
# 执行分析
|
| 318 |
+
character_profile = analyzer.analyze_character_batch(
|
| 319 |
+
character_name,
|
| 320 |
+
representative_chunks
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
# 增强配置
|
| 324 |
+
character_profile = analyzer.enhance_profile_with_examples(
|
| 325 |
+
character_profile,
|
| 326 |
+
chunks,
|
| 327 |
+
character_info['chunks']
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
print(f"✓ 角色分析完成!")
|
| 331 |
+
|
| 332 |
+
# 6. 创建对话代理
|
| 333 |
+
print("\n" + "="*70)
|
| 334 |
+
print("🤖 创建对话代理")
|
| 335 |
+
print("="*70)
|
| 336 |
+
|
| 337 |
+
use_memory = input("\n是否启用记忆系统?(y/n, 默认: y): ").strip().lower()
|
| 338 |
+
|
| 339 |
+
if use_memory in ['', 'y', 'yes', '是']:
|
| 340 |
+
print("正在初始化记忆系统...")
|
| 341 |
+
agent = CharacterAgent(
|
| 342 |
+
character_profile,
|
| 343 |
+
chunks=chunks,
|
| 344 |
+
character_chunks=character_info['chunks']
|
| 345 |
+
)
|
| 346 |
+
print("✓ Agent 创建成功,记忆系统已初始化")
|
| 347 |
+
else:
|
| 348 |
+
agent = CharacterAgent(character_profile)
|
| 349 |
+
print("✓ Agent 创建成功(未启用记忆系统)")
|
| 350 |
+
|
| 351 |
+
# 7. 开始对话
|
| 352 |
+
interactive_chat(agent)
|
| 353 |
+
|
| 354 |
+
# 结束
|
| 355 |
+
print("\n" + "="*70)
|
| 356 |
+
print("感谢使用小说角色 Agent 系统!")
|
| 357 |
+
print("="*70)
|
| 358 |
+
|
| 359 |
+
if __name__ == "__main__":
|
| 360 |
+
try:
|
| 361 |
+
main()
|
| 362 |
+
except KeyboardInterrupt:
|
| 363 |
+
print("\n\n👋 程序已被用户中断")
|
| 364 |
+
except Exception as e:
|
| 365 |
+
print(f"\n❌ 程序错误: {e}")
|
| 366 |
+
import traceback
|
| 367 |
+
traceback.print_exc()
|
sample_novels/harry_potter_sample.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
test_agent.py
ADDED
|
@@ -0,0 +1,431 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
from core.text_processor import TextProcessor
|
| 5 |
+
from core.character_extractor import CharacterExtractor
|
| 6 |
+
from core.character_analyzer import CharacterAnalyzer
|
| 7 |
+
from core.character_agent import CharacterAgent
|
| 8 |
+
from utils.text_utils import TextUtils
|
| 9 |
+
from utils.cache_manager import CacheManager
|
| 10 |
+
|
| 11 |
+
class TestLargeScaleAgent(unittest.TestCase):
|
| 12 |
+
"""大规模文本角色Agent测试"""
|
| 13 |
+
|
| 14 |
+
@classmethod
|
| 15 |
+
def setUpClass(cls):
|
| 16 |
+
"""准备测试数据"""
|
| 17 |
+
cls.sample_text = """
|
| 18 |
+
Chapter 1: The Boy Who Lived
|
| 19 |
+
|
| 20 |
+
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.
|
| 21 |
+
|
| 22 |
+
Harry Potter rolled over inside his blankets without waking up. One small hand closed on the letter beside him and he slept on, not knowing he was special, not knowing he was famous.
|
| 23 |
+
|
| 24 |
+
"Harry! Harry Potter!" said Hermione Granger excitedly. She was a clever girl with bushy brown hair.
|
| 25 |
+
|
| 26 |
+
Ron Weasley grinned at Harry. "Blimey, Harry, everyone knows who you are!" Ron was Harry's best friend, a tall boy with red hair.
|
| 27 |
+
|
| 28 |
+
Professor Dumbledore smiled wisely. "Harry, you have your mother's eyes," he said gently. Dumbledore was the headmaster of Hogwarts, known for his wisdom and kindness.
|
| 29 |
+
|
| 30 |
+
Harry looked confused. "I don't understand, Professor," he said quietly. Harry was a brave boy, though he often felt uncertain.
|
| 31 |
+
|
| 32 |
+
Hermione rolled her eyes. "Honestly, Harry, you need to read more!" she said in her know-it-all voice.
|
| 33 |
+
|
| 34 |
+
Ron laughed. "Don't worry, mate. We'll figure it out together," he said loyally.
|
| 35 |
+
|
| 36 |
+
""" * 50 # 重复50次模拟长文本
|
| 37 |
+
|
| 38 |
+
cls.processor = TextProcessor()
|
| 39 |
+
cls.chunks = cls.processor.chunk_text(cls.sample_text)
|
| 40 |
+
|
| 41 |
+
def test_01_text_utils(self):
|
| 42 |
+
"""测试文本工具"""
|
| 43 |
+
print("\n" + "="*70)
|
| 44 |
+
print("测试 1: 文本工具函数")
|
| 45 |
+
print("="*70)
|
| 46 |
+
|
| 47 |
+
utils = TextUtils()
|
| 48 |
+
|
| 49 |
+
# 测试 token 计数
|
| 50 |
+
tokens = utils.count_tokens(self.sample_text[:1000])
|
| 51 |
+
print(f"Token 数量: {tokens}")
|
| 52 |
+
self.assertGreater(tokens, 0)
|
| 53 |
+
|
| 54 |
+
# 测试语言检测
|
| 55 |
+
lang = utils.detect_language(self.sample_text)
|
| 56 |
+
print(f"检测语言: {lang}")
|
| 57 |
+
self.assertEqual(lang, "en")
|
| 58 |
+
|
| 59 |
+
# 测试句子分割
|
| 60 |
+
sentences = utils.split_into_sentences(self.sample_text[:500])
|
| 61 |
+
print(f"句子数量: {len(sentences)}")
|
| 62 |
+
self.assertGreater(len(sentences), 0)
|
| 63 |
+
|
| 64 |
+
# 测试关键词提取
|
| 65 |
+
keywords = utils.extract_keywords(self.sample_text[:1000], top_n=5)
|
| 66 |
+
print(f"关键词: {keywords}")
|
| 67 |
+
self.assertGreater(len(keywords), 0)
|
| 68 |
+
|
| 69 |
+
def test_02_cache_manager(self):
|
| 70 |
+
"""测试缓存管理器"""
|
| 71 |
+
print("\n" + "="*70)
|
| 72 |
+
print("测试 2: 缓存管理器")
|
| 73 |
+
print("="*70)
|
| 74 |
+
|
| 75 |
+
cache = CacheManager()
|
| 76 |
+
|
| 77 |
+
# 测试缓存设置和获取
|
| 78 |
+
test_key = "test_key"
|
| 79 |
+
test_data = {"name": "Harry", "age": 11}
|
| 80 |
+
|
| 81 |
+
cache.set(test_key, test_data)
|
| 82 |
+
cached_data = cache.get(test_key)
|
| 83 |
+
|
| 84 |
+
print(f"缓存数据: {cached_data}")
|
| 85 |
+
self.assertEqual(cached_data, test_data)
|
| 86 |
+
|
| 87 |
+
# 测试缓存存在性
|
| 88 |
+
self.assertTrue(cache.exists(test_key))
|
| 89 |
+
|
| 90 |
+
# 测试 JSON 缓存
|
| 91 |
+
cache.save_json("test_json", test_data)
|
| 92 |
+
json_data = cache.load_json("test_json")
|
| 93 |
+
print(f"JSON 数据: {json_data}")
|
| 94 |
+
self.assertEqual(json_data, test_data)
|
| 95 |
+
|
| 96 |
+
# 测试缓存信息
|
| 97 |
+
info = cache.get_cache_info()
|
| 98 |
+
print(f"缓存信息: {info}")
|
| 99 |
+
self.assertGreater(info['count'], 0)
|
| 100 |
+
|
| 101 |
+
# 清理测试缓存
|
| 102 |
+
cache.delete(test_key)
|
| 103 |
+
|
| 104 |
+
def test_03_text_processing(self):
|
| 105 |
+
"""测试文本处理"""
|
| 106 |
+
print("\n" + "="*70)
|
| 107 |
+
print("测试 3: 文本处理和分块")
|
| 108 |
+
print("="*70)
|
| 109 |
+
|
| 110 |
+
stats = self.processor.get_statistics(self.sample_text)
|
| 111 |
+
|
| 112 |
+
print(f"总字符数: {stats['total_length']:,}")
|
| 113 |
+
print(f"Token数: {stats['total_tokens']:,}")
|
| 114 |
+
print(f"段落数: {stats['paragraphs']}")
|
| 115 |
+
print(f"句子数: {stats['sentences']}")
|
| 116 |
+
print(f"语言: {stats['language']}")
|
| 117 |
+
print(f"分块数: {len(self.chunks)}")
|
| 118 |
+
|
| 119 |
+
self.assertGreater(len(self.chunks), 0)
|
| 120 |
+
self.assertGreater(stats['total_length'], 1000)
|
| 121 |
+
self.assertEqual(stats['language'], 'en')
|
| 122 |
+
|
| 123 |
+
def test_04_character_extraction(self):
|
| 124 |
+
"""测试角色提取"""
|
| 125 |
+
print("\n" + "="*70)
|
| 126 |
+
print("测试 4: 角色提取")
|
| 127 |
+
print("="*70)
|
| 128 |
+
|
| 129 |
+
extractor = CharacterExtractor()
|
| 130 |
+
characters = extractor.extract_main_characters(
|
| 131 |
+
self.chunks,
|
| 132 |
+
text_sample=self.sample_text[:1000],
|
| 133 |
+
language="en"
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
print(f"\n提取到 {len(characters)} 个主要角色:")
|
| 137 |
+
for i, char in enumerate(characters[:5], 1):
|
| 138 |
+
print(f" {i}. {char['name']}: 出现 {char['info']['count']} 次")
|
| 139 |
+
|
| 140 |
+
self.assertGreater(len(characters), 0)
|
| 141 |
+
|
| 142 |
+
# 检查是否包含预期角色
|
| 143 |
+
char_names = [c['name'] for c in characters]
|
| 144 |
+
expected_chars = ['Harry', 'Hermione', 'Ron']
|
| 145 |
+
|
| 146 |
+
for expected in expected_chars:
|
| 147 |
+
found = any(expected in name for name in char_names)
|
| 148 |
+
if found:
|
| 149 |
+
print(f"✓ 找到角色: {expected}")
|
| 150 |
+
self.assertTrue(found, f"应该找到角色: {expected}")
|
| 151 |
+
|
| 152 |
+
def test_05_character_analysis(self):
|
| 153 |
+
"""测试角色分析"""
|
| 154 |
+
print("\n" + "="*70)
|
| 155 |
+
print("测试 5: 角色性格分析")
|
| 156 |
+
print("="*70)
|
| 157 |
+
|
| 158 |
+
extractor = CharacterExtractor()
|
| 159 |
+
characters = extractor.extract_main_characters(
|
| 160 |
+
self.chunks,
|
| 161 |
+
language="en"
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
if not characters:
|
| 165 |
+
self.skipTest("没有提取到角色")
|
| 166 |
+
|
| 167 |
+
# 分析第一个角色
|
| 168 |
+
character = characters[0]
|
| 169 |
+
print(f"\n分析角色: {character['name']}")
|
| 170 |
+
|
| 171 |
+
analyzer = CharacterAnalyzer()
|
| 172 |
+
representative_chunks = analyzer.select_representative_chunks(
|
| 173 |
+
self.chunks,
|
| 174 |
+
character['info']['chunks'][:10]
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
print(f"选择了 {len(representative_chunks)} 个代表性片段")
|
| 178 |
+
|
| 179 |
+
profile = analyzer.analyze_character_batch(
|
| 180 |
+
character['name'],
|
| 181 |
+
representative_chunks
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
print(f"\n角色档案:")
|
| 185 |
+
print(f" 名字: {profile['name']}")
|
| 186 |
+
print(f" 核心特质: {profile.get('core_traits', [])}")
|
| 187 |
+
print(f" 说话风格: {profile.get('speaking_style', 'N/A')}")
|
| 188 |
+
print(f" 性格总结: {profile.get('personality_summary', 'N/A')[:100]}...")
|
| 189 |
+
|
| 190 |
+
self.assertEqual(profile['name'], character['name'])
|
| 191 |
+
self.assertIn('core_traits', profile)
|
| 192 |
+
self.assertIn('speaking_style', profile)
|
| 193 |
+
|
| 194 |
+
def test_06_agent_creation(self):
|
| 195 |
+
"""测试Agent创建"""
|
| 196 |
+
print("\n" + "="*70)
|
| 197 |
+
print("测试 6: 创建对话Agent")
|
| 198 |
+
print("="*70)
|
| 199 |
+
|
| 200 |
+
test_profile = {
|
| 201 |
+
'name': 'Harry Potter',
|
| 202 |
+
'core_traits': ['勇敢', '善良', '忠诚', '谦逊'],
|
| 203 |
+
'speaking_style': '谦逊、真诚,有时会感到困惑',
|
| 204 |
+
'behavior_patterns': '面对困难时表现出勇气',
|
| 205 |
+
'values': '友谊、正义、牺牲精神',
|
| 206 |
+
'emotional_style': '内敛但情感丰富',
|
| 207 |
+
'relationship_style': '忠诚于朋友',
|
| 208 |
+
'background': '年轻的巫师,霍格沃茨学生',
|
| 209 |
+
'personality_summary': 'Harry是一个勇敢善良的年轻巫师',
|
| 210 |
+
'key_quotes': ['I don\'t understand', 'What do you mean?']
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
agent = CharacterAgent(test_profile)
|
| 214 |
+
|
| 215 |
+
print("\n" + agent.get_character_info())
|
| 216 |
+
|
| 217 |
+
self.assertIsNotNone(agent)
|
| 218 |
+
self.assertEqual(agent.character_profile['name'], 'Harry Potter')
|
| 219 |
+
self.assertIsNotNone(agent.system_prompt)
|
| 220 |
+
|
| 221 |
+
def test_07_conversation(self):
|
| 222 |
+
"""测试对话功能"""
|
| 223 |
+
print("\n" + "="*70)
|
| 224 |
+
print("测试 7: 对话测试")
|
| 225 |
+
print("="*70)
|
| 226 |
+
|
| 227 |
+
test_profile = {
|
| 228 |
+
'name': 'Hermione Granger',
|
| 229 |
+
'core_traits': ['聪明', '好学', '正直', '有时完美主义'],
|
| 230 |
+
'speaking_style': '自信、条理清晰,有时显得严肃',
|
| 231 |
+
'behavior_patterns': '总是提前完成作业,遇到问题查阅书籍',
|
| 232 |
+
'values': '知识就是力量,规则应该被遵守',
|
| 233 |
+
'emotional_style': '理性为主,但也有感性的一面',
|
| 234 |
+
'relationship_style': '对朋友忠诚,但不容忍懒惰',
|
| 235 |
+
'background': '麻瓜出身的女巫,格兰芬多学生',
|
| 236 |
+
'personality_summary': 'Hermione是个极其聪明、勤奋的学生',
|
| 237 |
+
'key_quotes': ['Honestly!', 'You need to read more!', 'It\'s in Hogwarts: A History']
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
agent = CharacterAgent(test_profile)
|
| 241 |
+
|
| 242 |
+
test_messages = [
|
| 243 |
+
"Hi Hermione, how are you?",
|
| 244 |
+
"Can you help me with my homework?",
|
| 245 |
+
"Do you ever just relax?"
|
| 246 |
+
]
|
| 247 |
+
|
| 248 |
+
for msg in test_messages:
|
| 249 |
+
print(f"\n🧑 你: {msg}")
|
| 250 |
+
response = agent.chat(msg, use_memory=False)
|
| 251 |
+
print(f"🎭 Hermione: {response}")
|
| 252 |
+
|
| 253 |
+
self.assertIsInstance(response, str)
|
| 254 |
+
self.assertGreater(len(response), 0)
|
| 255 |
+
|
| 256 |
+
print(f"\n对话历史: {len(agent.conversation_history)} 条")
|
| 257 |
+
self.assertGreater(len(agent.conversation_history), 0)
|
| 258 |
+
|
| 259 |
+
def test_08_conversation_management(self):
|
| 260 |
+
"""测试对话管理功能"""
|
| 261 |
+
print("\n" + "="*70)
|
| 262 |
+
print("测试 8: 对话管理")
|
| 263 |
+
print("="*70)
|
| 264 |
+
|
| 265 |
+
test_profile = {
|
| 266 |
+
'name': 'Ron Weasley',
|
| 267 |
+
'core_traits': ['忠诚', '幽默', '勇敢'],
|
| 268 |
+
'speaking_style': '随意、幽默',
|
| 269 |
+
'personality_summary': 'Ron是Harry的忠实朋友',
|
| 270 |
+
'key_quotes': ['Blimey!', 'Brilliant!']
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
agent = CharacterAgent(test_profile)
|
| 274 |
+
|
| 275 |
+
# 进行对话
|
| 276 |
+
agent.chat("Hello Ron!")
|
| 277 |
+
agent.chat("How's it going?")
|
| 278 |
+
|
| 279 |
+
initial_count = len(agent.conversation_history)
|
| 280 |
+
print(f"对话数量: {initial_count}")
|
| 281 |
+
self.assertGreater(initial_count, 0)
|
| 282 |
+
|
| 283 |
+
# 测试重置
|
| 284 |
+
agent.reset_conversation()
|
| 285 |
+
print(f"重置后对话数量: {len(agent.conversation_history)}")
|
| 286 |
+
self.assertEqual(len(agent.conversation_history), 0)
|
| 287 |
+
|
| 288 |
+
# 测试保存和加载
|
| 289 |
+
agent.chat("Test message")
|
| 290 |
+
test_file = "test_conversation.json"
|
| 291 |
+
agent.save_conversation(test_file)
|
| 292 |
+
|
| 293 |
+
self.assertTrue(os.path.exists(test_file))
|
| 294 |
+
|
| 295 |
+
# 清理
|
| 296 |
+
if os.path.exists(test_file):
|
| 297 |
+
os.remove(test_file)
|
| 298 |
+
|
| 299 |
+
def run_quick_demo():
|
| 300 |
+
"""快速演示"""
|
| 301 |
+
print("\n" + "="*70)
|
| 302 |
+
print("🎭 快速演示:与 Hermione Granger 对话")
|
| 303 |
+
print("="*70)
|
| 304 |
+
|
| 305 |
+
profile = {
|
| 306 |
+
'name': 'Hermione Granger',
|
| 307 |
+
'core_traits': ['聪明绝顶', '勤奋好学', '正义感强', '有时完美主义'],
|
| 308 |
+
'speaking_style': '自信、条理清晰,喜欢引用书本知识,有时显得严肃',
|
| 309 |
+
'behavior_patterns': '总是提前完成作业,遇到问题首先想到查阅书籍',
|
| 310 |
+
'values': '知识就是力量,规则应该被遵守',
|
| 311 |
+
'emotional_style': '理性为主,但面对朋友会展现温暖的一面',
|
| 312 |
+
'relationship_style': '对朋友忠诚,但不容忍懒惰和不负责任',
|
| 313 |
+
'background': '麻瓜出身的年轻女巫,格兰芬多学院学生',
|
| 314 |
+
'key_quotes': [
|
| 315 |
+
'Honestly, don\'t you two read?',
|
| 316 |
+
'It\'s LeviOsa, not LeviosA!',
|
| 317 |
+
'When in doubt, go to the library.',
|
| 318 |
+
'Fear of a name only increases fear of the thing itself.'
|
| 319 |
+
],
|
| 320 |
+
'personality_summary': 'Hermione是一个极其聪明、勤奋的学生,对知识充满渴望。她有强烈的正义感,虽然有时显得有点自以为是,但内心深处对朋友极其忠诚。'
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
agent = CharacterAgent(profile)
|
| 324 |
+
print(agent.get_character_info())
|
| 325 |
+
|
| 326 |
+
demo_conversations = [
|
| 327 |
+
"Hi Hermione! How's your day going?",
|
| 328 |
+
"I'm really struggling with my Potions homework...",
|
| 329 |
+
"You always seem so confident. Don't you ever feel stressed?",
|
| 330 |
+
"What's your secret to being so good at everything?"
|
| 331 |
+
]
|
| 332 |
+
|
| 333 |
+
print("\n💬 开始对话演示\n")
|
| 334 |
+
print("提示: 这是自动演示模式,每个问题后会自动继续")
|
| 335 |
+
print("-"*70 + "\n")
|
| 336 |
+
|
| 337 |
+
for i, msg in enumerate(demo_conversations, 1):
|
| 338 |
+
print(f"[{i}/{len(demo_conversations)}]")
|
| 339 |
+
print(f"🧑 你: {msg}")
|
| 340 |
+
print("⏳ Hermione 正在思考...", end='', flush=True)
|
| 341 |
+
|
| 342 |
+
response = agent.chat(msg, use_memory=False)
|
| 343 |
+
|
| 344 |
+
print(f"\r🎭 Hermione: {response}\n")
|
| 345 |
+
|
| 346 |
+
if i < len(demo_conversations):
|
| 347 |
+
try:
|
| 348 |
+
input("[按 Enter 继续下一个问题...]")
|
| 349 |
+
except:
|
| 350 |
+
break
|
| 351 |
+
|
| 352 |
+
print("\n" + "="*70)
|
| 353 |
+
print("演示完成!")
|
| 354 |
+
print("="*70)
|
| 355 |
+
|
| 356 |
+
def run_interactive_demo():
|
| 357 |
+
"""交互式演示"""
|
| 358 |
+
print("\n" + "="*70)
|
| 359 |
+
print("🎭 交互式演示:与 Harry Potter 对话")
|
| 360 |
+
print("="*70)
|
| 361 |
+
|
| 362 |
+
profile = {
|
| 363 |
+
'name': 'Harry Potter',
|
| 364 |
+
'core_traits': ['勇敢', '善良', '忠诚', '谦逊', '有正义感'],
|
| 365 |
+
'speaking_style': '真诚、谦逊,说话时常带有不确定感',
|
| 366 |
+
'behavior_patterns': '面对危险时挺身而出,但对赞美感到不自在',
|
| 367 |
+
'values': '友谊高于一切,愿意为朋友牺牲',
|
| 368 |
+
'emotional_style': '内敛但情感深沉,容易被朋友的遭遇触动',
|
| 369 |
+
'relationship_style': '对朋友极度忠诚,珍视每一段友谊',
|
| 370 |
+
'background': '失去父母的年轻巫师,被预言为击败伏地魔的人',
|
| 371 |
+
'key_quotes': [
|
| 372 |
+
'I don\'t know what you\'re talking about.',
|
| 373 |
+
'We can figure this out together.',
|
| 374 |
+
'I had a lot of help.',
|
| 375 |
+
'It\'s not about being brave, it\'s about doing what\'s right.'
|
| 376 |
+
],
|
| 377 |
+
'personality_summary': 'Harry是一个勇敢善良但又谦逊的年轻巫师。虽然承担着巨大的使命,但他从不认为自己特殊。他最看重的是友谊,愿意为朋友付出一切。'
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
agent = CharacterAgent(profile)
|
| 381 |
+
print(agent.get_character_info())
|
| 382 |
+
|
| 383 |
+
print("\n💬 开始交���式对话")
|
| 384 |
+
print("-"*70)
|
| 385 |
+
print("提示:")
|
| 386 |
+
print(" • 输入你的问题开始对话")
|
| 387 |
+
print(" • 输入 'quit' 退出")
|
| 388 |
+
print(" • 输入 'reset' 重置对话")
|
| 389 |
+
print("="*70 + "\n")
|
| 390 |
+
|
| 391 |
+
while True:
|
| 392 |
+
try:
|
| 393 |
+
user_input = input("🧑 你: ").strip()
|
| 394 |
+
|
| 395 |
+
if not user_input:
|
| 396 |
+
continue
|
| 397 |
+
|
| 398 |
+
if user_input.lower() in ['quit', 'exit', 'q']:
|
| 399 |
+
print(f"\n🎭 Harry: Goodbye! It was nice talking to you.\n")
|
| 400 |
+
break
|
| 401 |
+
|
| 402 |
+
if user_input.lower() == 'reset':
|
| 403 |
+
agent.reset_conversation()
|
| 404 |
+
continue
|
| 405 |
+
|
| 406 |
+
print("⏳ Harry 正在思考...", end='', flush=True)
|
| 407 |
+
response = agent.chat(user_input, use_memory=False)
|
| 408 |
+
print(f"\r🎭 Harry: {response}\n")
|
| 409 |
+
|
| 410 |
+
except KeyboardInterrupt:
|
| 411 |
+
print(f"\n\n🎭 Harry: Goodbye!\n")
|
| 412 |
+
break
|
| 413 |
+
except Exception as e:
|
| 414 |
+
print(f"\n❌ 错误: {e}\n")
|
| 415 |
+
|
| 416 |
+
if __name__ == "__main__":
|
| 417 |
+
import sys
|
| 418 |
+
|
| 419 |
+
if len(sys.argv) > 1:
|
| 420 |
+
if sys.argv[1] == "demo":
|
| 421 |
+
run_quick_demo()
|
| 422 |
+
elif sys.argv[1] == "interactive":
|
| 423 |
+
run_interactive_demo()
|
| 424 |
+
else:
|
| 425 |
+
print("用法:")
|
| 426 |
+
print(" python test_agent.py - 运行所有测试")
|
| 427 |
+
print(" python test_agent.py demo - 快速演示")
|
| 428 |
+
print(" python test_agent.py interactive - 交互式演示")
|
| 429 |
+
else:
|
| 430 |
+
# 运行单元测试
|
| 431 |
+
unittest.main(verbosity=2)
|
tools/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# 工具模块
|
tools/batch_analyze.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""批量分析多个角色的工具脚本"""
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 6 |
+
|
| 7 |
+
from core import TextProcessor, CharacterExtractor, CharacterAnalyzer
|
| 8 |
+
from utils import CacheManager
|
| 9 |
+
import json
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
|
| 12 |
+
def batch_analyze(novel_path: str, output_dir: str = "character_profiles",
|
| 13 |
+
max_characters: int = 10):
|
| 14 |
+
"""批量分析小说中的所有主要角色
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
novel_path: 小说文件路径
|
| 18 |
+
output_dir: 输出目录
|
| 19 |
+
max_characters: 最多分析的角色数
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
print("="*70)
|
| 23 |
+
print("📚 批量角色分析工具")
|
| 24 |
+
print("="*70)
|
| 25 |
+
|
| 26 |
+
# 1. 加载小说
|
| 27 |
+
print(f"\n📖 加载小说: {novel_path}")
|
| 28 |
+
try:
|
| 29 |
+
with open(novel_path, 'r', encoding='utf-8') as f:
|
| 30 |
+
novel = f.read()
|
| 31 |
+
except:
|
| 32 |
+
print(f"❌ 无法加载文件: {novel_path}")
|
| 33 |
+
return
|
| 34 |
+
|
| 35 |
+
print(f"✓ 已加载 {len(novel):,} 个字符")
|
| 36 |
+
|
| 37 |
+
# 2. 处理文本
|
| 38 |
+
print("\n📄 处理文本...")
|
| 39 |
+
processor = TextProcessor()
|
| 40 |
+
chunks = processor.chunk_text(novel)
|
| 41 |
+
stats = processor.get_statistics(novel)
|
| 42 |
+
|
| 43 |
+
print(f"✓ 文本已分为 {len(chunks)} 个块")
|
| 44 |
+
print(f"✓ 检测语言: {stats['language']}")
|
| 45 |
+
|
| 46 |
+
# 3. 提取角色
|
| 47 |
+
print("\n👥 提取角色...")
|
| 48 |
+
extractor = CharacterExtractor()
|
| 49 |
+
characters = extractor.extract_main_characters(
|
| 50 |
+
chunks,
|
| 51 |
+
text_sample=novel[:3000],
|
| 52 |
+
language=stats['language']
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
if not characters:
|
| 56 |
+
print("❌ 未找到角色")
|
| 57 |
+
return
|
| 58 |
+
|
| 59 |
+
print(f"✓ 找到 {len(characters)} 个主要角色")
|
| 60 |
+
|
| 61 |
+
# 创建输出目录
|
| 62 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 63 |
+
|
| 64 |
+
# 4. 分析每个角色
|
| 65 |
+
print(f"\n🧠 开始分析角色 (最多 {max_characters} 个)...")
|
| 66 |
+
analyzer = CharacterAnalyzer()
|
| 67 |
+
|
| 68 |
+
all_profiles = []
|
| 69 |
+
analyze_count = min(max_characters, len(characters))
|
| 70 |
+
|
| 71 |
+
for i, char in enumerate(tqdm(characters[:analyze_count], desc="分析进度")):
|
| 72 |
+
char_name = char['name']
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
# 选择代表性片段
|
| 76 |
+
representative_chunks = analyzer.select_representative_chunks(
|
| 77 |
+
chunks,
|
| 78 |
+
char['info']['chunks']
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# 分析角色
|
| 82 |
+
profile = analyzer.analyze_character_batch(
|
| 83 |
+
char_name,
|
| 84 |
+
representative_chunks
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# 增强配置
|
| 88 |
+
profile = analyzer.enhance_profile_with_examples(
|
| 89 |
+
profile,
|
| 90 |
+
chunks,
|
| 91 |
+
char['info']['chunks']
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
all_profiles.append(profile)
|
| 95 |
+
|
| 96 |
+
# 保存单个角色配置
|
| 97 |
+
char_filename = f"{profile['name'].replace(' ', '_').replace('/', '_')}.json"
|
| 98 |
+
char_file = os.path.join(output_dir, char_filename)
|
| 99 |
+
|
| 100 |
+
with open(char_file, 'w', encoding='utf-8') as f:
|
| 101 |
+
json.dump(profile, f, ensure_ascii=False, indent=2)
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f"\n❌ 分析 {char_name} 失败: {e}")
|
| 105 |
+
continue
|
| 106 |
+
|
| 107 |
+
# 5. 保存汇总
|
| 108 |
+
all_file = os.path.join(output_dir, "all_characters.json")
|
| 109 |
+
with open(all_file, 'w', encoding='utf-8') as f:
|
| 110 |
+
json.dump(all_profiles, f, ensure_ascii=False, indent=2)
|
| 111 |
+
|
| 112 |
+
# 6. 生成报告
|
| 113 |
+
report_file = os.path.join(output_dir, "analysis_report.txt")
|
| 114 |
+
with open(report_file, 'w', encoding='utf-8') as f:
|
| 115 |
+
f.write("="*70 + "\n")
|
| 116 |
+
f.write("角色分析报告\n")
|
| 117 |
+
f.write("="*70 + "\n\n")
|
| 118 |
+
f.write(f"小说文件: {novel_path}\n")
|
| 119 |
+
f.write(f"文本长度: {len(novel):,} 字符\n")
|
| 120 |
+
f.write(f"分析角色数: {len(all_profiles)}\n\n")
|
| 121 |
+
f.write("-"*70 + "\n\n")
|
| 122 |
+
|
| 123 |
+
for i, profile in enumerate(all_profiles, 1):
|
| 124 |
+
f.write(f"{i}. {profile['name']}\n")
|
| 125 |
+
f.write(f" 核心特质: {', '.join(profile.get('core_traits', []))}\n")
|
| 126 |
+
f.write(f" 性格总结: {profile.get('personality_summary', 'N/A')}\n")
|
| 127 |
+
f.write("\n")
|
| 128 |
+
|
| 129 |
+
# 完成
|
| 130 |
+
print("\n" + "="*70)
|
| 131 |
+
print("✅ 分析完成!")
|
| 132 |
+
print("="*70)
|
| 133 |
+
print(f"📁 输出目录: {output_dir}")
|
| 134 |
+
print(f"📊 分析角色数: {len(all_profiles)}")
|
| 135 |
+
print(f"📄 汇总文件: {all_file}")
|
| 136 |
+
print(f"📋 报告文件: {report_file}")
|
| 137 |
+
print("="*70)
|
| 138 |
+
|
| 139 |
+
def main():
|
| 140 |
+
import argparse
|
| 141 |
+
|
| 142 |
+
parser = argparse.ArgumentParser(
|
| 143 |
+
description="批量分析小说角色",
|
| 144 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 145 |
+
epilog="""
|
| 146 |
+
示例:
|
| 147 |
+
python batch_analyze.py novel.txt
|
| 148 |
+
python batch_analyze.py novel.txt -o my_characters -n 15
|
| 149 |
+
"""
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
parser.add_argument("novel_path", help="小说文件路径")
|
| 153 |
+
parser.add_argument("-o", "--output", default="character_profiles",
|
| 154 |
+
help="输出目录 (默认: character_profiles)")
|
| 155 |
+
parser.add_argument("-n", "--num", type=int, default=10,
|
| 156 |
+
help="最多分析的角色数 (默认: 10)")
|
| 157 |
+
|
| 158 |
+
args = parser.parse_args()
|
| 159 |
+
|
| 160 |
+
batch_analyze(args.novel_path, args.output, args.num)
|
| 161 |
+
|
| 162 |
+
if __name__ == "__main__":
|
| 163 |
+
main()
|
tools/clear_cache.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""清理缓存工具"""
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 6 |
+
|
| 7 |
+
from utils import CacheManager
|
| 8 |
+
from config import Config
|
| 9 |
+
|
| 10 |
+
def clear_cache(pattern=None, confirm=True):
|
| 11 |
+
"""清理缓存
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
pattern: 匹配模式,None表示清理所有
|
| 15 |
+
confirm: 是否需要确认
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
cache = CacheManager()
|
| 19 |
+
info = cache.get_cache_info()
|
| 20 |
+
|
| 21 |
+
print("="*70)
|
| 22 |
+
print("🗑️ 缓存清理工具")
|
| 23 |
+
print("="*70)
|
| 24 |
+
print(f"\n当前缓存状态:")
|
| 25 |
+
print(f" 文件数量: {info['count']}")
|
| 26 |
+
print(f" 占用空间: {info['size_mb']} MB")
|
| 27 |
+
print(f" 缓存目录: {Config.CACHE_DIR}")
|
| 28 |
+
|
| 29 |
+
if info['count'] == 0:
|
| 30 |
+
print("\n✓ 缓存为空,无需清理")
|
| 31 |
+
return
|
| 32 |
+
|
| 33 |
+
if pattern:
|
| 34 |
+
print(f"\n将清理匹配 '{pattern}' 的缓存")
|
| 35 |
+
else:
|
| 36 |
+
print(f"\n⚠️ 将清理所有缓存文件")
|
| 37 |
+
|
| 38 |
+
if confirm:
|
| 39 |
+
response = input("\n确认清理?(y/n): ").strip().lower()
|
| 40 |
+
if response not in ['y', 'yes', '是']:
|
| 41 |
+
print("已取消")
|
| 42 |
+
return
|
| 43 |
+
|
| 44 |
+
print("\n清理中...")
|
| 45 |
+
cache.clear(pattern)
|
| 46 |
+
|
| 47 |
+
# 显示清理后状态
|
| 48 |
+
new_info = cache.get_cache_info()
|
| 49 |
+
print(f"\n✓ 清理完成")
|
| 50 |
+
print(f" 剩余文件: {new_info['count']}")
|
| 51 |
+
print(f" 释放空间: {info['size_mb'] - new_info['size_mb']:.2f} MB")
|
| 52 |
+
|
| 53 |
+
def main():
|
| 54 |
+
import argparse
|
| 55 |
+
|
| 56 |
+
parser = argparse.ArgumentParser(description="清理缓存")
|
| 57 |
+
parser.add_argument("-p", "--pattern", help="匹配模式")
|
| 58 |
+
parser.add_argument("-y", "--yes", action="store_true",
|
| 59 |
+
help="不询问直接清理")
|
| 60 |
+
|
| 61 |
+
args = parser.parse_args()
|
| 62 |
+
|
| 63 |
+
clear_cache(args.pattern, not args.yes)
|
| 64 |
+
|
| 65 |
+
if __name__ == "__main__":
|
| 66 |
+
main()
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .text_utils import TextUtils
|
| 2 |
+
from .cache_manager import CacheManager
|
| 3 |
+
|
| 4 |
+
__all__ = ['TextUtils', 'CacheManager']
|
utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (264 Bytes). View file
|
|
|
utils/__pycache__/cache_manager.cpython-310.pyc
ADDED
|
Binary file (5.03 kB). View file
|
|
|
utils/__pycache__/text_utils.cpython-310.pyc
ADDED
|
Binary file (4.55 kB). View file
|
|
|
utils/cache_manager.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import hashlib
|
| 4 |
+
import pickle
|
| 5 |
+
from typing import Any, Optional, Dict # 添加 Dict
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from config import Config
|
| 8 |
+
|
| 9 |
+
class CacheManager:
|
| 10 |
+
"""缓存管理器"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, cache_dir: str = None):
|
| 13 |
+
self.cache_dir = cache_dir or Config.CACHE_DIR
|
| 14 |
+
Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
|
| 15 |
+
|
| 16 |
+
def _get_cache_key(self, *args, **kwargs) -> str:
|
| 17 |
+
"""生成缓存键"""
|
| 18 |
+
content = str(args) + str(sorted(kwargs.items()))
|
| 19 |
+
return hashlib.md5(content.encode()).hexdigest()
|
| 20 |
+
|
| 21 |
+
def _get_cache_path(self, key: str) -> str:
|
| 22 |
+
"""获取缓存文件路径"""
|
| 23 |
+
return os.path.join(self.cache_dir, f"{key}.pkl")
|
| 24 |
+
|
| 25 |
+
def get(self, key: str) -> Optional[Any]:
|
| 26 |
+
"""获取缓存"""
|
| 27 |
+
if not Config.ENABLE_CACHE:
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
cache_file = self._get_cache_path(key)
|
| 31 |
+
if os.path.exists(cache_file):
|
| 32 |
+
try:
|
| 33 |
+
with open(cache_file, 'rb') as f:
|
| 34 |
+
return pickle.load(f)
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"读取缓存失败 ({key}): {e}")
|
| 37 |
+
return None
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
+
def set(self, key: str, value: Any) -> bool:
|
| 41 |
+
"""设置缓存"""
|
| 42 |
+
if not Config.ENABLE_CACHE:
|
| 43 |
+
return False
|
| 44 |
+
|
| 45 |
+
cache_file = self._get_cache_path(key)
|
| 46 |
+
try:
|
| 47 |
+
with open(cache_file, 'wb') as f:
|
| 48 |
+
pickle.dump(value, f)
|
| 49 |
+
return True
|
| 50 |
+
except Exception as e:
|
| 51 |
+
print(f"缓存保存失败 ({key}): {e}")
|
| 52 |
+
return False
|
| 53 |
+
|
| 54 |
+
def exists(self, key: str) -> bool:
|
| 55 |
+
"""检查缓存是否存在"""
|
| 56 |
+
if not Config.ENABLE_CACHE:
|
| 57 |
+
return False
|
| 58 |
+
return os.path.exists(self._get_cache_path(key))
|
| 59 |
+
|
| 60 |
+
def delete(self, key: str) -> bool:
|
| 61 |
+
"""删除指定缓存"""
|
| 62 |
+
cache_file = self._get_cache_path(key)
|
| 63 |
+
if os.path.exists(cache_file):
|
| 64 |
+
try:
|
| 65 |
+
os.remove(cache_file)
|
| 66 |
+
return True
|
| 67 |
+
except Exception as e:
|
| 68 |
+
print(f"删除缓存失败 ({key}): {e}")
|
| 69 |
+
return False
|
| 70 |
+
return False
|
| 71 |
+
|
| 72 |
+
def clear(self, pattern: str = None):
|
| 73 |
+
"""清除缓存"""
|
| 74 |
+
if not os.path.exists(self.cache_dir):
|
| 75 |
+
return
|
| 76 |
+
|
| 77 |
+
count = 0
|
| 78 |
+
for file in os.listdir(self.cache_dir):
|
| 79 |
+
if file.endswith('.pkl'):
|
| 80 |
+
if pattern is None or pattern in file:
|
| 81 |
+
try:
|
| 82 |
+
os.remove(os.path.join(self.cache_dir, file))
|
| 83 |
+
count += 1
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"删除缓存文件失败 ({file}): {e}")
|
| 86 |
+
|
| 87 |
+
print(f"已清除 {count} 个缓存文件")
|
| 88 |
+
|
| 89 |
+
def get_cache_size(self) -> int:
|
| 90 |
+
"""获取缓存总大小(字节)"""
|
| 91 |
+
if not os.path.exists(self.cache_dir):
|
| 92 |
+
return 0
|
| 93 |
+
|
| 94 |
+
total_size = 0
|
| 95 |
+
for file in os.listdir(self.cache_dir):
|
| 96 |
+
if file.endswith('.pkl'):
|
| 97 |
+
file_path = os.path.join(self.cache_dir, file)
|
| 98 |
+
total_size += os.path.getsize(file_path)
|
| 99 |
+
|
| 100 |
+
return total_size
|
| 101 |
+
|
| 102 |
+
def get_cache_info(self) -> dict:
|
| 103 |
+
"""获取缓存信息"""
|
| 104 |
+
if not os.path.exists(self.cache_dir):
|
| 105 |
+
return {
|
| 106 |
+
'count': 0,
|
| 107 |
+
'size': 0,
|
| 108 |
+
'size_mb': 0
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
count = 0
|
| 112 |
+
total_size = 0
|
| 113 |
+
|
| 114 |
+
for file in os.listdir(self.cache_dir):
|
| 115 |
+
if file.endswith('.pkl'):
|
| 116 |
+
count += 1
|
| 117 |
+
file_path = os.path.join(self.cache_dir, file)
|
| 118 |
+
total_size += os.path.getsize(file_path)
|
| 119 |
+
|
| 120 |
+
return {
|
| 121 |
+
'count': count,
|
| 122 |
+
'size': total_size,
|
| 123 |
+
'size_mb': round(total_size / (1024 * 1024), 2)
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
def save_json(self, key: str, data: dict) -> bool:
|
| 127 |
+
"""保存为JSON格式(用于可读性)"""
|
| 128 |
+
cache_file = os.path.join(self.cache_dir, f"{key}.json")
|
| 129 |
+
try:
|
| 130 |
+
with open(cache_file, 'w', encoding='utf-8') as f:
|
| 131 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 132 |
+
return True
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f"保存JSON缓存失败 ({key}): {e}")
|
| 135 |
+
return False
|
| 136 |
+
|
| 137 |
+
def load_json(self, key: str) -> Optional[dict]:
|
| 138 |
+
"""加载JSON格式缓存"""
|
| 139 |
+
cache_file = os.path.join(self.cache_dir, f"{key}.json")
|
| 140 |
+
if os.path.exists(cache_file):
|
| 141 |
+
try:
|
| 142 |
+
with open(cache_file, 'r', encoding='utf-8') as f:
|
| 143 |
+
return json.load(f)
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"读取JSON缓存失败 ({key}): {e}")
|
| 146 |
+
return None
|
| 147 |
+
return None
|
utils/text_utils.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import tiktoken
|
| 3 |
+
from typing import List, Tuple, Dict # 添加 Dict
|
| 4 |
+
|
| 5 |
+
class TextUtils:
|
| 6 |
+
"""文本处理工具类"""
|
| 7 |
+
|
| 8 |
+
@staticmethod
|
| 9 |
+
def count_tokens(text: str, model: str = "gpt-4") -> int:
|
| 10 |
+
"""计算文本的token数量"""
|
| 11 |
+
try:
|
| 12 |
+
encoding = tiktoken.encoding_for_model(model)
|
| 13 |
+
return len(encoding.encode(text))
|
| 14 |
+
except:
|
| 15 |
+
# 粗略估计:英文约4字符=1token,中文约1.5字符=1token
|
| 16 |
+
# 使用更保守的估计
|
| 17 |
+
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
|
| 18 |
+
english_chars = len(text) - chinese_chars
|
| 19 |
+
return int(chinese_chars / 1.5 + english_chars / 4)
|
| 20 |
+
|
| 21 |
+
@staticmethod
|
| 22 |
+
def clean_text(text: str) -> str:
|
| 23 |
+
"""清理文本"""
|
| 24 |
+
# 移除多余的空白
|
| 25 |
+
text = re.sub(r'\s+', ' ', text)
|
| 26 |
+
# 移除特殊字符(保留基本标点)
|
| 27 |
+
text = re.sub(r'[^\w\s,.!?;:\'\"()\-—《》「」『』【】\u4e00-\u9fff]', '', text)
|
| 28 |
+
return text.strip()
|
| 29 |
+
|
| 30 |
+
@staticmethod
|
| 31 |
+
def split_into_sentences(text: str) -> List[str]:
|
| 32 |
+
"""分割句子"""
|
| 33 |
+
# 支持中英文句子分割
|
| 34 |
+
# 英文句号、问号、感叹号
|
| 35 |
+
# 中文句号、问号、感叹号
|
| 36 |
+
sentences = re.split(r'[.!?。!?]+', text)
|
| 37 |
+
return [s.strip() for s in sentences if s.strip()]
|
| 38 |
+
|
| 39 |
+
@staticmethod
|
| 40 |
+
def detect_language(text: str) -> str:
|
| 41 |
+
"""检测文本语言"""
|
| 42 |
+
# 统计中文字符
|
| 43 |
+
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
|
| 44 |
+
# 统计英文字符
|
| 45 |
+
english_chars = len(re.findall(r'[a-zA-Z]', text))
|
| 46 |
+
|
| 47 |
+
total_chars = chinese_chars + english_chars
|
| 48 |
+
|
| 49 |
+
if total_chars == 0:
|
| 50 |
+
return "unknown"
|
| 51 |
+
|
| 52 |
+
chinese_ratio = chinese_chars / total_chars
|
| 53 |
+
|
| 54 |
+
if chinese_ratio > 0.3:
|
| 55 |
+
return "zh"
|
| 56 |
+
elif chinese_ratio < 0.1:
|
| 57 |
+
return "en"
|
| 58 |
+
else:
|
| 59 |
+
return "mixed"
|
| 60 |
+
|
| 61 |
+
@staticmethod
|
| 62 |
+
def extract_dialogues(text: str, language: str = "en") -> List[Dict]:
|
| 63 |
+
"""提取对话"""
|
| 64 |
+
dialogues = []
|
| 65 |
+
|
| 66 |
+
if language == "zh":
|
| 67 |
+
# 中文对话模式:引号内的内容
|
| 68 |
+
patterns = [
|
| 69 |
+
r'"([^"]+)"[,,]?\s*([^说道讲告诉问答叫喊]*(?:说|道|讲|告诉|问|答|叫|喊))',
|
| 70 |
+
r'"([^"]+)"',
|
| 71 |
+
r'「([^」]+)」',
|
| 72 |
+
r'『([^』]+)』'
|
| 73 |
+
]
|
| 74 |
+
else:
|
| 75 |
+
# 英文对话模式
|
| 76 |
+
patterns = [
|
| 77 |
+
r'"([^"]+)",?\s*([^said]*(said|asked|replied|shouted|whispered|muttered))',
|
| 78 |
+
r'"([^"]+)"',
|
| 79 |
+
r"'([^']+)',?\s*([^said]*(said|asked|replied))",
|
| 80 |
+
r"'([^']+)'"
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
for pattern in patterns:
|
| 84 |
+
matches = re.finditer(pattern, text, re.IGNORECASE)
|
| 85 |
+
for match in matches:
|
| 86 |
+
dialogue = {
|
| 87 |
+
'content': match.group(1),
|
| 88 |
+
'attribution': match.group(2) if len(match.groups()) > 1 else '',
|
| 89 |
+
'position': match.start()
|
| 90 |
+
}
|
| 91 |
+
dialogues.append(dialogue)
|
| 92 |
+
|
| 93 |
+
return dialogues
|
| 94 |
+
|
| 95 |
+
@staticmethod
|
| 96 |
+
def truncate_text(text: str, max_length: int,
|
| 97 |
+
ellipsis: str = "...") -> str:
|
| 98 |
+
"""截断文本到指定长度"""
|
| 99 |
+
if len(text) <= max_length:
|
| 100 |
+
return text
|
| 101 |
+
|
| 102 |
+
return text[:max_length - len(ellipsis)] + ellipsis
|
| 103 |
+
|
| 104 |
+
@staticmethod
|
| 105 |
+
def extract_keywords(text: str, top_n: int = 10) -> List[str]:
|
| 106 |
+
"""提取关键词(简单实现)"""
|
| 107 |
+
# 移除标点和停用词
|
| 108 |
+
words = re.findall(r'\b\w+\b', text.lower())
|
| 109 |
+
|
| 110 |
+
# 简单的停用词列表
|
| 111 |
+
stop_words = {
|
| 112 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
| 113 |
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
|
| 114 |
+
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
| 115 |
+
'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that',
|
| 116 |
+
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一',
|
| 117 |
+
'一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有'
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
# 过滤停用词
|
| 121 |
+
filtered_words = [w for w in words if w not in stop_words and len(w) > 2]
|
| 122 |
+
|
| 123 |
+
# 统计词频
|
| 124 |
+
from collections import Counter
|
| 125 |
+
word_freq = Counter(filtered_words)
|
| 126 |
+
|
| 127 |
+
# 返回最常见的词
|
| 128 |
+
return [word for word, freq in word_freq.most_common(top_n)]
|
| 129 |
+
|
| 130 |
+
@staticmethod
|
| 131 |
+
def estimate_reading_time(text: str, wpm: int = 200) -> int:
|
| 132 |
+
"""估计阅读时间(分钟)"""
|
| 133 |
+
words = len(re.findall(r'\b\w+\b', text))
|
| 134 |
+
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
|
| 135 |
+
|
| 136 |
+
# 中文按字符数/500,英文按单词数/wpm
|
| 137 |
+
reading_time = chinese_chars / 500 + words / wpm
|
| 138 |
+
|
| 139 |
+
return max(1, int(reading_time))
|