File size: 5,131 Bytes

a226682

import re
import tiktoken
from typing import List, Tuple, Dict  # 添加 Dict

class TextUtils:
    """文本处理工具类"""
    
    @staticmethod
    def count_tokens(text: str, model: str = "gpt-4") -> int:
        """计算文本的token数量"""
        try:
            encoding = tiktoken.encoding_for_model(model)
            return len(encoding.encode(text))
        except:
            # 粗略估计：英文约4字符=1token，中文约1.5字符=1token
            # 使用更保守的估计
            chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
            english_chars = len(text) - chinese_chars
            return int(chinese_chars / 1.5 + english_chars / 4)
    
    @staticmethod
    def clean_text(text: str) -> str:
        """清理文本"""
        # 移除多余的空白
        text = re.sub(r'\s+', ' ', text)
        # 移除特殊字符（保留基本标点）
        text = re.sub(r'[^\w\s,.!?;:\'\"()\-—《》「」『』【】\u4e00-\u9fff]', '', text)
        return text.strip()
    
    @staticmethod
    def split_into_sentences(text: str) -> List[str]:
        """分割句子"""
        # 支持中英文句子分割
        # 英文句号、问号、感叹号
        # 中文句号、问号、感叹号
        sentences = re.split(r'[.!?。！？]+', text)
        return [s.strip() for s in sentences if s.strip()]
    
    @staticmethod
    def detect_language(text: str) -> str:
        """检测文本语言"""
        # 统计中文字符
        chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
        # 统计英文字符
        english_chars = len(re.findall(r'[a-zA-Z]', text))
        
        total_chars = chinese_chars + english_chars
        
        if total_chars == 0:
            return "unknown"
        
        chinese_ratio = chinese_chars / total_chars
        
        if chinese_ratio > 0.3:
            return "zh"
        elif chinese_ratio < 0.1:
            return "en"
        else:
            return "mixed"
    
    @staticmethod
    def extract_dialogues(text: str, language: str = "en") -> List[Dict]:
        """提取对话"""
        dialogues = []
        
        if language == "zh":
            # 中文对话模式：引号内的内容
            patterns = [
                r'"([^"]+)"[,，]?\s*([^说道讲告诉问答叫喊]*(?:说|道|讲|告诉|问|答|叫|喊))',
                r'"([^"]+)"',
                r'「([^」]+)」',
                r'『([^』]+)』'
            ]
        else:
            # 英文对话模式
            patterns = [
                r'"([^"]+)",?\s*([^said]*(said|asked|replied|shouted|whispered|muttered))',
                r'"([^"]+)"',
                r"'([^']+)',?\s*([^said]*(said|asked|replied))",
                r"'([^']+)'"
            ]
        
        for pattern in patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                dialogue = {
                    'content': match.group(1),
                    'attribution': match.group(2) if len(match.groups()) > 1 else '',
                    'position': match.start()
                }
                dialogues.append(dialogue)
        
        return dialogues
    
    @staticmethod
    def truncate_text(text: str, max_length: int, 
                     ellipsis: str = "...") -> str:
        """截断文本到指定长度"""
        if len(text) <= max_length:
            return text
        
        return text[:max_length - len(ellipsis)] + ellipsis
    
    @staticmethod
    def extract_keywords(text: str, top_n: int = 10) -> List[str]:
        """提取关键词（简单实现）"""
        # 移除标点和停用词
        words = re.findall(r'\b\w+\b', text.lower())
        
        # 简单的停用词列表
        stop_words = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
            'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
            'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
            'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that',
            '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一',
            '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有'
        }
        
        # 过滤停用词
        filtered_words = [w for w in words if w not in stop_words and len(w) > 2]
        
        # 统计词频
        from collections import Counter
        word_freq = Counter(filtered_words)
        
        # 返回最常见的词
        return [word for word, freq in word_freq.most_common(top_n)]
    
    @staticmethod
    def estimate_reading_time(text: str, wpm: int = 200) -> int:
        """估计阅读时间（分钟）"""
        words = len(re.findall(r'\b\w+\b', text))
        chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
        
        # 中文按字符数/500，英文按单词数/wpm
        reading_time = chinese_chars / 500 + words / wpm
        
        return max(1, int(reading_time))