File size: 5,131 Bytes
a226682 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import re
import tiktoken
from typing import List, Tuple, Dict # 添加 Dict
class TextUtils:
"""文本处理工具类"""
@staticmethod
def count_tokens(text: str, model: str = "gpt-4") -> int:
"""计算文本的token数量"""
try:
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
except:
# 粗略估计:英文约4字符=1token,中文约1.5字符=1token
# 使用更保守的估计
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
english_chars = len(text) - chinese_chars
return int(chinese_chars / 1.5 + english_chars / 4)
@staticmethod
def clean_text(text: str) -> str:
"""清理文本"""
# 移除多余的空白
text = re.sub(r'\s+', ' ', text)
# 移除特殊字符(保留基本标点)
text = re.sub(r'[^\w\s,.!?;:\'\"()\-—《》「」『』【】\u4e00-\u9fff]', '', text)
return text.strip()
@staticmethod
def split_into_sentences(text: str) -> List[str]:
"""分割句子"""
# 支持中英文句子分割
# 英文句号、问号、感叹号
# 中文句号、问号、感叹号
sentences = re.split(r'[.!?。!?]+', text)
return [s.strip() for s in sentences if s.strip()]
@staticmethod
def detect_language(text: str) -> str:
"""检测文本语言"""
# 统计中文字符
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
# 统计英文字符
english_chars = len(re.findall(r'[a-zA-Z]', text))
total_chars = chinese_chars + english_chars
if total_chars == 0:
return "unknown"
chinese_ratio = chinese_chars / total_chars
if chinese_ratio > 0.3:
return "zh"
elif chinese_ratio < 0.1:
return "en"
else:
return "mixed"
@staticmethod
def extract_dialogues(text: str, language: str = "en") -> List[Dict]:
"""提取对话"""
dialogues = []
if language == "zh":
# 中文对话模式:引号内的内容
patterns = [
r'"([^"]+)"[,,]?\s*([^说道讲告诉问答叫喊]*(?:说|道|讲|告诉|问|答|叫|喊))',
r'"([^"]+)"',
r'「([^」]+)」',
r'『([^』]+)』'
]
else:
# 英文对话模式
patterns = [
r'"([^"]+)",?\s*([^said]*(said|asked|replied|shouted|whispered|muttered))',
r'"([^"]+)"',
r"'([^']+)',?\s*([^said]*(said|asked|replied))",
r"'([^']+)'"
]
for pattern in patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
dialogue = {
'content': match.group(1),
'attribution': match.group(2) if len(match.groups()) > 1 else '',
'position': match.start()
}
dialogues.append(dialogue)
return dialogues
@staticmethod
def truncate_text(text: str, max_length: int,
ellipsis: str = "...") -> str:
"""截断文本到指定长度"""
if len(text) <= max_length:
return text
return text[:max_length - len(ellipsis)] + ellipsis
@staticmethod
def extract_keywords(text: str, top_n: int = 10) -> List[str]:
"""提取关键词(简单实现)"""
# 移除标点和停用词
words = re.findall(r'\b\w+\b', text.lower())
# 简单的停用词列表
stop_words = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that',
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一',
'一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有'
}
# 过滤停用词
filtered_words = [w for w in words if w not in stop_words and len(w) > 2]
# 统计词频
from collections import Counter
word_freq = Counter(filtered_words)
# 返回最常见的词
return [word for word, freq in word_freq.most_common(top_n)]
@staticmethod
def estimate_reading_time(text: str, wpm: int = 200) -> int:
"""估计阅读时间(分钟)"""
words = len(re.findall(r'\b\w+\b', text))
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
# 中文按字符数/500,英文按单词数/wpm
reading_time = chinese_chars / 500 + words / wpm
return max(1, int(reading_time)) |