|
|
import re |
|
|
import tiktoken |
|
|
from typing import List, Tuple, Dict |
|
|
|
|
|
class TextUtils: |
|
|
"""文本处理工具类""" |
|
|
|
|
|
@staticmethod |
|
|
def count_tokens(text: str, model: str = "gpt-4") -> int: |
|
|
"""计算文本的token数量""" |
|
|
try: |
|
|
encoding = tiktoken.encoding_for_model(model) |
|
|
return len(encoding.encode(text)) |
|
|
except: |
|
|
|
|
|
|
|
|
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) |
|
|
english_chars = len(text) - chinese_chars |
|
|
return int(chinese_chars / 1.5 + english_chars / 4) |
|
|
|
|
|
@staticmethod |
|
|
def clean_text(text: str) -> str: |
|
|
"""清理文本""" |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
text = re.sub(r'[^\w\s,.!?;:\'\"()\-—《》「」『』【】\u4e00-\u9fff]', '', text) |
|
|
return text.strip() |
|
|
|
|
|
@staticmethod |
|
|
def split_into_sentences(text: str) -> List[str]: |
|
|
"""分割句子""" |
|
|
|
|
|
|
|
|
|
|
|
sentences = re.split(r'[.!?。!?]+', text) |
|
|
return [s.strip() for s in sentences if s.strip()] |
|
|
|
|
|
@staticmethod |
|
|
def detect_language(text: str) -> str: |
|
|
"""检测文本语言""" |
|
|
|
|
|
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) |
|
|
|
|
|
english_chars = len(re.findall(r'[a-zA-Z]', text)) |
|
|
|
|
|
total_chars = chinese_chars + english_chars |
|
|
|
|
|
if total_chars == 0: |
|
|
return "unknown" |
|
|
|
|
|
chinese_ratio = chinese_chars / total_chars |
|
|
|
|
|
if chinese_ratio > 0.3: |
|
|
return "zh" |
|
|
elif chinese_ratio < 0.1: |
|
|
return "en" |
|
|
else: |
|
|
return "mixed" |
|
|
|
|
|
@staticmethod |
|
|
def extract_dialogues(text: str, language: str = "en") -> List[Dict]: |
|
|
"""提取对话""" |
|
|
dialogues = [] |
|
|
|
|
|
if language == "zh": |
|
|
|
|
|
patterns = [ |
|
|
r'"([^"]+)"[,,]?\s*([^说道讲告诉问答叫喊]*(?:说|道|讲|告诉|问|答|叫|喊))', |
|
|
r'"([^"]+)"', |
|
|
r'「([^」]+)」', |
|
|
r'『([^』]+)』' |
|
|
] |
|
|
else: |
|
|
|
|
|
patterns = [ |
|
|
r'"([^"]+)",?\s*([^said]*(said|asked|replied|shouted|whispered|muttered))', |
|
|
r'"([^"]+)"', |
|
|
r"'([^']+)',?\s*([^said]*(said|asked|replied))", |
|
|
r"'([^']+)'" |
|
|
] |
|
|
|
|
|
for pattern in patterns: |
|
|
matches = re.finditer(pattern, text, re.IGNORECASE) |
|
|
for match in matches: |
|
|
dialogue = { |
|
|
'content': match.group(1), |
|
|
'attribution': match.group(2) if len(match.groups()) > 1 else '', |
|
|
'position': match.start() |
|
|
} |
|
|
dialogues.append(dialogue) |
|
|
|
|
|
return dialogues |
|
|
|
|
|
@staticmethod |
|
|
def truncate_text(text: str, max_length: int, |
|
|
ellipsis: str = "...") -> str: |
|
|
"""截断文本到指定长度""" |
|
|
if len(text) <= max_length: |
|
|
return text |
|
|
|
|
|
return text[:max_length - len(ellipsis)] + ellipsis |
|
|
|
|
|
@staticmethod |
|
|
def extract_keywords(text: str, top_n: int = 10) -> List[str]: |
|
|
"""提取关键词(简单实现)""" |
|
|
|
|
|
words = re.findall(r'\b\w+\b', text.lower()) |
|
|
|
|
|
|
|
|
stop_words = { |
|
|
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', |
|
|
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be', |
|
|
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', |
|
|
'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', |
|
|
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', |
|
|
'一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有' |
|
|
} |
|
|
|
|
|
|
|
|
filtered_words = [w for w in words if w not in stop_words and len(w) > 2] |
|
|
|
|
|
|
|
|
from collections import Counter |
|
|
word_freq = Counter(filtered_words) |
|
|
|
|
|
|
|
|
return [word for word, freq in word_freq.most_common(top_n)] |
|
|
|
|
|
@staticmethod |
|
|
def estimate_reading_time(text: str, wpm: int = 200) -> int: |
|
|
"""估计阅读时间(分钟)""" |
|
|
words = len(re.findall(r'\b\w+\b', text)) |
|
|
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) |
|
|
|
|
|
|
|
|
reading_time = chinese_chars / 500 + words / wpm |
|
|
|
|
|
return max(1, int(reading_time)) |