Spaces:
Sleeping
Sleeping
File size: 16,768 Bytes
0bbe763 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 |
"""
內容規劃模組
Content Planning Module
此模組提供智能內容分析、主題提取、對話結構規劃等功能。
幫助生成更連貫、有組織的長篇對話內容。
"""
import re
import logging
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from collections import Counter
logger = logging.getLogger(__name__)
@dataclass
class ContentSegment:
"""內容片段"""
title: str
content: str
keywords: List[str]
estimated_length: int
priority: int # 1-10, 10為最重要
@dataclass
class ContentOutline:
"""內容大綱"""
main_topic: str
segments: List[ContentSegment]
total_estimated_length: int
suggested_parts: int
class ContentAnalyzer:
"""內容分析器"""
def __init__(self):
self.stopwords = {
'的', '了', '和', '是', '在', '有', '這', '個', '一', '我', '你', '他',
'她', '它', '們', '我們', '你們', '他們', '也', '都', '很', '更', '最',
'可以', '能夠', '應該', '需要', '必須', '會', '將', '要', '來', '去',
'說', '講', '談', '看', '聽', '想', '覺得', '認為', '以為', '知道'
}
def analyze_content(self, text: str) -> Dict[str, any]:
"""
分析文本內容,提取關鍵信息
Args:
text: 輸入文本
Returns:
Dict: 分析結果
"""
logger.info("開始分析文本內容")
# 基本統計
word_count = len(text)
paragraph_count = len([p for p in text.split('\n\n') if p.strip()])
# 提取關鍵詞
keywords = self._extract_keywords(text)
# 識別主題
main_topics = self._identify_main_topics(text, keywords)
# 分析文本結構
structure = self._analyze_structure(text)
# 估算適合的對話長度
estimated_rounds = self._estimate_dialogue_rounds(word_count)
result = {
'word_count': word_count,
'paragraph_count': paragraph_count,
'keywords': keywords[:20], # 取前20個關鍵詞
'main_topics': main_topics,
'structure': structure,
'estimated_rounds': estimated_rounds,
'complexity_score': self._calculate_complexity(text)
}
logger.info(f"內容分析完成:{word_count}字,{paragraph_count}段落,{len(keywords)}個關鍵詞")
return result
def _extract_keywords(self, text: str, top_k: int = 50) -> List[str]:
"""提取關鍵詞"""
# 簡單的關鍵詞提取(基於詞頻)
words = re.findall(r'[\u4e00-\u9fff]+', text) # 提取中文詞彙
words = [word for word in words if len(word) >= 2 and word not in self.stopwords]
word_freq = Counter(words)
return [word for word, freq in word_freq.most_common(top_k)]
def _identify_main_topics(self, text: str, keywords: List[str]) -> List[str]:
"""識別主要主題"""
# 基於關鍵詞聚類的簡單主題識別
topic_patterns = {
'科技': ['技術', '科學', '研究', '創新', '發明', '實驗', '理論', '方法'],
'商業': ['公司', '市場', '產品', '服務', '客戶', '銷售', '營收', '策略'],
'教育': ['學習', '教學', '知識', '學生', '老師', '課程', '教育', '培訓'],
'健康': ['健康', '醫療', '疾病', '治療', '藥物', '醫生', '病人', '醫院'],
'環境': ['環境', '氣候', '污染', '保護', '生態', '自然', '能源', '綠色'],
'社會': ['社會', '文化', '政治', '經濟', '人民', '國家', '法律', '制度'],
'歷史': ['歷史', '古代', '傳統', '文化', '事件', '人物', '時代', '發展']
}
topics = []
for topic, patterns in topic_patterns.items():
score = sum(1 for keyword in keywords if any(pattern in keyword for pattern in patterns))
if score > 0:
topics.append((topic, score))
# 按分數排序並返回前5個主題
topics.sort(key=lambda x: x[1], reverse=True)
return [topic for topic, score in topics[:5]]
def _analyze_structure(self, text: str) -> Dict[str, int]:
"""分析文本結構"""
lines = text.split('\n')
# 計算不同類型的內容
headers = len([line for line in lines if self._is_header(line)])
lists = len([line for line in lines if self._is_list_item(line)])
tables = text.count('|') # 簡單的表格檢測
return {
'headers': headers,
'lists': lists,
'tables': tables,
'sections': max(1, headers) # 至少有一個章節
}
def _is_header(self, line: str) -> bool:
"""判斷是否為標題行"""
line = line.strip()
return (line.startswith('#') or
(len(line) < 50 and line.endswith(':')) or
re.match(r'^\d+\.', line) or
re.match(r'^[一二三四五六七八九十]+、', line))
def _is_list_item(self, line: str) -> bool:
"""判斷是否為列表項"""
line = line.strip()
return (line.startswith('•') or
line.startswith('-') or
line.startswith('*') or
re.match(r'^\d+\)', line))
def _estimate_dialogue_rounds(self, word_count: int) -> int:
"""估算對話輪數"""
# 假設每輪對話平均300-500字
avg_words_per_round = 400
return max(10, min(200, word_count // avg_words_per_round))
def _calculate_complexity(self, text: str) -> float:
"""計算文本複雜度 (0-100)"""
# 基於多個因素計算複雜度
factors = []
# 詞彙豐富度
words = re.findall(r'[\u4e00-\u9fff]+', text)
unique_words = len(set(words))
total_words = len(words)
vocab_richness = unique_words / max(1, total_words) if total_words > 0 else 0
factors.append(vocab_richness * 100)
# 句子長度變化
sentences = re.split(r'[。!?]', text)
sentence_lengths = [len(s.strip()) for s in sentences if s.strip()]
if sentence_lengths:
avg_length = sum(sentence_lengths) / len(sentence_lengths)
length_variance = sum((l - avg_length) ** 2 for l in sentence_lengths) / len(sentence_lengths)
factors.append(min(100, length_variance / 10))
# 專業術語密度
technical_terms = self._count_technical_terms(text)
term_density = (technical_terms / max(1, total_words)) * 1000
factors.append(min(100, term_density))
return sum(factors) / len(factors) if factors else 50.0
def _count_technical_terms(self, text: str) -> int:
"""計算專業術語數量"""
technical_patterns = [
r'[A-Z]{2,}', # 縮寫
r'\d+\.?\d*%', # 百分比
r'\d+\.?\d*[A-Za-z]+', # 帶單位的數字
r'[a-zA-Z]+tion', # -tion結尾的詞
r'[a-zA-Z]+ism', # -ism結尾的詞
]
count = 0
for pattern in technical_patterns:
count += len(re.findall(pattern, text))
return count
class ContentPlanner:
"""內容規劃器"""
def __init__(self):
self.analyzer = ContentAnalyzer()
def create_content_outline(self, text: str, target_rounds: int = None) -> ContentOutline:
"""
創建內容大綱
Args:
text: 輸入文本
target_rounds: 目標對話輪數
Returns:
ContentOutline: 內容大綱
"""
logger.info("開始創建內容大綱")
# 分析內容
analysis = self.analyzer.analyze_content(text)
if target_rounds is None:
target_rounds = analysis['estimated_rounds']
# 提取主要主題
main_topic = analysis['main_topics'][0] if analysis['main_topics'] else "一般主題"
# 分割內容為段落
segments = self._create_content_segments(text, analysis, target_rounds)
# 計算總長度和建議部分數
total_length = sum(seg.estimated_length for seg in segments)
suggested_parts = self._calculate_suggested_parts(target_rounds)
outline = ContentOutline(
main_topic=main_topic,
segments=segments,
total_estimated_length=total_length,
suggested_parts=suggested_parts
)
logger.info(f"內容大綱創建完成:{len(segments)}個片段,建議{suggested_parts}個部分")
return outline
def _create_content_segments(self, text: str, analysis: Dict, target_rounds: int) -> List[ContentSegment]:
"""創建內容片段"""
# 根據段落和主題分割內容
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
if not paragraphs:
paragraphs = [text]
segments = []
keywords = analysis['keywords']
avg_rounds_per_segment = max(1, target_rounds // max(1, len(paragraphs)))
for i, paragraph in enumerate(paragraphs):
# 提取段落關鍵詞
para_keywords = [kw for kw in keywords if kw in paragraph][:5]
# 生成標題
title = self._generate_segment_title(paragraph, para_keywords, i + 1)
# 估算長度(以對話輪數計算)
estimated_length = max(1, min(len(paragraph) // 200, avg_rounds_per_segment * 2))
# 計算優先級
priority = self._calculate_segment_priority(paragraph, keywords)
segment = ContentSegment(
title=title,
content=paragraph,
keywords=para_keywords,
estimated_length=estimated_length,
priority=priority
)
segments.append(segment)
return segments
def _generate_segment_title(self, content: str, keywords: List[str], index: int) -> str:
"""生成段落標題"""
# 嘗試從內容中提取現有標題
lines = content.split('\n')
for line in lines[:3]: # 檢查前3行
if self.analyzer._is_header(line):
return line.strip().replace('#', '').replace(':', '').strip()
# 如果沒有標題,根據關鍵詞生成
if keywords:
main_keyword = keywords[0]
return f"關於{main_keyword}的討論"
return f"第{index}部分討論"
def _calculate_segment_priority(self, content: str, global_keywords: List[str]) -> int:
"""計算段落優先級 (1-10)"""
# 基於關鍵詞密度和內容長度
keyword_count = sum(1 for kw in global_keywords[:10] if kw in content)
length_score = min(5, len(content) // 500)
priority = min(10, max(1, keyword_count + length_score))
return priority
def _calculate_suggested_parts(self, target_rounds: int) -> int:
"""計算建議的生成部分數"""
# 基於目標輪數計算最佳部分數
# 每部分理想輪數約50-80輪
if target_rounds <= 50:
return 1
elif target_rounds <= 100:
return 2
elif target_rounds <= 150:
return 3
elif target_rounds <= 200:
return 4
else:
return max(4, min(8, target_rounds // 50))
class SmartContentSplitter:
"""智能內容分割器"""
def __init__(self):
self.planner = ContentPlanner()
def split_for_generation(self, outline: ContentOutline, num_parts: int) -> List[Dict]:
"""
為生成過程分割內容
Args:
outline: 內容大綱
num_parts: 分割部分數
Returns:
List[Dict]: 每個部分的生成指令
"""
logger.info(f"將內容分割為{num_parts}個部分")
segments = outline.segments
total_segments = len(segments)
if total_segments == 0:
return [{"segments": [], "focus": "一般討論", "rounds": 67}]
# 計算每部分的段落分配
segments_per_part = max(1, total_segments // num_parts)
parts = []
for i in range(num_parts):
start_idx = i * segments_per_part
if i == num_parts - 1: # 最後一部分包含所有剩餘段落
end_idx = total_segments
else:
end_idx = min((i + 1) * segments_per_part, total_segments)
part_segments = segments[start_idx:end_idx]
if not part_segments: # 如果沒有分配到段落,跳過
continue
# 計算這部分的焦點主題
focus_keywords = []
total_rounds = 0
for segment in part_segments:
focus_keywords.extend(segment.keywords)
total_rounds += segment.estimated_length
# 確定主要焦點
if focus_keywords:
keyword_counts = Counter(focus_keywords)
main_focus = keyword_counts.most_common(1)[0][0]
focus = f"重點討論{main_focus}相關內容"
else:
focus = f"第{i+1}部分的深入討論"
# 確保每部分至少有合理的輪數
rounds = max(30, min(80, total_rounds))
part_info = {
"segments": part_segments,
"focus": focus,
"rounds": rounds,
"part_index": i,
"is_first": i == 0,
"is_last": i == num_parts - 1,
"content_summary": self._create_content_summary(part_segments)
}
parts.append(part_info)
logger.info(f"內容分割完成,共{len(parts)}個有效部分")
return parts
def _create_content_summary(self, segments: List[ContentSegment]) -> str:
"""創建內容摘要"""
if not segments:
return "一般討論內容"
main_keywords = []
for segment in segments:
main_keywords.extend(segment.keywords[:2]) # 每個段落取前2個關鍵詞
# 去重並取前5個
unique_keywords = list(dict.fromkeys(main_keywords))[:5]
if unique_keywords:
return f"主要討論:{', '.join(unique_keywords)}"
else:
return f"討論{segments[0].title}等相關主題"
def create_adaptive_prompts(outline: ContentOutline, part_info: Dict, base_template: Dict) -> Dict[str, str]:
"""
根據內容大綱創建自適應提示詞
Args:
outline: 內容大綱
part_info: 部分信息
base_template: 基礎模板
Returns:
Dict: 調整後的提示詞
"""
adapted_prompts = base_template.copy()
# 根據內容調整對話指令
if part_info["is_first"]:
# 第一部分:完整開場
adapted_prompts["dialog"] += f"""
特別注意:這是對話的開始部分,請確保:
1. 以標準開場白開始:「歡迎收聽 David888 Podcast,我是 David...」
2. 主要討論內容:{part_info['focus']}
3. 生成約{part_info['rounds']}輪對話
4. 為後續討論建立良好基礎
"""
elif part_info["is_last"]:
# 最後部分:總結收尾
adapted_prompts["dialog"] += f"""
特別注意:這是對話的最後部分,請確保:
1. 繼續前面的對話,不要重複開場白
2. 主要討論內容:{part_info['focus']}
3. 生成約{part_info['rounds']}輪對話
4. 在最後幾輪自然總結整個討論
5. 以適當的告別語結束
"""
else:
# 中間部分:承上啟下
adapted_prompts["dialog"] += f"""
特別注意:這是對話的中間部分,請確保:
1. 自然承接前面的討論
2. 主要討論內容:{part_info['focus']}
3. 生成約{part_info['rounds']}輪對話
4. 為後續討論預留空間,不要過早總結
"""
return adapted_prompts |