import logging import re from datetime import datetime from typing import List, Dict import html def setup_logging(): """設置日誌系統""" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler('news_app.log', encoding='utf-8') ] ) def format_news_for_display(news_data: List[Dict]) -> str: """格式化新聞用於顯示""" if not news_data: return "📰 暫無新聞資料" html_content = """

""" for news in news_data: # 獲取情緒和對應的樣式 sentiment = news.get('sentiment', 'neutral') sentiment_class = f"news-{sentiment}" # 情緒徽章 sentiment_badges = { 'positive': '正面 😊', 'negative': '負面 😔', 'neutral': '中性 😐' } sentiment_badge = sentiment_badges.get(sentiment, sentiment_badges['neutral']) # 格式化發布時間 published_date = news.get('published_date', '') if isinstance(published_date, str): try: dt = datetime.fromisoformat(published_date.replace('Z', '+00:00')) formatted_date = dt.strftime('%Y-%m-%d %H:%M') except: formatted_date = published_date else: formatted_date = str(published_date) # 清理和截斷內容 title = html.escape(news.get('title', '')) content = html.escape(news.get('content', '')) url = news.get('url', '') source = html.escape(news.get('source', '')) category_name = {'us_stock': '美股', 'tw_stock': '台股'}.get(news.get('category', ''), '財經') # 截斷長內容 if len(content) > 300: content = content[:300] + "..." # 新聞卡片HTML news_card = f"""

{title}

{sentiment_badge}

{category_name} {source} 📅 {formatted_date}

{content}

閱讀全文 →

""" html_content += news_card html_content += "

" return html_content def clean_text(text: str) -> str: """清理文本""" if not text: return "" # 移除HTML標籤 text = re.sub(r'<[^>]+>', '', text) # 移除多餘空格 text = re.sub(r'\s+', ' ', text) # 移除特殊字符 text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()（），。！？]', '', text) return text.strip() def calculate_similarity(text1: str, text2: str) -> float: """計算兩個文本的相似度""" if not text1 or not text2: return 0.0 # 簡單的詞彙相似度計算 words1 = set(text1.lower().split()) words2 = set(text2.lower().split()) if not words1 or not words2: return 0.0 intersection = words1.intersection(words2) union = words1.union(words2) return len(intersection) / len(union) if union else 0.0 def validate_url(url: str) -> bool: """驗證URL格式""" url_pattern = re.compile( r'^https?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) return url_pattern.match(url) is not None def format_duration(seconds: float) -> str: """格式化時間長度""" if seconds < 60: return f"{seconds:.1f}秒" elif seconds < 3600: minutes = seconds / 60 return f"{minutes:.1f}分鐘" else: hours = seconds / 3600 return f"{hours:.1f}小時" def truncate_text(text: str, max_length: int = 100) -> str: """截斷文本""" if not text: return "" if len(text) <= max_length: return text return text[:max_length].rsplit(' ', 1)[0] + "..."