import logging
import re
from datetime import datetime
from typing import List, Dict
import html
def setup_logging():
"""設置日誌系統"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('news_app.log', encoding='utf-8')
]
)
def format_news_for_display(news_data: List[Dict]) -> str:
"""格式化新聞用於顯示"""
if not news_data:
return "📰 暫無新聞資料"
html_content = """
"""
for news in news_data:
# 獲取情緒和對應的樣式
sentiment = news.get('sentiment', 'neutral')
sentiment_class = f"news-{sentiment}"
# 情緒徽章
sentiment_badges = {
'positive': '
正面 😊',
'negative': '
負面 😔',
'neutral': '
中性 😐'
}
sentiment_badge = sentiment_badges.get(sentiment, sentiment_badges['neutral'])
# 格式化發布時間
published_date = news.get('published_date', '')
if isinstance(published_date, str):
try:
dt = datetime.fromisoformat(published_date.replace('Z', '+00:00'))
formatted_date = dt.strftime('%Y-%m-%d %H:%M')
except:
formatted_date = published_date
else:
formatted_date = str(published_date)
# 清理和截斷內容
title = html.escape(news.get('title', ''))
content = html.escape(news.get('content', ''))
url = news.get('url', '')
source = html.escape(news.get('source', ''))
category_name = {'us_stock': '美股', 'tw_stock': '台股'}.get(news.get('category', ''), '財經')
# 截斷長內容
if len(content) > 300:
content = content[:300] + "..."
# 新聞卡片HTML
news_card = f"""
{category_name}
{source}
📅 {formatted_date}
{content}
"""
html_content += news_card
html_content += "
"
return html_content
def clean_text(text: str) -> str:
"""清理文本"""
if not text:
return ""
# 移除HTML標籤
text = re.sub(r'<[^>]+>', '', text)
# 移除多餘空格
text = re.sub(r'\s+', ' ', text)
# 移除特殊字符
text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()(),。!?]', '', text)
return text.strip()
def calculate_similarity(text1: str, text2: str) -> float:
"""計算兩個文本的相似度"""
if not text1 or not text2:
return 0.0
# 簡單的詞彙相似度計算
words1 = set(text1.lower().split())
words2 = set(text2.lower().split())
if not words1 or not words2:
return 0.0
intersection = words1.intersection(words2)
union = words1.union(words2)
return len(intersection) / len(union) if union else 0.0
def validate_url(url: str) -> bool:
"""驗證URL格式"""
url_pattern = re.compile(
r'^https?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return url_pattern.match(url) is not None
def format_duration(seconds: float) -> str:
"""格式化時間長度"""
if seconds < 60:
return f"{seconds:.1f}秒"
elif seconds < 3600:
minutes = seconds / 60
return f"{minutes:.1f}分鐘"
else:
hours = seconds / 3600
return f"{hours:.1f}小時"
def truncate_text(text: str, max_length: int = 100) -> str:
"""截斷文本"""
if not text:
return ""
if len(text) <= max_length:
return text
return text[:max_length].rsplit(' ', 1)[0] + "..."