Spaces:
Sleeping
Sleeping
File size: 5,478 Bytes
ec6ea02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import logging
import re
from datetime import datetime
from typing import List, Dict
import html
def setup_logging():
"""設置日誌系統"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('news_app.log', encoding='utf-8')
]
)
def format_news_for_display(news_data: List[Dict]) -> str:
"""格式化新聞用於顯示"""
if not news_data:
return "📰 暫無新聞資料"
html_content = """
<div style="max-width: 100%; font-family: Arial, sans-serif;">
"""
for news in news_data:
# 獲取情緒和對應的樣式
sentiment = news.get('sentiment', 'neutral')
sentiment_class = f"news-{sentiment}"
# 情緒徽章
sentiment_badges = {
'positive': '<span class="sentiment-badge positive-badge">正面 😊</span>',
'negative': '<span class="sentiment-badge negative-badge">負面 😔</span>',
'neutral': '<span class="sentiment-badge neutral-badge">中性 😐</span>'
}
sentiment_badge = sentiment_badges.get(sentiment, sentiment_badges['neutral'])
# 格式化發布時間
published_date = news.get('published_date', '')
if isinstance(published_date, str):
try:
dt = datetime.fromisoformat(published_date.replace('Z', '+00:00'))
formatted_date = dt.strftime('%Y-%m-%d %H:%M')
except:
formatted_date = published_date
else:
formatted_date = str(published_date)
# 清理和截斷內容
title = html.escape(news.get('title', ''))
content = html.escape(news.get('content', ''))
url = news.get('url', '')
source = html.escape(news.get('source', ''))
category_name = {'us_stock': '美股', 'tw_stock': '台股'}.get(news.get('category', ''), '財經')
# 截斷長內容
if len(content) > 300:
content = content[:300] + "..."
# 新聞卡片HTML
news_card = f"""
<div class="news-card {sentiment_class}" style="margin-bottom: 20px; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
<div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 10px;">
<h3 style="margin: 0; font-size: 18px; color: #333; flex: 1; margin-right: 10px;">
<a href="{url}" target="_blank" style="color: #333; text-decoration: none;">{title}</a>
</h3>
{sentiment_badge}
</div>
<div style="margin-bottom: 10px; color: #666; font-size: 14px;">
<span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 12px; margin-right: 8px;">{category_name}</span>
<span>{source}</span>
<span style="margin-left: 8px;">📅 {formatted_date}</span>
</div>
<p style="margin: 10px 0; color: #555; line-height: 1.6;">{content}</p>
<div style="margin-top: 10px; text-align: right;">
<a href="{url}" target="_blank" style="color: #007bff; text-decoration: none; font-size: 14px;">閱讀全文 →</a>
</div>
</div>
"""
html_content += news_card
html_content += "</div>"
return html_content
def clean_text(text: str) -> str:
"""清理文本"""
if not text:
return ""
# 移除HTML標籤
text = re.sub(r'<[^>]+>', '', text)
# 移除多餘空格
text = re.sub(r'\s+', ' ', text)
# 移除特殊字符
text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()(),。!?]', '', text)
return text.strip()
def calculate_similarity(text1: str, text2: str) -> float:
"""計算兩個文本的相似度"""
if not text1 or not text2:
return 0.0
# 簡單的詞彙相似度計算
words1 = set(text1.lower().split())
words2 = set(text2.lower().split())
if not words1 or not words2:
return 0.0
intersection = words1.intersection(words2)
union = words1.union(words2)
return len(intersection) / len(union) if union else 0.0
def validate_url(url: str) -> bool:
"""驗證URL格式"""
url_pattern = re.compile(
r'^https?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return url_pattern.match(url) is not None
def format_duration(seconds: float) -> str:
"""格式化時間長度"""
if seconds < 60:
return f"{seconds:.1f}秒"
elif seconds < 3600:
minutes = seconds / 60
return f"{minutes:.1f}分鐘"
else:
hours = seconds / 3600
return f"{hours:.1f}小時"
def truncate_text(text: str, max_length: int = 100) -> str:
"""截斷文本"""
if not text:
return ""
if len(text) <= max_length:
return text
return text[:max_length].rsplit(' ', 1)[0] + "..." |