Spaces:

khjhs60199
/

pyCrawing

Sleeping

File size: 5,478 Bytes

ec6ea02

import logging
import re
from datetime import datetime
from typing import List, Dict
import html

def setup_logging():
    """設置日誌系統"""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(),
            logging.FileHandler('news_app.log', encoding='utf-8')
        ]
    )

def format_news_for_display(news_data: List[Dict]) -> str:
    """格式化新聞用於顯示"""
    if not news_data:
        return "📰 暫無新聞資料"
    
    html_content = """
    <div style="max-width: 100%; font-family: Arial, sans-serif;">
    """
    
    for news in news_data:
        # 獲取情緒和對應的樣式
        sentiment = news.get('sentiment', 'neutral')
        sentiment_class = f"news-{sentiment}"
        
        # 情緒徽章
        sentiment_badges = {
            'positive': '<span class="sentiment-badge positive-badge">正面 😊</span>',
            'negative': '<span class="sentiment-badge negative-badge">負面 😔</span>',
            'neutral': '<span class="sentiment-badge neutral-badge">中性 😐</span>'
        }
        
        sentiment_badge = sentiment_badges.get(sentiment, sentiment_badges['neutral'])
        
        # 格式化發布時間
        published_date = news.get('published_date', '')
        if isinstance(published_date, str):
            try:
                dt = datetime.fromisoformat(published_date.replace('Z', '+00:00'))
                formatted_date = dt.strftime('%Y-%m-%d %H:%M')
            except:
                formatted_date = published_date
        else:
            formatted_date = str(published_date)
        
        # 清理和截斷內容
        title = html.escape(news.get('title', ''))
        content = html.escape(news.get('content', ''))
        url = news.get('url', '')
        source = html.escape(news.get('source', ''))
        category_name = {'us_stock': '美股', 'tw_stock': '台股'}.get(news.get('category', ''), '財經')
        
        # 截斷長內容
        if len(content) > 300:
            content = content[:300] + "..."
        
        # 新聞卡片HTML
        news_card = f"""
        <div class="news-card {sentiment_class}" style="margin-bottom: 20px; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
            <div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 10px;">
                <h3 style="margin: 0; font-size: 18px; color: #333; flex: 1; margin-right: 10px;">
                    <a href="{url}" target="_blank" style="color: #333; text-decoration: none;">{title}</a>
                </h3>
                {sentiment_badge}
            </div>
            
            <div style="margin-bottom: 10px; color: #666; font-size: 14px;">
                <span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 12px; margin-right: 8px;">{category_name}</span>
                <span>{source}</span>
                <span style="margin-left: 8px;">📅 {formatted_date}</span>
            </div>
            
            <p style="margin: 10px 0; color: #555; line-height: 1.6;">{content}</p>
            
            <div style="margin-top: 10px; text-align: right;">
                <a href="{url}" target="_blank" style="color: #007bff; text-decoration: none; font-size: 14px;">閱讀全文 →</a>
            </div>
        </div>
        """
        
        html_content += news_card
    
    html_content += "</div>"
    
    return html_content

def clean_text(text: str) -> str:
    """清理文本"""
    if not text:
        return ""
    
    # 移除HTML標籤
    text = re.sub(r'<[^>]+>', '', text)
    
    # 移除多餘空格
    text = re.sub(r'\s+', ' ', text)
    
    # 移除特殊字符
    text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()（），。！？]', '', text)
    
    return text.strip()

def calculate_similarity(text1: str, text2: str) -> float:
    """計算兩個文本的相似度"""
    if not text1 or not text2:
        return 0.0
    
    # 簡單的詞彙相似度計算
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    
    if not words1 or not words2:
        return 0.0
    
    intersection = words1.intersection(words2)
    union = words1.union(words2)
    
    return len(intersection) / len(union) if union else 0.0

def validate_url(url: str) -> bool:
    """驗證URL格式"""
    url_pattern = re.compile(
        r'^https?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'  # domain...
        r'localhost|'  # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    
    return url_pattern.match(url) is not None

def format_duration(seconds: float) -> str:
    """格式化時間長度"""
    if seconds < 60:
        return f"{seconds:.1f}秒"
    elif seconds < 3600:
        minutes = seconds / 60
        return f"{minutes:.1f}分鐘"
    else:
        hours = seconds / 3600
        return f"{hours:.1f}小時"

def truncate_text(text: str, max_length: int = 100) -> str:
    """截斷文本"""
    if not text:
        return ""
    
    if len(text) <= max_length:
        return text
    
    return text[:max_length].rsplit(' ', 1)[0] + "..."