Spaces:

khjhs60199
/

pyCrawing

Sleeping

App Files Files Community

khjhs60199 commited on Sep 17, 2025

Commit

ec6ea02

verified ·

1 Parent(s): 2d3f73b

kickStart

Browse files

Files changed (7) hide show

app.py +184 -0
crawler.py +317 -0
database.py +296 -0
requirements.txt +30 -0
scheduler.py +167 -0
sentiment_analyzer.py +192 -0
utils.py +157 -0

app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import gradio as gr
+import pandas as pd
+import sqlite3
+import logging
+import asyncio
+import threading
+import time
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional
+import os
+from crawler import CnYesNewsCrawler
+from sentiment_analyzer import SentimentAnalyzer
+from database import NewsDatabase
+from scheduler import NewsScheduler
+from utils import setup_logging, format_news_for_display
+# 設置日誌
+setup_logging()
+logger = logging.getLogger(__name__)
+class NewsApp:
+    def __init__(self):
+        self.db = NewsDatabase()
+        self.crawler = CnYesNewsCrawler()
+        self.sentiment_analyzer = SentimentAnalyzer()
+        self.scheduler = NewsScheduler(self.db, self.crawler, self.sentiment_analyzer)
+        # 啟動背景排程器
+        self.scheduler.start()
+        logger.info("新聞應用程式初始化完成")
+    def get_latest_news(self, category: str = "all", limit: int = 50) -> str:
+        """獲取最新新聞並格式化顯示"""
+        try:
+            news_data = self.db.get_recent_news(category=category, limit=limit)
+            if not news_data:
+                return "📰 暫無新聞資料，請稍後再試"
+            return format_news_for_display(news_data)
+        except Exception as e:
+            logger.error(f"獲取新聞時發生錯誤: {e}")
+            return f"❌ 獲取新聞時發生錯誤: {str(e)}"
+    def manual_crawl(self) -> str:
+        """手動觸發爬蟲"""
+        try:
+            logger.info("手動觸發爬蟲開始")
+            result = self.scheduler.run_crawl_task()
+            return f"✅ 手動爬蟲完成: {result}"
+        except Exception as e:
+            logger.error(f"手動爬蟲錯誤: {e}")
+            return f"❌ 手動爬蟲失敗: {str(e)}"
+    def get_statistics(self) -> str:
+        """獲取統計資訊"""
+        try:
+            stats = self.db.get_statistics()
+            return f"""
+📊 **新聞統計**
+- 總新聞數量: {stats.get('total_news', 0)}
+- 美股新聞: {stats.get('us_stock_count', 0)}
+- 台股新聞: {stats.get('tw_stock_count', 0)}
+- 正面新聞: {stats.get('positive_count', 0)}
+- 負面新聞: {stats.get('negative_count', 0)}
+- 中性新聞: {stats.get('neutral_count', 0)}
+- 最後更新: {stats.get('last_update', 'N/A')}
+            """
+        except Exception as e:
+            logger.error(f"獲取統計資訊錯誤: {e}")
+            return f"❌ 獲取統計資訊失敗: {str(e)}"
+# 初始化應用
+app = NewsApp()
+# 創建 Gradio 介面
+def create_interface():
+    with gr.Blocks(
+        title="📈 股市新聞情緒分析器",
+        theme=gr.themes.Soft(),
+        css="""
+        .news-positive { background: linear-gradient(90deg, #d4edda 0%, #c3e6cb 100%); border-left: 4px solid #28a745; }
+        .news-negative { background: linear-gradient(90deg, #f8d7da 0%, #f5c6cb 100%); border-left: 4px solid #dc3545; }
+        .news-neutral { background: linear-gradient(90deg, #e2e3e5 0%, #d6d8db 100%); border-left: 4px solid #6c757d; }
+        .news-card { margin: 10px 0; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
+        .sentiment-badge { padding: 4px 8px; border-radius: 12px; font-size: 12px; font-weight: bold; }
+        .positive-badge { background: #28a745; color: white; }
+        .negative-badge { background: #dc3545; color: white; }
+        .neutral-badge { background: #6c757d; color: white; }
+        """
+    ) as interface:
+        gr.Markdown("""
+        # 📈 股市新聞情緒分析器
+        🤖 自動爬取鉅亨網美股和台股新聞，並進行中文情緒分析
+        ⏰ **自動更新**: 每30分鐘自動爬取最新新聞
+        🎯 **智能分析**: 使用 RoBERTa 模型進行情緒分析
+        🔄 **去重處理**: 自動過濾重複新聞
+        📅 **資料保留**: 保存兩週內的新聞資料
+        """)
+        with gr.Tab("📰 最新新聞"):
+            with gr.Row():
+                with gr.Column(scale=3):
+                    category_radio = gr.Radio(
+                        choices=["all", "us_stock", "tw_stock"],
+                        value="all",
+                        label="新聞分類",
+                        info="選擇要顯示的新聞類型"
+                    )
+                with gr.Column(scale=1):
+                    refresh_btn = gr.Button("🔄 重新整理", variant="primary")
+                    manual_crawl_btn = gr.Button("🚀 手動爬取", variant="secondary")
+            news_display = gr.HTML(label="新聞內容")
+            # 自動重新整理
+            def auto_refresh():
+                return app.get_latest_news("all")
+            def refresh_news(category):
+                return app.get_latest_news(category)
+            # 綁定事件
+            refresh_btn.click(refresh_news, inputs=[category_radio], outputs=[news_display])
+            manual_crawl_btn.click(app.manual_crawl, outputs=[gr.Textbox(label="爬取結果")])
+            category_radio.change(refresh_news, inputs=[category_radio], outputs=[news_display])
+            # 初始載入
+            interface.load(auto_refresh, outputs=[news_display])
+        with gr.Tab("📊 統計資訊"):
+            stats_display = gr.Markdown()
+            stats_refresh_btn = gr.Button("🔄 更新統計")
+            stats_refresh_btn.click(app.get_statistics, outputs=[stats_display])
+            interface.load(app.get_statistics, outputs=[stats_display])
+        with gr.Tab("ℹ️ 關於"):
+            gr.Markdown("""
+            ## 🛠️ 技術特色
+            ### 📊 情緒分析
+            - **模型**: `uer/roberta-base-finetuned-jd-binary-chinese`
+            - **分類**: 正面 (綠色) / 負面 (紅色) / 中性 (灰色)
+            - **準確性**: 針對中文金融新聞優化
+            ### 🕷️ 新聞爬蟲
+            - **來源**: 鉅亨網 (cnyes.com)
+            - **分類**: 美股、台股新聞
+            - **頻率**: 每30分鐘自動更新
+            - **去重**: 基於標題相似度智能去重
+            ### 💾 資料管理
+            - **儲存**: SQLite 本地資料庫
+            - **保留期**: 自動清理兩週前的資料
+            - **效能**: 索引優化，快速查詢
+            ### 🔧 系統功能
+            - **反爬蟲**: 隨機延遲、User-Agent 輪換
+            - **錯誤處理**: 完整的異常捕獲和日誌記錄
+            - **監控**: 即時統計和狀態監控
+            ---
+            💡 **提示**: 首次啟動可能需要幾分鐘下載模型和初始化資料庫
+            """)
+    return interface
+# 啟動應用
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+        quiet=False
+    )

crawler.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import requests
+import cloudscraper
+from bs4 import BeautifulSoup
+import time
+import random
+import logging
+import re
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional
+from urllib.parse import urljoin, urlparse
+from fake_useragent import UserAgent
+import json
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+@dataclass
+class NewsItem:
+    """新聞項目資料結構"""
+    title: str
+    content: str
+    url: str
+    source: str
+    category: str
+    published_date: datetime
+    sentiment: Optional[str] = None
+    sentiment_score: Optional[float] = None
+class CnYesNewsCrawler:
+    """鉅亨網新聞爬蟲"""
+    def __init__(self):
+        self.base_url = "https://news.cnyes.com"
+        self.session = cloudscraper.create_scraper()
+        self.ua = UserAgent()
+        # 新聞分類URL
+        self.categories = {
+            'us_stock': 'https://news.cnyes.com/news/cat/us_stock',
+            'tw_stock': 'https://news.cnyes.com/news/cat/tw_stock_news'
+        }
+        # 設置請求頭
+        self._setup_headers()
+    def _setup_headers(self):
+        """設置隨機請求頭"""
+        self.session.headers.update({
+            'User-Agent': self.ua.random,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+            'Cache-Control': 'max-age=0'
+        })
+    def _get_page(self, url: str, retries: int = 3) -> Optional[BeautifulSoup]:
+        """獲取網頁內容"""
+        for attempt in range(retries):
+            try:
+                # 隨機延遲
+                time.sleep(random.uniform(2, 5))
+                # 輪換 User-Agent
+                self.session.headers['User-Agent'] = self.ua.random
+                response = self.session.get(url, timeout=30)
+                if response.status_code == 200:
+                    response.encoding = 'utf-8'
+                    return BeautifulSoup(response.content, 'html.parser')
+                else:
+                    logger.warning(f"HTTP {response.status_code} for {url}")
+            except Exception as e:
+                logger.error(f"請求失敗 (嘗試 {attempt + 1}/{retries}): {e}")
+                if attempt < retries - 1:
+                    time.sleep(random.uniform(5, 10))
+        return None
+    def _extract_article_urls(self, category_url: str, max_pages: int = 3) -> List[str]:
+        """從分類頁面提取文章URL"""
+        article_urls = []
+        for page in range(1, max_pages + 1):
+            try:
+                if page == 1:
+                    url = category_url
+                else:
+                    url = f"{category_url}?page={page}"
+                logger.info(f"爬取分類頁面: {url}")
+                soup = self._get_page(url)
+                if not soup:
+                    continue
+                # 尋找文章連結
+                links = soup.find_all('a', href=re.compile(r'/news/id/\d+'))
+                page_urls = []
+                for link in links:
+                    href = link.get('href')
+                    if href:
+                        full_url = urljoin(self.base_url, href)
+                        if full_url not in page_urls:
+                            page_urls.append(full_url)
+                article_urls.extend(page_urls)
+                logger.info(f"第 {page} 頁找到 {len(page_urls)} 篇文章")
+                if not page_urls:
+                    break
+            except Exception as e:
+                logger.error(f"爬取第 {page} 頁時發生錯誤: {e}")
+                continue
+        return list(set(article_urls))  # 去重
+    def _extract_article_content(self, url: str, category: str) -> Optional[NewsItem]:
+        """提取文章詳細內容"""
+        try:
+            soup = self._get_page(url)
+            if not soup:
+                return None
+            # 提取標題
+            title_selectors = [
+                'h1.news-title',
+                'h1[class*="title"]',
+                '.article-header h1',
+                'h1'
+            ]
+            title = ""
+            for selector in title_selectors:
+                title_elem = soup.select_one(selector)
+                if title_elem:
+                    title = title_elem.get_text(strip=True)
+                    if title and len(title) > 5:
+                        break
+            if not title:
+                logger.warning(f"無法提取標題: {url}")
+                return None
+            # 提取內容
+            content_selectors = [
+                '.news-content',
+                '.article-content',
+                '.content-body',
+                '[class*="article-text"]'
+            ]
+            content = ""
+            for selector in content_selectors:
+                content_elem = soup.select_one(selector)
+                if content_elem:
+                    # 移除不需要的元素
+                    for unwanted in content_elem.select('script, style, .ad, .advertisement'):
+                        unwanted.decompose()
+                    paragraphs = content_elem.find_all(['p', 'div'])
+                    content_parts = []
+                    for p in paragraphs:
+                        text = p.get_text(strip=True)
+                        if text and len(text) > 10:
+                            content_parts.append(text)
+                    content = '\n'.join(content_parts)
+                    if content:
+                        break
+            if not content or len(content) < 50:
+                logger.warning(f"內容太短或無法提取: {url}")
+                return None
+            # 提取發布時間
+            published_date = self._extract_publish_date(soup)
+            # 創建新聞項目
+            news_item = NewsItem(
+                title=title,
+                content=content[:2000],  # 限制內容長度
+                url=url,
+                source='鉅亨網',
+                category=category,
+                published_date=published_date
+            )
+            logger.info(f"成功提取文章: {title[:50]}...")
+            return news_item
+        except Exception as e:
+            logger.error(f"提取文章內容時發生錯誤 {url}: {e}")
+            return None
+    def _extract_publish_date(self, soup: BeautifulSoup) -> datetime:
+        """提取發布時間"""
+        time_selectors = [
+            'time[datetime]',
+            '.publish-time',
+            '.news-time',
+            '[class*="time"]'
+        ]
+        for selector in time_selectors:
+            time_elem = soup.select_one(selector)
+            if time_elem:
+                datetime_attr = time_elem.get('datetime')
+                if datetime_attr:
+                    try:
+                        return datetime.fromisoformat(datetime_attr.replace('Z', '+00:00')).replace(tzinfo=None)
+                    except:
+                        pass
+                time_text = time_elem.get_text(strip=True)
+                parsed_time = self._parse_time_text(time_text)
+                if parsed_time:
+                    return parsed_time
+        return datetime.now()
+    def _parse_time_text(self, time_text: str) -> Optional[datetime]:
+        """解析時間文字"""
+        patterns = [
+            r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
+            r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2})',
+            r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
+            r'(\d{4})-(\d{2})-(\d{2})'
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, time_text)
+            if match:
+                try:
+                    groups = match.groups()
+                    if len(groups) >= 6:
+                        return datetime(int(groups[0]), int(groups[1]), int(groups[2]),
+                                      int(groups[3]), int(groups[4]), int(groups[5]))
+                    elif len(groups) >= 5:
+                        return datetime(int(groups[0]), int(groups[1]), int(groups[2]),
+                                      int(groups[3]), int(groups[4]))
+                    else:
+                        return datetime(int(groups[0]), int(groups[1]), int(groups[2]))
+                except:
+                    continue
+        return None
+    def crawl_category(self, category: str, max_articles: int = 20) -> List[NewsItem]:
+        """爬取指定分類的新聞"""
+        if category not in self.categories:
+            logger.error(f"無效的分類: {category}")
+            return []
+        logger.info(f"開始爬取 {category} 分類新聞")
+        # 獲取文章URL列表
+        category_url = self.categories[category]
+        article_urls = self._extract_article_urls(category_url)
+        if not article_urls:
+            logger.warning(f"未找到 {category} 分類的文章URL")
+            return []
+        # 限制文章數量
+        if len(article_urls) > max_articles:
+            article_urls = article_urls[:max_articles]
+        # 提取文章內容
+        articles = []
+        for i, url in enumerate(article_urls, 1):
+            try:
+                logger.info(f"處理文章 {i}/{len(article_urls)}: {url}")
+                article = self._extract_article_content(url, category)
+                if article:
+                    articles.append(article)
+                # 隨機延遲
+                time.sleep(random.uniform(3, 8))
+            except Exception as e:
+                logger.error(f"處理文章時發生錯誤 {url}: {e}")
+                continue
+        logger.info(f"{category} 分類爬取完成，共 {len(articles)} 篇文章")
+        return articles
+    def crawl_all_categories(self, max_articles_per_category: int = 15) -> Dict[str, List[NewsItem]]:
+        """爬取所有分類的新聞"""
+        results = {}
+        for category in self.categories.keys():
+            try:
+                logger.info(f"開始爬取 {category} 分類")
+                articles = self.crawl_category(category, max_articles_per_category)
+                results[category] = articles
+                # 分類間延遲
+                time.sleep(random.uniform(10, 20))
+            except Exception as e:
+                logger.error(f"爬取 {category} 分類時發生錯誤: {e}")
+                results[category] = []
+        total_articles = sum(len(articles) for articles in results.values())
+        logger.info(f"所有分類爬取完成，總共 {total_articles} 篇文章")
+        return results

database.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import sqlite3
+import logging
+import json
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional, Tuple
+import threading
+from contextlib import contextmanager
+logger = logging.getLogger(__name__)
+class NewsDatabase:
+    """新聞資料庫管理器"""
+    def __init__(self, db_path: str = "news.db"):
+        self.db_path = db_path
+        self.lock = threading.Lock()
+        # 初始化資料庫
+        self._init_database()
+    def _init_database(self):
+        """初始化資料庫表格"""
+        try:
+            with self._get_connection() as conn:
+                cursor = conn.cursor()
+                # 創建新聞表
+                cursor.execute("""
+                    CREATE TABLE IF NOT EXISTS news (
+                        id INTEGER PRIMARY KEY AUTOINCREMENT,
+                        title TEXT NOT NULL,
+                        content TEXT NOT NULL,
+                        url TEXT UNIQUE NOT NULL,
+                        source TEXT NOT NULL,
+                        category TEXT NOT NULL,
+                        published_date DATETIME NOT NULL,
+                        created_date DATETIME DEFAULT CURRENT_TIMESTAMP,
+                        sentiment TEXT,
+                        sentiment_score REAL,
+                        sentiment_method TEXT
+                    )
+                """)
+                # 創建索引
+                cursor.execute("CREATE INDEX IF NOT EXISTS idx_url ON news(url)")
+                cursor.execute("CREATE INDEX IF NOT EXISTS idx_category ON news(category)")
+                cursor.execute("CREATE INDEX IF NOT EXISTS idx_published_date ON news(published_date)")
+                cursor.execute("CREATE INDEX IF NOT EXISTS idx_sentiment ON news(sentiment)")
+                # 創建統計表
+                cursor.execute("""
+                    CREATE TABLE IF NOT EXISTS crawl_stats (
+                        id INTEGER PRIMARY KEY AUTOINCREMENT,
+                        crawl_date DATETIME DEFAULT CURRENT_TIMESTAMP,
+                        category TEXT NOT NULL,
+                        articles_count INTEGER NOT NULL,
+                        success_count INTEGER NOT NULL,
+                        error_count INTEGER NOT NULL,
+                        execution_time REAL
+                    )
+                """)
+                conn.commit()
+                logger.info("資料庫初始化完成")
+        except Exception as e:
+            logger.error(f"資料庫初始化錯誤: {e}")
+            raise
+    @contextmanager
+    def _get_connection(self):
+        """獲取資料庫連接（上下文管理器）"""
+        conn = None
+        try:
+            conn = sqlite3.connect(self.db_path, timeout=30.0)
+            conn.row_factory = sqlite3.Row  # 返回字典型結果
+            yield conn
+        except Exception as e:
+            if conn:
+                conn.rollback()
+            logger.error(f"資料庫連接錯誤: {e}")
+            raise
+        finally:
+            if conn:
+                conn.close()
+    def insert_news(self, news_items: List[Dict]) -> Tuple[int, int]:
+        """插入新聞資料"""
+        inserted_count = 0
+        duplicate_count = 0
+        try:
+            with self.lock:
+                with self._get_connection() as conn:
+                    cursor = conn.cursor()
+                    for item in news_items:
+                        try:
+                            cursor.execute("""
+                                INSERT OR IGNORE INTO news
+                                (title, content, url, source, category, published_date,
+                                 sentiment, sentiment_score, sentiment_method)
+                                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+                            """, (
+                                item.get('title'),
+                                item.get('content'),
+                                item.get('url'),
+                                item.get('source'),
+                                item.get('category'),
+                                item.get('published_date'),
+                                item.get('sentiment'),
+                                item.get('sentiment_score'),
+                                item.get('sentiment_method')
+                            ))
+                            if cursor.rowcount > 0:
+                                inserted_count += 1
+                            else:
+                                duplicate_count += 1
+                        except Exception as e:
+                            logger.error(f"插入新聞時發生錯誤: {e}")
+                            continue
+                    conn.commit()
+        except Exception as e:
+            logger.error(f"批量插入新聞錯誤: {e}")
+            raise
+        logger.info(f"插入新聞完成 - 新增: {inserted_count}, 重複: {duplicate_count}")
+        return inserted_count, duplicate_count
+    def get_recent_news(self, category: str = "all", limit: int = 50, days: int = 7) -> List[Dict]:
+        """獲取最近的新聞"""
+        try:
+            with self._get_connection() as conn:
+                cursor = conn.cursor()
+                # 構建查詢條件
+                where_clause = "WHERE published_date >= ?"
+                params = [datetime.now() - timedelta(days=days)]
+                if category != "all":
+                    where_clause += " AND category = ?"
+                    params.append(category)
+                query = f"""
+                    SELECT * FROM news
+                    {where_clause}
+                    ORDER BY published_date DESC
+                    LIMIT ?
+                """
+                params.append(limit)
+                cursor.execute(query, params)
+                rows = cursor.fetchall()
+                # 轉換為字典列表
+                news_list = []
+                for row in rows:
+                    news_dict = dict(row)
+                    # 轉換日期格式
+                    if news_dict['published_date']:
+                        news_dict['published_date'] = datetime.fromisoformat(news_dict['published_date'])
+                    news_list.append(news_dict)
+                return news_list
+        except Exception as e:
+            logger.error(f"獲取新聞錯誤: {e}")
+            return []
+    def get_statistics(self) -> Dict:
+        """獲取新聞統計資訊"""
+        try:
+            with self._get_connection() as conn:
+                cursor = conn.cursor()
+                # 總新聞數量
+                cursor.execute("SELECT COUNT(*) as total FROM news")
+                total_news = cursor.fetchone()['total']
+                # 分類統計
+                cursor.execute("""
+                    SELECT category, COUNT(*) as count
+                    FROM news
+                    GROUP BY category
+                """)
+                category_stats = {row['category']: row['count'] for row in cursor.fetchall()}
+                # 情緒統計
+                cursor.execute("""
+                    SELECT sentiment, COUNT(*) as count
+                    FROM news
+                    WHERE sentiment IS NOT NULL
+                    GROUP BY sentiment
+                """)
+                sentiment_stats = {row['sentiment']: row['count'] for row in cursor.fetchall()}
+                # 最後更新時間
+                cursor.execute("SELECT MAX(created_date) as last_update FROM news")
+                last_update = cursor.fetchone()['last_update']
+                return {
+                    'total_news': total_news,
+                    'us_stock_count': category_stats.get('us_stock', 0),
+                    'tw_stock_count': category_stats.get('tw_stock', 0),
+                    'positive_count': sentiment_stats.get('positive', 0),
+                    'negative_count': sentiment_stats.get('negative', 0),
+                    'neutral_count': sentiment_stats.get('neutral', 0),
+                    'last_update': last_update
+                }
+        except Exception as e:
+            logger.error(f"獲取統計資訊錯誤: {e}")
+            return {}
+    def cleanup_old_news(self, days: int = 14) -> int:
+        """清理舊新聞"""
+        try:
+            cutoff_date = datetime.now() - timedelta(days=days)
+            with self.lock:
+                with self._get_connection() as conn:
+                    cursor = conn.cursor()
+                    cursor.execute("""
+                        DELETE FROM news
+                        WHERE published_date < ?
+                    """, (cutoff_date,))
+                    deleted_count = cursor.rowcount
+                    conn.commit()
+                    logger.info(f"清理了 {deleted_count} 條超過 {days} 天的新聞")
+                    return deleted_count
+        except Exception as e:
+            logger.error(f"清理舊新聞錯誤: {e}")
+            return 0
+    def record_crawl_stats(self, category: str, articles_count: int,
+                          success_count: int, error_count: int, execution_time: float):
+        """記錄爬蟲統計"""
+        try:
+            with self._get_connection() as conn:
+                cursor = conn.cursor()
+                cursor.execute("""
+                    INSERT INTO crawl_stats
+                    (category, articles_count, success_count, error_count, execution_time)
+                    VALUES (?, ?, ?, ?, ?)
+                """, (category, articles_count, success_count, error_count, execution_time))
+                conn.commit()
+        except Exception as e:
+            logger.error(f"記錄爬蟲統計錯誤: {e}")
+    def check_duplicate_by_title(self, title: str, similarity_threshold: float = 0.8) -> bool:
+        """檢查標題重複性"""
+        try:
+            with self._get_connection() as conn:
+                cursor = conn.cursor()
+                # 簡單的標題相似度檢查
+                cursor.execute("""
+                    SELECT title FROM news
+                    WHERE created_date >= ?
+                """, (datetime.now() - timedelta(days=1),))
+                existing_titles = [row['title'] for row in cursor.fetchall()]
+                # 計算相似度（簡化版）
+                title_words = set(title.lower().split())
+                for existing_title in existing_titles:
+                    existing_words = set(existing_title.lower().split())
+                    if len(title_words) == 0 or len(existing_words) == 0:
+                        continue
+                    intersection = title_words.intersection(existing_words)
+                    union = title_words.union(existing_words)
+                    similarity = len(intersection) / len(union) if union else 0
+                    if similarity > similarity_threshold:
+                        return True
+                return False
+        except Exception as e:
+            logger.error(f"檢查標題重複性錯誤: {e}")
+            return False

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+gradio==4.44.0
+torch>=2.0.0
+transformers>=4.30.0
+datasets>=2.14.0
+accelerate>=0.20.0
+requests>=2.31.0
+beautifulsoup4>=4.12.0
+cloudscraper>=1.2.71
+pandas>=2.0.0
+numpy>=1.24.0
+scikit-learn>=1.3.0
+sqlite3
+python-dateutil>=2.8.2
+pytz>=2023.3
+schedule>=1.2.0
+fake-useragent>=1.4.0
+selenium>=4.15.0
+webdriver-manager>=4.0.0
+lxml>=4.9.0
+cssselect>=1.2.0
+readability-lxml>=0.8.1
+feedparser>=6.0.10
+nltk>=3.8.1
+jieba>=0.42.1
+emoji>=2.8.0
+python-dotenv>=1.0.0
+aiohttp>=3.8.0
+asyncio
+threading
+logging

scheduler.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import schedule
+import threading
+import time
+import logging
+from datetime import datetime
+from typing import Dict, List
+from crawler import CnYesNewsCrawler, NewsItem
+from sentiment_analyzer import SentimentAnalyzer
+from database import NewsDatabase
+logger = logging.getLogger(__name__)
+class NewsScheduler:
+    """新聞爬蟲排程器"""
+    def __init__(self, database: NewsDatabase, crawler: CnYesNewsCrawler, sentiment_analyzer: SentimentAnalyzer):
+        self.db = database
+        self.crawler = crawler
+        self.sentiment_analyzer = sentiment_analyzer
+        self.is_running = False
+        self.scheduler_thread = None
+    def start(self):
+        """啟動排程器"""
+        if self.is_running:
+            logger.warning("排程器已經在運行中")
+            return
+        self.is_running = True
+        # 設置排程任務
+        schedule.every(30).minutes.do(self._run_crawl_task)  # 每30分鐘爬取
+        schedule.every().day.at("02:00").do(self._cleanup_old_news)  # 每天凌晨2點清理
+        # 啟動背景線程
+        self.scheduler_thread = threading.Thread(target=self._run_scheduler, daemon=True)
+        self.scheduler_thread.start()
+        logger.info("新聞排程器已啟動 - 每30分鐘自動爬取")
+        # 立即執行一次爬取
+        threading.Thread(target=self._run_crawl_task, daemon=True).start()
+    def stop(self):
+        """停止排程器"""
+        self.is_running = False
+        if self.scheduler_thread:
+            self.scheduler_thread.join(timeout=5)
+        logger.info("新聞排程器已停止")
+    def _run_scheduler(self):
+        """運行排程器主循環"""
+        while self.is_running:
+            try:
+                schedule.run_pending()
+                time.sleep(60)  # 每分鐘檢查一次
+            except Exception as e:
+                logger.error(f"排程器運行錯誤: {e}")
+                time.sleep(60)
+    def _run_crawl_task(self):
+        """執行爬蟲任務"""
+        try:
+            start_time = time.time()
+            logger.info("開始執行定時爬蟲任務")
+            # 爬取所有分類
+            all_news = self.crawler.crawl_all_categories(max_articles_per_category=15)
+            total_articles = 0
+            total_inserted = 0
+            for category, articles in all_news.items():
+                if not articles:
+                    continue
+                # 情緒分析
+                analyzed_articles = self._analyze_articles_sentiment(articles)
+                # 轉換為資料庫格式
+                db_articles = self._convert_to_db_format(analyzed_articles)
+                # 插入資料庫
+                inserted, duplicates = self.db.insert_news(db_articles)
+                total_articles += len(articles)
+                total_inserted += inserted
+                # 記錄統計
+                execution_time = time.time() - start_time
+                self.db.record_crawl_stats(
+                    category=category,
+                    articles_count=len(articles),
+                    success_count=inserted,
+                    error_count=len(articles) - inserted,
+                    execution_time=execution_time
+                )
+                logger.info(f"{category} 分類: {len(articles)} 篇文章, {inserted} 篇新增")
+            execution_time = time.time() - start_time
+            logger.info(f"爬蟲任務完成 - 總計: {total_articles} 篇, 新增: {total_inserted} 篇, 耗時: {execution_time:.2f}秒")
+            return f"成功爬取 {total_articles} 篇文章，新增 {total_inserted} 篇"
+        except Exception as e:
+            logger.error(f"爬蟲任務執行錯誤: {e}")
+            return f"爬蟲任務失敗: {str(e)}"
+    def _analyze_articles_sentiment(self, articles: List[NewsItem]) -> List[NewsItem]:
+        """對文章進行情緒分析"""
+        try:
+            logger.info(f"開始分析 {len(articles)} 篇文章的情緒")
+            for article in articles:
+                sentiment_result = self.sentiment_analyzer.analyze_sentiment(
+                    article.content,
+                    article.title
+                )
+                article.sentiment = sentiment_result['sentiment']
+                article.sentiment_score = sentiment_result['confidence']
+            logger.info("情緒分析完成")
+            return articles
+        except Exception as e:
+            logger.error(f"情緒分析錯誤: {e}")
+            return articles
+    def _convert_to_db_format(self, articles: List[NewsItem]) -> List[Dict]:
+        """轉換為資料庫格式"""
+        db_articles = []
+        for article in articles:
+            # 檢查重複
+            if self.db.check_duplicate_by_title(article.title):
+                logger.info(f"跳過重複文章: {article.title[:50]}...")
+                continue
+            db_article = {
+                'title': article.title,
+                'content': article.content,
+                'url': article.url,
+                'source': article.source,
+                'category': article.category,
+                'published_date': article.published_date.isoformat(),
+                'sentiment': article.sentiment,
+                'sentiment_score': article.sentiment_score,
+                'sentiment_method': 'auto'
+            }
+            db_articles.append(db_article)
+        return db_articles
+    def _cleanup_old_news(self):
+        """清理舊新聞"""
+        try:
+            deleted_count = self.db.cleanup_old_news(days=14)
+            logger.info(f"清理任務完成，刪除了 {deleted_count} 條舊新聞")
+        except Exception as e:
+            logger.error(f"清理舊新聞錯誤: {e}")
+    def run_crawl_task(self):
+        """手動執行爬蟲任務（用於UI）"""
+        return self._run_crawl_task()

sentiment_analyzer.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+import logging
+import re
+from typing import Dict, Tuple, Optional
+import jieba
+import emoji
+logger = logging.getLogger(__name__)
+class SentimentAnalyzer:
+    """中文新聞情緒分析器"""
+    def __init__(self, model_name: str = "uer/roberta-base-finetuned-jd-binary-chinese"):
+        self.model_name = model_name
+        self.tokenizer = None
+        self.model = None
+        self.classifier = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # 初始化模型
+        self._load_model()
+        # 情緒關鍵字典
+        self.positive_keywords = {
+            '上漲', '漲', '漲幅', '上升', '增長', '成長', '利好', '利多', '買進', '看好',
+            '樂觀', '獲利', '盈利', '突破', '新高', '強勢', '回升', '反彈', '看漲',
+            '推薦', '買入', '增持', '超買', '牛市', '多頭', '正面', '積極'
+        }
+        self.negative_keywords = {
+            '下跌', '跌', '跌幅', '下滑', '下降', '減少', '衰退', '利空', '賣出', '看壞',
+            '悲觀', '虧損', '損失', '破底', '新低', '弱勢', '下探', '重挫', '看跌',
+            '賣出', '減持', '超賣', '熊市', '空頭', '負面', '消極', '警告'
+        }
+    def _load_model(self):
+        """載入預訓練模型"""
+        try:
+            logger.info(f"載入情緒分析模型: {self.model_name}")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
+            # 創建分類器管道
+            self.classifier = pipeline(
+                "text-classification",
+                model=self.model,
+                tokenizer=self.tokenizer,
+                device=0 if self.device == "cuda" else -1,
+                return_all_scores=True
+            )
+            logger.info("情緒分析模型載入成功")
+        except Exception as e:
+            logger.error(f"載入模型時發生錯誤: {e}")
+            self.classifier = None
+    def _preprocess_text(self, text: str) -> str:
+        """文本預處理"""
+        try:
+            # 移除emoji
+            text = emoji.demojize(text, language='zh')
+            text = re.sub(r':[a-zA-Z_]+:', '', text)
+            # 移除特殊字符
+            text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()（），。！？]', '', text)
+            # 移除多餘空格
+            text = re.sub(r'\s+', ' ', text).strip()
+            # 截斷長度 (BERT模型限制)
+            if len(text) > 500:
+                text = text[:500]
+            return text
+        except Exception as e:
+            logger.error(f"文本預處理錯誤: {e}")
+            return text
+    def _keyword_sentiment(self, text: str) -> Tuple[str, float]:
+        """基於關鍵字的情緒分析"""
+        positive_count = sum(1 for keyword in self.positive_keywords if keyword in text)
+        negative_count = sum(1 for keyword in self.negative_keywords if keyword in text)
+        total_keywords = positive_count + negative_count
+        if total_keywords == 0:
+            return "neutral", 0.5
+        positive_ratio = positive_count / total_keywords
+        if positive_ratio > 0.6:
+            return "positive", 0.7 + (positive_ratio - 0.6) * 0.75
+        elif positive_ratio < 0.4:
+            return "negative", 0.3 - (0.4 - positive_ratio) * 0.75
+        else:
+            return "neutral", 0.5
+    def analyze_sentiment(self, text: str, title: str = "") -> Dict[str, any]:
+        """分析文本情緒"""
+        try:
+            # 合併標題和內容
+            full_text = f"{title} {text}" if title else text
+            processed_text = self._preprocess_text(full_text)
+            if not processed_text:
+                return {
+                    "sentiment": "neutral",
+                    "confidence": 0.5,
+                    "method": "default"
+                }
+            # 使用模型分析
+            if self.classifier:
+                try:
+                    results = self.classifier(processed_text)
+                    # 處理模型結果
+                    if results and len(results) > 0:
+                        scores = results[0]
+                        # 找到最高分數的標籤
+                        best_result = max(scores, key=lambda x: x['score'])
+                        # 標籤映射
+                        label_mapping = {
+                            'LABEL_0': 'negative',
+                            'LABEL_1': 'positive',
+                            'negative': 'negative',
+                            'positive': 'positive'
+                        }
+                        sentiment = label_mapping.get(best_result['label'], 'neutral')
+                        confidence = best_result['score']
+                        # 如果信心度較低，使用關鍵字方法
+                        if confidence < 0.7:
+                            keyword_sentiment, keyword_confidence = self._keyword_sentiment(processed_text)
+                            # 加權平均
+                            if abs(confidence - 0.5) < abs(keyword_confidence - 0.5):
+                                sentiment = keyword_sentiment
+                                confidence = (confidence + keyword_confidence) / 2
+                                method = "hybrid"
+                            else:
+                                method = "model"
+                        else:
+                            method = "model"
+                        return {
+                            "sentiment": sentiment,
+                            "confidence": confidence,
+                            "method": method
+                        }
+                except Exception as e:
+                    logger.error(f"模型分析錯誤: {e}")
+            # 備用：關鍵字分析
+            sentiment, confidence = self._keyword_sentiment(processed_text)
+            return {
+                "sentiment": sentiment,
+                "confidence": confidence,
+                "method": "keyword"
+            }
+        except Exception as e:
+            logger.error(f"情緒分析錯誤: {e}")
+            return {
+                "sentiment": "neutral",
+                "confidence": 0.5,
+                "method": "error"
+            }
+    def batch_analyze(self, texts: list, titles: list = None) -> list:
+        """批量分析情緒"""
+        results = []
+        titles = titles or [""] * len(texts)
+        for i, text in enumerate(texts):
+            title = titles[i] if i < len(titles) else ""
+            result = self.analyze_sentiment(text, title)
+            results.append(result)
+            # 避免GPU記憶體問題
+            if i % 10 == 0:
+                torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        return results

utils.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import logging
+import re
+from datetime import datetime
+from typing import List, Dict
+import html
+def setup_logging():
+    """設置日誌系統"""
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.StreamHandler(),
+            logging.FileHandler('news_app.log', encoding='utf-8')
+        ]
+    )
+def format_news_for_display(news_data: List[Dict]) -> str:
+    """格式化新聞用於顯示"""
+    if not news_data:
+        return "📰 暫無新聞資料"
+    html_content = """
+    <div style="max-width: 100%; font-family: Arial, sans-serif;">
+    """
+    for news in news_data:
+        # 獲取情緒和對應的樣式
+        sentiment = news.get('sentiment', 'neutral')
+        sentiment_class = f"news-{sentiment}"
+        # 情緒徽章
+        sentiment_badges = {
+            'positive': '<span class="sentiment-badge positive-badge">正面 😊</span>',
+            'negative': '<span class="sentiment-badge negative-badge">負面 😔</span>',
+            'neutral': '<span class="sentiment-badge neutral-badge">中性 😐</span>'
+        }
+        sentiment_badge = sentiment_badges.get(sentiment, sentiment_badges['neutral'])
+        # 格式化發布時間
+        published_date = news.get('published_date', '')
+        if isinstance(published_date, str):
+            try:
+                dt = datetime.fromisoformat(published_date.replace('Z', '+00:00'))
+                formatted_date = dt.strftime('%Y-%m-%d %H:%M')
+            except:
+                formatted_date = published_date
+        else:
+            formatted_date = str(published_date)
+        # 清理和截斷內容
+        title = html.escape(news.get('title', ''))
+        content = html.escape(news.get('content', ''))
+        url = news.get('url', '')
+        source = html.escape(news.get('source', ''))
+        category_name = {'us_stock': '美股', 'tw_stock': '台股'}.get(news.get('category', ''), '財經')
+        # 截斷長內容
+        if len(content) > 300:
+            content = content[:300] + "..."
+        # 新聞卡片HTML
+        news_card = f"""
+        <div class="news-card {sentiment_class}" style="margin-bottom: 20px; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
+            <div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 10px;">
+                <h3 style="margin: 0; font-size: 18px; color: #333; flex: 1; margin-right: 10px;">
+                    <a href="{url}" target="_blank" style="color: #333; text-decoration: none;">{title}</a>
+                </h3>
+                {sentiment_badge}
+            </div>
+            <div style="margin-bottom: 10px; color: #666; font-size: 14px;">
+                <span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 12px; margin-right: 8px;">{category_name}</span>
+                <span>{source}</span>
+                <span style="margin-left: 8px;">📅 {formatted_date}</span>
+            </div>
+            <p style="margin: 10px 0; color: #555; line-height: 1.6;">{content}</p>
+            <div style="margin-top: 10px; text-align: right;">
+                <a href="{url}" target="_blank" style="color: #007bff; text-decoration: none; font-size: 14px;">閱讀全文 →</a>
+            </div>
+        </div>
+        """
+        html_content += news_card
+    html_content += "</div>"
+    return html_content
+def clean_text(text: str) -> str:
+    """清理文本"""
+    if not text:
+        return ""
+    # 移除HTML標籤
+    text = re.sub(r'<[^>]+>', '', text)
+    # 移除多餘空格
+    text = re.sub(r'\s+', ' ', text)
+    # 移除特殊字符
+    text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()（），。！？]', '', text)
+    return text.strip()
+def calculate_similarity(text1: str, text2: str) -> float:
+    """計算兩個文本的相似度"""
+    if not text1 or not text2:
+        return 0.0
+    # 簡單的詞彙相似度計算
+    words1 = set(text1.lower().split())
+    words2 = set(text2.lower().split())
+    if not words1 or not words2:
+        return 0.0
+    intersection = words1.intersection(words2)
+    union = words1.union(words2)
+    return len(intersection) / len(union) if union else 0.0
+def validate_url(url: str) -> bool:
+    """驗證URL格式"""
+    url_pattern = re.compile(
+        r'^https?://'  # http:// or https://
+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'  # domain...
+        r'localhost|'  # localhost...
+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
+        r'(?::\d+)?'  # optional port
+        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+    return url_pattern.match(url) is not None
+def format_duration(seconds: float) -> str:
+    """格式化時間長度"""
+    if seconds < 60:
+        return f"{seconds:.1f}秒"
+    elif seconds < 3600:
+        minutes = seconds / 60
+        return f"{minutes:.1f}分鐘"
+    else:
+        hours = seconds / 3600
+        return f"{hours:.1f}小時"
+def truncate_text(text: str, max_length: int = 100) -> str:
+    """截斷文本"""
+    if not text:
+        return ""
+    if len(text) <= max_length:
+        return text
+    return text[:max_length].rsplit(' ', 1)[0] + "..."