Spaces:

khjhs60199
/

pyCrawing

Sleeping

App Files Files Community

khjhs60199 commited on Sep 17, 2025

Commit

0105605

verified ·

1 Parent(s): 7645752

Update app.py

Browse files

Files changed (1) hide show

app.py +278 -27

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ import time
 from datetime import datetime, timedelta
 from typing import List, Dict, Optional
 import os
 from crawler import CnYesNewsCrawler
 from sentiment_analyzer import SentimentAnalyzer
@@ -19,18 +21,40 @@ from utils import setup_logging, format_news_for_display
 setup_logging()
 logger = logging.getLogger(__name__)
 class NewsApp:
     def __init__(self):
         self.db = NewsDatabase()
-        self.crawler = CnYesNewsCrawler()
         self.sentiment_analyzer = SentimentAnalyzer()
         self.scheduler = NewsScheduler(self.db, self.crawler, self.sentiment_analyzer)
         # 啟動背景排程器
         self.scheduler.start()
         logger.info("新聞應用程式初始化完成")
     def get_latest_news(self, category: str = "all", limit: int = 50) -> str:
         """獲取最新新聞並格式化顯示"""
         try:
@@ -46,14 +70,28 @@ class NewsApp:
     def manual_crawl(self) -> str:
         """手動觸發爬蟲"""
         try:
-            logger.info("🚀 手動觸發爬蟲開始")
-            result = self.scheduler.run_crawl_task()
-            logger.info(f"✅ 手動爬蟲完成: {result}")
-            return f"✅ 手動爬蟲完成: {result}"
         except Exception as e:
-            logger.error(f"❌ 手動爬蟲錯誤: {e}")
-            return f"❌ 手動爬蟲失敗: {str(e)}"
     def get_statistics(self) -> str:
         """獲取統計資訊"""
@@ -64,18 +102,113 @@ class NewsApp:
 - 總新聞數量: {stats.get('total_news', 0)}
 - 美股新聞: {stats.get('us_stock_count', 0)}
 - 台股新聞: {stats.get('tw_stock_count', 0)}
-- 正面新聞: {stats.get('positive_count', 0)}
-- 負面新聞: {stats.get('negative_count', 0)}
-- 中性新聞: {stats.get('neutral_count', 0)}
 - 最後更新: {stats.get('last_update', 'N/A')}
             """
         except Exception as e:
             logger.error(f"獲取統計資訊錯誤: {e}")
             return f"❌ 獲取統計資訊失敗: {str(e)}"
 # 初始化應用
 app = NewsApp()
 # 創建 Gradio 介面
 def create_interface():
     with gr.Blocks(
@@ -90,23 +223,24 @@ def create_interface():
         .positive-badge { background: #28a745; color: white; }
         .negative-badge { background: #dc3545; color: white; }
         .neutral-badge { background: #6c757d; color: white; }
         """
     ) as interface:
         gr.Markdown("""
-        # 📈 股市新聞情緒分析器
-        🤖 自動爬取鉅亨網美股和台股新聞，並進行中文情緒分析
-        ⏰ **自動更新**: 每30分鐘自動爬取最新新聞
         🎯 **智能分析**: 使用 RoBERTa 模型進行情緒分析
         🔄 **去重處理**: 自動過濾重複新聞
-        📅 **資料保留**: 保存兩週內的新聞資料
         """)
         with gr.Tab("📰 最新新聞"):
             with gr.Row():
-                with gr.Column(scale=3):
                     category_radio = gr.Radio(
                         choices=["all", "us_stock", "tw_stock"],
                         value="all",
@@ -117,10 +251,21 @@ def create_interface():
                     refresh_btn = gr.Button("🔄 重新整理", variant="primary")
                     manual_crawl_btn = gr.Button("🚀 手動爬取", variant="secondary")
             news_display = gr.HTML(label="新聞內容")
             crawl_result = gr.Textbox(label="爬取結果", visible=False)
-            # 自動重新整理
             def auto_refresh():
                 return app.get_latest_news("all")
@@ -133,6 +278,13 @@ def create_interface():
                 news = app.get_latest_news("all")
                 return result, news
             # 綁定事件
             refresh_btn.click(refresh_news, inputs=[category_radio], outputs=[news_display])
             manual_crawl_btn.click(
@@ -143,9 +295,6 @@ def create_interface():
                 outputs=[crawl_result]
             )
             category_radio.change(refresh_news, inputs=[category_radio], outputs=[news_display])
-            # 初始載入
-            interface.load(auto_refresh, outputs=[news_display])
         with gr.Tab("📊 統計資訊"):
             stats_display = gr.Markdown()
@@ -154,10 +303,97 @@ def create_interface():
             stats_refresh_btn.click(app.get_statistics, outputs=[stats_display])
             interface.load(app.get_statistics, outputs=[stats_display])
         with gr.Tab("ℹ️ 關於"):
             gr.Markdown("""
             ## 🛠️ 技術特色
             ### 📊 情緒分析
             - **模型**: `uer/roberta-base-finetuned-jd-binary-chinese`
             - **分類**: 正面 (綠色) / 負面 (紅色) / 中性 (灰色)
@@ -167,27 +403,42 @@ def create_interface():
             - **來源**: 鉅亨網 (cnyes.com)
             - **分類**: 美股、台股新聞
             - **頻率**: 每30分鐘自動更新
-            - **去重**: 基於標題相似度智能去重
             ### 💾 資料管理
-            - **儲存**: SQLite 本地資料庫
             - **保留期**: 自動清理兩週前的資料
-            - **效能**: 索引優化，快速查詢
-            ### 🔧 系統功能
-            - **反爬蟲**: 隨機延遲、User-Agent 輪換
-            - **錯誤處理**: 完整的異常捕獲和日誌記錄
-            - **監控**: 即時統計和狀態監控
             ---
-            💡 **提示**: 首次啟動可能需要幾分鐘下載模型和初始化資料庫
             """)
     return interface
 # 啟動應用
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch(
         server_name="0.0.0.0",

 from datetime import datetime, timedelta
 from typing import List, Dict, Optional
 import os
+from flask import Flask, jsonify, request
+import json
 from crawler import CnYesNewsCrawler
 from sentiment_analyzer import SentimentAnalyzer
 setup_logging()
 logger = logging.getLogger(__name__)
+# Flask API 應用
+flask_app = Flask(__name__)
 class NewsApp:
     def __init__(self):
         self.db = NewsDatabase()
         self.sentiment_analyzer = SentimentAnalyzer()
+        self.crawler = CnYesNewsCrawler(
+            sentiment_analyzer=self.sentiment_analyzer,
+            database=self.db
+        )
         self.scheduler = NewsScheduler(self.db, self.crawler, self.sentiment_analyzer)
+        # 進度追蹤
+        self.current_progress = "系統已就緒"
+        self.is_crawling = False
+        # 設置爬蟲進度回調
+        self.crawler.set_progress_callback(self.update_progress)
         # 啟動背景排程器
         self.scheduler.start()
         logger.info("新聞應用程式初始化完成")
+    def update_progress(self, message: str):
+        """更新進度信息"""
+        self.current_progress = f"{datetime.now().strftime('%H:%M:%S')} - {message}"
+        logger.info(f"進度更新: {message}")
+    def get_progress(self) -> str:
+        """獲取當前進度"""
+        return self.current_progress
     def get_latest_news(self, category: str = "all", limit: int = 50) -> str:
         """獲取最新新聞並格式化顯示"""
         try:
     def manual_crawl(self) -> str:
         """手動觸發爬蟲"""
+        if self.is_crawling:
+            return "⚠️ 爬蟲正在運行中，請稍後再試"
         try:
+            self.is_crawling = True
+            self.update_progress("🚀 手動爬蟲開始")
+            # 使用新的即時爬蟲方法
+            results = self.crawler.crawl_all_categories(max_articles_per_category=5)
+            total_articles = sum(len(articles) for articles in results.values())
+            result_message = f"✅ 手動爬蟲完成，總共處理 {total_articles} 篇文章"
+            self.update_progress(result_message)
+            return result_message
         except Exception as e:
+            error_message = f"❌ 手動爬蟲失敗: {str(e)}"
+            self.update_progress(error_message)
+            return error_message
+        finally:
+            self.is_crawling = False
     def get_statistics(self) -> str:
         """獲取統計資訊"""
 - 總新聞數量: {stats.get('total_news', 0)}
 - 美股新聞: {stats.get('us_stock_count', 0)}
 - 台股新聞: {stats.get('tw_stock_count', 0)}
+- 正面新聞: {stats.get('positive_count', 0)} 😊
+- 負面新聞: {stats.get('negative_count', 0)} 😔
+- 中性新聞: {stats.get('neutral_count', 0)} 😐
 - 最後更新: {stats.get('last_update', 'N/A')}
             """
         except Exception as e:
             logger.error(f"獲取統計資訊錯誤: {e}")
             return f"❌ 獲取統計資訊失敗: {str(e)}"
+    def get_news_api_data(self, category: str = "all", limit: int = 50) -> Dict:
+        """獲取新聞API數據"""
+        try:
+            news_data = self.db.get_recent_news(category=category, limit=limit)
+            # 轉換為JSON友好格式
+            api_data = []
+            for news in news_data:
+                api_news = {
+                    'id': news.get('id'),
+                    'title': news.get('title'),
+                    'content': news.get('content'),
+                    'url': news.get('url'),
+                    'source': news.get('source'),
+                    'category': news.get('category'),
+                    'published_date': news.get('published_date').isoformat() if news.get('published_date') else None,
+                    'sentiment': news.get('sentiment'),
+                    'sentiment_score': news.get('sentiment_score'),
+                    'created_date': news.get('created_date')
+                }
+                api_data.append(api_news)
+            return {
+                'success': True,
+                'count': len(api_data),
+                'data': api_data
+            }
+        except Exception as e:
+            logger.error(f"獲取API數據錯誤: {e}")
+            return {
+                'success': False,
+                'error': str(e),
+                'data': []
+            }
 # 初始化應用
 app = NewsApp()
+# API 路由
+@flask_app.route('/api/news', methods=['GET'])
+def api_get_news():
+    """獲取新聞列表API"""
+    category = request.args.get('category', 'all')
+    limit = int(request.args.get('limit', 50))
+    result = app.get_news_api_data(category, limit)
+    return jsonify(result)
+@flask_app.route('/api/stats', methods=['GET'])
+def api_get_stats():
+    """獲取統計信息API"""
+    try:
+        stats = app.db.get_statistics()
+        return jsonify({
+            'success': True,
+            'data': stats
+        })
+    except Exception as e:
+        return jsonify({
+            'success': False,
+            'error': str(e)
+        })
+@flask_app.route('/api/crawl', methods=['POST'])
+def api_manual_crawl():
+    """手動觸發爬蟲API"""
+    try:
+        if app.is_crawling:
+            return jsonify({
+                'success': False,
+                'message': '爬蟲正在運行中'
+            })
+        # 在背景執行爬蟲
+        def run_crawl():
+            app.manual_crawl()
+        threading.Thread(target=run_crawl, daemon=True).start()
+        return jsonify({
+            'success': True,
+            'message': '爬蟲任務已啟動'
+        })
+    except Exception as e:
+        return jsonify({
+            'success': False,
+            'error': str(e)
+        })
+@flask_app.route('/api/progress', methods=['GET'])
+def api_get_progress():
+    """獲取爬蟲進度API"""
+    return jsonify({
+        'progress': app.get_progress(),
+        'is_crawling': app.is_crawling
+    })
 # 創建 Gradio 介面
 def create_interface():
     with gr.Blocks(
         .positive-badge { background: #28a745; color: white; }
         .negative-badge { background: #dc3545; color: white; }
         .neutral-badge { background: #6c757d; color: white; }
+        .progress-box { background: #f8f9fa; border: 1px solid #dee2e6; border-radius: 5px; padding: 10px; font-family: monospace; }
         """
     ) as interface:
         gr.Markdown("""
+        # 📈 股市新聞情緒分析器 - 即時版
+        🤖 自動爬取鉅亨網美股和台股新聞，並進行即時中文情緒分析
+        ⚡ **即時處理**: 每篇文章完成後立即分析並存檔
         🎯 **智能分析**: 使用 RoBERTa 模型進行情緒分析
         🔄 **去重處理**: 自動過濾重複新聞
+        📊 **API接口**: 提供RESTful API獲取分析結果
         """)
         with gr.Tab("📰 最新新聞"):
             with gr.Row():
+                with gr.Column(scale=2):
                     category_radio = gr.Radio(
                         choices=["all", "us_stock", "tw_stock"],
                         value="all",
                     refresh_btn = gr.Button("🔄 重新整理", variant="primary")
                     manual_crawl_btn = gr.Button("🚀 手動爬取", variant="secondary")
+            # 進度顯示
+            progress_display = gr.Textbox(
+                label="📊 即時進度",
+                value=app.get_progress(),
+                interactive=False,
+                elem_classes=["progress-box"]
+            )
             news_display = gr.HTML(label="新聞內容")
             crawl_result = gr.Textbox(label="爬取結果", visible=False)
+            # 自動重新整理進度
+            def update_progress_display():
+                return app.get_progress()
             def auto_refresh():
                 return app.get_latest_news("all")
                 news = app.get_latest_news("all")
                 return result, news
+            # 定期更新進度
+            interface.load(
+                lambda: [app.get_progress(), app.get_latest_news("all")],
+                outputs=[progress_display, news_display],
+                every=5  # 每5秒更新一次
+            )
             # 綁定事件
             refresh_btn.click(refresh_news, inputs=[category_radio], outputs=[news_display])
             manual_crawl_btn.click(
                 outputs=[crawl_result]
             )
             category_radio.change(refresh_news, inputs=[category_radio], outputs=[news_display])
         with gr.Tab("📊 統計資訊"):
             stats_display = gr.Markdown()
             stats_refresh_btn.click(app.get_statistics, outputs=[stats_display])
             interface.load(app.get_statistics, outputs=[stats_display])
+        with gr.Tab("🔌 API接口"):
+            gr.Markdown("""
+            ## 📖 API 使用說明
+            ### 🔗 接口列表
+            #### 1. 獲取新聞列表
+            ```
+            GET /api/news?category={all|us_stock|tw_stock}&limit={數量}
+            ```
+            **參數:**
+            - `category`: 新聞分類 (可選，默認: all)
+            - `limit`: 返回數量 (可選，默認: 50)
+            **響應示例:**
+            ```json
+            {
+                "success": true,
+                "count": 10,
+                "data": [
+                    {
+                        "id": 1,
+                        "title": "美股標題",
+                        "content": "新聞內容...",
+                        "url": "https://...",
+                        "source": "鉅亨網",
+                        "category": "us_stock",
+                        "published_date": "2024-01-01T12:00:00",
+                        "sentiment": "positive",
+                        "sentiment_score": 0.85,
+                        "created_date": "2024-01-01T12:05:00"
+                    }
+                ]
+            }
+            ```
+            #### 2. 獲取統計信息
+            ```
+            GET /api/stats
+            ```
+            #### 3. 手動觸發爬蟲
+            ```
+            POST /api/crawl
+            ```
+            #### 4. 獲取爬蟲進度
+            ```
+            GET /api/progress
+            ```
+            ### 💡 使用示例
+            **Python:**
+            ```python
+            import requests
+            # 獲取所有新聞
+            response = requests.get('http://localhost:7860/api/news')
+            news_data = response.json()
+            # 獲取美股新聞
+            response = requests.get('http://localhost:7860/api/news?category=us_stock&limit=10')
+            us_news = response.json()
+            ```
+            **JavaScript:**
+            ```javascript
+            // 獲取新聞
+            fetch('/api/news?category=tw_stock')
+                .then(response => response.json())
+                .then(data => console.log(data));
+            // 觸發爬蟲
+            fetch('/api/crawl', {method: 'POST'})
+                .then(response => response.json())
+                .then(data => console.log(data));
+            ```
+            """)
         with gr.Tab("ℹ️ 關於"):
             gr.Markdown("""
             ## 🛠️ 技術特色
+            ### ⚡ 即時處理
+            - **實時分析**: 每篇文章爬取完成立即進行情緒分析
+            - **即時存檔**: 分析完成後立即保存到SQLite資料庫
+            - **進度追蹤**: 實時顯示爬蟲和分析進度
+            - **去重檢查**: 存檔前檢查標題相似度避免重複
             ### 📊 情緒分析
             - **模型**: `uer/roberta-base-finetuned-jd-binary-chinese`
             - **分類**: 正面 (綠色) / 負面 (紅色) / 中性 (灰色)
             - **來源**: 鉅亨網 (cnyes.com)
             - **分類**: 美股、台股新聞
             - **頻率**: 每30分鐘自動更新
+            - **反爬蟲**: 隨機延遲、User-Agent輪換
             ### 💾 資料管理
+            - **儲存**: SQLite 本地資料庫 (news.db)
+            - **索引**: 針對URL、分類、時間、情緒建立索引
             - **保留期**: 自動清理兩週前的資料
+            - **效能**: 優化查詢，支持並發存取
+            ### 🔌 API接口
+            - **REST API**: 提供完整的RESTful API
+            - **JSON格式**: 標準JSON響應格式
+            - **錯誤處理**: 完善的錯誤處理和狀態碼
+            - **跨域支持**: 支持CORS跨域請求
             ---
+            💡 **提示**:
+            - 首次啟動可能需要幾分鐘下載模型
+            - 建議使用SSD硬碟提升SQLite性能
+            - API接口可用於整合其他應用系統
             """)
     return interface
 # 啟動應用
 if __name__ == "__main__":
+    import threading
+    # 在背景啟動Flask API
+    def run_flask():
+        flask_app.run(host='0.0.0.0', port=5000, debug=False)
+    flask_thread = threading.Thread(target=run_flask, daemon=True)
+    flask_thread.start()
+    # 啟動Gradio介面
     interface = create_interface()
     interface.launch(
         server_name="0.0.0.0",