Spaces:
Sleeping
Sleeping
kickStart
Browse files- app.py +184 -0
- crawler.py +317 -0
- database.py +296 -0
- requirements.txt +30 -0
- scheduler.py +167 -0
- sentiment_analyzer.py +192 -0
- utils.py +157 -0
app.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import sqlite3
|
| 4 |
+
import logging
|
| 5 |
+
import asyncio
|
| 6 |
+
import threading
|
| 7 |
+
import time
|
| 8 |
+
from datetime import datetime, timedelta
|
| 9 |
+
from typing import List, Dict, Optional
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
from crawler import CnYesNewsCrawler
|
| 13 |
+
from sentiment_analyzer import SentimentAnalyzer
|
| 14 |
+
from database import NewsDatabase
|
| 15 |
+
from scheduler import NewsScheduler
|
| 16 |
+
from utils import setup_logging, format_news_for_display
|
| 17 |
+
|
| 18 |
+
# 設置日誌
|
| 19 |
+
setup_logging()
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
class NewsApp:
|
| 23 |
+
def __init__(self):
|
| 24 |
+
self.db = NewsDatabase()
|
| 25 |
+
self.crawler = CnYesNewsCrawler()
|
| 26 |
+
self.sentiment_analyzer = SentimentAnalyzer()
|
| 27 |
+
self.scheduler = NewsScheduler(self.db, self.crawler, self.sentiment_analyzer)
|
| 28 |
+
|
| 29 |
+
# 啟動背景排程器
|
| 30 |
+
self.scheduler.start()
|
| 31 |
+
|
| 32 |
+
logger.info("新聞應用程式初始化完成")
|
| 33 |
+
|
| 34 |
+
def get_latest_news(self, category: str = "all", limit: int = 50) -> str:
|
| 35 |
+
"""獲取最新新聞並格式化顯示"""
|
| 36 |
+
try:
|
| 37 |
+
news_data = self.db.get_recent_news(category=category, limit=limit)
|
| 38 |
+
if not news_data:
|
| 39 |
+
return "📰 暫無新聞資料,請稍後再試"
|
| 40 |
+
|
| 41 |
+
return format_news_for_display(news_data)
|
| 42 |
+
|
| 43 |
+
except Exception as e:
|
| 44 |
+
logger.error(f"獲取新聞時發生錯誤: {e}")
|
| 45 |
+
return f"❌ 獲取新聞時發生錯誤: {str(e)}"
|
| 46 |
+
|
| 47 |
+
def manual_crawl(self) -> str:
|
| 48 |
+
"""手動觸發爬蟲"""
|
| 49 |
+
try:
|
| 50 |
+
logger.info("手動觸發爬蟲開始")
|
| 51 |
+
result = self.scheduler.run_crawl_task()
|
| 52 |
+
return f"✅ 手動爬蟲完成: {result}"
|
| 53 |
+
except Exception as e:
|
| 54 |
+
logger.error(f"手動爬蟲錯誤: {e}")
|
| 55 |
+
return f"❌ 手動爬蟲失敗: {str(e)}"
|
| 56 |
+
|
| 57 |
+
def get_statistics(self) -> str:
|
| 58 |
+
"""獲取統計資訊"""
|
| 59 |
+
try:
|
| 60 |
+
stats = self.db.get_statistics()
|
| 61 |
+
return f"""
|
| 62 |
+
📊 **新聞統計**
|
| 63 |
+
- 總新聞數量: {stats.get('total_news', 0)}
|
| 64 |
+
- 美股新聞: {stats.get('us_stock_count', 0)}
|
| 65 |
+
- 台股新聞: {stats.get('tw_stock_count', 0)}
|
| 66 |
+
- 正面新聞: {stats.get('positive_count', 0)}
|
| 67 |
+
- 負面新聞: {stats.get('negative_count', 0)}
|
| 68 |
+
- 中性新聞: {stats.get('neutral_count', 0)}
|
| 69 |
+
- 最後更新: {stats.get('last_update', 'N/A')}
|
| 70 |
+
"""
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.error(f"獲取統計資訊錯誤: {e}")
|
| 73 |
+
return f"❌ 獲取統計資訊失敗: {str(e)}"
|
| 74 |
+
|
| 75 |
+
# 初始化應用
|
| 76 |
+
app = NewsApp()
|
| 77 |
+
|
| 78 |
+
# 創建 Gradio 介面
|
| 79 |
+
def create_interface():
|
| 80 |
+
with gr.Blocks(
|
| 81 |
+
title="📈 股市新聞情緒分析器",
|
| 82 |
+
theme=gr.themes.Soft(),
|
| 83 |
+
css="""
|
| 84 |
+
.news-positive { background: linear-gradient(90deg, #d4edda 0%, #c3e6cb 100%); border-left: 4px solid #28a745; }
|
| 85 |
+
.news-negative { background: linear-gradient(90deg, #f8d7da 0%, #f5c6cb 100%); border-left: 4px solid #dc3545; }
|
| 86 |
+
.news-neutral { background: linear-gradient(90deg, #e2e3e5 0%, #d6d8db 100%); border-left: 4px solid #6c757d; }
|
| 87 |
+
.news-card { margin: 10px 0; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
|
| 88 |
+
.sentiment-badge { padding: 4px 8px; border-radius: 12px; font-size: 12px; font-weight: bold; }
|
| 89 |
+
.positive-badge { background: #28a745; color: white; }
|
| 90 |
+
.negative-badge { background: #dc3545; color: white; }
|
| 91 |
+
.neutral-badge { background: #6c757d; color: white; }
|
| 92 |
+
"""
|
| 93 |
+
) as interface:
|
| 94 |
+
|
| 95 |
+
gr.Markdown("""
|
| 96 |
+
# 📈 股市新聞情緒分析器
|
| 97 |
+
|
| 98 |
+
🤖 自動爬取鉅亨網美股和台股新聞,並進行中文情緒分析
|
| 99 |
+
|
| 100 |
+
⏰ **自動更新**: 每30分鐘自動爬取最新新聞
|
| 101 |
+
🎯 **智能分析**: 使用 RoBERTa 模型進行情緒分析
|
| 102 |
+
🔄 **去重處理**: 自動過濾重複新聞
|
| 103 |
+
📅 **資料保留**: 保存兩週內的新聞資料
|
| 104 |
+
""")
|
| 105 |
+
|
| 106 |
+
with gr.Tab("📰 最新新聞"):
|
| 107 |
+
with gr.Row():
|
| 108 |
+
with gr.Column(scale=3):
|
| 109 |
+
category_radio = gr.Radio(
|
| 110 |
+
choices=["all", "us_stock", "tw_stock"],
|
| 111 |
+
value="all",
|
| 112 |
+
label="新聞分類",
|
| 113 |
+
info="選擇要顯示的新聞類型"
|
| 114 |
+
)
|
| 115 |
+
with gr.Column(scale=1):
|
| 116 |
+
refresh_btn = gr.Button("🔄 重新整理", variant="primary")
|
| 117 |
+
manual_crawl_btn = gr.Button("🚀 手動爬取", variant="secondary")
|
| 118 |
+
|
| 119 |
+
news_display = gr.HTML(label="新聞內容")
|
| 120 |
+
|
| 121 |
+
# 自動重新整理
|
| 122 |
+
def auto_refresh():
|
| 123 |
+
return app.get_latest_news("all")
|
| 124 |
+
|
| 125 |
+
def refresh_news(category):
|
| 126 |
+
return app.get_latest_news(category)
|
| 127 |
+
|
| 128 |
+
# 綁定事件
|
| 129 |
+
refresh_btn.click(refresh_news, inputs=[category_radio], outputs=[news_display])
|
| 130 |
+
manual_crawl_btn.click(app.manual_crawl, outputs=[gr.Textbox(label="爬取結果")])
|
| 131 |
+
category_radio.change(refresh_news, inputs=[category_radio], outputs=[news_display])
|
| 132 |
+
|
| 133 |
+
# 初始載入
|
| 134 |
+
interface.load(auto_refresh, outputs=[news_display])
|
| 135 |
+
|
| 136 |
+
with gr.Tab("📊 統計資訊"):
|
| 137 |
+
stats_display = gr.Markdown()
|
| 138 |
+
stats_refresh_btn = gr.Button("🔄 更新統計")
|
| 139 |
+
|
| 140 |
+
stats_refresh_btn.click(app.get_statistics, outputs=[stats_display])
|
| 141 |
+
interface.load(app.get_statistics, outputs=[stats_display])
|
| 142 |
+
|
| 143 |
+
with gr.Tab("ℹ️ 關於"):
|
| 144 |
+
gr.Markdown("""
|
| 145 |
+
## 🛠️ 技術特色
|
| 146 |
+
|
| 147 |
+
### 📊 情緒分析
|
| 148 |
+
- **模型**: `uer/roberta-base-finetuned-jd-binary-chinese`
|
| 149 |
+
- **分類**: 正面 (綠色) / 負面 (紅色) / 中性 (灰色)
|
| 150 |
+
- **準確性**: 針對中文金融新聞優化
|
| 151 |
+
|
| 152 |
+
### 🕷️ 新聞爬蟲
|
| 153 |
+
- **來源**: 鉅亨網 (cnyes.com)
|
| 154 |
+
- **分類**: 美股、台股新聞
|
| 155 |
+
- **頻率**: 每30分鐘自動更新
|
| 156 |
+
- **去重**: 基於標題相似度智能去重
|
| 157 |
+
|
| 158 |
+
### 💾 資料管理
|
| 159 |
+
- **儲存**: SQLite 本地資料庫
|
| 160 |
+
- **保留期**: 自動清理兩週前的資料
|
| 161 |
+
- **效能**: 索引優化,快速查詢
|
| 162 |
+
|
| 163 |
+
### 🔧 系統功能
|
| 164 |
+
- **反爬蟲**: 隨機延遲、User-Agent 輪換
|
| 165 |
+
- **錯誤處理**: 完整的異常捕獲和日誌記錄
|
| 166 |
+
- **監控**: 即時統計和狀態監控
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
+
💡 **提示**: 首次啟動可能需要幾分鐘下載模型和初始化資料庫
|
| 171 |
+
""")
|
| 172 |
+
|
| 173 |
+
return interface
|
| 174 |
+
|
| 175 |
+
# 啟動應用
|
| 176 |
+
if __name__ == "__main__":
|
| 177 |
+
interface = create_interface()
|
| 178 |
+
interface.launch(
|
| 179 |
+
server_name="0.0.0.0",
|
| 180 |
+
server_port=7860,
|
| 181 |
+
share=False,
|
| 182 |
+
show_error=True,
|
| 183 |
+
quiet=False
|
| 184 |
+
)
|
crawler.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import cloudscraper
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
import time
|
| 5 |
+
import random
|
| 6 |
+
import logging
|
| 7 |
+
import re
|
| 8 |
+
from datetime import datetime, timedelta
|
| 9 |
+
from typing import List, Dict, Optional
|
| 10 |
+
from urllib.parse import urljoin, urlparse
|
| 11 |
+
from fake_useragent import UserAgent
|
| 12 |
+
import json
|
| 13 |
+
from dataclasses import dataclass
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class NewsItem:
|
| 19 |
+
"""新聞項目資料結構"""
|
| 20 |
+
title: str
|
| 21 |
+
content: str
|
| 22 |
+
url: str
|
| 23 |
+
source: str
|
| 24 |
+
category: str
|
| 25 |
+
published_date: datetime
|
| 26 |
+
sentiment: Optional[str] = None
|
| 27 |
+
sentiment_score: Optional[float] = None
|
| 28 |
+
|
| 29 |
+
class CnYesNewsCrawler:
|
| 30 |
+
"""鉅亨網新聞爬蟲"""
|
| 31 |
+
|
| 32 |
+
def __init__(self):
|
| 33 |
+
self.base_url = "https://news.cnyes.com"
|
| 34 |
+
self.session = cloudscraper.create_scraper()
|
| 35 |
+
self.ua = UserAgent()
|
| 36 |
+
|
| 37 |
+
# 新聞分類URL
|
| 38 |
+
self.categories = {
|
| 39 |
+
'us_stock': 'https://news.cnyes.com/news/cat/us_stock',
|
| 40 |
+
'tw_stock': 'https://news.cnyes.com/news/cat/tw_stock_news'
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# 設置請求頭
|
| 44 |
+
self._setup_headers()
|
| 45 |
+
|
| 46 |
+
def _setup_headers(self):
|
| 47 |
+
"""設置隨機請求頭"""
|
| 48 |
+
self.session.headers.update({
|
| 49 |
+
'User-Agent': self.ua.random,
|
| 50 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 51 |
+
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
|
| 52 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 53 |
+
'DNT': '1',
|
| 54 |
+
'Connection': 'keep-alive',
|
| 55 |
+
'Upgrade-Insecure-Requests': '1',
|
| 56 |
+
'Sec-Fetch-Dest': 'document',
|
| 57 |
+
'Sec-Fetch-Mode': 'navigate',
|
| 58 |
+
'Sec-Fetch-Site': 'none',
|
| 59 |
+
'Cache-Control': 'max-age=0'
|
| 60 |
+
})
|
| 61 |
+
|
| 62 |
+
def _get_page(self, url: str, retries: int = 3) -> Optional[BeautifulSoup]:
|
| 63 |
+
"""獲取網頁內容"""
|
| 64 |
+
for attempt in range(retries):
|
| 65 |
+
try:
|
| 66 |
+
# 隨機延遲
|
| 67 |
+
time.sleep(random.uniform(2, 5))
|
| 68 |
+
|
| 69 |
+
# 輪換 User-Agent
|
| 70 |
+
self.session.headers['User-Agent'] = self.ua.random
|
| 71 |
+
|
| 72 |
+
response = self.session.get(url, timeout=30)
|
| 73 |
+
|
| 74 |
+
if response.status_code == 200:
|
| 75 |
+
response.encoding = 'utf-8'
|
| 76 |
+
return BeautifulSoup(response.content, 'html.parser')
|
| 77 |
+
else:
|
| 78 |
+
logger.warning(f"HTTP {response.status_code} for {url}")
|
| 79 |
+
|
| 80 |
+
except Exception as e:
|
| 81 |
+
logger.error(f"請求失敗 (嘗試 {attempt + 1}/{retries}): {e}")
|
| 82 |
+
if attempt < retries - 1:
|
| 83 |
+
time.sleep(random.uniform(5, 10))
|
| 84 |
+
|
| 85 |
+
return None
|
| 86 |
+
|
| 87 |
+
def _extract_article_urls(self, category_url: str, max_pages: int = 3) -> List[str]:
|
| 88 |
+
"""從分類頁面提取文章URL"""
|
| 89 |
+
article_urls = []
|
| 90 |
+
|
| 91 |
+
for page in range(1, max_pages + 1):
|
| 92 |
+
try:
|
| 93 |
+
if page == 1:
|
| 94 |
+
url = category_url
|
| 95 |
+
else:
|
| 96 |
+
url = f"{category_url}?page={page}"
|
| 97 |
+
|
| 98 |
+
logger.info(f"爬取分類頁面: {url}")
|
| 99 |
+
soup = self._get_page(url)
|
| 100 |
+
|
| 101 |
+
if not soup:
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
# 尋找文章連結
|
| 105 |
+
links = soup.find_all('a', href=re.compile(r'/news/id/\d+'))
|
| 106 |
+
page_urls = []
|
| 107 |
+
|
| 108 |
+
for link in links:
|
| 109 |
+
href = link.get('href')
|
| 110 |
+
if href:
|
| 111 |
+
full_url = urljoin(self.base_url, href)
|
| 112 |
+
if full_url not in page_urls:
|
| 113 |
+
page_urls.append(full_url)
|
| 114 |
+
|
| 115 |
+
article_urls.extend(page_urls)
|
| 116 |
+
logger.info(f"第 {page} 頁找到 {len(page_urls)} 篇文章")
|
| 117 |
+
|
| 118 |
+
if not page_urls:
|
| 119 |
+
break
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.error(f"爬取第 {page} 頁時發生錯誤: {e}")
|
| 123 |
+
continue
|
| 124 |
+
|
| 125 |
+
return list(set(article_urls)) # 去重
|
| 126 |
+
|
| 127 |
+
def _extract_article_content(self, url: str, category: str) -> Optional[NewsItem]:
|
| 128 |
+
"""提取文章詳細內容"""
|
| 129 |
+
try:
|
| 130 |
+
soup = self._get_page(url)
|
| 131 |
+
if not soup:
|
| 132 |
+
return None
|
| 133 |
+
|
| 134 |
+
# 提取標題
|
| 135 |
+
title_selectors = [
|
| 136 |
+
'h1.news-title',
|
| 137 |
+
'h1[class*="title"]',
|
| 138 |
+
'.article-header h1',
|
| 139 |
+
'h1'
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
title = ""
|
| 143 |
+
for selector in title_selectors:
|
| 144 |
+
title_elem = soup.select_one(selector)
|
| 145 |
+
if title_elem:
|
| 146 |
+
title = title_elem.get_text(strip=True)
|
| 147 |
+
if title and len(title) > 5:
|
| 148 |
+
break
|
| 149 |
+
|
| 150 |
+
if not title:
|
| 151 |
+
logger.warning(f"無法提取標題: {url}")
|
| 152 |
+
return None
|
| 153 |
+
|
| 154 |
+
# 提取內容
|
| 155 |
+
content_selectors = [
|
| 156 |
+
'.news-content',
|
| 157 |
+
'.article-content',
|
| 158 |
+
'.content-body',
|
| 159 |
+
'[class*="article-text"]'
|
| 160 |
+
]
|
| 161 |
+
|
| 162 |
+
content = ""
|
| 163 |
+
for selector in content_selectors:
|
| 164 |
+
content_elem = soup.select_one(selector)
|
| 165 |
+
if content_elem:
|
| 166 |
+
# 移除不需要的元素
|
| 167 |
+
for unwanted in content_elem.select('script, style, .ad, .advertisement'):
|
| 168 |
+
unwanted.decompose()
|
| 169 |
+
|
| 170 |
+
paragraphs = content_elem.find_all(['p', 'div'])
|
| 171 |
+
content_parts = []
|
| 172 |
+
for p in paragraphs:
|
| 173 |
+
text = p.get_text(strip=True)
|
| 174 |
+
if text and len(text) > 10:
|
| 175 |
+
content_parts.append(text)
|
| 176 |
+
|
| 177 |
+
content = '\n'.join(content_parts)
|
| 178 |
+
if content:
|
| 179 |
+
break
|
| 180 |
+
|
| 181 |
+
if not content or len(content) < 50:
|
| 182 |
+
logger.warning(f"內容太短或無法提取: {url}")
|
| 183 |
+
return None
|
| 184 |
+
|
| 185 |
+
# 提取發布時間
|
| 186 |
+
published_date = self._extract_publish_date(soup)
|
| 187 |
+
|
| 188 |
+
# 創建新聞項目
|
| 189 |
+
news_item = NewsItem(
|
| 190 |
+
title=title,
|
| 191 |
+
content=content[:2000], # 限制內容長度
|
| 192 |
+
url=url,
|
| 193 |
+
source='鉅亨網',
|
| 194 |
+
category=category,
|
| 195 |
+
published_date=published_date
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
logger.info(f"成功提取文章: {title[:50]}...")
|
| 199 |
+
return news_item
|
| 200 |
+
|
| 201 |
+
except Exception as e:
|
| 202 |
+
logger.error(f"提取文章內容時發生錯誤 {url}: {e}")
|
| 203 |
+
return None
|
| 204 |
+
|
| 205 |
+
def _extract_publish_date(self, soup: BeautifulSoup) -> datetime:
|
| 206 |
+
"""提取發布時間"""
|
| 207 |
+
time_selectors = [
|
| 208 |
+
'time[datetime]',
|
| 209 |
+
'.publish-time',
|
| 210 |
+
'.news-time',
|
| 211 |
+
'[class*="time"]'
|
| 212 |
+
]
|
| 213 |
+
|
| 214 |
+
for selector in time_selectors:
|
| 215 |
+
time_elem = soup.select_one(selector)
|
| 216 |
+
if time_elem:
|
| 217 |
+
datetime_attr = time_elem.get('datetime')
|
| 218 |
+
if datetime_attr:
|
| 219 |
+
try:
|
| 220 |
+
return datetime.fromisoformat(datetime_attr.replace('Z', '+00:00')).replace(tzinfo=None)
|
| 221 |
+
except:
|
| 222 |
+
pass
|
| 223 |
+
|
| 224 |
+
time_text = time_elem.get_text(strip=True)
|
| 225 |
+
parsed_time = self._parse_time_text(time_text)
|
| 226 |
+
if parsed_time:
|
| 227 |
+
return parsed_time
|
| 228 |
+
|
| 229 |
+
return datetime.now()
|
| 230 |
+
|
| 231 |
+
def _parse_time_text(self, time_text: str) -> Optional[datetime]:
|
| 232 |
+
"""解析時間文字"""
|
| 233 |
+
patterns = [
|
| 234 |
+
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
|
| 235 |
+
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2})',
|
| 236 |
+
r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
|
| 237 |
+
r'(\d{4})-(\d{2})-(\d{2})'
|
| 238 |
+
]
|
| 239 |
+
|
| 240 |
+
for pattern in patterns:
|
| 241 |
+
match = re.search(pattern, time_text)
|
| 242 |
+
if match:
|
| 243 |
+
try:
|
| 244 |
+
groups = match.groups()
|
| 245 |
+
if len(groups) >= 6:
|
| 246 |
+
return datetime(int(groups[0]), int(groups[1]), int(groups[2]),
|
| 247 |
+
int(groups[3]), int(groups[4]), int(groups[5]))
|
| 248 |
+
elif len(groups) >= 5:
|
| 249 |
+
return datetime(int(groups[0]), int(groups[1]), int(groups[2]),
|
| 250 |
+
int(groups[3]), int(groups[4]))
|
| 251 |
+
else:
|
| 252 |
+
return datetime(int(groups[0]), int(groups[1]), int(groups[2]))
|
| 253 |
+
except:
|
| 254 |
+
continue
|
| 255 |
+
|
| 256 |
+
return None
|
| 257 |
+
|
| 258 |
+
def crawl_category(self, category: str, max_articles: int = 20) -> List[NewsItem]:
|
| 259 |
+
"""爬取指定分類的新聞"""
|
| 260 |
+
if category not in self.categories:
|
| 261 |
+
logger.error(f"無效的分類: {category}")
|
| 262 |
+
return []
|
| 263 |
+
|
| 264 |
+
logger.info(f"開始爬取 {category} 分類新聞")
|
| 265 |
+
|
| 266 |
+
# 獲取文章URL列表
|
| 267 |
+
category_url = self.categories[category]
|
| 268 |
+
article_urls = self._extract_article_urls(category_url)
|
| 269 |
+
|
| 270 |
+
if not article_urls:
|
| 271 |
+
logger.warning(f"未找到 {category} 分類的文章URL")
|
| 272 |
+
return []
|
| 273 |
+
|
| 274 |
+
# 限制文章數量
|
| 275 |
+
if len(article_urls) > max_articles:
|
| 276 |
+
article_urls = article_urls[:max_articles]
|
| 277 |
+
|
| 278 |
+
# 提取文章內容
|
| 279 |
+
articles = []
|
| 280 |
+
for i, url in enumerate(article_urls, 1):
|
| 281 |
+
try:
|
| 282 |
+
logger.info(f"處理文章 {i}/{len(article_urls)}: {url}")
|
| 283 |
+
article = self._extract_article_content(url, category)
|
| 284 |
+
if article:
|
| 285 |
+
articles.append(article)
|
| 286 |
+
|
| 287 |
+
# 隨機延遲
|
| 288 |
+
time.sleep(random.uniform(3, 8))
|
| 289 |
+
|
| 290 |
+
except Exception as e:
|
| 291 |
+
logger.error(f"處理文章時發生錯誤 {url}: {e}")
|
| 292 |
+
continue
|
| 293 |
+
|
| 294 |
+
logger.info(f"{category} 分類爬取完成,共 {len(articles)} 篇文章")
|
| 295 |
+
return articles
|
| 296 |
+
|
| 297 |
+
def crawl_all_categories(self, max_articles_per_category: int = 15) -> Dict[str, List[NewsItem]]:
|
| 298 |
+
"""爬取所有分類的新聞"""
|
| 299 |
+
results = {}
|
| 300 |
+
|
| 301 |
+
for category in self.categories.keys():
|
| 302 |
+
try:
|
| 303 |
+
logger.info(f"開始爬取 {category} 分類")
|
| 304 |
+
articles = self.crawl_category(category, max_articles_per_category)
|
| 305 |
+
results[category] = articles
|
| 306 |
+
|
| 307 |
+
# 分類間延遲
|
| 308 |
+
time.sleep(random.uniform(10, 20))
|
| 309 |
+
|
| 310 |
+
except Exception as e:
|
| 311 |
+
logger.error(f"爬取 {category} 分類時發生錯誤: {e}")
|
| 312 |
+
results[category] = []
|
| 313 |
+
|
| 314 |
+
total_articles = sum(len(articles) for articles in results.values())
|
| 315 |
+
logger.info(f"所有分類爬取完成,總共 {total_articles} 篇文章")
|
| 316 |
+
|
| 317 |
+
return results
|
database.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlite3
|
| 2 |
+
import logging
|
| 3 |
+
import json
|
| 4 |
+
from datetime import datetime, timedelta
|
| 5 |
+
from typing import List, Dict, Optional, Tuple
|
| 6 |
+
import threading
|
| 7 |
+
from contextlib import contextmanager
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class NewsDatabase:
|
| 12 |
+
"""新聞資料庫管理器"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, db_path: str = "news.db"):
|
| 15 |
+
self.db_path = db_path
|
| 16 |
+
self.lock = threading.Lock()
|
| 17 |
+
|
| 18 |
+
# 初始化資料庫
|
| 19 |
+
self._init_database()
|
| 20 |
+
|
| 21 |
+
def _init_database(self):
|
| 22 |
+
"""初始化資料庫表格"""
|
| 23 |
+
try:
|
| 24 |
+
with self._get_connection() as conn:
|
| 25 |
+
cursor = conn.cursor()
|
| 26 |
+
|
| 27 |
+
# 創建新聞表
|
| 28 |
+
cursor.execute("""
|
| 29 |
+
CREATE TABLE IF NOT EXISTS news (
|
| 30 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 31 |
+
title TEXT NOT NULL,
|
| 32 |
+
content TEXT NOT NULL,
|
| 33 |
+
url TEXT UNIQUE NOT NULL,
|
| 34 |
+
source TEXT NOT NULL,
|
| 35 |
+
category TEXT NOT NULL,
|
| 36 |
+
published_date DATETIME NOT NULL,
|
| 37 |
+
created_date DATETIME DEFAULT CURRENT_TIMESTAMP,
|
| 38 |
+
sentiment TEXT,
|
| 39 |
+
sentiment_score REAL,
|
| 40 |
+
sentiment_method TEXT
|
| 41 |
+
)
|
| 42 |
+
""")
|
| 43 |
+
|
| 44 |
+
# 創建索引
|
| 45 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_url ON news(url)")
|
| 46 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_category ON news(category)")
|
| 47 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_published_date ON news(published_date)")
|
| 48 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_sentiment ON news(sentiment)")
|
| 49 |
+
|
| 50 |
+
# 創建統計表
|
| 51 |
+
cursor.execute("""
|
| 52 |
+
CREATE TABLE IF NOT EXISTS crawl_stats (
|
| 53 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 54 |
+
crawl_date DATETIME DEFAULT CURRENT_TIMESTAMP,
|
| 55 |
+
category TEXT NOT NULL,
|
| 56 |
+
articles_count INTEGER NOT NULL,
|
| 57 |
+
success_count INTEGER NOT NULL,
|
| 58 |
+
error_count INTEGER NOT NULL,
|
| 59 |
+
execution_time REAL
|
| 60 |
+
)
|
| 61 |
+
""")
|
| 62 |
+
|
| 63 |
+
conn.commit()
|
| 64 |
+
logger.info("資料庫初始化完成")
|
| 65 |
+
|
| 66 |
+
except Exception as e:
|
| 67 |
+
logger.error(f"資料庫初始化錯誤: {e}")
|
| 68 |
+
raise
|
| 69 |
+
|
| 70 |
+
@contextmanager
|
| 71 |
+
def _get_connection(self):
|
| 72 |
+
"""獲取資料庫連接(上下文管理器)"""
|
| 73 |
+
conn = None
|
| 74 |
+
try:
|
| 75 |
+
conn = sqlite3.connect(self.db_path, timeout=30.0)
|
| 76 |
+
conn.row_factory = sqlite3.Row # 返回字典型結果
|
| 77 |
+
yield conn
|
| 78 |
+
except Exception as e:
|
| 79 |
+
if conn:
|
| 80 |
+
conn.rollback()
|
| 81 |
+
logger.error(f"資料庫連接錯誤: {e}")
|
| 82 |
+
raise
|
| 83 |
+
finally:
|
| 84 |
+
if conn:
|
| 85 |
+
conn.close()
|
| 86 |
+
|
| 87 |
+
def insert_news(self, news_items: List[Dict]) -> Tuple[int, int]:
|
| 88 |
+
"""插入新聞資料"""
|
| 89 |
+
inserted_count = 0
|
| 90 |
+
duplicate_count = 0
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
with self.lock:
|
| 94 |
+
with self._get_connection() as conn:
|
| 95 |
+
cursor = conn.cursor()
|
| 96 |
+
|
| 97 |
+
for item in news_items:
|
| 98 |
+
try:
|
| 99 |
+
cursor.execute("""
|
| 100 |
+
INSERT OR IGNORE INTO news
|
| 101 |
+
(title, content, url, source, category, published_date,
|
| 102 |
+
sentiment, sentiment_score, sentiment_method)
|
| 103 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 104 |
+
""", (
|
| 105 |
+
item.get('title'),
|
| 106 |
+
item.get('content'),
|
| 107 |
+
item.get('url'),
|
| 108 |
+
item.get('source'),
|
| 109 |
+
item.get('category'),
|
| 110 |
+
item.get('published_date'),
|
| 111 |
+
item.get('sentiment'),
|
| 112 |
+
item.get('sentiment_score'),
|
| 113 |
+
item.get('sentiment_method')
|
| 114 |
+
))
|
| 115 |
+
|
| 116 |
+
if cursor.rowcount > 0:
|
| 117 |
+
inserted_count += 1
|
| 118 |
+
else:
|
| 119 |
+
duplicate_count += 1
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.error(f"插入新聞時發生錯誤: {e}")
|
| 123 |
+
continue
|
| 124 |
+
|
| 125 |
+
conn.commit()
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.error(f"批量插入新聞錯誤: {e}")
|
| 129 |
+
raise
|
| 130 |
+
|
| 131 |
+
logger.info(f"插入新聞完成 - 新增: {inserted_count}, 重複: {duplicate_count}")
|
| 132 |
+
return inserted_count, duplicate_count
|
| 133 |
+
|
| 134 |
+
def get_recent_news(self, category: str = "all", limit: int = 50, days: int = 7) -> List[Dict]:
|
| 135 |
+
"""獲取最近的新聞"""
|
| 136 |
+
try:
|
| 137 |
+
with self._get_connection() as conn:
|
| 138 |
+
cursor = conn.cursor()
|
| 139 |
+
|
| 140 |
+
# 構建查詢條件
|
| 141 |
+
where_clause = "WHERE published_date >= ?"
|
| 142 |
+
params = [datetime.now() - timedelta(days=days)]
|
| 143 |
+
|
| 144 |
+
if category != "all":
|
| 145 |
+
where_clause += " AND category = ?"
|
| 146 |
+
params.append(category)
|
| 147 |
+
|
| 148 |
+
query = f"""
|
| 149 |
+
SELECT * FROM news
|
| 150 |
+
{where_clause}
|
| 151 |
+
ORDER BY published_date DESC
|
| 152 |
+
LIMIT ?
|
| 153 |
+
"""
|
| 154 |
+
params.append(limit)
|
| 155 |
+
|
| 156 |
+
cursor.execute(query, params)
|
| 157 |
+
rows = cursor.fetchall()
|
| 158 |
+
|
| 159 |
+
# 轉換為字典列表
|
| 160 |
+
news_list = []
|
| 161 |
+
for row in rows:
|
| 162 |
+
news_dict = dict(row)
|
| 163 |
+
# 轉換日期格式
|
| 164 |
+
if news_dict['published_date']:
|
| 165 |
+
news_dict['published_date'] = datetime.fromisoformat(news_dict['published_date'])
|
| 166 |
+
news_list.append(news_dict)
|
| 167 |
+
|
| 168 |
+
return news_list
|
| 169 |
+
|
| 170 |
+
except Exception as e:
|
| 171 |
+
logger.error(f"獲取新聞錯誤: {e}")
|
| 172 |
+
return []
|
| 173 |
+
|
| 174 |
+
def get_statistics(self) -> Dict:
|
| 175 |
+
"""獲取新聞統計資訊"""
|
| 176 |
+
try:
|
| 177 |
+
with self._get_connection() as conn:
|
| 178 |
+
cursor = conn.cursor()
|
| 179 |
+
|
| 180 |
+
# 總新聞數量
|
| 181 |
+
cursor.execute("SELECT COUNT(*) as total FROM news")
|
| 182 |
+
total_news = cursor.fetchone()['total']
|
| 183 |
+
|
| 184 |
+
# 分類統計
|
| 185 |
+
cursor.execute("""
|
| 186 |
+
SELECT category, COUNT(*) as count
|
| 187 |
+
FROM news
|
| 188 |
+
GROUP BY category
|
| 189 |
+
""")
|
| 190 |
+
category_stats = {row['category']: row['count'] for row in cursor.fetchall()}
|
| 191 |
+
|
| 192 |
+
# 情緒統計
|
| 193 |
+
cursor.execute("""
|
| 194 |
+
SELECT sentiment, COUNT(*) as count
|
| 195 |
+
FROM news
|
| 196 |
+
WHERE sentiment IS NOT NULL
|
| 197 |
+
GROUP BY sentiment
|
| 198 |
+
""")
|
| 199 |
+
sentiment_stats = {row['sentiment']: row['count'] for row in cursor.fetchall()}
|
| 200 |
+
|
| 201 |
+
# 最後更新時間
|
| 202 |
+
cursor.execute("SELECT MAX(created_date) as last_update FROM news")
|
| 203 |
+
last_update = cursor.fetchone()['last_update']
|
| 204 |
+
|
| 205 |
+
return {
|
| 206 |
+
'total_news': total_news,
|
| 207 |
+
'us_stock_count': category_stats.get('us_stock', 0),
|
| 208 |
+
'tw_stock_count': category_stats.get('tw_stock', 0),
|
| 209 |
+
'positive_count': sentiment_stats.get('positive', 0),
|
| 210 |
+
'negative_count': sentiment_stats.get('negative', 0),
|
| 211 |
+
'neutral_count': sentiment_stats.get('neutral', 0),
|
| 212 |
+
'last_update': last_update
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
except Exception as e:
|
| 216 |
+
logger.error(f"獲取統計資訊錯誤: {e}")
|
| 217 |
+
return {}
|
| 218 |
+
|
| 219 |
+
def cleanup_old_news(self, days: int = 14) -> int:
|
| 220 |
+
"""清理舊新聞"""
|
| 221 |
+
try:
|
| 222 |
+
cutoff_date = datetime.now() - timedelta(days=days)
|
| 223 |
+
|
| 224 |
+
with self.lock:
|
| 225 |
+
with self._get_connection() as conn:
|
| 226 |
+
cursor = conn.cursor()
|
| 227 |
+
|
| 228 |
+
cursor.execute("""
|
| 229 |
+
DELETE FROM news
|
| 230 |
+
WHERE published_date < ?
|
| 231 |
+
""", (cutoff_date,))
|
| 232 |
+
|
| 233 |
+
deleted_count = cursor.rowcount
|
| 234 |
+
conn.commit()
|
| 235 |
+
|
| 236 |
+
logger.info(f"清理了 {deleted_count} 條超過 {days} 天的新聞")
|
| 237 |
+
return deleted_count
|
| 238 |
+
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.error(f"清理舊新聞錯誤: {e}")
|
| 241 |
+
return 0
|
| 242 |
+
|
| 243 |
+
def record_crawl_stats(self, category: str, articles_count: int,
|
| 244 |
+
success_count: int, error_count: int, execution_time: float):
|
| 245 |
+
"""記錄爬蟲統計"""
|
| 246 |
+
try:
|
| 247 |
+
with self._get_connection() as conn:
|
| 248 |
+
cursor = conn.cursor()
|
| 249 |
+
|
| 250 |
+
cursor.execute("""
|
| 251 |
+
INSERT INTO crawl_stats
|
| 252 |
+
(category, articles_count, success_count, error_count, execution_time)
|
| 253 |
+
VALUES (?, ?, ?, ?, ?)
|
| 254 |
+
""", (category, articles_count, success_count, error_count, execution_time))
|
| 255 |
+
|
| 256 |
+
conn.commit()
|
| 257 |
+
|
| 258 |
+
except Exception as e:
|
| 259 |
+
logger.error(f"記錄爬蟲統計錯誤: {e}")
|
| 260 |
+
|
| 261 |
+
def check_duplicate_by_title(self, title: str, similarity_threshold: float = 0.8) -> bool:
|
| 262 |
+
"""檢查標題重複性"""
|
| 263 |
+
try:
|
| 264 |
+
with self._get_connection() as conn:
|
| 265 |
+
cursor = conn.cursor()
|
| 266 |
+
|
| 267 |
+
# 簡單的標題相似度檢查
|
| 268 |
+
cursor.execute("""
|
| 269 |
+
SELECT title FROM news
|
| 270 |
+
WHERE created_date >= ?
|
| 271 |
+
""", (datetime.now() - timedelta(days=1),))
|
| 272 |
+
|
| 273 |
+
existing_titles = [row['title'] for row in cursor.fetchall()]
|
| 274 |
+
|
| 275 |
+
# 計算相似度(簡化版)
|
| 276 |
+
title_words = set(title.lower().split())
|
| 277 |
+
|
| 278 |
+
for existing_title in existing_titles:
|
| 279 |
+
existing_words = set(existing_title.lower().split())
|
| 280 |
+
|
| 281 |
+
if len(title_words) == 0 or len(existing_words) == 0:
|
| 282 |
+
continue
|
| 283 |
+
|
| 284 |
+
intersection = title_words.intersection(existing_words)
|
| 285 |
+
union = title_words.union(existing_words)
|
| 286 |
+
|
| 287 |
+
similarity = len(intersection) / len(union) if union else 0
|
| 288 |
+
|
| 289 |
+
if similarity > similarity_threshold:
|
| 290 |
+
return True
|
| 291 |
+
|
| 292 |
+
return False
|
| 293 |
+
|
| 294 |
+
except Exception as e:
|
| 295 |
+
logger.error(f"檢查標題重複性錯誤: {e}")
|
| 296 |
+
return False
|
requirements.txt
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.44.0
|
| 2 |
+
torch>=2.0.0
|
| 3 |
+
transformers>=4.30.0
|
| 4 |
+
datasets>=2.14.0
|
| 5 |
+
accelerate>=0.20.0
|
| 6 |
+
requests>=2.31.0
|
| 7 |
+
beautifulsoup4>=4.12.0
|
| 8 |
+
cloudscraper>=1.2.71
|
| 9 |
+
pandas>=2.0.0
|
| 10 |
+
numpy>=1.24.0
|
| 11 |
+
scikit-learn>=1.3.0
|
| 12 |
+
sqlite3
|
| 13 |
+
python-dateutil>=2.8.2
|
| 14 |
+
pytz>=2023.3
|
| 15 |
+
schedule>=1.2.0
|
| 16 |
+
fake-useragent>=1.4.0
|
| 17 |
+
selenium>=4.15.0
|
| 18 |
+
webdriver-manager>=4.0.0
|
| 19 |
+
lxml>=4.9.0
|
| 20 |
+
cssselect>=1.2.0
|
| 21 |
+
readability-lxml>=0.8.1
|
| 22 |
+
feedparser>=6.0.10
|
| 23 |
+
nltk>=3.8.1
|
| 24 |
+
jieba>=0.42.1
|
| 25 |
+
emoji>=2.8.0
|
| 26 |
+
python-dotenv>=1.0.0
|
| 27 |
+
aiohttp>=3.8.0
|
| 28 |
+
asyncio
|
| 29 |
+
threading
|
| 30 |
+
logging
|
scheduler.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import schedule
|
| 2 |
+
import threading
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from typing import Dict, List
|
| 7 |
+
from crawler import CnYesNewsCrawler, NewsItem
|
| 8 |
+
from sentiment_analyzer import SentimentAnalyzer
|
| 9 |
+
from database import NewsDatabase
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
class NewsScheduler:
|
| 14 |
+
"""新聞爬蟲排程器"""
|
| 15 |
+
|
| 16 |
+
def __init__(self, database: NewsDatabase, crawler: CnYesNewsCrawler, sentiment_analyzer: SentimentAnalyzer):
|
| 17 |
+
self.db = database
|
| 18 |
+
self.crawler = crawler
|
| 19 |
+
self.sentiment_analyzer = sentiment_analyzer
|
| 20 |
+
self.is_running = False
|
| 21 |
+
self.scheduler_thread = None
|
| 22 |
+
|
| 23 |
+
def start(self):
|
| 24 |
+
"""啟動排程器"""
|
| 25 |
+
if self.is_running:
|
| 26 |
+
logger.warning("排程器已經在運行中")
|
| 27 |
+
return
|
| 28 |
+
|
| 29 |
+
self.is_running = True
|
| 30 |
+
|
| 31 |
+
# 設置排程任務
|
| 32 |
+
schedule.every(30).minutes.do(self._run_crawl_task) # 每30分鐘爬取
|
| 33 |
+
schedule.every().day.at("02:00").do(self._cleanup_old_news) # 每天凌晨2點清理
|
| 34 |
+
|
| 35 |
+
# 啟動背景線程
|
| 36 |
+
self.scheduler_thread = threading.Thread(target=self._run_scheduler, daemon=True)
|
| 37 |
+
self.scheduler_thread.start()
|
| 38 |
+
|
| 39 |
+
logger.info("新聞排程器已啟動 - 每30分鐘自動爬取")
|
| 40 |
+
|
| 41 |
+
# 立即執行一次爬取
|
| 42 |
+
threading.Thread(target=self._run_crawl_task, daemon=True).start()
|
| 43 |
+
|
| 44 |
+
def stop(self):
|
| 45 |
+
"""停止排程器"""
|
| 46 |
+
self.is_running = False
|
| 47 |
+
if self.scheduler_thread:
|
| 48 |
+
self.scheduler_thread.join(timeout=5)
|
| 49 |
+
logger.info("新聞排程器已停止")
|
| 50 |
+
|
| 51 |
+
def _run_scheduler(self):
|
| 52 |
+
"""運行排程器主循環"""
|
| 53 |
+
while self.is_running:
|
| 54 |
+
try:
|
| 55 |
+
schedule.run_pending()
|
| 56 |
+
time.sleep(60) # 每分鐘檢查一次
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.error(f"排程器運行錯誤: {e}")
|
| 59 |
+
time.sleep(60)
|
| 60 |
+
|
| 61 |
+
def _run_crawl_task(self):
|
| 62 |
+
"""執行爬蟲任務"""
|
| 63 |
+
try:
|
| 64 |
+
start_time = time.time()
|
| 65 |
+
logger.info("開始執行定時爬蟲任務")
|
| 66 |
+
|
| 67 |
+
# 爬取所有分類
|
| 68 |
+
all_news = self.crawler.crawl_all_categories(max_articles_per_category=15)
|
| 69 |
+
|
| 70 |
+
total_articles = 0
|
| 71 |
+
total_inserted = 0
|
| 72 |
+
|
| 73 |
+
for category, articles in all_news.items():
|
| 74 |
+
if not articles:
|
| 75 |
+
continue
|
| 76 |
+
|
| 77 |
+
# 情緒分析
|
| 78 |
+
analyzed_articles = self._analyze_articles_sentiment(articles)
|
| 79 |
+
|
| 80 |
+
# 轉換為資料庫格式
|
| 81 |
+
db_articles = self._convert_to_db_format(analyzed_articles)
|
| 82 |
+
|
| 83 |
+
# 插入資料庫
|
| 84 |
+
inserted, duplicates = self.db.insert_news(db_articles)
|
| 85 |
+
|
| 86 |
+
total_articles += len(articles)
|
| 87 |
+
total_inserted += inserted
|
| 88 |
+
|
| 89 |
+
# 記錄統計
|
| 90 |
+
execution_time = time.time() - start_time
|
| 91 |
+
self.db.record_crawl_stats(
|
| 92 |
+
category=category,
|
| 93 |
+
articles_count=len(articles),
|
| 94 |
+
success_count=inserted,
|
| 95 |
+
error_count=len(articles) - inserted,
|
| 96 |
+
execution_time=execution_time
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
logger.info(f"{category} 分類: {len(articles)} 篇文章, {inserted} 篇新增")
|
| 100 |
+
|
| 101 |
+
execution_time = time.time() - start_time
|
| 102 |
+
logger.info(f"爬蟲任務完成 - 總計: {total_articles} 篇, 新增: {total_inserted} 篇, 耗時: {execution_time:.2f}秒")
|
| 103 |
+
|
| 104 |
+
return f"成功爬取 {total_articles} 篇文章,新增 {total_inserted} 篇"
|
| 105 |
+
|
| 106 |
+
except Exception as e:
|
| 107 |
+
logger.error(f"爬蟲任務執行錯誤: {e}")
|
| 108 |
+
return f"爬蟲任務失敗: {str(e)}"
|
| 109 |
+
|
| 110 |
+
def _analyze_articles_sentiment(self, articles: List[NewsItem]) -> List[NewsItem]:
|
| 111 |
+
"""對文章進行情緒分析"""
|
| 112 |
+
try:
|
| 113 |
+
logger.info(f"開始分析 {len(articles)} 篇文章的情緒")
|
| 114 |
+
|
| 115 |
+
for article in articles:
|
| 116 |
+
sentiment_result = self.sentiment_analyzer.analyze_sentiment(
|
| 117 |
+
article.content,
|
| 118 |
+
article.title
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
article.sentiment = sentiment_result['sentiment']
|
| 122 |
+
article.sentiment_score = sentiment_result['confidence']
|
| 123 |
+
|
| 124 |
+
logger.info("情緒分析完成")
|
| 125 |
+
return articles
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.error(f"情緒分析錯誤: {e}")
|
| 129 |
+
return articles
|
| 130 |
+
|
| 131 |
+
def _convert_to_db_format(self, articles: List[NewsItem]) -> List[Dict]:
|
| 132 |
+
"""轉換為資料庫格式"""
|
| 133 |
+
db_articles = []
|
| 134 |
+
|
| 135 |
+
for article in articles:
|
| 136 |
+
# 檢查重複
|
| 137 |
+
if self.db.check_duplicate_by_title(article.title):
|
| 138 |
+
logger.info(f"跳過重複文章: {article.title[:50]}...")
|
| 139 |
+
continue
|
| 140 |
+
|
| 141 |
+
db_article = {
|
| 142 |
+
'title': article.title,
|
| 143 |
+
'content': article.content,
|
| 144 |
+
'url': article.url,
|
| 145 |
+
'source': article.source,
|
| 146 |
+
'category': article.category,
|
| 147 |
+
'published_date': article.published_date.isoformat(),
|
| 148 |
+
'sentiment': article.sentiment,
|
| 149 |
+
'sentiment_score': article.sentiment_score,
|
| 150 |
+
'sentiment_method': 'auto'
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
db_articles.append(db_article)
|
| 154 |
+
|
| 155 |
+
return db_articles
|
| 156 |
+
|
| 157 |
+
def _cleanup_old_news(self):
|
| 158 |
+
"""清理舊新聞"""
|
| 159 |
+
try:
|
| 160 |
+
deleted_count = self.db.cleanup_old_news(days=14)
|
| 161 |
+
logger.info(f"清理任務完成,刪除了 {deleted_count} 條舊新聞")
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.error(f"清理舊新聞錯誤: {e}")
|
| 164 |
+
|
| 165 |
+
def run_crawl_task(self):
|
| 166 |
+
"""手動執行爬蟲任務(用於UI)"""
|
| 167 |
+
return self._run_crawl_task()
|
sentiment_analyzer.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
| 3 |
+
import logging
|
| 4 |
+
import re
|
| 5 |
+
from typing import Dict, Tuple, Optional
|
| 6 |
+
import jieba
|
| 7 |
+
import emoji
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class SentimentAnalyzer:
|
| 12 |
+
"""中文新聞情緒分析器"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, model_name: str = "uer/roberta-base-finetuned-jd-binary-chinese"):
|
| 15 |
+
self.model_name = model_name
|
| 16 |
+
self.tokenizer = None
|
| 17 |
+
self.model = None
|
| 18 |
+
self.classifier = None
|
| 19 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 20 |
+
|
| 21 |
+
# 初始化模型
|
| 22 |
+
self._load_model()
|
| 23 |
+
|
| 24 |
+
# 情緒關鍵字典
|
| 25 |
+
self.positive_keywords = {
|
| 26 |
+
'上漲', '漲', '漲幅', '上升', '增長', '成長', '利好', '利多', '買進', '看好',
|
| 27 |
+
'樂觀', '獲利', '盈利', '突破', '新高', '強勢', '回升', '反彈', '看漲',
|
| 28 |
+
'推薦', '買入', '增持', '超買', '牛市', '多頭', '正面', '積極'
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
self.negative_keywords = {
|
| 32 |
+
'下跌', '跌', '跌幅', '下滑', '下降', '減少', '衰退', '利空', '賣出', '看壞',
|
| 33 |
+
'悲觀', '虧損', '損失', '破底', '新低', '弱勢', '下探', '重挫', '看跌',
|
| 34 |
+
'賣出', '減持', '超賣', '熊市', '空頭', '負面', '消極', '警告'
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
def _load_model(self):
|
| 38 |
+
"""載入預訓練模型"""
|
| 39 |
+
try:
|
| 40 |
+
logger.info(f"載入情緒分析模型: {self.model_name}")
|
| 41 |
+
|
| 42 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 43 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
|
| 44 |
+
|
| 45 |
+
# 創建分類器管道
|
| 46 |
+
self.classifier = pipeline(
|
| 47 |
+
"text-classification",
|
| 48 |
+
model=self.model,
|
| 49 |
+
tokenizer=self.tokenizer,
|
| 50 |
+
device=0 if self.device == "cuda" else -1,
|
| 51 |
+
return_all_scores=True
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
logger.info("情緒分析模型載入成功")
|
| 55 |
+
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.error(f"載入模型時發生錯誤: {e}")
|
| 58 |
+
self.classifier = None
|
| 59 |
+
|
| 60 |
+
def _preprocess_text(self, text: str) -> str:
|
| 61 |
+
"""文本預處理"""
|
| 62 |
+
try:
|
| 63 |
+
# 移除emoji
|
| 64 |
+
text = emoji.demojize(text, language='zh')
|
| 65 |
+
text = re.sub(r':[a-zA-Z_]+:', '', text)
|
| 66 |
+
|
| 67 |
+
# 移除特殊字符
|
| 68 |
+
text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()(),。!?]', '', text)
|
| 69 |
+
|
| 70 |
+
# 移除多餘空格
|
| 71 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 72 |
+
|
| 73 |
+
# 截斷長度 (BERT模型限制)
|
| 74 |
+
if len(text) > 500:
|
| 75 |
+
text = text[:500]
|
| 76 |
+
|
| 77 |
+
return text
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.error(f"文本預處理錯誤: {e}")
|
| 81 |
+
return text
|
| 82 |
+
|
| 83 |
+
def _keyword_sentiment(self, text: str) -> Tuple[str, float]:
|
| 84 |
+
"""基於關鍵字的情緒分析"""
|
| 85 |
+
positive_count = sum(1 for keyword in self.positive_keywords if keyword in text)
|
| 86 |
+
negative_count = sum(1 for keyword in self.negative_keywords if keyword in text)
|
| 87 |
+
|
| 88 |
+
total_keywords = positive_count + negative_count
|
| 89 |
+
|
| 90 |
+
if total_keywords == 0:
|
| 91 |
+
return "neutral", 0.5
|
| 92 |
+
|
| 93 |
+
positive_ratio = positive_count / total_keywords
|
| 94 |
+
|
| 95 |
+
if positive_ratio > 0.6:
|
| 96 |
+
return "positive", 0.7 + (positive_ratio - 0.6) * 0.75
|
| 97 |
+
elif positive_ratio < 0.4:
|
| 98 |
+
return "negative", 0.3 - (0.4 - positive_ratio) * 0.75
|
| 99 |
+
else:
|
| 100 |
+
return "neutral", 0.5
|
| 101 |
+
|
| 102 |
+
def analyze_sentiment(self, text: str, title: str = "") -> Dict[str, any]:
|
| 103 |
+
"""分析文本情緒"""
|
| 104 |
+
try:
|
| 105 |
+
# 合併標題和內容
|
| 106 |
+
full_text = f"{title} {text}" if title else text
|
| 107 |
+
processed_text = self._preprocess_text(full_text)
|
| 108 |
+
|
| 109 |
+
if not processed_text:
|
| 110 |
+
return {
|
| 111 |
+
"sentiment": "neutral",
|
| 112 |
+
"confidence": 0.5,
|
| 113 |
+
"method": "default"
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
# 使用模型分析
|
| 117 |
+
if self.classifier:
|
| 118 |
+
try:
|
| 119 |
+
results = self.classifier(processed_text)
|
| 120 |
+
|
| 121 |
+
# 處理模型結果
|
| 122 |
+
if results and len(results) > 0:
|
| 123 |
+
scores = results[0]
|
| 124 |
+
|
| 125 |
+
# 找到最高分數的標籤
|
| 126 |
+
best_result = max(scores, key=lambda x: x['score'])
|
| 127 |
+
|
| 128 |
+
# 標籤映射
|
| 129 |
+
label_mapping = {
|
| 130 |
+
'LABEL_0': 'negative',
|
| 131 |
+
'LABEL_1': 'positive',
|
| 132 |
+
'negative': 'negative',
|
| 133 |
+
'positive': 'positive'
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
sentiment = label_mapping.get(best_result['label'], 'neutral')
|
| 137 |
+
confidence = best_result['score']
|
| 138 |
+
|
| 139 |
+
# 如果信心度較低,使用關鍵字方法
|
| 140 |
+
if confidence < 0.7:
|
| 141 |
+
keyword_sentiment, keyword_confidence = self._keyword_sentiment(processed_text)
|
| 142 |
+
|
| 143 |
+
# 加權平均
|
| 144 |
+
if abs(confidence - 0.5) < abs(keyword_confidence - 0.5):
|
| 145 |
+
sentiment = keyword_sentiment
|
| 146 |
+
confidence = (confidence + keyword_confidence) / 2
|
| 147 |
+
method = "hybrid"
|
| 148 |
+
else:
|
| 149 |
+
method = "model"
|
| 150 |
+
else:
|
| 151 |
+
method = "model"
|
| 152 |
+
|
| 153 |
+
return {
|
| 154 |
+
"sentiment": sentiment,
|
| 155 |
+
"confidence": confidence,
|
| 156 |
+
"method": method
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
except Exception as e:
|
| 160 |
+
logger.error(f"模型分析錯誤: {e}")
|
| 161 |
+
|
| 162 |
+
# 備用:關鍵字分析
|
| 163 |
+
sentiment, confidence = self._keyword_sentiment(processed_text)
|
| 164 |
+
return {
|
| 165 |
+
"sentiment": sentiment,
|
| 166 |
+
"confidence": confidence,
|
| 167 |
+
"method": "keyword"
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
except Exception as e:
|
| 171 |
+
logger.error(f"情緒分析錯誤: {e}")
|
| 172 |
+
return {
|
| 173 |
+
"sentiment": "neutral",
|
| 174 |
+
"confidence": 0.5,
|
| 175 |
+
"method": "error"
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
def batch_analyze(self, texts: list, titles: list = None) -> list:
|
| 179 |
+
"""批量分析情緒"""
|
| 180 |
+
results = []
|
| 181 |
+
titles = titles or [""] * len(texts)
|
| 182 |
+
|
| 183 |
+
for i, text in enumerate(texts):
|
| 184 |
+
title = titles[i] if i < len(titles) else ""
|
| 185 |
+
result = self.analyze_sentiment(text, title)
|
| 186 |
+
results.append(result)
|
| 187 |
+
|
| 188 |
+
# 避免GPU記憶體問題
|
| 189 |
+
if i % 10 == 0:
|
| 190 |
+
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
| 191 |
+
|
| 192 |
+
return results
|
utils.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import re
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import html
|
| 6 |
+
|
| 7 |
+
def setup_logging():
|
| 8 |
+
"""設置日誌系統"""
|
| 9 |
+
logging.basicConfig(
|
| 10 |
+
level=logging.INFO,
|
| 11 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 12 |
+
handlers=[
|
| 13 |
+
logging.StreamHandler(),
|
| 14 |
+
logging.FileHandler('news_app.log', encoding='utf-8')
|
| 15 |
+
]
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
def format_news_for_display(news_data: List[Dict]) -> str:
|
| 19 |
+
"""格式化新聞用於顯示"""
|
| 20 |
+
if not news_data:
|
| 21 |
+
return "📰 暫無新聞資料"
|
| 22 |
+
|
| 23 |
+
html_content = """
|
| 24 |
+
<div style="max-width: 100%; font-family: Arial, sans-serif;">
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
for news in news_data:
|
| 28 |
+
# 獲取情緒和對應的樣式
|
| 29 |
+
sentiment = news.get('sentiment', 'neutral')
|
| 30 |
+
sentiment_class = f"news-{sentiment}"
|
| 31 |
+
|
| 32 |
+
# 情緒徽章
|
| 33 |
+
sentiment_badges = {
|
| 34 |
+
'positive': '<span class="sentiment-badge positive-badge">正面 😊</span>',
|
| 35 |
+
'negative': '<span class="sentiment-badge negative-badge">負面 😔</span>',
|
| 36 |
+
'neutral': '<span class="sentiment-badge neutral-badge">中性 😐</span>'
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
sentiment_badge = sentiment_badges.get(sentiment, sentiment_badges['neutral'])
|
| 40 |
+
|
| 41 |
+
# 格式化發布時間
|
| 42 |
+
published_date = news.get('published_date', '')
|
| 43 |
+
if isinstance(published_date, str):
|
| 44 |
+
try:
|
| 45 |
+
dt = datetime.fromisoformat(published_date.replace('Z', '+00:00'))
|
| 46 |
+
formatted_date = dt.strftime('%Y-%m-%d %H:%M')
|
| 47 |
+
except:
|
| 48 |
+
formatted_date = published_date
|
| 49 |
+
else:
|
| 50 |
+
formatted_date = str(published_date)
|
| 51 |
+
|
| 52 |
+
# 清理和截斷內容
|
| 53 |
+
title = html.escape(news.get('title', ''))
|
| 54 |
+
content = html.escape(news.get('content', ''))
|
| 55 |
+
url = news.get('url', '')
|
| 56 |
+
source = html.escape(news.get('source', ''))
|
| 57 |
+
category_name = {'us_stock': '美股', 'tw_stock': '台股'}.get(news.get('category', ''), '財經')
|
| 58 |
+
|
| 59 |
+
# 截斷長內容
|
| 60 |
+
if len(content) > 300:
|
| 61 |
+
content = content[:300] + "..."
|
| 62 |
+
|
| 63 |
+
# 新聞卡片HTML
|
| 64 |
+
news_card = f"""
|
| 65 |
+
<div class="news-card {sentiment_class}" style="margin-bottom: 20px; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 66 |
+
<div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 10px;">
|
| 67 |
+
<h3 style="margin: 0; font-size: 18px; color: #333; flex: 1; margin-right: 10px;">
|
| 68 |
+
<a href="{url}" target="_blank" style="color: #333; text-decoration: none;">{title}</a>
|
| 69 |
+
</h3>
|
| 70 |
+
{sentiment_badge}
|
| 71 |
+
</div>
|
| 72 |
+
|
| 73 |
+
<div style="margin-bottom: 10px; color: #666; font-size: 14px;">
|
| 74 |
+
<span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 12px; margin-right: 8px;">{category_name}</span>
|
| 75 |
+
<span>{source}</span>
|
| 76 |
+
<span style="margin-left: 8px;">📅 {formatted_date}</span>
|
| 77 |
+
</div>
|
| 78 |
+
|
| 79 |
+
<p style="margin: 10px 0; color: #555; line-height: 1.6;">{content}</p>
|
| 80 |
+
|
| 81 |
+
<div style="margin-top: 10px; text-align: right;">
|
| 82 |
+
<a href="{url}" target="_blank" style="color: #007bff; text-decoration: none; font-size: 14px;">閱讀全文 →</a>
|
| 83 |
+
</div>
|
| 84 |
+
</div>
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
html_content += news_card
|
| 88 |
+
|
| 89 |
+
html_content += "</div>"
|
| 90 |
+
|
| 91 |
+
return html_content
|
| 92 |
+
|
| 93 |
+
def clean_text(text: str) -> str:
|
| 94 |
+
"""清理文本"""
|
| 95 |
+
if not text:
|
| 96 |
+
return ""
|
| 97 |
+
|
| 98 |
+
# 移除HTML標籤
|
| 99 |
+
text = re.sub(r'<[^>]+>', '', text)
|
| 100 |
+
|
| 101 |
+
# 移除多餘空格
|
| 102 |
+
text = re.sub(r'\s+', ' ', text)
|
| 103 |
+
|
| 104 |
+
# 移除特殊字符
|
| 105 |
+
text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()(),。!?]', '', text)
|
| 106 |
+
|
| 107 |
+
return text.strip()
|
| 108 |
+
|
| 109 |
+
def calculate_similarity(text1: str, text2: str) -> float:
|
| 110 |
+
"""計算兩個文本的相似度"""
|
| 111 |
+
if not text1 or not text2:
|
| 112 |
+
return 0.0
|
| 113 |
+
|
| 114 |
+
# 簡單的詞彙相似度計算
|
| 115 |
+
words1 = set(text1.lower().split())
|
| 116 |
+
words2 = set(text2.lower().split())
|
| 117 |
+
|
| 118 |
+
if not words1 or not words2:
|
| 119 |
+
return 0.0
|
| 120 |
+
|
| 121 |
+
intersection = words1.intersection(words2)
|
| 122 |
+
union = words1.union(words2)
|
| 123 |
+
|
| 124 |
+
return len(intersection) / len(union) if union else 0.0
|
| 125 |
+
|
| 126 |
+
def validate_url(url: str) -> bool:
|
| 127 |
+
"""驗證URL格式"""
|
| 128 |
+
url_pattern = re.compile(
|
| 129 |
+
r'^https?://' # http:// or https://
|
| 130 |
+
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
|
| 131 |
+
r'localhost|' # localhost...
|
| 132 |
+
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
| 133 |
+
r'(?::\d+)?' # optional port
|
| 134 |
+
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
| 135 |
+
|
| 136 |
+
return url_pattern.match(url) is not None
|
| 137 |
+
|
| 138 |
+
def format_duration(seconds: float) -> str:
|
| 139 |
+
"""格式化時間長度"""
|
| 140 |
+
if seconds < 60:
|
| 141 |
+
return f"{seconds:.1f}秒"
|
| 142 |
+
elif seconds < 3600:
|
| 143 |
+
minutes = seconds / 60
|
| 144 |
+
return f"{minutes:.1f}分鐘"
|
| 145 |
+
else:
|
| 146 |
+
hours = seconds / 3600
|
| 147 |
+
return f"{hours:.1f}小時"
|
| 148 |
+
|
| 149 |
+
def truncate_text(text: str, max_length: int = 100) -> str:
|
| 150 |
+
"""截斷文本"""
|
| 151 |
+
if not text:
|
| 152 |
+
return ""
|
| 153 |
+
|
| 154 |
+
if len(text) <= max_length:
|
| 155 |
+
return text
|
| 156 |
+
|
| 157 |
+
return text[:max_length].rsplit(' ', 1)[0] + "..."
|