khjhs60199 commited on
Commit
ec6ea02
·
verified ·
1 Parent(s): 2d3f73b
Files changed (7) hide show
  1. app.py +184 -0
  2. crawler.py +317 -0
  3. database.py +296 -0
  4. requirements.txt +30 -0
  5. scheduler.py +167 -0
  6. sentiment_analyzer.py +192 -0
  7. utils.py +157 -0
app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import sqlite3
4
+ import logging
5
+ import asyncio
6
+ import threading
7
+ import time
8
+ from datetime import datetime, timedelta
9
+ from typing import List, Dict, Optional
10
+ import os
11
+
12
+ from crawler import CnYesNewsCrawler
13
+ from sentiment_analyzer import SentimentAnalyzer
14
+ from database import NewsDatabase
15
+ from scheduler import NewsScheduler
16
+ from utils import setup_logging, format_news_for_display
17
+
18
+ # 設置日誌
19
+ setup_logging()
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class NewsApp:
23
+ def __init__(self):
24
+ self.db = NewsDatabase()
25
+ self.crawler = CnYesNewsCrawler()
26
+ self.sentiment_analyzer = SentimentAnalyzer()
27
+ self.scheduler = NewsScheduler(self.db, self.crawler, self.sentiment_analyzer)
28
+
29
+ # 啟動背景排程器
30
+ self.scheduler.start()
31
+
32
+ logger.info("新聞應用程式初始化完成")
33
+
34
+ def get_latest_news(self, category: str = "all", limit: int = 50) -> str:
35
+ """獲取最新新聞並格式化顯示"""
36
+ try:
37
+ news_data = self.db.get_recent_news(category=category, limit=limit)
38
+ if not news_data:
39
+ return "📰 暫無新聞資料,請稍後再試"
40
+
41
+ return format_news_for_display(news_data)
42
+
43
+ except Exception as e:
44
+ logger.error(f"獲取新聞時發生錯誤: {e}")
45
+ return f"❌ 獲取新聞時發生錯誤: {str(e)}"
46
+
47
+ def manual_crawl(self) -> str:
48
+ """手動觸發爬蟲"""
49
+ try:
50
+ logger.info("手動觸發爬蟲開始")
51
+ result = self.scheduler.run_crawl_task()
52
+ return f"✅ 手動爬蟲完成: {result}"
53
+ except Exception as e:
54
+ logger.error(f"手動爬蟲錯誤: {e}")
55
+ return f"❌ 手動爬蟲失敗: {str(e)}"
56
+
57
+ def get_statistics(self) -> str:
58
+ """獲取統計資訊"""
59
+ try:
60
+ stats = self.db.get_statistics()
61
+ return f"""
62
+ 📊 **新聞統計**
63
+ - 總新聞數量: {stats.get('total_news', 0)}
64
+ - 美股新聞: {stats.get('us_stock_count', 0)}
65
+ - 台股新聞: {stats.get('tw_stock_count', 0)}
66
+ - 正面新聞: {stats.get('positive_count', 0)}
67
+ - 負面新聞: {stats.get('negative_count', 0)}
68
+ - 中性新聞: {stats.get('neutral_count', 0)}
69
+ - 最後更新: {stats.get('last_update', 'N/A')}
70
+ """
71
+ except Exception as e:
72
+ logger.error(f"獲取統計資訊錯誤: {e}")
73
+ return f"❌ 獲取統計資訊失敗: {str(e)}"
74
+
75
+ # 初始化應用
76
+ app = NewsApp()
77
+
78
+ # 創建 Gradio 介面
79
+ def create_interface():
80
+ with gr.Blocks(
81
+ title="📈 股市新聞情緒分析器",
82
+ theme=gr.themes.Soft(),
83
+ css="""
84
+ .news-positive { background: linear-gradient(90deg, #d4edda 0%, #c3e6cb 100%); border-left: 4px solid #28a745; }
85
+ .news-negative { background: linear-gradient(90deg, #f8d7da 0%, #f5c6cb 100%); border-left: 4px solid #dc3545; }
86
+ .news-neutral { background: linear-gradient(90deg, #e2e3e5 0%, #d6d8db 100%); border-left: 4px solid #6c757d; }
87
+ .news-card { margin: 10px 0; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
88
+ .sentiment-badge { padding: 4px 8px; border-radius: 12px; font-size: 12px; font-weight: bold; }
89
+ .positive-badge { background: #28a745; color: white; }
90
+ .negative-badge { background: #dc3545; color: white; }
91
+ .neutral-badge { background: #6c757d; color: white; }
92
+ """
93
+ ) as interface:
94
+
95
+ gr.Markdown("""
96
+ # 📈 股市新聞情緒分析器
97
+
98
+ 🤖 自動爬取鉅亨網美股和台股新聞,並進行中文情緒分析
99
+
100
+ ⏰ **自動更新**: 每30分鐘自動爬取最新新聞
101
+ 🎯 **智能分析**: 使用 RoBERTa 模型進行情緒分析
102
+ 🔄 **去重處理**: 自動過濾重複新聞
103
+ 📅 **資料保留**: 保存兩週內的新聞資料
104
+ """)
105
+
106
+ with gr.Tab("📰 最新新聞"):
107
+ with gr.Row():
108
+ with gr.Column(scale=3):
109
+ category_radio = gr.Radio(
110
+ choices=["all", "us_stock", "tw_stock"],
111
+ value="all",
112
+ label="新聞分類",
113
+ info="選擇要顯示的新聞類型"
114
+ )
115
+ with gr.Column(scale=1):
116
+ refresh_btn = gr.Button("🔄 重新整理", variant="primary")
117
+ manual_crawl_btn = gr.Button("🚀 手動爬取", variant="secondary")
118
+
119
+ news_display = gr.HTML(label="新聞內容")
120
+
121
+ # 自動重新整理
122
+ def auto_refresh():
123
+ return app.get_latest_news("all")
124
+
125
+ def refresh_news(category):
126
+ return app.get_latest_news(category)
127
+
128
+ # 綁定事件
129
+ refresh_btn.click(refresh_news, inputs=[category_radio], outputs=[news_display])
130
+ manual_crawl_btn.click(app.manual_crawl, outputs=[gr.Textbox(label="爬取結果")])
131
+ category_radio.change(refresh_news, inputs=[category_radio], outputs=[news_display])
132
+
133
+ # 初始載入
134
+ interface.load(auto_refresh, outputs=[news_display])
135
+
136
+ with gr.Tab("📊 統計資訊"):
137
+ stats_display = gr.Markdown()
138
+ stats_refresh_btn = gr.Button("🔄 更新統計")
139
+
140
+ stats_refresh_btn.click(app.get_statistics, outputs=[stats_display])
141
+ interface.load(app.get_statistics, outputs=[stats_display])
142
+
143
+ with gr.Tab("ℹ️ 關於"):
144
+ gr.Markdown("""
145
+ ## 🛠️ 技術特色
146
+
147
+ ### 📊 情緒分析
148
+ - **模型**: `uer/roberta-base-finetuned-jd-binary-chinese`
149
+ - **分類**: 正面 (綠色) / 負面 (紅色) / 中性 (灰色)
150
+ - **準確性**: 針對中文金融新聞優化
151
+
152
+ ### 🕷️ 新聞爬蟲
153
+ - **來源**: 鉅亨網 (cnyes.com)
154
+ - **分類**: 美股、台股新聞
155
+ - **頻率**: 每30分鐘自動更新
156
+ - **去重**: 基於標題相似度智能去重
157
+
158
+ ### 💾 資料管理
159
+ - **儲存**: SQLite 本地資料庫
160
+ - **保留期**: 自動清理兩週前的資料
161
+ - **效能**: 索引優化,快速查詢
162
+
163
+ ### 🔧 系統功能
164
+ - **反爬蟲**: 隨機延遲、User-Agent 輪換
165
+ - **錯誤處理**: 完整的異常捕獲和日誌記錄
166
+ - **監控**: 即時統計和狀態監控
167
+
168
+ ---
169
+
170
+ 💡 **提示**: 首次啟動可能需要幾分鐘下載模型和初始化資料庫
171
+ """)
172
+
173
+ return interface
174
+
175
+ # 啟動應用
176
+ if __name__ == "__main__":
177
+ interface = create_interface()
178
+ interface.launch(
179
+ server_name="0.0.0.0",
180
+ server_port=7860,
181
+ share=False,
182
+ show_error=True,
183
+ quiet=False
184
+ )
crawler.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import cloudscraper
3
+ from bs4 import BeautifulSoup
4
+ import time
5
+ import random
6
+ import logging
7
+ import re
8
+ from datetime import datetime, timedelta
9
+ from typing import List, Dict, Optional
10
+ from urllib.parse import urljoin, urlparse
11
+ from fake_useragent import UserAgent
12
+ import json
13
+ from dataclasses import dataclass
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ @dataclass
18
+ class NewsItem:
19
+ """新聞項目資料結構"""
20
+ title: str
21
+ content: str
22
+ url: str
23
+ source: str
24
+ category: str
25
+ published_date: datetime
26
+ sentiment: Optional[str] = None
27
+ sentiment_score: Optional[float] = None
28
+
29
+ class CnYesNewsCrawler:
30
+ """鉅亨網新聞爬蟲"""
31
+
32
+ def __init__(self):
33
+ self.base_url = "https://news.cnyes.com"
34
+ self.session = cloudscraper.create_scraper()
35
+ self.ua = UserAgent()
36
+
37
+ # 新聞分類URL
38
+ self.categories = {
39
+ 'us_stock': 'https://news.cnyes.com/news/cat/us_stock',
40
+ 'tw_stock': 'https://news.cnyes.com/news/cat/tw_stock_news'
41
+ }
42
+
43
+ # 設置請求頭
44
+ self._setup_headers()
45
+
46
+ def _setup_headers(self):
47
+ """設置隨機請求頭"""
48
+ self.session.headers.update({
49
+ 'User-Agent': self.ua.random,
50
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
51
+ 'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
52
+ 'Accept-Encoding': 'gzip, deflate, br',
53
+ 'DNT': '1',
54
+ 'Connection': 'keep-alive',
55
+ 'Upgrade-Insecure-Requests': '1',
56
+ 'Sec-Fetch-Dest': 'document',
57
+ 'Sec-Fetch-Mode': 'navigate',
58
+ 'Sec-Fetch-Site': 'none',
59
+ 'Cache-Control': 'max-age=0'
60
+ })
61
+
62
+ def _get_page(self, url: str, retries: int = 3) -> Optional[BeautifulSoup]:
63
+ """獲取網頁內容"""
64
+ for attempt in range(retries):
65
+ try:
66
+ # 隨機延遲
67
+ time.sleep(random.uniform(2, 5))
68
+
69
+ # 輪換 User-Agent
70
+ self.session.headers['User-Agent'] = self.ua.random
71
+
72
+ response = self.session.get(url, timeout=30)
73
+
74
+ if response.status_code == 200:
75
+ response.encoding = 'utf-8'
76
+ return BeautifulSoup(response.content, 'html.parser')
77
+ else:
78
+ logger.warning(f"HTTP {response.status_code} for {url}")
79
+
80
+ except Exception as e:
81
+ logger.error(f"請求失敗 (嘗試 {attempt + 1}/{retries}): {e}")
82
+ if attempt < retries - 1:
83
+ time.sleep(random.uniform(5, 10))
84
+
85
+ return None
86
+
87
+ def _extract_article_urls(self, category_url: str, max_pages: int = 3) -> List[str]:
88
+ """從分類頁面提取文章URL"""
89
+ article_urls = []
90
+
91
+ for page in range(1, max_pages + 1):
92
+ try:
93
+ if page == 1:
94
+ url = category_url
95
+ else:
96
+ url = f"{category_url}?page={page}"
97
+
98
+ logger.info(f"爬取分類頁面: {url}")
99
+ soup = self._get_page(url)
100
+
101
+ if not soup:
102
+ continue
103
+
104
+ # 尋找文章連結
105
+ links = soup.find_all('a', href=re.compile(r'/news/id/\d+'))
106
+ page_urls = []
107
+
108
+ for link in links:
109
+ href = link.get('href')
110
+ if href:
111
+ full_url = urljoin(self.base_url, href)
112
+ if full_url not in page_urls:
113
+ page_urls.append(full_url)
114
+
115
+ article_urls.extend(page_urls)
116
+ logger.info(f"第 {page} 頁找到 {len(page_urls)} 篇文章")
117
+
118
+ if not page_urls:
119
+ break
120
+
121
+ except Exception as e:
122
+ logger.error(f"爬取第 {page} 頁時發生錯誤: {e}")
123
+ continue
124
+
125
+ return list(set(article_urls)) # 去重
126
+
127
+ def _extract_article_content(self, url: str, category: str) -> Optional[NewsItem]:
128
+ """提取文章詳細內容"""
129
+ try:
130
+ soup = self._get_page(url)
131
+ if not soup:
132
+ return None
133
+
134
+ # 提取標題
135
+ title_selectors = [
136
+ 'h1.news-title',
137
+ 'h1[class*="title"]',
138
+ '.article-header h1',
139
+ 'h1'
140
+ ]
141
+
142
+ title = ""
143
+ for selector in title_selectors:
144
+ title_elem = soup.select_one(selector)
145
+ if title_elem:
146
+ title = title_elem.get_text(strip=True)
147
+ if title and len(title) > 5:
148
+ break
149
+
150
+ if not title:
151
+ logger.warning(f"無法提取標題: {url}")
152
+ return None
153
+
154
+ # 提取內容
155
+ content_selectors = [
156
+ '.news-content',
157
+ '.article-content',
158
+ '.content-body',
159
+ '[class*="article-text"]'
160
+ ]
161
+
162
+ content = ""
163
+ for selector in content_selectors:
164
+ content_elem = soup.select_one(selector)
165
+ if content_elem:
166
+ # 移除不需要的元素
167
+ for unwanted in content_elem.select('script, style, .ad, .advertisement'):
168
+ unwanted.decompose()
169
+
170
+ paragraphs = content_elem.find_all(['p', 'div'])
171
+ content_parts = []
172
+ for p in paragraphs:
173
+ text = p.get_text(strip=True)
174
+ if text and len(text) > 10:
175
+ content_parts.append(text)
176
+
177
+ content = '\n'.join(content_parts)
178
+ if content:
179
+ break
180
+
181
+ if not content or len(content) < 50:
182
+ logger.warning(f"內容太短或無法提取: {url}")
183
+ return None
184
+
185
+ # 提取發布時間
186
+ published_date = self._extract_publish_date(soup)
187
+
188
+ # 創建新聞項目
189
+ news_item = NewsItem(
190
+ title=title,
191
+ content=content[:2000], # 限制內容長度
192
+ url=url,
193
+ source='鉅亨網',
194
+ category=category,
195
+ published_date=published_date
196
+ )
197
+
198
+ logger.info(f"成功提取文章: {title[:50]}...")
199
+ return news_item
200
+
201
+ except Exception as e:
202
+ logger.error(f"提取文章內容時發生錯誤 {url}: {e}")
203
+ return None
204
+
205
+ def _extract_publish_date(self, soup: BeautifulSoup) -> datetime:
206
+ """提取發布時間"""
207
+ time_selectors = [
208
+ 'time[datetime]',
209
+ '.publish-time',
210
+ '.news-time',
211
+ '[class*="time"]'
212
+ ]
213
+
214
+ for selector in time_selectors:
215
+ time_elem = soup.select_one(selector)
216
+ if time_elem:
217
+ datetime_attr = time_elem.get('datetime')
218
+ if datetime_attr:
219
+ try:
220
+ return datetime.fromisoformat(datetime_attr.replace('Z', '+00:00')).replace(tzinfo=None)
221
+ except:
222
+ pass
223
+
224
+ time_text = time_elem.get_text(strip=True)
225
+ parsed_time = self._parse_time_text(time_text)
226
+ if parsed_time:
227
+ return parsed_time
228
+
229
+ return datetime.now()
230
+
231
+ def _parse_time_text(self, time_text: str) -> Optional[datetime]:
232
+ """解析時間文字"""
233
+ patterns = [
234
+ r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
235
+ r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2})',
236
+ r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
237
+ r'(\d{4})-(\d{2})-(\d{2})'
238
+ ]
239
+
240
+ for pattern in patterns:
241
+ match = re.search(pattern, time_text)
242
+ if match:
243
+ try:
244
+ groups = match.groups()
245
+ if len(groups) >= 6:
246
+ return datetime(int(groups[0]), int(groups[1]), int(groups[2]),
247
+ int(groups[3]), int(groups[4]), int(groups[5]))
248
+ elif len(groups) >= 5:
249
+ return datetime(int(groups[0]), int(groups[1]), int(groups[2]),
250
+ int(groups[3]), int(groups[4]))
251
+ else:
252
+ return datetime(int(groups[0]), int(groups[1]), int(groups[2]))
253
+ except:
254
+ continue
255
+
256
+ return None
257
+
258
+ def crawl_category(self, category: str, max_articles: int = 20) -> List[NewsItem]:
259
+ """爬取指定分類的新聞"""
260
+ if category not in self.categories:
261
+ logger.error(f"無效的分類: {category}")
262
+ return []
263
+
264
+ logger.info(f"開始爬取 {category} 分類新聞")
265
+
266
+ # 獲取文章URL列表
267
+ category_url = self.categories[category]
268
+ article_urls = self._extract_article_urls(category_url)
269
+
270
+ if not article_urls:
271
+ logger.warning(f"未找到 {category} 分類的文章URL")
272
+ return []
273
+
274
+ # 限制文章數量
275
+ if len(article_urls) > max_articles:
276
+ article_urls = article_urls[:max_articles]
277
+
278
+ # 提取文章內容
279
+ articles = []
280
+ for i, url in enumerate(article_urls, 1):
281
+ try:
282
+ logger.info(f"處理文章 {i}/{len(article_urls)}: {url}")
283
+ article = self._extract_article_content(url, category)
284
+ if article:
285
+ articles.append(article)
286
+
287
+ # 隨機延遲
288
+ time.sleep(random.uniform(3, 8))
289
+
290
+ except Exception as e:
291
+ logger.error(f"處理文章時發生錯誤 {url}: {e}")
292
+ continue
293
+
294
+ logger.info(f"{category} 分類爬取完成,共 {len(articles)} 篇文章")
295
+ return articles
296
+
297
+ def crawl_all_categories(self, max_articles_per_category: int = 15) -> Dict[str, List[NewsItem]]:
298
+ """爬取所有分類的新聞"""
299
+ results = {}
300
+
301
+ for category in self.categories.keys():
302
+ try:
303
+ logger.info(f"開始爬取 {category} 分類")
304
+ articles = self.crawl_category(category, max_articles_per_category)
305
+ results[category] = articles
306
+
307
+ # 分類間延遲
308
+ time.sleep(random.uniform(10, 20))
309
+
310
+ except Exception as e:
311
+ logger.error(f"爬取 {category} 分類時發生錯誤: {e}")
312
+ results[category] = []
313
+
314
+ total_articles = sum(len(articles) for articles in results.values())
315
+ logger.info(f"所有分類爬取完成,總共 {total_articles} 篇文章")
316
+
317
+ return results
database.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import logging
3
+ import json
4
+ from datetime import datetime, timedelta
5
+ from typing import List, Dict, Optional, Tuple
6
+ import threading
7
+ from contextlib import contextmanager
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class NewsDatabase:
12
+ """新聞資料庫管理器"""
13
+
14
+ def __init__(self, db_path: str = "news.db"):
15
+ self.db_path = db_path
16
+ self.lock = threading.Lock()
17
+
18
+ # 初始化資料庫
19
+ self._init_database()
20
+
21
+ def _init_database(self):
22
+ """初始化資料庫表格"""
23
+ try:
24
+ with self._get_connection() as conn:
25
+ cursor = conn.cursor()
26
+
27
+ # 創建新聞表
28
+ cursor.execute("""
29
+ CREATE TABLE IF NOT EXISTS news (
30
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
31
+ title TEXT NOT NULL,
32
+ content TEXT NOT NULL,
33
+ url TEXT UNIQUE NOT NULL,
34
+ source TEXT NOT NULL,
35
+ category TEXT NOT NULL,
36
+ published_date DATETIME NOT NULL,
37
+ created_date DATETIME DEFAULT CURRENT_TIMESTAMP,
38
+ sentiment TEXT,
39
+ sentiment_score REAL,
40
+ sentiment_method TEXT
41
+ )
42
+ """)
43
+
44
+ # 創建索引
45
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_url ON news(url)")
46
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_category ON news(category)")
47
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_published_date ON news(published_date)")
48
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_sentiment ON news(sentiment)")
49
+
50
+ # 創建統計表
51
+ cursor.execute("""
52
+ CREATE TABLE IF NOT EXISTS crawl_stats (
53
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
54
+ crawl_date DATETIME DEFAULT CURRENT_TIMESTAMP,
55
+ category TEXT NOT NULL,
56
+ articles_count INTEGER NOT NULL,
57
+ success_count INTEGER NOT NULL,
58
+ error_count INTEGER NOT NULL,
59
+ execution_time REAL
60
+ )
61
+ """)
62
+
63
+ conn.commit()
64
+ logger.info("資料庫初始化完成")
65
+
66
+ except Exception as e:
67
+ logger.error(f"資料庫初始化錯誤: {e}")
68
+ raise
69
+
70
+ @contextmanager
71
+ def _get_connection(self):
72
+ """獲取資料庫連接(上下文管理器)"""
73
+ conn = None
74
+ try:
75
+ conn = sqlite3.connect(self.db_path, timeout=30.0)
76
+ conn.row_factory = sqlite3.Row # 返回字典型結果
77
+ yield conn
78
+ except Exception as e:
79
+ if conn:
80
+ conn.rollback()
81
+ logger.error(f"資料庫連接錯誤: {e}")
82
+ raise
83
+ finally:
84
+ if conn:
85
+ conn.close()
86
+
87
+ def insert_news(self, news_items: List[Dict]) -> Tuple[int, int]:
88
+ """插入新聞資料"""
89
+ inserted_count = 0
90
+ duplicate_count = 0
91
+
92
+ try:
93
+ with self.lock:
94
+ with self._get_connection() as conn:
95
+ cursor = conn.cursor()
96
+
97
+ for item in news_items:
98
+ try:
99
+ cursor.execute("""
100
+ INSERT OR IGNORE INTO news
101
+ (title, content, url, source, category, published_date,
102
+ sentiment, sentiment_score, sentiment_method)
103
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
104
+ """, (
105
+ item.get('title'),
106
+ item.get('content'),
107
+ item.get('url'),
108
+ item.get('source'),
109
+ item.get('category'),
110
+ item.get('published_date'),
111
+ item.get('sentiment'),
112
+ item.get('sentiment_score'),
113
+ item.get('sentiment_method')
114
+ ))
115
+
116
+ if cursor.rowcount > 0:
117
+ inserted_count += 1
118
+ else:
119
+ duplicate_count += 1
120
+
121
+ except Exception as e:
122
+ logger.error(f"插入新聞時發生錯誤: {e}")
123
+ continue
124
+
125
+ conn.commit()
126
+
127
+ except Exception as e:
128
+ logger.error(f"批量插入新聞錯誤: {e}")
129
+ raise
130
+
131
+ logger.info(f"插入新聞完成 - 新增: {inserted_count}, 重複: {duplicate_count}")
132
+ return inserted_count, duplicate_count
133
+
134
+ def get_recent_news(self, category: str = "all", limit: int = 50, days: int = 7) -> List[Dict]:
135
+ """獲取最近的新聞"""
136
+ try:
137
+ with self._get_connection() as conn:
138
+ cursor = conn.cursor()
139
+
140
+ # 構建查詢條件
141
+ where_clause = "WHERE published_date >= ?"
142
+ params = [datetime.now() - timedelta(days=days)]
143
+
144
+ if category != "all":
145
+ where_clause += " AND category = ?"
146
+ params.append(category)
147
+
148
+ query = f"""
149
+ SELECT * FROM news
150
+ {where_clause}
151
+ ORDER BY published_date DESC
152
+ LIMIT ?
153
+ """
154
+ params.append(limit)
155
+
156
+ cursor.execute(query, params)
157
+ rows = cursor.fetchall()
158
+
159
+ # 轉換為字典列表
160
+ news_list = []
161
+ for row in rows:
162
+ news_dict = dict(row)
163
+ # 轉換日期格式
164
+ if news_dict['published_date']:
165
+ news_dict['published_date'] = datetime.fromisoformat(news_dict['published_date'])
166
+ news_list.append(news_dict)
167
+
168
+ return news_list
169
+
170
+ except Exception as e:
171
+ logger.error(f"獲取新聞錯誤: {e}")
172
+ return []
173
+
174
+ def get_statistics(self) -> Dict:
175
+ """獲取新聞統計資訊"""
176
+ try:
177
+ with self._get_connection() as conn:
178
+ cursor = conn.cursor()
179
+
180
+ # 總新聞數量
181
+ cursor.execute("SELECT COUNT(*) as total FROM news")
182
+ total_news = cursor.fetchone()['total']
183
+
184
+ # 分類統計
185
+ cursor.execute("""
186
+ SELECT category, COUNT(*) as count
187
+ FROM news
188
+ GROUP BY category
189
+ """)
190
+ category_stats = {row['category']: row['count'] for row in cursor.fetchall()}
191
+
192
+ # 情緒統計
193
+ cursor.execute("""
194
+ SELECT sentiment, COUNT(*) as count
195
+ FROM news
196
+ WHERE sentiment IS NOT NULL
197
+ GROUP BY sentiment
198
+ """)
199
+ sentiment_stats = {row['sentiment']: row['count'] for row in cursor.fetchall()}
200
+
201
+ # 最後更新時間
202
+ cursor.execute("SELECT MAX(created_date) as last_update FROM news")
203
+ last_update = cursor.fetchone()['last_update']
204
+
205
+ return {
206
+ 'total_news': total_news,
207
+ 'us_stock_count': category_stats.get('us_stock', 0),
208
+ 'tw_stock_count': category_stats.get('tw_stock', 0),
209
+ 'positive_count': sentiment_stats.get('positive', 0),
210
+ 'negative_count': sentiment_stats.get('negative', 0),
211
+ 'neutral_count': sentiment_stats.get('neutral', 0),
212
+ 'last_update': last_update
213
+ }
214
+
215
+ except Exception as e:
216
+ logger.error(f"獲取統計資訊錯誤: {e}")
217
+ return {}
218
+
219
+ def cleanup_old_news(self, days: int = 14) -> int:
220
+ """清理舊新聞"""
221
+ try:
222
+ cutoff_date = datetime.now() - timedelta(days=days)
223
+
224
+ with self.lock:
225
+ with self._get_connection() as conn:
226
+ cursor = conn.cursor()
227
+
228
+ cursor.execute("""
229
+ DELETE FROM news
230
+ WHERE published_date < ?
231
+ """, (cutoff_date,))
232
+
233
+ deleted_count = cursor.rowcount
234
+ conn.commit()
235
+
236
+ logger.info(f"清理了 {deleted_count} 條超過 {days} 天的新聞")
237
+ return deleted_count
238
+
239
+ except Exception as e:
240
+ logger.error(f"清理舊新聞錯誤: {e}")
241
+ return 0
242
+
243
+ def record_crawl_stats(self, category: str, articles_count: int,
244
+ success_count: int, error_count: int, execution_time: float):
245
+ """記錄爬蟲統計"""
246
+ try:
247
+ with self._get_connection() as conn:
248
+ cursor = conn.cursor()
249
+
250
+ cursor.execute("""
251
+ INSERT INTO crawl_stats
252
+ (category, articles_count, success_count, error_count, execution_time)
253
+ VALUES (?, ?, ?, ?, ?)
254
+ """, (category, articles_count, success_count, error_count, execution_time))
255
+
256
+ conn.commit()
257
+
258
+ except Exception as e:
259
+ logger.error(f"記錄爬蟲統計錯誤: {e}")
260
+
261
+ def check_duplicate_by_title(self, title: str, similarity_threshold: float = 0.8) -> bool:
262
+ """檢查標題重複性"""
263
+ try:
264
+ with self._get_connection() as conn:
265
+ cursor = conn.cursor()
266
+
267
+ # 簡單的標題相似度檢查
268
+ cursor.execute("""
269
+ SELECT title FROM news
270
+ WHERE created_date >= ?
271
+ """, (datetime.now() - timedelta(days=1),))
272
+
273
+ existing_titles = [row['title'] for row in cursor.fetchall()]
274
+
275
+ # 計算相似度(簡化版)
276
+ title_words = set(title.lower().split())
277
+
278
+ for existing_title in existing_titles:
279
+ existing_words = set(existing_title.lower().split())
280
+
281
+ if len(title_words) == 0 or len(existing_words) == 0:
282
+ continue
283
+
284
+ intersection = title_words.intersection(existing_words)
285
+ union = title_words.union(existing_words)
286
+
287
+ similarity = len(intersection) / len(union) if union else 0
288
+
289
+ if similarity > similarity_threshold:
290
+ return True
291
+
292
+ return False
293
+
294
+ except Exception as e:
295
+ logger.error(f"檢查標題重複性錯誤: {e}")
296
+ return False
requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.44.0
2
+ torch>=2.0.0
3
+ transformers>=4.30.0
4
+ datasets>=2.14.0
5
+ accelerate>=0.20.0
6
+ requests>=2.31.0
7
+ beautifulsoup4>=4.12.0
8
+ cloudscraper>=1.2.71
9
+ pandas>=2.0.0
10
+ numpy>=1.24.0
11
+ scikit-learn>=1.3.0
12
+ sqlite3
13
+ python-dateutil>=2.8.2
14
+ pytz>=2023.3
15
+ schedule>=1.2.0
16
+ fake-useragent>=1.4.0
17
+ selenium>=4.15.0
18
+ webdriver-manager>=4.0.0
19
+ lxml>=4.9.0
20
+ cssselect>=1.2.0
21
+ readability-lxml>=0.8.1
22
+ feedparser>=6.0.10
23
+ nltk>=3.8.1
24
+ jieba>=0.42.1
25
+ emoji>=2.8.0
26
+ python-dotenv>=1.0.0
27
+ aiohttp>=3.8.0
28
+ asyncio
29
+ threading
30
+ logging
scheduler.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import schedule
2
+ import threading
3
+ import time
4
+ import logging
5
+ from datetime import datetime
6
+ from typing import Dict, List
7
+ from crawler import CnYesNewsCrawler, NewsItem
8
+ from sentiment_analyzer import SentimentAnalyzer
9
+ from database import NewsDatabase
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class NewsScheduler:
14
+ """新聞爬蟲排程器"""
15
+
16
+ def __init__(self, database: NewsDatabase, crawler: CnYesNewsCrawler, sentiment_analyzer: SentimentAnalyzer):
17
+ self.db = database
18
+ self.crawler = crawler
19
+ self.sentiment_analyzer = sentiment_analyzer
20
+ self.is_running = False
21
+ self.scheduler_thread = None
22
+
23
+ def start(self):
24
+ """啟動排程器"""
25
+ if self.is_running:
26
+ logger.warning("排程器已經在運行中")
27
+ return
28
+
29
+ self.is_running = True
30
+
31
+ # 設置排程任務
32
+ schedule.every(30).minutes.do(self._run_crawl_task) # 每30分鐘爬取
33
+ schedule.every().day.at("02:00").do(self._cleanup_old_news) # 每天凌晨2點清理
34
+
35
+ # 啟動背景線程
36
+ self.scheduler_thread = threading.Thread(target=self._run_scheduler, daemon=True)
37
+ self.scheduler_thread.start()
38
+
39
+ logger.info("新聞排程器已啟動 - 每30分鐘自動爬取")
40
+
41
+ # 立即執行一次爬取
42
+ threading.Thread(target=self._run_crawl_task, daemon=True).start()
43
+
44
+ def stop(self):
45
+ """停止排程器"""
46
+ self.is_running = False
47
+ if self.scheduler_thread:
48
+ self.scheduler_thread.join(timeout=5)
49
+ logger.info("新聞排程器已停止")
50
+
51
+ def _run_scheduler(self):
52
+ """運行排程器主循環"""
53
+ while self.is_running:
54
+ try:
55
+ schedule.run_pending()
56
+ time.sleep(60) # 每分鐘檢查一次
57
+ except Exception as e:
58
+ logger.error(f"排程器運行錯誤: {e}")
59
+ time.sleep(60)
60
+
61
+ def _run_crawl_task(self):
62
+ """執行爬蟲任務"""
63
+ try:
64
+ start_time = time.time()
65
+ logger.info("開始執行定時爬蟲任務")
66
+
67
+ # 爬取所有分類
68
+ all_news = self.crawler.crawl_all_categories(max_articles_per_category=15)
69
+
70
+ total_articles = 0
71
+ total_inserted = 0
72
+
73
+ for category, articles in all_news.items():
74
+ if not articles:
75
+ continue
76
+
77
+ # 情緒分析
78
+ analyzed_articles = self._analyze_articles_sentiment(articles)
79
+
80
+ # 轉換為資料庫格式
81
+ db_articles = self._convert_to_db_format(analyzed_articles)
82
+
83
+ # 插入資料庫
84
+ inserted, duplicates = self.db.insert_news(db_articles)
85
+
86
+ total_articles += len(articles)
87
+ total_inserted += inserted
88
+
89
+ # 記錄統計
90
+ execution_time = time.time() - start_time
91
+ self.db.record_crawl_stats(
92
+ category=category,
93
+ articles_count=len(articles),
94
+ success_count=inserted,
95
+ error_count=len(articles) - inserted,
96
+ execution_time=execution_time
97
+ )
98
+
99
+ logger.info(f"{category} 分類: {len(articles)} 篇文章, {inserted} 篇新增")
100
+
101
+ execution_time = time.time() - start_time
102
+ logger.info(f"爬蟲任務完成 - 總計: {total_articles} 篇, 新增: {total_inserted} 篇, 耗時: {execution_time:.2f}秒")
103
+
104
+ return f"成功爬取 {total_articles} 篇文章,新增 {total_inserted} 篇"
105
+
106
+ except Exception as e:
107
+ logger.error(f"爬蟲任務執行錯誤: {e}")
108
+ return f"爬蟲任務失敗: {str(e)}"
109
+
110
+ def _analyze_articles_sentiment(self, articles: List[NewsItem]) -> List[NewsItem]:
111
+ """對文章進行情緒分析"""
112
+ try:
113
+ logger.info(f"開始分析 {len(articles)} 篇文章的情緒")
114
+
115
+ for article in articles:
116
+ sentiment_result = self.sentiment_analyzer.analyze_sentiment(
117
+ article.content,
118
+ article.title
119
+ )
120
+
121
+ article.sentiment = sentiment_result['sentiment']
122
+ article.sentiment_score = sentiment_result['confidence']
123
+
124
+ logger.info("情緒分析完成")
125
+ return articles
126
+
127
+ except Exception as e:
128
+ logger.error(f"情緒分析錯誤: {e}")
129
+ return articles
130
+
131
+ def _convert_to_db_format(self, articles: List[NewsItem]) -> List[Dict]:
132
+ """轉換為資料庫格式"""
133
+ db_articles = []
134
+
135
+ for article in articles:
136
+ # 檢查重複
137
+ if self.db.check_duplicate_by_title(article.title):
138
+ logger.info(f"跳過重複文章: {article.title[:50]}...")
139
+ continue
140
+
141
+ db_article = {
142
+ 'title': article.title,
143
+ 'content': article.content,
144
+ 'url': article.url,
145
+ 'source': article.source,
146
+ 'category': article.category,
147
+ 'published_date': article.published_date.isoformat(),
148
+ 'sentiment': article.sentiment,
149
+ 'sentiment_score': article.sentiment_score,
150
+ 'sentiment_method': 'auto'
151
+ }
152
+
153
+ db_articles.append(db_article)
154
+
155
+ return db_articles
156
+
157
+ def _cleanup_old_news(self):
158
+ """清理舊新聞"""
159
+ try:
160
+ deleted_count = self.db.cleanup_old_news(days=14)
161
+ logger.info(f"清理任務完成,刪除了 {deleted_count} 條舊新聞")
162
+ except Exception as e:
163
+ logger.error(f"清理舊新聞錯誤: {e}")
164
+
165
+ def run_crawl_task(self):
166
+ """手動執行爬蟲任務(用於UI)"""
167
+ return self._run_crawl_task()
sentiment_analyzer.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
3
+ import logging
4
+ import re
5
+ from typing import Dict, Tuple, Optional
6
+ import jieba
7
+ import emoji
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class SentimentAnalyzer:
12
+ """中文新聞情緒分析器"""
13
+
14
+ def __init__(self, model_name: str = "uer/roberta-base-finetuned-jd-binary-chinese"):
15
+ self.model_name = model_name
16
+ self.tokenizer = None
17
+ self.model = None
18
+ self.classifier = None
19
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
20
+
21
+ # 初始化模型
22
+ self._load_model()
23
+
24
+ # 情緒關鍵字典
25
+ self.positive_keywords = {
26
+ '上漲', '漲', '漲幅', '上升', '增長', '成長', '利好', '利多', '買進', '看好',
27
+ '樂觀', '獲利', '盈利', '突破', '新高', '強勢', '回升', '反彈', '看漲',
28
+ '推薦', '買入', '增持', '超買', '牛市', '多頭', '正面', '積極'
29
+ }
30
+
31
+ self.negative_keywords = {
32
+ '下跌', '跌', '跌幅', '下滑', '下降', '減少', '衰退', '利空', '賣出', '看壞',
33
+ '悲觀', '虧損', '損失', '破底', '新低', '弱勢', '下探', '重挫', '看跌',
34
+ '賣出', '減持', '超賣', '熊市', '空頭', '負面', '消極', '警告'
35
+ }
36
+
37
+ def _load_model(self):
38
+ """載入預訓練模型"""
39
+ try:
40
+ logger.info(f"載入情緒分析模型: {self.model_name}")
41
+
42
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
43
+ self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
44
+
45
+ # 創建分類器管道
46
+ self.classifier = pipeline(
47
+ "text-classification",
48
+ model=self.model,
49
+ tokenizer=self.tokenizer,
50
+ device=0 if self.device == "cuda" else -1,
51
+ return_all_scores=True
52
+ )
53
+
54
+ logger.info("情緒分析模型載入成功")
55
+
56
+ except Exception as e:
57
+ logger.error(f"載入模型時發生錯誤: {e}")
58
+ self.classifier = None
59
+
60
+ def _preprocess_text(self, text: str) -> str:
61
+ """文本預處理"""
62
+ try:
63
+ # 移除emoji
64
+ text = emoji.demojize(text, language='zh')
65
+ text = re.sub(r':[a-zA-Z_]+:', '', text)
66
+
67
+ # 移除特殊字符
68
+ text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()(),。!?]', '', text)
69
+
70
+ # 移除多餘空格
71
+ text = re.sub(r'\s+', ' ', text).strip()
72
+
73
+ # 截斷長度 (BERT模型限制)
74
+ if len(text) > 500:
75
+ text = text[:500]
76
+
77
+ return text
78
+
79
+ except Exception as e:
80
+ logger.error(f"文本預處理錯誤: {e}")
81
+ return text
82
+
83
+ def _keyword_sentiment(self, text: str) -> Tuple[str, float]:
84
+ """基於關鍵字的情緒分析"""
85
+ positive_count = sum(1 for keyword in self.positive_keywords if keyword in text)
86
+ negative_count = sum(1 for keyword in self.negative_keywords if keyword in text)
87
+
88
+ total_keywords = positive_count + negative_count
89
+
90
+ if total_keywords == 0:
91
+ return "neutral", 0.5
92
+
93
+ positive_ratio = positive_count / total_keywords
94
+
95
+ if positive_ratio > 0.6:
96
+ return "positive", 0.7 + (positive_ratio - 0.6) * 0.75
97
+ elif positive_ratio < 0.4:
98
+ return "negative", 0.3 - (0.4 - positive_ratio) * 0.75
99
+ else:
100
+ return "neutral", 0.5
101
+
102
+ def analyze_sentiment(self, text: str, title: str = "") -> Dict[str, any]:
103
+ """分析文本情緒"""
104
+ try:
105
+ # 合併標題和內容
106
+ full_text = f"{title} {text}" if title else text
107
+ processed_text = self._preprocess_text(full_text)
108
+
109
+ if not processed_text:
110
+ return {
111
+ "sentiment": "neutral",
112
+ "confidence": 0.5,
113
+ "method": "default"
114
+ }
115
+
116
+ # 使用模型分析
117
+ if self.classifier:
118
+ try:
119
+ results = self.classifier(processed_text)
120
+
121
+ # 處理模型結果
122
+ if results and len(results) > 0:
123
+ scores = results[0]
124
+
125
+ # 找到最高分數的標籤
126
+ best_result = max(scores, key=lambda x: x['score'])
127
+
128
+ # 標籤映射
129
+ label_mapping = {
130
+ 'LABEL_0': 'negative',
131
+ 'LABEL_1': 'positive',
132
+ 'negative': 'negative',
133
+ 'positive': 'positive'
134
+ }
135
+
136
+ sentiment = label_mapping.get(best_result['label'], 'neutral')
137
+ confidence = best_result['score']
138
+
139
+ # 如果信心度較低,使用關鍵字方法
140
+ if confidence < 0.7:
141
+ keyword_sentiment, keyword_confidence = self._keyword_sentiment(processed_text)
142
+
143
+ # 加權平均
144
+ if abs(confidence - 0.5) < abs(keyword_confidence - 0.5):
145
+ sentiment = keyword_sentiment
146
+ confidence = (confidence + keyword_confidence) / 2
147
+ method = "hybrid"
148
+ else:
149
+ method = "model"
150
+ else:
151
+ method = "model"
152
+
153
+ return {
154
+ "sentiment": sentiment,
155
+ "confidence": confidence,
156
+ "method": method
157
+ }
158
+
159
+ except Exception as e:
160
+ logger.error(f"模型分析錯誤: {e}")
161
+
162
+ # 備用:關鍵字分析
163
+ sentiment, confidence = self._keyword_sentiment(processed_text)
164
+ return {
165
+ "sentiment": sentiment,
166
+ "confidence": confidence,
167
+ "method": "keyword"
168
+ }
169
+
170
+ except Exception as e:
171
+ logger.error(f"情緒分析錯誤: {e}")
172
+ return {
173
+ "sentiment": "neutral",
174
+ "confidence": 0.5,
175
+ "method": "error"
176
+ }
177
+
178
+ def batch_analyze(self, texts: list, titles: list = None) -> list:
179
+ """批量分析情緒"""
180
+ results = []
181
+ titles = titles or [""] * len(texts)
182
+
183
+ for i, text in enumerate(texts):
184
+ title = titles[i] if i < len(titles) else ""
185
+ result = self.analyze_sentiment(text, title)
186
+ results.append(result)
187
+
188
+ # 避免GPU記憶體問題
189
+ if i % 10 == 0:
190
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
191
+
192
+ return results
utils.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from datetime import datetime
4
+ from typing import List, Dict
5
+ import html
6
+
7
+ def setup_logging():
8
+ """設置日誌系統"""
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
12
+ handlers=[
13
+ logging.StreamHandler(),
14
+ logging.FileHandler('news_app.log', encoding='utf-8')
15
+ ]
16
+ )
17
+
18
+ def format_news_for_display(news_data: List[Dict]) -> str:
19
+ """格式化新聞用於顯示"""
20
+ if not news_data:
21
+ return "📰 暫無新聞資料"
22
+
23
+ html_content = """
24
+ <div style="max-width: 100%; font-family: Arial, sans-serif;">
25
+ """
26
+
27
+ for news in news_data:
28
+ # 獲取情緒和對應的樣式
29
+ sentiment = news.get('sentiment', 'neutral')
30
+ sentiment_class = f"news-{sentiment}"
31
+
32
+ # 情緒徽章
33
+ sentiment_badges = {
34
+ 'positive': '<span class="sentiment-badge positive-badge">正面 😊</span>',
35
+ 'negative': '<span class="sentiment-badge negative-badge">負面 😔</span>',
36
+ 'neutral': '<span class="sentiment-badge neutral-badge">中性 😐</span>'
37
+ }
38
+
39
+ sentiment_badge = sentiment_badges.get(sentiment, sentiment_badges['neutral'])
40
+
41
+ # 格式化發布時間
42
+ published_date = news.get('published_date', '')
43
+ if isinstance(published_date, str):
44
+ try:
45
+ dt = datetime.fromisoformat(published_date.replace('Z', '+00:00'))
46
+ formatted_date = dt.strftime('%Y-%m-%d %H:%M')
47
+ except:
48
+ formatted_date = published_date
49
+ else:
50
+ formatted_date = str(published_date)
51
+
52
+ # 清理和截斷內容
53
+ title = html.escape(news.get('title', ''))
54
+ content = html.escape(news.get('content', ''))
55
+ url = news.get('url', '')
56
+ source = html.escape(news.get('source', ''))
57
+ category_name = {'us_stock': '美股', 'tw_stock': '台股'}.get(news.get('category', ''), '財經')
58
+
59
+ # 截斷長內容
60
+ if len(content) > 300:
61
+ content = content[:300] + "..."
62
+
63
+ # 新聞卡片HTML
64
+ news_card = f"""
65
+ <div class="news-card {sentiment_class}" style="margin-bottom: 20px; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
66
+ <div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 10px;">
67
+ <h3 style="margin: 0; font-size: 18px; color: #333; flex: 1; margin-right: 10px;">
68
+ <a href="{url}" target="_blank" style="color: #333; text-decoration: none;">{title}</a>
69
+ </h3>
70
+ {sentiment_badge}
71
+ </div>
72
+
73
+ <div style="margin-bottom: 10px; color: #666; font-size: 14px;">
74
+ <span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 12px; margin-right: 8px;">{category_name}</span>
75
+ <span>{source}</span>
76
+ <span style="margin-left: 8px;">📅 {formatted_date}</span>
77
+ </div>
78
+
79
+ <p style="margin: 10px 0; color: #555; line-height: 1.6;">{content}</p>
80
+
81
+ <div style="margin-top: 10px; text-align: right;">
82
+ <a href="{url}" target="_blank" style="color: #007bff; text-decoration: none; font-size: 14px;">閱讀全文 →</a>
83
+ </div>
84
+ </div>
85
+ """
86
+
87
+ html_content += news_card
88
+
89
+ html_content += "</div>"
90
+
91
+ return html_content
92
+
93
+ def clean_text(text: str) -> str:
94
+ """清理文本"""
95
+ if not text:
96
+ return ""
97
+
98
+ # 移除HTML標籤
99
+ text = re.sub(r'<[^>]+>', '', text)
100
+
101
+ # 移除多餘空格
102
+ text = re.sub(r'\s+', ' ', text)
103
+
104
+ # 移除特殊字符
105
+ text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()(),。!?]', '', text)
106
+
107
+ return text.strip()
108
+
109
+ def calculate_similarity(text1: str, text2: str) -> float:
110
+ """計算兩個文本的相似度"""
111
+ if not text1 or not text2:
112
+ return 0.0
113
+
114
+ # 簡單的詞彙相似度計算
115
+ words1 = set(text1.lower().split())
116
+ words2 = set(text2.lower().split())
117
+
118
+ if not words1 or not words2:
119
+ return 0.0
120
+
121
+ intersection = words1.intersection(words2)
122
+ union = words1.union(words2)
123
+
124
+ return len(intersection) / len(union) if union else 0.0
125
+
126
+ def validate_url(url: str) -> bool:
127
+ """驗證URL格式"""
128
+ url_pattern = re.compile(
129
+ r'^https?://' # http:// or https://
130
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
131
+ r'localhost|' # localhost...
132
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
133
+ r'(?::\d+)?' # optional port
134
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE)
135
+
136
+ return url_pattern.match(url) is not None
137
+
138
+ def format_duration(seconds: float) -> str:
139
+ """格式化時間長度"""
140
+ if seconds < 60:
141
+ return f"{seconds:.1f}秒"
142
+ elif seconds < 3600:
143
+ minutes = seconds / 60
144
+ return f"{minutes:.1f}分鐘"
145
+ else:
146
+ hours = seconds / 3600
147
+ return f"{hours:.1f}小時"
148
+
149
+ def truncate_text(text: str, max_length: int = 100) -> str:
150
+ """截斷文本"""
151
+ if not text:
152
+ return ""
153
+
154
+ if len(text) <= max_length:
155
+ return text
156
+
157
+ return text[:max_length].rsplit(' ', 1)[0] + "..."