khjhs60199 commited on
Commit
0c07a4b
·
verified ·
1 Parent(s): bcdcc05

Update scheduler.py

Browse files
Files changed (1) hide show
  1. scheduler.py +108 -61
scheduler.py CHANGED
@@ -39,7 +39,7 @@ class NewsScheduler:
39
  logger.info("新聞排程器已啟動 - 每30分鐘自動爬取")
40
 
41
  # 立即執行一次爬取
42
- threading.Thread(target=self._run_crawl_task, daemon=True).start()
43
 
44
  def stop(self):
45
  """停止排程器"""
@@ -59,109 +59,156 @@ class NewsScheduler:
59
  time.sleep(60)
60
 
61
  def _run_crawl_task(self):
62
- """執行爬蟲任務"""
 
 
 
 
 
 
 
63
  try:
64
  start_time = time.time()
65
- logger.info("開始執行定時爬蟲任務")
66
 
67
  # 爬取所有分類
68
- all_news = self.crawler.crawl_all_categories(max_articles_per_category=15)
69
 
70
  total_articles = 0
71
  total_inserted = 0
 
72
 
73
  for category, articles in all_news.items():
74
  if not articles:
 
75
  continue
76
 
 
 
77
  # 情緒分析
78
- analyzed_articles = self._analyze_articles_sentiment(articles)
 
 
 
 
 
 
79
 
80
  # 轉換為資料庫格式
81
- db_articles = self._convert_to_db_format(analyzed_articles)
 
 
 
 
 
82
 
83
  # 插入資料庫
84
- inserted, duplicates = self.db.insert_news(db_articles)
85
-
86
- total_articles += len(articles)
87
- total_inserted += inserted
88
-
89
- # 記錄統計
90
- execution_time = time.time() - start_time
91
- self.db.record_crawl_stats(
92
- category=category,
93
- articles_count=len(articles),
94
- success_count=inserted,
95
- error_count=len(articles) - inserted,
96
- execution_time=execution_time
97
- )
98
-
99
- logger.info(f"{category} 分類: {len(articles)} 篇文章, {inserted} 篇新增")
 
 
 
 
 
 
 
100
 
101
  execution_time = time.time() - start_time
102
- logger.info(f"爬蟲任務完成 - 總計: {total_articles} 篇, 新增: {total_inserted} 篇, 耗時: {execution_time:.2f}秒")
 
103
 
104
- return f"成功爬取 {total_articles} 篇文章,新增 {total_inserted} 篇"
105
 
106
  except Exception as e:
107
- logger.error(f"爬蟲任務執行錯誤: {e}")
108
- return f"爬蟲任務失敗: {str(e)}"
 
109
 
110
  def _analyze_articles_sentiment(self, articles: List[NewsItem]) -> List[NewsItem]:
111
  """對文章進行情緒分析"""
112
  try:
113
- logger.info(f"開始分析 {len(articles)} 篇文章的情緒")
114
 
115
- for article in articles:
116
- sentiment_result = self.sentiment_analyzer.analyze_sentiment(
117
- article.content,
118
- article.title
119
- )
120
-
121
- article.sentiment = sentiment_result['sentiment']
122
- article.sentiment_score = sentiment_result['confidence']
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- logger.info("情緒分析完成")
125
  return articles
126
 
127
  except Exception as e:
128
- logger.error(f"情緒分析錯誤: {e}")
129
  return articles
130
 
131
  def _convert_to_db_format(self, articles: List[NewsItem]) -> List[Dict]:
132
  """轉換為資料庫格式"""
133
  db_articles = []
134
 
135
- for article in articles:
136
- # 檢查重複
137
- if self.db.check_duplicate_by_title(article.title):
138
- logger.info(f"跳過重複文章: {article.title[:50]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  continue
140
-
141
- db_article = {
142
- 'title': article.title,
143
- 'content': article.content,
144
- 'url': article.url,
145
- 'source': article.source,
146
- 'category': article.category,
147
- 'published_date': article.published_date.isoformat(),
148
- 'sentiment': article.sentiment,
149
- 'sentiment_score': article.sentiment_score,
150
- 'sentiment_method': 'auto'
151
- }
152
-
153
- db_articles.append(db_article)
154
 
 
155
  return db_articles
156
 
157
  def _cleanup_old_news(self):
158
  """清理舊新聞"""
159
  try:
160
  deleted_count = self.db.cleanup_old_news(days=14)
161
- logger.info(f"清理任務完成,刪除了 {deleted_count} 條舊新聞")
162
  except Exception as e:
163
- logger.error(f"清理舊新聞錯誤: {e}")
164
-
165
- def run_crawl_task(self):
166
- """手動執行爬蟲任務(用於UI)"""
167
- return self._run_crawl_task()
 
39
  logger.info("新聞排程器已啟動 - 每30分鐘自動爬取")
40
 
41
  # 立即執行一次爬取
42
+ threading.Thread(target=self.run_crawl_task, daemon=True).start()
43
 
44
  def stop(self):
45
  """停止排程器"""
 
59
  time.sleep(60)
60
 
61
  def _run_crawl_task(self):
62
+ """內部爬蟲任務(不回傳結果)"""
63
+ try:
64
+ self.run_crawl_task()
65
+ except Exception as e:
66
+ logger.error(f"內部爬蟲任務錯誤: {e}")
67
+
68
+ def run_crawl_task(self):
69
+ """執行爬蟲任務(可被外部調用)"""
70
  try:
71
  start_time = time.time()
72
+ logger.info("🚀 開始執行爬蟲任務")
73
 
74
  # 爬取所有分類
75
+ all_news = self.crawler.crawl_all_categories(max_articles_per_category=8)
76
 
77
  total_articles = 0
78
  total_inserted = 0
79
+ total_success = 0
80
 
81
  for category, articles in all_news.items():
82
  if not articles:
83
+ logger.warning(f"⚠️ {category} 分類沒有獲取到文章")
84
  continue
85
 
86
+ logger.info(f"📊 開始處理 {category} 分類的 {len(articles)} 篇文章")
87
+
88
  # 情緒分析
89
+ try:
90
+ logger.info(f"🧠 開始對 {category} 分類進行情緒分析")
91
+ analyzed_articles = self._analyze_articles_sentiment(articles)
92
+ logger.info(f"✅ {category} 分類情緒分析完成")
93
+ except Exception as e:
94
+ logger.error(f"❌ {category} 分類情緒分析錯誤: {e}")
95
+ analyzed_articles = articles # 使用原始文章
96
 
97
  # 轉換為資料庫格式
98
+ try:
99
+ db_articles = self._convert_to_db_format(analyzed_articles)
100
+ logger.info(f"🔄 {category} 分類轉換為資料庫格式: {len(db_articles)} 篇")
101
+ except Exception as e:
102
+ logger.error(f"❌ {category} 分類格式轉換錯誤: {e}")
103
+ continue
104
 
105
  # 插入資料庫
106
+ try:
107
+ if db_articles:
108
+ inserted, duplicates = self.db.insert_news(db_articles)
109
+ logger.info(f"💾 {category} 分類資料庫操作完成: 新增 {inserted} 篇, 重複 {duplicates} 篇")
110
+
111
+ total_articles += len(articles)
112
+ total_inserted += inserted
113
+ total_success += len(analyzed_articles)
114
+
115
+ # 記錄統計
116
+ execution_time = time.time() - start_time
117
+ self.db.record_crawl_stats(
118
+ category=category,
119
+ articles_count=len(articles),
120
+ success_count=inserted,
121
+ error_count=len(articles) - inserted,
122
+ execution_time=execution_time
123
+ )
124
+ else:
125
+ logger.warning(f"⚠️ {category} 分類沒有有效的文章數據")
126
+ except Exception as e:
127
+ logger.error(f"❌ {category} 分類資料庫插入錯誤: {e}")
128
+ continue
129
 
130
  execution_time = time.time() - start_time
131
+ result_message = f"🎉 爬蟲任務完成 - 總計: {total_articles} 篇, 成功: {total_success} 篇, 新增: {total_inserted} 篇, 耗時: {execution_time:.2f}秒"
132
+ logger.info(result_message)
133
 
134
+ return result_message
135
 
136
  except Exception as e:
137
+ error_message = f"爬蟲任務執行錯誤: {e}"
138
+ logger.error(error_message)
139
+ return error_message
140
 
141
  def _analyze_articles_sentiment(self, articles: List[NewsItem]) -> List[NewsItem]:
142
  """對文章進行情緒分析"""
143
  try:
144
+ logger.info(f"🔍 開始分析 {len(articles)} 篇文章的情緒")
145
 
146
+ for i, article in enumerate(articles, 1):
147
+ try:
148
+ logger.debug(f"分析第 {i}/{len(articles)} 篇: {article.title[:30]}...")
149
+
150
+ sentiment_result = self.sentiment_analyzer.analyze_sentiment(
151
+ article.content,
152
+ article.title
153
+ )
154
+
155
+ article.sentiment = sentiment_result['sentiment']
156
+ article.sentiment_score = sentiment_result['confidence']
157
+
158
+ logger.debug(f"情緒分析結果: {sentiment_result['sentiment']} (信心度: {sentiment_result['confidence']:.2f})")
159
+
160
+ except Exception as e:
161
+ logger.error(f"分析第 {i} 篇文章時發生錯誤: {e}")
162
+ # 設置默認值
163
+ article.sentiment = 'neutral'
164
+ article.sentiment_score = 0.5
165
 
166
+ logger.info("情緒分析完成")
167
  return articles
168
 
169
  except Exception as e:
170
+ logger.error(f" 情緒分析過程錯誤: {e}")
171
  return articles
172
 
173
  def _convert_to_db_format(self, articles: List[NewsItem]) -> List[Dict]:
174
  """轉換為資料庫格式"""
175
  db_articles = []
176
 
177
+ logger.info(f"🔄 開始轉換 {len(articles)} 篇文章為資料庫格式")
178
+
179
+ for i, article in enumerate(articles, 1):
180
+ try:
181
+ # 檢查重複
182
+ if self.db.check_duplicate_by_title(article.title):
183
+ logger.info(f"⏭️ 跳過重複文章 {i}: {article.title[:30]}...")
184
+ continue
185
+
186
+ db_article = {
187
+ 'title': article.title,
188
+ 'content': article.content,
189
+ 'url': article.url,
190
+ 'source': article.source,
191
+ 'category': article.category,
192
+ 'published_date': article.published_date.isoformat() if article.published_date else datetime.now().isoformat(),
193
+ 'sentiment': article.sentiment,
194
+ 'sentiment_score': article.sentiment_score,
195
+ 'sentiment_method': 'auto'
196
+ }
197
+
198
+ db_articles.append(db_article)
199
+ logger.debug(f"✅ 轉換文章 {i}: {article.title[:30]}...")
200
+
201
+ except Exception as e:
202
+ logger.error(f"❌ 轉換第 {i} 篇文章時發生錯誤: {e}")
203
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
+ logger.info(f"🔄 轉換完成,有效文章: {len(db_articles)} 篇")
206
  return db_articles
207
 
208
  def _cleanup_old_news(self):
209
  """清理舊新聞"""
210
  try:
211
  deleted_count = self.db.cleanup_old_news(days=14)
212
+ logger.info(f"🧹 清理任務完成,刪除了 {deleted_count} 條舊新聞")
213
  except Exception as e:
214
+ logger.error(f"清理舊新聞錯誤: {e}")