Spaces:
Sleeping
Sleeping
Update crawler.py
Browse files- crawler.py +41 -28
crawler.py
CHANGED
|
@@ -27,7 +27,7 @@ class NewsItem:
|
|
| 27 |
sentiment_score: Optional[float] = None
|
| 28 |
|
| 29 |
class CnYesNewsCrawler:
|
| 30 |
-
"""鉅亨網新聞爬蟲 -
|
| 31 |
|
| 32 |
def __init__(self, sentiment_analyzer=None, database=None):
|
| 33 |
self.base_url = "https://news.cnyes.com"
|
|
@@ -56,7 +56,7 @@ class CnYesNewsCrawler:
|
|
| 56 |
# 設置請求頭
|
| 57 |
self._setup_headers()
|
| 58 |
|
| 59 |
-
logger.info("爬蟲初始化完成")
|
| 60 |
logger.info(f"美股URL: {self.categories['us_stock']}")
|
| 61 |
logger.info(f"台股URL: {self.categories['tw_stock']}")
|
| 62 |
|
|
@@ -122,8 +122,8 @@ class CnYesNewsCrawler:
|
|
| 122 |
|
| 123 |
return None
|
| 124 |
|
| 125 |
-
def _extract_article_urls(self, category_url: str, max_pages: int =
|
| 126 |
-
"""從分類頁面提取文章URL -
|
| 127 |
article_urls = []
|
| 128 |
|
| 129 |
for page in range(1, max_pages + 1):
|
|
@@ -166,7 +166,7 @@ class CnYesNewsCrawler:
|
|
| 166 |
self._notify_progress(f"📄 第 {page} 頁找到 {len(page_urls)} 篇文章")
|
| 167 |
|
| 168 |
if not page_urls:
|
| 169 |
-
logger.warning(f"第 {page}
|
| 170 |
break
|
| 171 |
|
| 172 |
if page < max_pages:
|
|
@@ -349,47 +349,55 @@ class CnYesNewsCrawler:
|
|
| 349 |
|
| 350 |
return None
|
| 351 |
|
| 352 |
-
def crawl_category(self, category: str,
|
| 353 |
-
"""爬取指定分類的新聞 -
|
| 354 |
if category not in self.categories:
|
| 355 |
logger.error(f"無效的分類: {category}")
|
| 356 |
return []
|
| 357 |
|
| 358 |
category_name = "美股" if category == "us_stock" else "台股"
|
| 359 |
-
|
|
|
|
| 360 |
|
| 361 |
# 獲取文章URL列表
|
| 362 |
category_url = self.categories[category]
|
| 363 |
-
article_urls = self._extract_article_urls(category_url, max_pages=
|
| 364 |
|
| 365 |
if not article_urls:
|
| 366 |
self._notify_progress(f"⚠️ 未找到 {category_name} 分類的文章URL")
|
| 367 |
return []
|
| 368 |
|
| 369 |
-
# **關鍵修正:不限制文章數量**
|
| 370 |
total_articles = len(article_urls)
|
| 371 |
-
if max_articles and max_articles > 0:
|
| 372 |
-
# 只有在明確指定max_articles時才限制
|
| 373 |
-
if len(article_urls) > max_articles:
|
| 374 |
-
article_urls = article_urls[:max_articles]
|
| 375 |
-
self._notify_progress(f"⚠️ 限制處理文章數量為 {max_articles} 篇")
|
| 376 |
|
| 377 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
# 提取文章內容並即時分析存檔
|
| 380 |
articles = []
|
| 381 |
success_count = 0
|
| 382 |
error_count = 0
|
|
|
|
| 383 |
|
| 384 |
-
for i, url in enumerate(
|
| 385 |
try:
|
| 386 |
-
self._notify_progress(f"📖 處理 {category_name} 文章 {i}/{len(
|
| 387 |
article = self._extract_article_content(url, category)
|
| 388 |
|
| 389 |
if article:
|
| 390 |
# 即時情感分析
|
| 391 |
if self.sentiment_analyzer:
|
| 392 |
-
self._notify_progress(f"🧠 分析 {category_name} 文章 {i}/{len(
|
| 393 |
sentiment_result = self.sentiment_analyzer.analyze_sentiment(
|
| 394 |
article.content, article.title
|
| 395 |
)
|
|
@@ -418,16 +426,18 @@ class CnYesNewsCrawler:
|
|
| 418 |
success_count += 1
|
| 419 |
else:
|
| 420 |
self._notify_progress(f"⏭️ 跳過重複 {category_name} 文章: {article.title[:30]}...")
|
|
|
|
| 421 |
else:
|
| 422 |
self._notify_progress(f"⏭️ 跳過重複 {category_name} 文章: {article.title[:30]}...")
|
|
|
|
| 423 |
|
| 424 |
articles.append(article)
|
| 425 |
else:
|
| 426 |
error_count += 1
|
| 427 |
|
| 428 |
# 文章間延遲
|
| 429 |
-
if i < len(
|
| 430 |
-
time.sleep(random.uniform(
|
| 431 |
|
| 432 |
except Exception as e:
|
| 433 |
logger.error(f"處理文章時發生錯誤 {url}: {e}")
|
|
@@ -435,26 +445,29 @@ class CnYesNewsCrawler:
|
|
| 435 |
error_count += 1
|
| 436 |
continue
|
| 437 |
|
| 438 |
-
self._notify_progress(f"✅ {category_name}
|
| 439 |
return articles
|
| 440 |
|
| 441 |
-
def crawl_all_categories(self,
|
| 442 |
-
"""爬取所有分類的新聞 -
|
| 443 |
results = {}
|
|
|
|
|
|
|
|
|
|
| 444 |
|
| 445 |
for category in self.categories.keys():
|
| 446 |
try:
|
| 447 |
category_name = "美股" if category == "us_stock" else "台股"
|
| 448 |
self._notify_progress(f"🎯 開始爬取 {category_name} 分類")
|
| 449 |
|
| 450 |
-
#
|
| 451 |
-
articles = self.crawl_category(category,
|
| 452 |
results[category] = articles
|
| 453 |
|
| 454 |
# 分類間延遲
|
| 455 |
if len(self.categories) > 1:
|
| 456 |
self._notify_progress(f"⏸️ 分類間休息...")
|
| 457 |
-
time.sleep(random.uniform(
|
| 458 |
|
| 459 |
except Exception as e:
|
| 460 |
logger.error(f"爬取 {category} 分類時發生錯誤: {e}")
|
|
@@ -462,6 +475,6 @@ class CnYesNewsCrawler:
|
|
| 462 |
results[category] = []
|
| 463 |
|
| 464 |
total_articles = sum(len(articles) for articles in results.values())
|
| 465 |
-
self._notify_progress(f"🎉
|
| 466 |
|
| 467 |
return results
|
|
|
|
| 27 |
sentiment_score: Optional[float] = None
|
| 28 |
|
| 29 |
class CnYesNewsCrawler:
|
| 30 |
+
"""鉅亨網新聞爬蟲 - 完全無限制版"""
|
| 31 |
|
| 32 |
def __init__(self, sentiment_analyzer=None, database=None):
|
| 33 |
self.base_url = "https://news.cnyes.com"
|
|
|
|
| 56 |
# 設置請求頭
|
| 57 |
self._setup_headers()
|
| 58 |
|
| 59 |
+
logger.info("爬蟲初始化完成 - 無限制模式")
|
| 60 |
logger.info(f"美股URL: {self.categories['us_stock']}")
|
| 61 |
logger.info(f"台股URL: {self.categories['tw_stock']}")
|
| 62 |
|
|
|
|
| 122 |
|
| 123 |
return None
|
| 124 |
|
| 125 |
+
def _extract_article_urls(self, category_url: str, max_pages: int = 4) -> List[str]:
|
| 126 |
+
"""從分類頁面提取文章URL - 增加到4頁"""
|
| 127 |
article_urls = []
|
| 128 |
|
| 129 |
for page in range(1, max_pages + 1):
|
|
|
|
| 166 |
self._notify_progress(f"📄 第 {page} 頁找到 {len(page_urls)} 篇文章")
|
| 167 |
|
| 168 |
if not page_urls:
|
| 169 |
+
logger.warning(f"第 {page} 頁沒有找到文章,停止爬取後續頁面")
|
| 170 |
break
|
| 171 |
|
| 172 |
if page < max_pages:
|
|
|
|
| 349 |
|
| 350 |
return None
|
| 351 |
|
| 352 |
+
def crawl_category(self, category: str, unlimited: bool = True) -> List[NewsItem]:
|
| 353 |
+
"""爬取指定分類的新聞 - 完全無限制版"""
|
| 354 |
if category not in self.categories:
|
| 355 |
logger.error(f"無效的分類: {category}")
|
| 356 |
return []
|
| 357 |
|
| 358 |
category_name = "美股" if category == "us_stock" else "台股"
|
| 359 |
+
mode_text = "無限制" if unlimited else "限制"
|
| 360 |
+
self._notify_progress(f"🚀 開始爬取 {category_name} 分類新聞 ({mode_text}模式)")
|
| 361 |
|
| 362 |
# 獲取文章URL列表
|
| 363 |
category_url = self.categories[category]
|
| 364 |
+
article_urls = self._extract_article_urls(category_url, max_pages=4) # 增加到4頁
|
| 365 |
|
| 366 |
if not article_urls:
|
| 367 |
self._notify_progress(f"⚠️ 未找到 {category_name} 分類的文章URL")
|
| 368 |
return []
|
| 369 |
|
|
|
|
| 370 |
total_articles = len(article_urls)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
|
| 372 |
+
if unlimited:
|
| 373 |
+
# **完全無限制模式 - 處理所有文章**
|
| 374 |
+
self._notify_progress(f"🎯 無限制模式:將處理所有 {total_articles} 篇文章")
|
| 375 |
+
articles_to_process = article_urls
|
| 376 |
+
else:
|
| 377 |
+
# 限制模式 - 最多20篇
|
| 378 |
+
max_limit = 20
|
| 379 |
+
if total_articles > max_limit:
|
| 380 |
+
articles_to_process = article_urls[:max_limit]
|
| 381 |
+
self._notify_progress(f"⚠️ 限制模式:只處理前 {max_limit} 篇文章(共找到 {total_articles} 篇)")
|
| 382 |
+
else:
|
| 383 |
+
articles_to_process = article_urls
|
| 384 |
+
self._notify_progress(f"📊 限制模式:將處理所有 {total_articles} 篇文章")
|
| 385 |
|
| 386 |
# 提取文章內容並即時分析存檔
|
| 387 |
articles = []
|
| 388 |
success_count = 0
|
| 389 |
error_count = 0
|
| 390 |
+
skip_count = 0
|
| 391 |
|
| 392 |
+
for i, url in enumerate(articles_to_process, 1):
|
| 393 |
try:
|
| 394 |
+
self._notify_progress(f"📖 處理 {category_name} 文章 {i}/{len(articles_to_process)}: 正在提取內容...")
|
| 395 |
article = self._extract_article_content(url, category)
|
| 396 |
|
| 397 |
if article:
|
| 398 |
# 即時情感分析
|
| 399 |
if self.sentiment_analyzer:
|
| 400 |
+
self._notify_progress(f"🧠 分析 {category_name} 文章 {i}/{len(articles_to_process)}: {article.title[:30]}...")
|
| 401 |
sentiment_result = self.sentiment_analyzer.analyze_sentiment(
|
| 402 |
article.content, article.title
|
| 403 |
)
|
|
|
|
| 426 |
success_count += 1
|
| 427 |
else:
|
| 428 |
self._notify_progress(f"⏭️ 跳過重複 {category_name} 文章: {article.title[:30]}...")
|
| 429 |
+
skip_count += 1
|
| 430 |
else:
|
| 431 |
self._notify_progress(f"⏭️ 跳過重複 {category_name} 文章: {article.title[:30]}...")
|
| 432 |
+
skip_count += 1
|
| 433 |
|
| 434 |
articles.append(article)
|
| 435 |
else:
|
| 436 |
error_count += 1
|
| 437 |
|
| 438 |
# 文章間延遲
|
| 439 |
+
if i < len(articles_to_process):
|
| 440 |
+
time.sleep(random.uniform(2, 6)) # 進一步縮短延遲時間
|
| 441 |
|
| 442 |
except Exception as e:
|
| 443 |
logger.error(f"處理文章時發生錯誤 {url}: {e}")
|
|
|
|
| 445 |
error_count += 1
|
| 446 |
continue
|
| 447 |
|
| 448 |
+
self._notify_progress(f"✅ {category_name} 分類爬取完成 - 處理: {len(articles_to_process)}, 成功: {success_count}, 跳過: {skip_count}, 錯誤: {error_count}")
|
| 449 |
return articles
|
| 450 |
|
| 451 |
+
def crawl_all_categories(self, unlimited: bool = True) -> Dict[str, List[NewsItem]]:
|
| 452 |
+
"""爬取所有分類的新聞 - 完全無限制版"""
|
| 453 |
results = {}
|
| 454 |
+
mode_text = "無限制" if unlimited else "限制"
|
| 455 |
+
|
| 456 |
+
self._notify_progress(f"🚀 開始爬取所有分類 ({mode_text}模式)")
|
| 457 |
|
| 458 |
for category in self.categories.keys():
|
| 459 |
try:
|
| 460 |
category_name = "美股" if category == "us_stock" else "台股"
|
| 461 |
self._notify_progress(f"🎯 開始爬取 {category_name} 分類")
|
| 462 |
|
| 463 |
+
# 使用新的unlimited參數
|
| 464 |
+
articles = self.crawl_category(category, unlimited=unlimited)
|
| 465 |
results[category] = articles
|
| 466 |
|
| 467 |
# 分類間延遲
|
| 468 |
if len(self.categories) > 1:
|
| 469 |
self._notify_progress(f"⏸️ 分類間休息...")
|
| 470 |
+
time.sleep(random.uniform(15, 30)) # 縮短休息時間
|
| 471 |
|
| 472 |
except Exception as e:
|
| 473 |
logger.error(f"爬取 {category} 分類時發生錯誤: {e}")
|
|
|
|
| 475 |
results[category] = []
|
| 476 |
|
| 477 |
total_articles = sum(len(articles) for articles in results.values())
|
| 478 |
+
self._notify_progress(f"🎉 所有分類爬取完成 ({mode_text}模式),總共處理 {total_articles} 篇文章")
|
| 479 |
|
| 480 |
return results
|