Spaces:
Sleeping
Sleeping
Update crawler.py
Browse files- crawler.py +29 -14
crawler.py
CHANGED
|
@@ -27,7 +27,7 @@ class NewsItem:
|
|
| 27 |
sentiment_score: Optional[float] = None
|
| 28 |
|
| 29 |
class CnYesNewsCrawler:
|
| 30 |
-
"""鉅亨網新聞爬蟲 -
|
| 31 |
|
| 32 |
def __init__(self, sentiment_analyzer=None, database=None):
|
| 33 |
self.base_url = "https://news.cnyes.com"
|
|
@@ -122,8 +122,8 @@ class CnYesNewsCrawler:
|
|
| 122 |
|
| 123 |
return None
|
| 124 |
|
| 125 |
-
def _extract_article_urls(self, category_url: str, max_pages: int =
|
| 126 |
-
"""從分類頁面提取文章URL"""
|
| 127 |
article_urls = []
|
| 128 |
|
| 129 |
for page in range(1, max_pages + 1):
|
|
@@ -349,8 +349,8 @@ class CnYesNewsCrawler:
|
|
| 349 |
|
| 350 |
return None
|
| 351 |
|
| 352 |
-
def crawl_category(self, category: str, max_articles: int =
|
| 353 |
-
"""爬取指定分類的新聞"""
|
| 354 |
if category not in self.categories:
|
| 355 |
logger.error(f"無效的分類: {category}")
|
| 356 |
return []
|
|
@@ -360,18 +360,27 @@ class CnYesNewsCrawler:
|
|
| 360 |
|
| 361 |
# 獲取文章URL列表
|
| 362 |
category_url = self.categories[category]
|
| 363 |
-
article_urls = self._extract_article_urls(category_url, max_pages=
|
| 364 |
|
| 365 |
if not article_urls:
|
| 366 |
self._notify_progress(f"⚠️ 未找到 {category_name} 分類的文章URL")
|
| 367 |
return []
|
| 368 |
|
| 369 |
-
#
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
# 提取文章內容並即時分析存檔
|
| 374 |
articles = []
|
|
|
|
|
|
|
|
|
|
| 375 |
for i, url in enumerate(article_urls, 1):
|
| 376 |
try:
|
| 377 |
self._notify_progress(f"📖 處理 {category_name} 文章 {i}/{len(article_urls)}: 正在提取內容...")
|
|
@@ -406,40 +415,46 @@ class CnYesNewsCrawler:
|
|
| 406 |
inserted, _ = self.database.insert_news([db_article])
|
| 407 |
if inserted > 0:
|
| 408 |
self._notify_progress(f"💾 已保存 {category_name} 文章: {article.title[:30]}... (情緒: {article.sentiment})")
|
|
|
|
| 409 |
else:
|
| 410 |
self._notify_progress(f"⏭️ 跳過重複 {category_name} 文章: {article.title[:30]}...")
|
| 411 |
else:
|
| 412 |
self._notify_progress(f"⏭️ 跳過重複 {category_name} 文章: {article.title[:30]}...")
|
| 413 |
|
| 414 |
articles.append(article)
|
|
|
|
|
|
|
| 415 |
|
| 416 |
# 文章間延遲
|
| 417 |
if i < len(article_urls):
|
| 418 |
-
time.sleep(random.uniform(
|
| 419 |
|
| 420 |
except Exception as e:
|
| 421 |
logger.error(f"處理文章時發生錯誤 {url}: {e}")
|
| 422 |
self._notify_progress(f"❌ 處理 {category_name} 文章時發生錯誤: {str(e)[:50]}...")
|
|
|
|
| 423 |
continue
|
| 424 |
|
| 425 |
-
self._notify_progress(f"✅ {category_name} 分類爬取完成,共處理 {len(articles)}
|
| 426 |
return articles
|
| 427 |
|
| 428 |
-
def crawl_all_categories(self, max_articles_per_category: int =
|
| 429 |
-
"""爬取所有分類的新聞"""
|
| 430 |
results = {}
|
| 431 |
|
| 432 |
for category in self.categories.keys():
|
| 433 |
try:
|
| 434 |
category_name = "美股" if category == "us_stock" else "台股"
|
| 435 |
self._notify_progress(f"🎯 開始爬取 {category_name} 分類")
|
|
|
|
|
|
|
| 436 |
articles = self.crawl_category(category, max_articles_per_category)
|
| 437 |
results[category] = articles
|
| 438 |
|
| 439 |
# 分類間延遲
|
| 440 |
if len(self.categories) > 1:
|
| 441 |
self._notify_progress(f"⏸️ 分類間休息...")
|
| 442 |
-
time.sleep(random.uniform(
|
| 443 |
|
| 444 |
except Exception as e:
|
| 445 |
logger.error(f"爬取 {category} 分類時發生錯誤: {e}")
|
|
|
|
| 27 |
sentiment_score: Optional[float] = None
|
| 28 |
|
| 29 |
class CnYesNewsCrawler:
|
| 30 |
+
"""鉅亨網新聞爬蟲 - 無限制版"""
|
| 31 |
|
| 32 |
def __init__(self, sentiment_analyzer=None, database=None):
|
| 33 |
self.base_url = "https://news.cnyes.com"
|
|
|
|
| 122 |
|
| 123 |
return None
|
| 124 |
|
| 125 |
+
def _extract_article_urls(self, category_url: str, max_pages: int = 3) -> List[str]:
|
| 126 |
+
"""從分類頁面提取文章URL - 增加頁數"""
|
| 127 |
article_urls = []
|
| 128 |
|
| 129 |
for page in range(1, max_pages + 1):
|
|
|
|
| 349 |
|
| 350 |
return None
|
| 351 |
|
| 352 |
+
def crawl_category(self, category: str, max_articles: int = None) -> List[NewsItem]:
|
| 353 |
+
"""爬取指定分類的新聞 - 移除文章數量限制"""
|
| 354 |
if category not in self.categories:
|
| 355 |
logger.error(f"無效的分類: {category}")
|
| 356 |
return []
|
|
|
|
| 360 |
|
| 361 |
# 獲取文章URL列表
|
| 362 |
category_url = self.categories[category]
|
| 363 |
+
article_urls = self._extract_article_urls(category_url, max_pages=3) # 增加到3頁
|
| 364 |
|
| 365 |
if not article_urls:
|
| 366 |
self._notify_progress(f"⚠️ 未找到 {category_name} 分類的文章URL")
|
| 367 |
return []
|
| 368 |
|
| 369 |
+
# **關鍵修正:不限制文章數量**
|
| 370 |
+
total_articles = len(article_urls)
|
| 371 |
+
if max_articles and max_articles > 0:
|
| 372 |
+
# 只有在明確指定max_articles時才限制
|
| 373 |
+
if len(article_urls) > max_articles:
|
| 374 |
+
article_urls = article_urls[:max_articles]
|
| 375 |
+
self._notify_progress(f"⚠️ 限制處理文章數量為 {max_articles} 篇")
|
| 376 |
+
|
| 377 |
+
self._notify_progress(f"📊 將處理 {len(article_urls)} 篇文章(共找到 {total_articles} 篇)")
|
| 378 |
|
| 379 |
# 提取文章內容並即時分析存檔
|
| 380 |
articles = []
|
| 381 |
+
success_count = 0
|
| 382 |
+
error_count = 0
|
| 383 |
+
|
| 384 |
for i, url in enumerate(article_urls, 1):
|
| 385 |
try:
|
| 386 |
self._notify_progress(f"📖 處理 {category_name} 文章 {i}/{len(article_urls)}: 正在提取內容...")
|
|
|
|
| 415 |
inserted, _ = self.database.insert_news([db_article])
|
| 416 |
if inserted > 0:
|
| 417 |
self._notify_progress(f"💾 已保存 {category_name} 文章: {article.title[:30]}... (情緒: {article.sentiment})")
|
| 418 |
+
success_count += 1
|
| 419 |
else:
|
| 420 |
self._notify_progress(f"⏭️ 跳過重複 {category_name} 文章: {article.title[:30]}...")
|
| 421 |
else:
|
| 422 |
self._notify_progress(f"⏭️ 跳過重複 {category_name} 文章: {article.title[:30]}...")
|
| 423 |
|
| 424 |
articles.append(article)
|
| 425 |
+
else:
|
| 426 |
+
error_count += 1
|
| 427 |
|
| 428 |
# 文章間延遲
|
| 429 |
if i < len(article_urls):
|
| 430 |
+
time.sleep(random.uniform(3, 8)) # 縮短延遲時間
|
| 431 |
|
| 432 |
except Exception as e:
|
| 433 |
logger.error(f"處理文章時發生錯誤 {url}: {e}")
|
| 434 |
self._notify_progress(f"❌ 處理 {category_name} 文章時發生錯誤: {str(e)[:50]}...")
|
| 435 |
+
error_count += 1
|
| 436 |
continue
|
| 437 |
|
| 438 |
+
self._notify_progress(f"✅ {category_name} 分類爬取完成,共處理 {len(articles)} 篇文章(成功: {success_count}, 錯誤: {error_count})")
|
| 439 |
return articles
|
| 440 |
|
| 441 |
+
def crawl_all_categories(self, max_articles_per_category: int = None) -> Dict[str, List[NewsItem]]:
|
| 442 |
+
"""爬取所有分類的新聞 - 移除限制"""
|
| 443 |
results = {}
|
| 444 |
|
| 445 |
for category in self.categories.keys():
|
| 446 |
try:
|
| 447 |
category_name = "美股" if category == "us_stock" else "台股"
|
| 448 |
self._notify_progress(f"🎯 開始爬取 {category_name} 分類")
|
| 449 |
+
|
| 450 |
+
# **關鍵修正:傳遞None表示不限制**
|
| 451 |
articles = self.crawl_category(category, max_articles_per_category)
|
| 452 |
results[category] = articles
|
| 453 |
|
| 454 |
# 分類間延遲
|
| 455 |
if len(self.categories) > 1:
|
| 456 |
self._notify_progress(f"⏸️ 分類間休息...")
|
| 457 |
+
time.sleep(random.uniform(20, 40)) # 縮短休息時間
|
| 458 |
|
| 459 |
except Exception as e:
|
| 460 |
logger.error(f"爬取 {category} 分類時發生錯誤: {e}")
|