khjhs60199 commited on
Commit
c2a50a4
ยท
verified ยท
1 Parent(s): ea8c7be

Update crawler.py

Browse files
Files changed (1) hide show
  1. crawler.py +22 -16
crawler.py CHANGED
@@ -27,7 +27,7 @@ class NewsItem:
27
  sentiment_score: Optional[float] = None
28
 
29
  class CnYesNewsCrawler:
30
- """้‰…ไบจ็ถฒๆ–ฐ่ž็ˆฌ่Ÿฒ - ๅณๆ™‚ๅˆ†ๆž็‰ˆ"""
31
 
32
  def __init__(self, sentiment_analyzer=None, database=None):
33
  self.base_url = "https://news.cnyes.com"
@@ -44,10 +44,10 @@ class CnYesNewsCrawler:
44
  self.sentiment_analyzer = sentiment_analyzer
45
  self.database = database
46
 
47
- # ๆ–ฐ่žๅˆ†้กžURL
48
  self.categories = {
49
- 'us_stock': 'https://news.cnyes.com/news/cat/us_stock',
50
- 'tw_stock': 'https://news.cnyes.com/news/cat/tw_stock_news'
51
  }
52
 
53
  # ้€ฒๅบฆๅ›ž่ชฟๅ‡ฝๆ•ธ
@@ -56,6 +56,10 @@ class CnYesNewsCrawler:
56
  # ่จญ็ฝฎ่ซ‹ๆฑ‚้ ญ
57
  self._setup_headers()
58
 
 
 
 
 
59
  def set_progress_callback(self, callback: Callable[[str], None]):
60
  """่จญ็ฝฎ้€ฒๅบฆๅ›ž่ชฟๅ‡ฝๆ•ธ"""
61
  self.progress_callback = callback
@@ -346,19 +350,20 @@ class CnYesNewsCrawler:
346
  return None
347
 
348
  def crawl_category(self, category: str, max_articles: int = 10) -> List[NewsItem]:
349
- """็ˆฌๅ–ๆŒ‡ๅฎšๅˆ†้กž็š„ๆ–ฐ่ž - ๅณๆ™‚ๅˆ†ๆž็‰ˆ"""
350
  if category not in self.categories:
351
  logger.error(f"็„กๆ•ˆ็š„ๅˆ†้กž: {category}")
352
  return []
353
 
354
- self._notify_progress(f"๐Ÿš€ ้–‹ๅง‹็ˆฌๅ– {category} ๅˆ†้กžๆ–ฐ่ž")
 
355
 
356
  # ็ฒๅ–ๆ–‡็ซ URLๅˆ—่กจ
357
  category_url = self.categories[category]
358
  article_urls = self._extract_article_urls(category_url, max_pages=2)
359
 
360
  if not article_urls:
361
- self._notify_progress(f"โš ๏ธ ๆœชๆ‰พๅˆฐ {category} ๅˆ†้กž็š„ๆ–‡็ซ URL")
362
  return []
363
 
364
  # ้™ๅˆถๆ–‡็ซ ๆ•ธ้‡
@@ -369,13 +374,13 @@ class CnYesNewsCrawler:
369
  articles = []
370
  for i, url in enumerate(article_urls, 1):
371
  try:
372
- self._notify_progress(f"๐Ÿ“– ่™•็†ๆ–‡็ซ  {i}/{len(article_urls)}: ๆญฃๅœจๆๅ–ๅ…งๅฎน...")
373
  article = self._extract_article_content(url, category)
374
 
375
  if article:
376
  # ๅณๆ™‚ๆƒ…ๆ„Ÿๅˆ†ๆž
377
  if self.sentiment_analyzer:
378
- self._notify_progress(f"๐Ÿง  ๅˆ†ๆžๆ–‡็ซ  {i}/{len(article_urls)}: {article.title[:30]}...")
379
  sentiment_result = self.sentiment_analyzer.analyze_sentiment(
380
  article.content, article.title
381
  )
@@ -400,11 +405,11 @@ class CnYesNewsCrawler:
400
 
401
  inserted, _ = self.database.insert_news([db_article])
402
  if inserted > 0:
403
- self._notify_progress(f"๐Ÿ’พ ๅทฒไฟๅญ˜ๆ–‡็ซ : {article.title[:30]}... (ๆƒ…็ท’: {article.sentiment})")
404
  else:
405
- self._notify_progress(f"โญ๏ธ ่ทณ้Ž้‡่ค‡ๆ–‡็ซ : {article.title[:30]}...")
406
  else:
407
- self._notify_progress(f"โญ๏ธ ่ทณ้Ž้‡่ค‡ๆ–‡็ซ : {article.title[:30]}...")
408
 
409
  articles.append(article)
410
 
@@ -414,19 +419,20 @@ class CnYesNewsCrawler:
414
 
415
  except Exception as e:
416
  logger.error(f"่™•็†ๆ–‡็ซ ๆ™‚็™ผ็”Ÿ้Œฏ่ชค {url}: {e}")
417
- self._notify_progress(f"โŒ ่™•็†ๆ–‡็ซ ๆ™‚็™ผ็”Ÿ้Œฏ่ชค: {str(e)[:50]}...")
418
  continue
419
 
420
- self._notify_progress(f"โœ… {category} ๅˆ†้กž็ˆฌๅ–ๅฎŒๆˆ๏ผŒๅ…ฑ่™•็† {len(articles)} ็ฏ‡ๆ–‡็ซ ")
421
  return articles
422
 
423
  def crawl_all_categories(self, max_articles_per_category: int = 8) -> Dict[str, List[NewsItem]]:
424
- """็ˆฌๅ–ๆ‰€ๆœ‰ๅˆ†้กž็š„ๆ–ฐ่ž - ๅณๆ™‚ๅˆ†ๆž็‰ˆ"""
425
  results = {}
426
 
427
  for category in self.categories.keys():
428
  try:
429
- self._notify_progress(f"๐ŸŽฏ ้–‹ๅง‹็ˆฌๅ– {category} ๅˆ†้กž")
 
430
  articles = self.crawl_category(category, max_articles_per_category)
431
  results[category] = articles
432
 
 
27
  sentiment_score: Optional[float] = None
28
 
29
  class CnYesNewsCrawler:
30
+ """้‰…ไบจ็ถฒๆ–ฐ่ž็ˆฌ่Ÿฒ - ไฟฎๆญฃURL็‰ˆ"""
31
 
32
  def __init__(self, sentiment_analyzer=None, database=None):
33
  self.base_url = "https://news.cnyes.com"
 
44
  self.sentiment_analyzer = sentiment_analyzer
45
  self.database = database
46
 
47
+ # ไฟฎๆญฃๅพŒ็š„ๆ–ฐ่žๅˆ†้กžURL
48
  self.categories = {
49
+ 'us_stock': 'https://news.cnyes.com/news/cat/us_stock', # ็พŽ่‚ก
50
+ 'tw_stock': 'https://news.cnyes.com/news/cat/tw_stock_news' # ๅฐ่‚ก
51
  }
52
 
53
  # ้€ฒๅบฆๅ›ž่ชฟๅ‡ฝๆ•ธ
 
56
  # ่จญ็ฝฎ่ซ‹ๆฑ‚้ ญ
57
  self._setup_headers()
58
 
59
+ logger.info("็ˆฌ่Ÿฒๅˆๅง‹ๅŒ–ๅฎŒๆˆ")
60
+ logger.info(f"็พŽ่‚กURL: {self.categories['us_stock']}")
61
+ logger.info(f"ๅฐ่‚กURL: {self.categories['tw_stock']}")
62
+
63
  def set_progress_callback(self, callback: Callable[[str], None]):
64
  """่จญ็ฝฎ้€ฒๅบฆๅ›ž่ชฟๅ‡ฝๆ•ธ"""
65
  self.progress_callback = callback
 
350
  return None
351
 
352
  def crawl_category(self, category: str, max_articles: int = 10) -> List[NewsItem]:
353
+ """็ˆฌๅ–ๆŒ‡ๅฎšๅˆ†้กž็š„ๆ–ฐ่ž"""
354
  if category not in self.categories:
355
  logger.error(f"็„กๆ•ˆ็š„ๅˆ†้กž: {category}")
356
  return []
357
 
358
+ category_name = "็พŽ่‚ก" if category == "us_stock" else "ๅฐ่‚ก"
359
+ self._notify_progress(f"๐Ÿš€ ้–‹ๅง‹็ˆฌๅ– {category_name} ๅˆ†้กžๆ–ฐ่ž")
360
 
361
  # ็ฒๅ–ๆ–‡็ซ URLๅˆ—่กจ
362
  category_url = self.categories[category]
363
  article_urls = self._extract_article_urls(category_url, max_pages=2)
364
 
365
  if not article_urls:
366
+ self._notify_progress(f"โš ๏ธ ๆœชๆ‰พๅˆฐ {category_name} ๅˆ†้กž็š„ๆ–‡็ซ URL")
367
  return []
368
 
369
  # ้™ๅˆถๆ–‡็ซ ๆ•ธ้‡
 
374
  articles = []
375
  for i, url in enumerate(article_urls, 1):
376
  try:
377
+ self._notify_progress(f"๐Ÿ“– ่™•็† {category_name} ๆ–‡็ซ  {i}/{len(article_urls)}: ๆญฃๅœจๆๅ–ๅ…งๅฎน...")
378
  article = self._extract_article_content(url, category)
379
 
380
  if article:
381
  # ๅณๆ™‚ๆƒ…ๆ„Ÿๅˆ†ๆž
382
  if self.sentiment_analyzer:
383
+ self._notify_progress(f"๐Ÿง  ๅˆ†ๆž {category_name} ๆ–‡็ซ  {i}/{len(article_urls)}: {article.title[:30]}...")
384
  sentiment_result = self.sentiment_analyzer.analyze_sentiment(
385
  article.content, article.title
386
  )
 
405
 
406
  inserted, _ = self.database.insert_news([db_article])
407
  if inserted > 0:
408
+ self._notify_progress(f"๐Ÿ’พ ๅทฒไฟๅญ˜ {category_name} ๆ–‡็ซ : {article.title[:30]}... (ๆƒ…็ท’: {article.sentiment})")
409
  else:
410
+ self._notify_progress(f"โญ๏ธ ่ทณ้Ž้‡่ค‡ {category_name} ๆ–‡็ซ : {article.title[:30]}...")
411
  else:
412
+ self._notify_progress(f"โญ๏ธ ่ทณ้Ž้‡่ค‡ {category_name} ๆ–‡็ซ : {article.title[:30]}...")
413
 
414
  articles.append(article)
415
 
 
419
 
420
  except Exception as e:
421
  logger.error(f"่™•็†ๆ–‡็ซ ๆ™‚็™ผ็”Ÿ้Œฏ่ชค {url}: {e}")
422
+ self._notify_progress(f"โŒ ่™•็† {category_name} ๆ–‡็ซ ๆ™‚็™ผ็”Ÿ้Œฏ่ชค: {str(e)[:50]}...")
423
  continue
424
 
425
+ self._notify_progress(f"โœ… {category_name} ๅˆ†้กž็ˆฌๅ–ๅฎŒๆˆ๏ผŒๅ…ฑ่™•็† {len(articles)} ็ฏ‡ๆ–‡็ซ ")
426
  return articles
427
 
428
  def crawl_all_categories(self, max_articles_per_category: int = 8) -> Dict[str, List[NewsItem]]:
429
+ """็ˆฌๅ–ๆ‰€ๆœ‰ๅˆ†้กž็š„ๆ–ฐ่ž"""
430
  results = {}
431
 
432
  for category in self.categories.keys():
433
  try:
434
+ category_name = "็พŽ่‚ก" if category == "us_stock" else "ๅฐ่‚ก"
435
+ self._notify_progress(f"๐ŸŽฏ ้–‹ๅง‹็ˆฌๅ– {category_name} ๅˆ†้กž")
436
  articles = self.crawl_category(category, max_articles_per_category)
437
  results[category] = articles
438