khjhs60199 commited on
Commit
7645752
ยท
verified ยท
1 Parent(s): 0c2a6b1

Update crawler.py

Browse files
Files changed (1) hide show
  1. crawler.py +89 -78
crawler.py CHANGED
@@ -6,7 +6,7 @@ import random
6
  import logging
7
  import re
8
  from datetime import datetime, timedelta
9
- from typing import List, Dict, Optional
10
  from urllib.parse import urljoin, urlparse
11
  from fake_useragent import UserAgent
12
  import json
@@ -27,9 +27,9 @@ class NewsItem:
27
  sentiment_score: Optional[float] = None
28
 
29
  class CnYesNewsCrawler:
30
- """้‰…ไบจ็ถฒๆ–ฐ่ž็ˆฌ่Ÿฒ - ๆ”น้€ฒ็‰ˆ"""
31
 
32
- def __init__(self):
33
  self.base_url = "https://news.cnyes.com"
34
  self.session = cloudscraper.create_scraper(
35
  browser={
@@ -40,15 +40,32 @@ class CnYesNewsCrawler:
40
  )
41
  self.ua = UserAgent()
42
 
 
 
 
 
43
  # ๆ–ฐ่žๅˆ†้กžURL
44
  self.categories = {
45
  'us_stock': 'https://news.cnyes.com/news/cat/us_stock',
46
  'tw_stock': 'https://news.cnyes.com/news/cat/tw_stock_news'
47
  }
48
 
 
 
 
49
  # ่จญ็ฝฎ่ซ‹ๆฑ‚้ ญ
50
  self._setup_headers()
51
 
 
 
 
 
 
 
 
 
 
 
52
  def _setup_headers(self):
53
  """่จญ็ฝฎๆ›ด็œŸๅฏฆ็š„่ซ‹ๆฑ‚้ ญ"""
54
  self.session.headers.update({
@@ -70,13 +87,11 @@ class CnYesNewsCrawler:
70
  })
71
 
72
  def _get_page(self, url: str, retries: int = 3) -> Optional[BeautifulSoup]:
73
- """็ฒๅ–็ถฒ้ ๅ…งๅฎน - ๆ”น้€ฒ็‰ˆ"""
74
  for attempt in range(retries):
75
  try:
76
- # ๆ›ด้•ท็š„้šจๆฉŸๅปถ้ฒ๏ผŒๆจกๆ“ฌไบบ้กž่กŒ็‚บ
77
- time.sleep(random.uniform(8, 15))
78
 
79
- # ่ผชๆ› User-Agent
80
  user_agents = [
81
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
82
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
@@ -99,12 +114,12 @@ class CnYesNewsCrawler:
99
  except Exception as e:
100
  logger.error(f"่ซ‹ๆฑ‚ๅคฑๆ•— (ๅ˜—่ฉฆ {attempt + 1}/{retries}): {e}")
101
  if attempt < retries - 1:
102
- time.sleep(random.uniform(15, 30))
103
 
104
  return None
105
 
106
  def _extract_article_urls(self, category_url: str, max_pages: int = 2) -> List[str]:
107
- """ๅพžๅˆ†้กž้ ้ขๆๅ–ๆ–‡็ซ URL - ๆ”น้€ฒ็‰ˆ"""
108
  article_urls = []
109
 
110
  for page in range(1, max_pages + 1):
@@ -112,16 +127,14 @@ class CnYesNewsCrawler:
112
  if page == 1:
113
  url = category_url
114
  else:
115
- # ไฟฎๆญฃๅˆ†้ URLๆ ผๅผ
116
  url = f"{category_url}?page={page}"
117
 
118
- logger.info(f"็ˆฌๅ–ๅˆ†้กž้ ้ข {page}: {url}")
119
  soup = self._get_page(url)
120
 
121
  if not soup:
122
  continue
123
 
124
- # ๆ”น้€ฒ็š„้ธๆ“‡ๅ™จ๏ผŒ้‡ๅฐ้‰…ไบจ็ถฒ็š„ๅฏฆ้š›็ตๆง‹
125
  link_selectors = [
126
  'a[href*="/news/id/"]',
127
  '.news-list a[href*="/news/id/"]',
@@ -146,36 +159,31 @@ class CnYesNewsCrawler:
146
  page_urls.append(full_url)
147
 
148
  article_urls.extend(page_urls)
149
- logger.info(f"็ฌฌ {page} ้ ๆ‰พๅˆฐ {len(page_urls)} ็ฏ‡ๆ–‡็ซ ")
150
 
151
  if not page_urls:
152
- logger.warning(f"็ฌฌ {page} ้ ๆฒ’ๆœ‰ๆ‰พๅˆฐๆ–‡็ซ ๏ผŒๅฏ่ƒฝ้‡ๅˆฐๅ็ˆฌ่ŸฒๆฉŸๅˆถ")
153
  break
154
 
155
- # ้ ้ข้–“ๆ›ด้•ทๅปถ้ฒ
156
  if page < max_pages:
157
- time.sleep(random.uniform(20, 40))
158
 
159
  except Exception as e:
160
  logger.error(f"็ˆฌๅ–็ฌฌ {page} ้ ๆ™‚็™ผ็”Ÿ้Œฏ่ชค: {e}")
161
  continue
162
 
163
- # ๅŽป้‡ไธฆ้™ๅˆถๆ•ธ้‡
164
  unique_urls = list(set(article_urls))
165
- logger.info(f"็ธฝๅ…ฑๆ‰พๅˆฐ {len(unique_urls)} ็ฏ‡็จ็‰นๆ–‡็ซ ")
166
  return unique_urls
167
 
168
  def _extract_article_content(self, url: str, category: str) -> Optional[NewsItem]:
169
- """ๆๅ–ๆ–‡็ซ ่ฉณ็ดฐๅ…งๅฎน - ๆ”น้€ฒ็‰ˆ"""
170
  try:
171
  soup = self._get_page(url)
172
  if not soup:
173
  return None
174
 
175
- # ่ชฟ่ฉฆ๏ผšๆ‰“ๅฐ็ถฒ้ ็ตๆง‹็š„ไธ€้ƒจๅˆ†
176
- logger.info(f"็ถฒ้ ๆจ™้กŒๆจ™็ฑค: {[tag.name for tag in soup.find_all(['h1', 'h2', 'h3'])]}")
177
-
178
- # ๆ”น้€ฒ็š„ๆจ™้กŒ้ธๆ“‡ๅ™จ
179
  title_selectors = [
180
  'h1[class*="title"]',
181
  'h1.news-title',
@@ -195,22 +203,18 @@ class CnYesNewsCrawler:
195
  if title_elem:
196
  title = title_elem.get_text(strip=True)
197
  if title and len(title) > 10:
198
- logger.info(f"ไฝฟ็”จ้ธๆ“‡ๅ™จ '{selector}' ๆ‰พๅˆฐๆจ™้กŒ: {title[:50]}...")
199
  break
200
 
201
  if not title:
202
- logger.warning(f"็„กๆณ•ๆๅ–ๆจ™้กŒ: {url}")
203
- # ๅ˜—่ฉฆๅพž้ ้ขๆจ™้กŒ็ฒๅ–
204
  page_title = soup.find('title')
205
  if page_title:
206
  title = page_title.get_text(strip=True).split(' | ')[0]
207
- logger.info(f"ๅพž้ ้ขๆจ™้กŒ็ฒๅ–: {title[:50]}...")
208
 
209
  if not title or len(title) < 5:
210
  logger.warning(f"ๆจ™้กŒๅคช็Ÿญๆˆ–็„กๆณ•ๆๅ–: {url}")
211
  return None
212
 
213
- # ๆ”น้€ฒ็š„ๅ…งๅฎน้ธๆ“‡ๅ™จ
214
  content_selectors = [
215
  '.article-content',
216
  '.news-content',
@@ -228,11 +232,9 @@ class CnYesNewsCrawler:
228
  for selector in content_selectors:
229
  content_container = soup.select_one(selector)
230
  if content_container:
231
- # ็งป้™คไธ้œ€่ฆ็š„ๅ…ƒ็ด 
232
  for unwanted in content_container.select('script, style, .ad, .advertisement, .related, .share, .comment'):
233
  unwanted.decompose()
234
 
235
- # ๆๅ–ๆ–‡ๆœฌๆฎต่ฝ
236
  paragraphs = content_container.find_all(['p', 'div'], string=True)
237
  content_parts = []
238
 
@@ -243,33 +245,10 @@ class CnYesNewsCrawler:
243
 
244
  content = '\n'.join(content_parts)
245
  if len(content) > 100:
246
- logger.info(f"ไฝฟ็”จ้ธๆ“‡ๅ™จ '{selector}' ๆ‰พๅˆฐๅ…งๅฎน๏ผŒ้•ทๅบฆ: {len(content)}")
247
  break
248
 
249
- # ๅฆ‚ๆžœ้‚„ๆ˜ฏๆฒ’ๆœ‰ๅ…งๅฎน๏ผŒๅ˜—่ฉฆ็ฒๅ–ๆ‰€ๆœ‰ๆ–‡ๆœฌ
250
- if not content or len(content) < 100:
251
- logger.warning(f"ๅธธ่ฆๆ–นๆณ•็„กๆณ•ๆๅ–ๅ…งๅฎน๏ผŒๅ˜—่ฉฆๅ‚™็”จๆ–นๆณ•: {url}")
252
-
253
- # ็งป้™คไธ้œ€่ฆ็š„ๆจ™็ฑค
254
- for unwanted in soup.select('script, style, nav, header, footer, .menu, .sidebar, .ad'):
255
- unwanted.decompose()
256
-
257
- # ๅฐ‹ๆ‰พๅŒ…ๅซๆœ€ๅคšๆ–‡ๆœฌ็š„ๅ…ƒ็ด 
258
- all_text_elements = soup.find_all(['p', 'div'], string=True)
259
- text_blocks = []
260
-
261
- for elem in all_text_elements:
262
- text = elem.get_text(strip=True)
263
- if len(text) > 50:
264
- text_blocks.append(text)
265
-
266
- if text_blocks:
267
- content = '\n'.join(text_blocks[:10]) # ๅ–ๅ‰10ๆฎต
268
- logger.info(f"ๅ‚™็”จๆ–นๆณ•ๆ‰พๅˆฐๅ…งๅฎน๏ผŒ้•ทๅบฆ: {len(content)}")
269
-
270
  if not content or len(content) < 50:
271
- logger.warning(f"ๅ…งๅฎนๅคช็Ÿญๆˆ–็„กๆณ•ๆๅ–: {url}, ๅ…งๅฎน้•ทๅบฆ: {len(content)}")
272
- logger.debug(f"็ถฒ้ HTML็ตๆง‹้ ่ฆฝ: {str(soup)[:500]}...")
273
  return None
274
 
275
  # ๆๅ–็™ผๅธƒๆ™‚้–“
@@ -281,14 +260,13 @@ class CnYesNewsCrawler:
281
  # ๅ‰ตๅปบๆ–ฐ่ž้ …็›ฎ
282
  news_item = NewsItem(
283
  title=title,
284
- content=content[:2000], # ้™ๅˆถๅ…งๅฎน้•ทๅบฆ
285
  url=url,
286
  source='้‰…ไบจ็ถฒ',
287
  category=category,
288
  published_date=published_date
289
  )
290
 
291
- logger.info(f"ๆˆๅŠŸๆๅ–ๆ–‡็ซ : {title[:50]}... (ๅ…งๅฎน้•ทๅบฆ: {len(content)})")
292
  return news_item
293
 
294
  except Exception as e:
@@ -297,13 +275,9 @@ class CnYesNewsCrawler:
297
 
298
  def _clean_content(self, content: str) -> str:
299
  """ๆธ…็†ๅ…งๅฎน"""
300
- # ็งป้™คๅคš้ค˜็ฉบ็™ฝ
301
  content = re.sub(r'\s+', ' ', content)
302
-
303
- # ็งป้™ค็‰นๆฎŠๅญ—็ฌฆ
304
  content = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()๏ผˆ๏ผ‰๏ผŒใ€‚๏ผ๏ผŸ๏ผš๏ผ›ใ€Œใ€ใ€Žใ€]', '', content)
305
 
306
- # ็งป้™ค้‡่ค‡ๅฅๅญ
307
  sentences = content.split('ใ€‚')
308
  unique_sentences = []
309
  for sentence in sentences:
@@ -313,7 +287,7 @@ class CnYesNewsCrawler:
313
  return 'ใ€‚'.join(unique_sentences)
314
 
315
  def _extract_publish_date(self, soup: BeautifulSoup) -> datetime:
316
- """ๆๅ–็™ผๅธƒๆ™‚้–“ - ๆ”น้€ฒ็‰ˆ"""
317
  time_selectors = [
318
  'time[datetime]',
319
  '.publish-time',
@@ -328,7 +302,6 @@ class CnYesNewsCrawler:
328
  for selector in time_selectors:
329
  time_elem = soup.select_one(selector)
330
  if time_elem:
331
- # ๆชขๆŸฅdatetimeๅฑฌๆ€ง
332
  datetime_attr = time_elem.get('datetime') or time_elem.get('content')
333
  if datetime_attr:
334
  try:
@@ -336,7 +309,6 @@ class CnYesNewsCrawler:
336
  except:
337
  pass
338
 
339
- # ๆชขๆŸฅๆ–‡ๆœฌๅ…งๅฎน
340
  time_text = time_elem.get_text(strip=True)
341
  parsed_time = self._parse_time_text(time_text)
342
  if parsed_time:
@@ -345,7 +317,7 @@ class CnYesNewsCrawler:
345
  return datetime.now()
346
 
347
  def _parse_time_text(self, time_text: str) -> Optional[datetime]:
348
- """่งฃๆžๆ™‚้–“ๆ–‡ๅญ— - ๆ”น้€ฒ็‰ˆ"""
349
  patterns = [
350
  r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
351
  r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2})',
@@ -374,62 +346,101 @@ class CnYesNewsCrawler:
374
  return None
375
 
376
  def crawl_category(self, category: str, max_articles: int = 10) -> List[NewsItem]:
377
- """็ˆฌๅ–ๆŒ‡ๅฎšๅˆ†้กž็š„ๆ–ฐ่ž - ๆธ›ๅฐ‘ๆ•ธ้‡้ฟๅ…่ขซๅฐ"""
378
  if category not in self.categories:
379
  logger.error(f"็„กๆ•ˆ็š„ๅˆ†้กž: {category}")
380
  return []
381
 
382
- logger.info(f"้–‹ๅง‹็ˆฌๅ– {category} ๅˆ†้กžๆ–ฐ่ž")
383
 
384
  # ็ฒๅ–ๆ–‡็ซ URLๅˆ—่กจ
385
  category_url = self.categories[category]
386
  article_urls = self._extract_article_urls(category_url, max_pages=2)
387
 
388
  if not article_urls:
389
- logger.warning(f"ๆœชๆ‰พๅˆฐ {category} ๅˆ†้กž็š„ๆ–‡็ซ URL")
390
  return []
391
 
392
- # ้™ๅˆถๆ–‡็ซ ๆ•ธ้‡๏ผŒ้ฟๅ…่ขซๅฐ
393
  if len(article_urls) > max_articles:
394
  article_urls = article_urls[:max_articles]
395
 
396
- # ๆๅ–ๆ–‡็ซ ๅ…งๅฎน
397
  articles = []
398
  for i, url in enumerate(article_urls, 1):
399
  try:
400
- logger.info(f"่™•็†ๆ–‡็ซ  {i}/{len(article_urls)}: {url}")
401
  article = self._extract_article_content(url, category)
 
402
  if article:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  articles.append(article)
404
 
405
- # ๆ›ด้•ท็š„้šจๆฉŸๅปถ้ฒ๏ผŒๆจกๆ“ฌไบบ้กž้–ฑ่ฎ€
406
- time.sleep(random.uniform(15, 30))
 
407
 
408
  except Exception as e:
409
  logger.error(f"่™•็†ๆ–‡็ซ ๆ™‚็™ผ็”Ÿ้Œฏ่ชค {url}: {e}")
 
410
  continue
411
 
412
- logger.info(f"{category} ๅˆ†้กž็ˆฌๅ–ๅฎŒๆˆ๏ผŒๅ…ฑ {len(articles)} ็ฏ‡ๆ–‡็ซ ")
413
  return articles
414
 
415
  def crawl_all_categories(self, max_articles_per_category: int = 8) -> Dict[str, List[NewsItem]]:
416
- """็ˆฌๅ–ๆ‰€ๆœ‰ๅˆ†้กž็š„ๆ–ฐ่ž - ๆธ›ๅฐ‘ๆ•ธ้‡"""
417
  results = {}
418
 
419
  for category in self.categories.keys():
420
  try:
421
- logger.info(f"้–‹ๅง‹็ˆฌๅ– {category} ๅˆ†้กž")
422
  articles = self.crawl_category(category, max_articles_per_category)
423
  results[category] = articles
424
 
425
- # ๅˆ†้กž้–“ๆ›ด้•ทๅปถ้ฒ
426
- time.sleep(random.uniform(60, 120))
 
 
427
 
428
  except Exception as e:
429
  logger.error(f"็ˆฌๅ– {category} ๅˆ†้กžๆ™‚็™ผ็”Ÿ้Œฏ่ชค: {e}")
 
430
  results[category] = []
431
 
432
  total_articles = sum(len(articles) for articles in results.values())
433
- logger.info(f"ๆ‰€ๆœ‰ๅˆ†้กž็ˆฌๅ–ๅฎŒๆˆ๏ผŒ็ธฝๅ…ฑ {total_articles} ็ฏ‡ๆ–‡็ซ ")
434
 
435
  return results
 
6
  import logging
7
  import re
8
  from datetime import datetime, timedelta
9
+ from typing import List, Dict, Optional, Callable
10
  from urllib.parse import urljoin, urlparse
11
  from fake_useragent import UserAgent
12
  import json
 
27
  sentiment_score: Optional[float] = None
28
 
29
  class CnYesNewsCrawler:
30
+ """้‰…ไบจ็ถฒๆ–ฐ่ž็ˆฌ่Ÿฒ - ๅณๆ™‚ๅˆ†ๆž็‰ˆ"""
31
 
32
+ def __init__(self, sentiment_analyzer=None, database=None):
33
  self.base_url = "https://news.cnyes.com"
34
  self.session = cloudscraper.create_scraper(
35
  browser={
 
40
  )
41
  self.ua = UserAgent()
42
 
43
+ # ๆณจๅ…ฅไพ่ณด
44
+ self.sentiment_analyzer = sentiment_analyzer
45
+ self.database = database
46
+
47
  # ๆ–ฐ่žๅˆ†้กžURL
48
  self.categories = {
49
  'us_stock': 'https://news.cnyes.com/news/cat/us_stock',
50
  'tw_stock': 'https://news.cnyes.com/news/cat/tw_stock_news'
51
  }
52
 
53
+ # ้€ฒๅบฆๅ›ž่ชฟๅ‡ฝๆ•ธ
54
+ self.progress_callback = None
55
+
56
  # ่จญ็ฝฎ่ซ‹ๆฑ‚้ ญ
57
  self._setup_headers()
58
 
59
+ def set_progress_callback(self, callback: Callable[[str], None]):
60
+ """่จญ็ฝฎ้€ฒๅบฆๅ›ž่ชฟๅ‡ฝๆ•ธ"""
61
+ self.progress_callback = callback
62
+
63
+ def _notify_progress(self, message: str):
64
+ """้€š็Ÿฅ้€ฒๅบฆๆ›ดๆ–ฐ"""
65
+ if self.progress_callback:
66
+ self.progress_callback(message)
67
+ logger.info(message)
68
+
69
  def _setup_headers(self):
70
  """่จญ็ฝฎๆ›ด็œŸๅฏฆ็š„่ซ‹ๆฑ‚้ ญ"""
71
  self.session.headers.update({
 
87
  })
88
 
89
  def _get_page(self, url: str, retries: int = 3) -> Optional[BeautifulSoup]:
90
+ """็ฒๅ–็ถฒ้ ๅ…งๅฎน"""
91
  for attempt in range(retries):
92
  try:
93
+ time.sleep(random.uniform(3, 8))
 
94
 
 
95
  user_agents = [
96
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
97
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
 
114
  except Exception as e:
115
  logger.error(f"่ซ‹ๆฑ‚ๅคฑๆ•— (ๅ˜—่ฉฆ {attempt + 1}/{retries}): {e}")
116
  if attempt < retries - 1:
117
+ time.sleep(random.uniform(5, 15))
118
 
119
  return None
120
 
121
  def _extract_article_urls(self, category_url: str, max_pages: int = 2) -> List[str]:
122
+ """ๅพžๅˆ†้กž้ ้ขๆๅ–ๆ–‡็ซ URL"""
123
  article_urls = []
124
 
125
  for page in range(1, max_pages + 1):
 
127
  if page == 1:
128
  url = category_url
129
  else:
 
130
  url = f"{category_url}?page={page}"
131
 
132
+ self._notify_progress(f"๐Ÿ” ็ˆฌๅ–ๅˆ†้กž้ ้ข {page}: {url}")
133
  soup = self._get_page(url)
134
 
135
  if not soup:
136
  continue
137
 
 
138
  link_selectors = [
139
  'a[href*="/news/id/"]',
140
  '.news-list a[href*="/news/id/"]',
 
159
  page_urls.append(full_url)
160
 
161
  article_urls.extend(page_urls)
162
+ self._notify_progress(f"๐Ÿ“„ ็ฌฌ {page} ้ ๆ‰พๅˆฐ {len(page_urls)} ็ฏ‡ๆ–‡็ซ ")
163
 
164
  if not page_urls:
165
+ logger.warning(f"็ฌฌ {page} ้ ๆฒ’ๆœ‰ๆ‰พๅˆฐๆ–‡็ซ ")
166
  break
167
 
 
168
  if page < max_pages:
169
+ time.sleep(random.uniform(8, 15))
170
 
171
  except Exception as e:
172
  logger.error(f"็ˆฌๅ–็ฌฌ {page} ้ ๆ™‚็™ผ็”Ÿ้Œฏ่ชค: {e}")
173
  continue
174
 
 
175
  unique_urls = list(set(article_urls))
176
+ self._notify_progress(f"๐ŸŽฏ ็ธฝๅ…ฑๆ‰พๅˆฐ {len(unique_urls)} ็ฏ‡็จ็‰นๆ–‡็ซ ")
177
  return unique_urls
178
 
179
  def _extract_article_content(self, url: str, category: str) -> Optional[NewsItem]:
180
+ """ๆๅ–ๆ–‡็ซ ่ฉณ็ดฐๅ…งๅฎน"""
181
  try:
182
  soup = self._get_page(url)
183
  if not soup:
184
  return None
185
 
186
+ # ๆๅ–ๆจ™้กŒ
 
 
 
187
  title_selectors = [
188
  'h1[class*="title"]',
189
  'h1.news-title',
 
203
  if title_elem:
204
  title = title_elem.get_text(strip=True)
205
  if title and len(title) > 10:
 
206
  break
207
 
208
  if not title:
 
 
209
  page_title = soup.find('title')
210
  if page_title:
211
  title = page_title.get_text(strip=True).split(' | ')[0]
 
212
 
213
  if not title or len(title) < 5:
214
  logger.warning(f"ๆจ™้กŒๅคช็Ÿญๆˆ–็„กๆณ•ๆๅ–: {url}")
215
  return None
216
 
217
+ # ๆๅ–ๅ…งๅฎน
218
  content_selectors = [
219
  '.article-content',
220
  '.news-content',
 
232
  for selector in content_selectors:
233
  content_container = soup.select_one(selector)
234
  if content_container:
 
235
  for unwanted in content_container.select('script, style, .ad, .advertisement, .related, .share, .comment'):
236
  unwanted.decompose()
237
 
 
238
  paragraphs = content_container.find_all(['p', 'div'], string=True)
239
  content_parts = []
240
 
 
245
 
246
  content = '\n'.join(content_parts)
247
  if len(content) > 100:
 
248
  break
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  if not content or len(content) < 50:
251
+ logger.warning(f"ๅ…งๅฎนๅคช็Ÿญๆˆ–็„กๆณ•ๆๅ–: {url}")
 
252
  return None
253
 
254
  # ๆๅ–็™ผๅธƒๆ™‚้–“
 
260
  # ๅ‰ตๅปบๆ–ฐ่ž้ …็›ฎ
261
  news_item = NewsItem(
262
  title=title,
263
+ content=content[:2000],
264
  url=url,
265
  source='้‰…ไบจ็ถฒ',
266
  category=category,
267
  published_date=published_date
268
  )
269
 
 
270
  return news_item
271
 
272
  except Exception as e:
 
275
 
276
  def _clean_content(self, content: str) -> str:
277
  """ๆธ…็†ๅ…งๅฎน"""
 
278
  content = re.sub(r'\s+', ' ', content)
 
 
279
  content = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()๏ผˆ๏ผ‰๏ผŒใ€‚๏ผ๏ผŸ๏ผš๏ผ›ใ€Œใ€ใ€Žใ€]', '', content)
280
 
 
281
  sentences = content.split('ใ€‚')
282
  unique_sentences = []
283
  for sentence in sentences:
 
287
  return 'ใ€‚'.join(unique_sentences)
288
 
289
  def _extract_publish_date(self, soup: BeautifulSoup) -> datetime:
290
+ """ๆๅ–็™ผๅธƒๆ™‚้–“"""
291
  time_selectors = [
292
  'time[datetime]',
293
  '.publish-time',
 
302
  for selector in time_selectors:
303
  time_elem = soup.select_one(selector)
304
  if time_elem:
 
305
  datetime_attr = time_elem.get('datetime') or time_elem.get('content')
306
  if datetime_attr:
307
  try:
 
309
  except:
310
  pass
311
 
 
312
  time_text = time_elem.get_text(strip=True)
313
  parsed_time = self._parse_time_text(time_text)
314
  if parsed_time:
 
317
  return datetime.now()
318
 
319
  def _parse_time_text(self, time_text: str) -> Optional[datetime]:
320
+ """่งฃๆžๆ™‚้–“ๆ–‡ๅญ—"""
321
  patterns = [
322
  r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
323
  r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2})',
 
346
  return None
347
 
348
  def crawl_category(self, category: str, max_articles: int = 10) -> List[NewsItem]:
349
+ """็ˆฌๅ–ๆŒ‡ๅฎšๅˆ†้กž็š„ๆ–ฐ่ž - ๅณๆ™‚ๅˆ†ๆž็‰ˆ"""
350
  if category not in self.categories:
351
  logger.error(f"็„กๆ•ˆ็š„ๅˆ†้กž: {category}")
352
  return []
353
 
354
+ self._notify_progress(f"๐Ÿš€ ้–‹ๅง‹็ˆฌๅ– {category} ๅˆ†้กžๆ–ฐ่ž")
355
 
356
  # ็ฒๅ–ๆ–‡็ซ URLๅˆ—่กจ
357
  category_url = self.categories[category]
358
  article_urls = self._extract_article_urls(category_url, max_pages=2)
359
 
360
  if not article_urls:
361
+ self._notify_progress(f"โš ๏ธ ๆœชๆ‰พๅˆฐ {category} ๅˆ†้กž็š„ๆ–‡็ซ URL")
362
  return []
363
 
364
+ # ้™ๅˆถๆ–‡็ซ ๆ•ธ้‡
365
  if len(article_urls) > max_articles:
366
  article_urls = article_urls[:max_articles]
367
 
368
+ # ๆๅ–ๆ–‡็ซ ๅ…งๅฎนไธฆๅณๆ™‚ๅˆ†ๆžๅญ˜ๆช”
369
  articles = []
370
  for i, url in enumerate(article_urls, 1):
371
  try:
372
+ self._notify_progress(f"๐Ÿ“– ่™•็†ๆ–‡็ซ  {i}/{len(article_urls)}: ๆญฃๅœจๆๅ–ๅ…งๅฎน...")
373
  article = self._extract_article_content(url, category)
374
+
375
  if article:
376
+ # ๅณๆ™‚ๆƒ…ๆ„Ÿๅˆ†ๆž
377
+ if self.sentiment_analyzer:
378
+ self._notify_progress(f"๐Ÿง  ๅˆ†ๆžๆ–‡็ซ  {i}/{len(article_urls)}: {article.title[:30]}...")
379
+ sentiment_result = self.sentiment_analyzer.analyze_sentiment(
380
+ article.content, article.title
381
+ )
382
+ article.sentiment = sentiment_result['sentiment']
383
+ article.sentiment_score = sentiment_result['confidence']
384
+
385
+ # ๅณๆ™‚ๅญ˜ๆช”
386
+ if self.database:
387
+ # ๆชขๆŸฅ้‡่ค‡
388
+ if not self.database.check_duplicate_by_title(article.title):
389
+ db_article = {
390
+ 'title': article.title,
391
+ 'content': article.content,
392
+ 'url': article.url,
393
+ 'source': article.source,
394
+ 'category': article.category,
395
+ 'published_date': article.published_date.isoformat(),
396
+ 'sentiment': article.sentiment,
397
+ 'sentiment_score': article.sentiment_score,
398
+ 'sentiment_method': 'auto'
399
+ }
400
+
401
+ inserted, _ = self.database.insert_news([db_article])
402
+ if inserted > 0:
403
+ self._notify_progress(f"๐Ÿ’พ ๅทฒไฟๅญ˜ๆ–‡็ซ : {article.title[:30]}... (ๆƒ…็ท’: {article.sentiment})")
404
+ else:
405
+ self._notify_progress(f"โญ๏ธ ่ทณ้Ž้‡่ค‡ๆ–‡็ซ : {article.title[:30]}...")
406
+ else:
407
+ self._notify_progress(f"โญ๏ธ ่ทณ้Ž้‡่ค‡ๆ–‡็ซ : {article.title[:30]}...")
408
+
409
  articles.append(article)
410
 
411
+ # ๆ–‡็ซ ้–“ๅปถ้ฒ
412
+ if i < len(article_urls):
413
+ time.sleep(random.uniform(5, 10))
414
 
415
  except Exception as e:
416
  logger.error(f"่™•็†ๆ–‡็ซ ๆ™‚็™ผ็”Ÿ้Œฏ่ชค {url}: {e}")
417
+ self._notify_progress(f"โŒ ่™•็†ๆ–‡็ซ ๆ™‚็™ผ็”Ÿ้Œฏ่ชค: {str(e)[:50]}...")
418
  continue
419
 
420
+ self._notify_progress(f"โœ… {category} ๅˆ†้กž็ˆฌๅ–ๅฎŒๆˆ๏ผŒๅ…ฑ่™•็† {len(articles)} ็ฏ‡ๆ–‡็ซ ")
421
  return articles
422
 
423
  def crawl_all_categories(self, max_articles_per_category: int = 8) -> Dict[str, List[NewsItem]]:
424
+ """็ˆฌๅ–ๆ‰€ๆœ‰ๅˆ†้กž็š„ๆ–ฐ่ž - ๅณๆ™‚ๅˆ†ๆž็‰ˆ"""
425
  results = {}
426
 
427
  for category in self.categories.keys():
428
  try:
429
+ self._notify_progress(f"๐ŸŽฏ ้–‹ๅง‹็ˆฌๅ– {category} ๅˆ†้กž")
430
  articles = self.crawl_category(category, max_articles_per_category)
431
  results[category] = articles
432
 
433
+ # ๅˆ†้กž้–“ๅปถ้ฒ
434
+ if len(self.categories) > 1:
435
+ self._notify_progress(f"โธ๏ธ ๅˆ†้กž้–“ไผ‘ๆฏ...")
436
+ time.sleep(random.uniform(30, 60))
437
 
438
  except Exception as e:
439
  logger.error(f"็ˆฌๅ– {category} ๅˆ†้กžๆ™‚็™ผ็”Ÿ้Œฏ่ชค: {e}")
440
+ self._notify_progress(f"โŒ ็ˆฌๅ– {category} ๅˆ†้กžๆ™‚็™ผ็”Ÿ้Œฏ่ชค: {str(e)}")
441
  results[category] = []
442
 
443
  total_articles = sum(len(articles) for articles in results.values())
444
+ self._notify_progress(f"๐ŸŽ‰ ๆ‰€ๆœ‰ๅˆ†้กž็ˆฌๅ–ๅฎŒๆˆ๏ผŒ็ธฝๅ…ฑ่™•็† {total_articles} ็ฏ‡ๆ–‡็ซ ")
445
 
446
  return results