khjhs60199 commited on
Commit
bcdcc05
·
verified ·
1 Parent(s): 15e2a05

fix craw fail issue1

Browse files
Files changed (1) hide show
  1. crawler.py +168 -50
crawler.py CHANGED
@@ -27,11 +27,17 @@ class NewsItem:
27
  sentiment_score: Optional[float] = None
28
 
29
  class CnYesNewsCrawler:
30
- """鉅亨網新聞爬蟲"""
31
 
32
  def __init__(self):
33
  self.base_url = "https://news.cnyes.com"
34
- self.session = cloudscraper.create_scraper()
 
 
 
 
 
 
35
  self.ua = UserAgent()
36
 
37
  # 新聞分類URL
@@ -44,10 +50,10 @@ class CnYesNewsCrawler:
44
  self._setup_headers()
45
 
46
  def _setup_headers(self):
47
- """設置隨機請求頭"""
48
  self.session.headers.update({
49
- 'User-Agent': self.ua.random,
50
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
51
  'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
52
  'Accept-Encoding': 'gzip, deflate, br',
53
  'DNT': '1',
@@ -56,36 +62,49 @@ class CnYesNewsCrawler:
56
  'Sec-Fetch-Dest': 'document',
57
  'Sec-Fetch-Mode': 'navigate',
58
  'Sec-Fetch-Site': 'none',
59
- 'Cache-Control': 'max-age=0'
 
 
 
 
60
  })
61
 
62
  def _get_page(self, url: str, retries: int = 3) -> Optional[BeautifulSoup]:
63
- """獲取網頁內容"""
64
  for attempt in range(retries):
65
  try:
66
- # 隨機延遲
67
- time.sleep(random.uniform(2, 5))
68
 
69
  # 輪換 User-Agent
70
- self.session.headers['User-Agent'] = self.ua.random
 
 
 
 
 
 
71
 
 
72
  response = self.session.get(url, timeout=30)
73
 
74
  if response.status_code == 200:
75
  response.encoding = 'utf-8'
76
- return BeautifulSoup(response.content, 'html.parser')
 
 
77
  else:
78
  logger.warning(f"HTTP {response.status_code} for {url}")
79
 
80
  except Exception as e:
81
  logger.error(f"請求失敗 (嘗試 {attempt + 1}/{retries}): {e}")
82
  if attempt < retries - 1:
83
- time.sleep(random.uniform(5, 10))
84
 
85
  return None
86
 
87
- def _extract_article_urls(self, category_url: str, max_pages: int = 3) -> List[str]:
88
- """從分類頁面提取文章URL"""
89
  article_urls = []
90
 
91
  for page in range(1, max_pages + 1):
@@ -93,21 +112,35 @@ class CnYesNewsCrawler:
93
  if page == 1:
94
  url = category_url
95
  else:
 
96
  url = f"{category_url}?page={page}"
97
 
98
- logger.info(f"爬取分類頁面: {url}")
99
  soup = self._get_page(url)
100
 
101
  if not soup:
102
  continue
103
 
104
- # 尋找文章連結
105
- links = soup.find_all('a', href=re.compile(r'/news/id/\d+'))
 
 
 
 
 
 
 
 
106
  page_urls = []
 
 
 
 
 
107
 
108
  for link in links:
109
  href = link.get('href')
110
- if href:
111
  full_url = urljoin(self.base_url, href)
112
  if full_url not in page_urls:
113
  page_urls.append(full_url)
@@ -116,27 +149,44 @@ class CnYesNewsCrawler:
116
  logger.info(f"第 {page} 頁找到 {len(page_urls)} 篇文章")
117
 
118
  if not page_urls:
 
119
  break
120
 
 
 
 
 
121
  except Exception as e:
122
  logger.error(f"爬取第 {page} 頁時發生錯誤: {e}")
123
  continue
124
 
125
- return list(set(article_urls)) # 去重
 
 
 
126
 
127
  def _extract_article_content(self, url: str, category: str) -> Optional[NewsItem]:
128
- """提取文章詳細內容"""
129
  try:
130
  soup = self._get_page(url)
131
  if not soup:
132
  return None
133
 
134
- # 提取標題
 
 
 
135
  title_selectors = [
136
- 'h1.news-title',
137
  'h1[class*="title"]',
 
 
138
  '.article-header h1',
139
- 'h1'
 
 
 
 
 
140
  ]
141
 
142
  title = ""
@@ -144,47 +194,90 @@ class CnYesNewsCrawler:
144
  title_elem = soup.select_one(selector)
145
  if title_elem:
146
  title = title_elem.get_text(strip=True)
147
- if title and len(title) > 5:
 
148
  break
149
 
150
  if not title:
151
  logger.warning(f"無法提取標題: {url}")
 
 
 
 
 
 
 
 
152
  return None
153
 
154
- # 提取內容
155
  content_selectors = [
156
- '.news-content',
157
  '.article-content',
 
158
  '.content-body',
159
- '[class*="article-text"]'
 
 
 
 
 
 
160
  ]
161
 
162
  content = ""
163
  for selector in content_selectors:
164
- content_elem = soup.select_one(selector)
165
- if content_elem:
166
  # 移除不需要的元素
167
- for unwanted in content_elem.select('script, style, .ad, .advertisement'):
168
  unwanted.decompose()
169
 
170
- paragraphs = content_elem.find_all(['p', 'div'])
 
171
  content_parts = []
 
172
  for p in paragraphs:
173
  text = p.get_text(strip=True)
174
- if text and len(text) > 10:
175
  content_parts.append(text)
176
 
177
  content = '\n'.join(content_parts)
178
- if content:
 
179
  break
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  if not content or len(content) < 50:
182
- logger.warning(f"內容太短或無法提取: {url}")
 
183
  return None
184
 
185
  # 提取發布時間
186
  published_date = self._extract_publish_date(soup)
187
 
 
 
 
188
  # 創建新聞項目
189
  news_item = NewsItem(
190
  title=title,
@@ -195,32 +288,55 @@ class CnYesNewsCrawler:
195
  published_date=published_date
196
  )
197
 
198
- logger.info(f"成功提取文章: {title[:50]}...")
199
  return news_item
200
 
201
  except Exception as e:
202
  logger.error(f"提取文章內容時發生錯誤 {url}: {e}")
203
  return None
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  def _extract_publish_date(self, soup: BeautifulSoup) -> datetime:
206
- """提取發布時間"""
207
  time_selectors = [
208
  'time[datetime]',
209
  '.publish-time',
210
  '.news-time',
211
- '[class*="time"]'
 
 
 
 
212
  ]
213
 
214
  for selector in time_selectors:
215
  time_elem = soup.select_one(selector)
216
  if time_elem:
217
- datetime_attr = time_elem.get('datetime')
 
218
  if datetime_attr:
219
  try:
220
  return datetime.fromisoformat(datetime_attr.replace('Z', '+00:00')).replace(tzinfo=None)
221
  except:
222
  pass
223
 
 
224
  time_text = time_elem.get_text(strip=True)
225
  parsed_time = self._parse_time_text(time_text)
226
  if parsed_time:
@@ -229,12 +345,14 @@ class CnYesNewsCrawler:
229
  return datetime.now()
230
 
231
  def _parse_time_text(self, time_text: str) -> Optional[datetime]:
232
- """解析時間文字"""
233
  patterns = [
234
  r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
235
  r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2})',
236
  r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
237
- r'(\d{4})-(\d{2})-(\d{2})'
 
 
238
  ]
239
 
240
  for pattern in patterns:
@@ -255,8 +373,8 @@ class CnYesNewsCrawler:
255
 
256
  return None
257
 
258
- def crawl_category(self, category: str, max_articles: int = 20) -> List[NewsItem]:
259
- """爬取指定分類的新聞"""
260
  if category not in self.categories:
261
  logger.error(f"無效的分類: {category}")
262
  return []
@@ -265,13 +383,13 @@ class CnYesNewsCrawler:
265
 
266
  # 獲取文章URL列表
267
  category_url = self.categories[category]
268
- article_urls = self._extract_article_urls(category_url)
269
 
270
  if not article_urls:
271
  logger.warning(f"未找到 {category} 分類的文章URL")
272
  return []
273
 
274
- # 限制文章數量
275
  if len(article_urls) > max_articles:
276
  article_urls = article_urls[:max_articles]
277
 
@@ -284,8 +402,8 @@ class CnYesNewsCrawler:
284
  if article:
285
  articles.append(article)
286
 
287
- # 隨機延遲
288
- time.sleep(random.uniform(3, 8))
289
 
290
  except Exception as e:
291
  logger.error(f"處理文章時發生錯誤 {url}: {e}")
@@ -294,8 +412,8 @@ class CnYesNewsCrawler:
294
  logger.info(f"{category} 分類爬取完成,共 {len(articles)} 篇文章")
295
  return articles
296
 
297
- def crawl_all_categories(self, max_articles_per_category: int = 15) -> Dict[str, List[NewsItem]]:
298
- """爬取所有分類的新聞"""
299
  results = {}
300
 
301
  for category in self.categories.keys():
@@ -304,8 +422,8 @@ class CnYesNewsCrawler:
304
  articles = self.crawl_category(category, max_articles_per_category)
305
  results[category] = articles
306
 
307
- # 分類間延遲
308
- time.sleep(random.uniform(10, 20))
309
 
310
  except Exception as e:
311
  logger.error(f"爬取 {category} 分類時發生錯誤: {e}")
 
27
  sentiment_score: Optional[float] = None
28
 
29
  class CnYesNewsCrawler:
30
+ """鉅亨網新聞爬蟲 - 改進版"""
31
 
32
  def __init__(self):
33
  self.base_url = "https://news.cnyes.com"
34
+ self.session = cloudscraper.create_scraper(
35
+ browser={
36
+ 'browser': 'chrome',
37
+ 'platform': 'windows',
38
+ 'mobile': False
39
+ }
40
+ )
41
  self.ua = UserAgent()
42
 
43
  # 新聞分類URL
 
50
  self._setup_headers()
51
 
52
  def _setup_headers(self):
53
+ """設置更真實的請求頭"""
54
  self.session.headers.update({
55
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
56
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
57
  'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
58
  'Accept-Encoding': 'gzip, deflate, br',
59
  'DNT': '1',
 
62
  'Sec-Fetch-Dest': 'document',
63
  'Sec-Fetch-Mode': 'navigate',
64
  'Sec-Fetch-Site': 'none',
65
+ 'Sec-Fetch-User': '?1',
66
+ 'Cache-Control': 'max-age=0',
67
+ 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
68
+ 'sec-ch-ua-mobile': '?0',
69
+ 'sec-ch-ua-platform': '"Windows"'
70
  })
71
 
72
  def _get_page(self, url: str, retries: int = 3) -> Optional[BeautifulSoup]:
73
+ """獲取網頁內容 - 改進版"""
74
  for attempt in range(retries):
75
  try:
76
+ # 更長的隨機延遲,模擬人類行為
77
+ time.sleep(random.uniform(8, 15))
78
 
79
  # 輪換 User-Agent
80
+ user_agents = [
81
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
82
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
83
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
84
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'
85
+ ]
86
+ self.session.headers['User-Agent'] = random.choice(user_agents)
87
 
88
+ logger.info(f"正在請求: {url}")
89
  response = self.session.get(url, timeout=30)
90
 
91
  if response.status_code == 200:
92
  response.encoding = 'utf-8'
93
+ soup = BeautifulSoup(response.content, 'html.parser')
94
+ logger.info(f"成功獲取網頁: {url}")
95
+ return soup
96
  else:
97
  logger.warning(f"HTTP {response.status_code} for {url}")
98
 
99
  except Exception as e:
100
  logger.error(f"請求失敗 (嘗試 {attempt + 1}/{retries}): {e}")
101
  if attempt < retries - 1:
102
+ time.sleep(random.uniform(15, 30))
103
 
104
  return None
105
 
106
+ def _extract_article_urls(self, category_url: str, max_pages: int = 2) -> List[str]:
107
+ """從分類頁面提取文章URL - 改進版"""
108
  article_urls = []
109
 
110
  for page in range(1, max_pages + 1):
 
112
  if page == 1:
113
  url = category_url
114
  else:
115
+ # 修正分頁URL格式
116
  url = f"{category_url}?page={page}"
117
 
118
+ logger.info(f"爬取分類頁面 {page}: {url}")
119
  soup = self._get_page(url)
120
 
121
  if not soup:
122
  continue
123
 
124
+ # 改進的選擇器,針對鉅亨網的實際結構
125
+ link_selectors = [
126
+ 'a[href*="/news/id/"]',
127
+ '.news-list a[href*="/news/id/"]',
128
+ '.list-item a[href*="/news/id/"]',
129
+ '.news-item a[href*="/news/id/"]',
130
+ 'h3 a[href*="/news/id/"]',
131
+ '.title a[href*="/news/id/"]'
132
+ ]
133
+
134
  page_urls = []
135
+ for selector in link_selectors:
136
+ links = soup.select(selector)
137
+ if links:
138
+ logger.info(f"使用選擇器 '{selector}' 找到 {len(links)} 個連結")
139
+ break
140
 
141
  for link in links:
142
  href = link.get('href')
143
+ if href and '/news/id/' in href:
144
  full_url = urljoin(self.base_url, href)
145
  if full_url not in page_urls:
146
  page_urls.append(full_url)
 
149
  logger.info(f"第 {page} 頁找到 {len(page_urls)} 篇文章")
150
 
151
  if not page_urls:
152
+ logger.warning(f"第 {page} 頁沒有找到文章,可能遇到反爬蟲機制")
153
  break
154
 
155
+ # 頁面間更長延遲
156
+ if page < max_pages:
157
+ time.sleep(random.uniform(20, 40))
158
+
159
  except Exception as e:
160
  logger.error(f"爬取第 {page} 頁時發生錯誤: {e}")
161
  continue
162
 
163
+ # 去重並限制數量
164
+ unique_urls = list(set(article_urls))
165
+ logger.info(f"總共找到 {len(unique_urls)} 篇獨特文章")
166
+ return unique_urls
167
 
168
  def _extract_article_content(self, url: str, category: str) -> Optional[NewsItem]:
169
+ """提取文章詳細內容 - 改進版"""
170
  try:
171
  soup = self._get_page(url)
172
  if not soup:
173
  return None
174
 
175
+ # 調試:打印網頁結構的一部分
176
+ logger.info(f"網頁標題標籤: {[tag.name for tag in soup.find_all(['h1', 'h2', 'h3'])]}")
177
+
178
+ # 改進的標題選擇器
179
  title_selectors = [
 
180
  'h1[class*="title"]',
181
+ 'h1.news-title',
182
+ 'h1.article-title',
183
  '.article-header h1',
184
+ '.news-header h1',
185
+ '.content-header h1',
186
+ 'h1',
187
+ 'h2[class*="title"]',
188
+ '.title h1',
189
+ '.title h2'
190
  ]
191
 
192
  title = ""
 
194
  title_elem = soup.select_one(selector)
195
  if title_elem:
196
  title = title_elem.get_text(strip=True)
197
+ if title and len(title) > 10:
198
+ logger.info(f"使用選擇器 '{selector}' 找到標題: {title[:50]}...")
199
  break
200
 
201
  if not title:
202
  logger.warning(f"無法提取標題: {url}")
203
+ # 嘗試從頁面標題獲取
204
+ page_title = soup.find('title')
205
+ if page_title:
206
+ title = page_title.get_text(strip=True).split(' | ')[0]
207
+ logger.info(f"從頁面標題獲取: {title[:50]}...")
208
+
209
+ if not title or len(title) < 5:
210
+ logger.warning(f"標題太短或無法提取: {url}")
211
  return None
212
 
213
+ # 改進的內容選擇器
214
  content_selectors = [
 
215
  '.article-content',
216
+ '.news-content',
217
  '.content-body',
218
+ '.article-body',
219
+ '.news-body',
220
+ '.post-content',
221
+ '[class*="article-text"]',
222
+ '[class*="content"]',
223
+ '.article p',
224
+ '.content p'
225
  ]
226
 
227
  content = ""
228
  for selector in content_selectors:
229
+ content_container = soup.select_one(selector)
230
+ if content_container:
231
  # 移除不需要的元素
232
+ for unwanted in content_container.select('script, style, .ad, .advertisement, .related, .share, .comment'):
233
  unwanted.decompose()
234
 
235
+ # 提取文本段落
236
+ paragraphs = content_container.find_all(['p', 'div'], string=True)
237
  content_parts = []
238
+
239
  for p in paragraphs:
240
  text = p.get_text(strip=True)
241
+ if text and len(text) > 20 and not any(skip in text.lower() for skip in ['廣告', 'ad', 'advertisement', '分享', 'share']):
242
  content_parts.append(text)
243
 
244
  content = '\n'.join(content_parts)
245
+ if len(content) > 100:
246
+ logger.info(f"使用選擇器 '{selector}' 找到內容,長度: {len(content)}")
247
  break
248
 
249
+ # 如果還是沒有內容,嘗試獲取所有文本
250
+ if not content or len(content) < 100:
251
+ logger.warning(f"常規方法無法提取內容,嘗試備用方法: {url}")
252
+
253
+ # 移除不需要的標籤
254
+ for unwanted in soup.select('script, style, nav, header, footer, .menu, .sidebar, .ad'):
255
+ unwanted.decompose()
256
+
257
+ # 尋找包含最多文本的元素
258
+ all_text_elements = soup.find_all(['p', 'div'], string=True)
259
+ text_blocks = []
260
+
261
+ for elem in all_text_elements:
262
+ text = elem.get_text(strip=True)
263
+ if len(text) > 50:
264
+ text_blocks.append(text)
265
+
266
+ if text_blocks:
267
+ content = '\n'.join(text_blocks[:10]) # 取前10段
268
+ logger.info(f"備用方法找到內容,長度: {len(content)}")
269
+
270
  if not content or len(content) < 50:
271
+ logger.warning(f"內容太短或無法提取: {url}, 內容長度: {len(content)}")
272
+ logger.debug(f"網頁HTML結構預覽: {str(soup)[:500]}...")
273
  return None
274
 
275
  # 提取發布時間
276
  published_date = self._extract_publish_date(soup)
277
 
278
+ # 清理內容
279
+ content = self._clean_content(content)
280
+
281
  # 創建新聞項目
282
  news_item = NewsItem(
283
  title=title,
 
288
  published_date=published_date
289
  )
290
 
291
+ logger.info(f"成功提取文章: {title[:50]}... (內容長度: {len(content)})")
292
  return news_item
293
 
294
  except Exception as e:
295
  logger.error(f"提取文章內容時發生錯誤 {url}: {e}")
296
  return None
297
 
298
+ def _clean_content(self, content: str) -> str:
299
+ """清理內容"""
300
+ # 移除多餘空白
301
+ content = re.sub(r'\s+', ' ', content)
302
+
303
+ # 移除特殊字符
304
+ content = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()(),。!?:;「」『』]', '', content)
305
+
306
+ # 移除重複句子
307
+ sentences = content.split('。')
308
+ unique_sentences = []
309
+ for sentence in sentences:
310
+ if sentence.strip() and sentence.strip() not in unique_sentences:
311
+ unique_sentences.append(sentence.strip())
312
+
313
+ return '。'.join(unique_sentences)
314
+
315
  def _extract_publish_date(self, soup: BeautifulSoup) -> datetime:
316
+ """提取發布時間 - 改進版"""
317
  time_selectors = [
318
  'time[datetime]',
319
  '.publish-time',
320
  '.news-time',
321
+ '.article-time',
322
+ '[class*="time"]',
323
+ '[class*="date"]',
324
+ 'meta[property="article:published_time"]',
325
+ 'meta[name="pubdate"]'
326
  ]
327
 
328
  for selector in time_selectors:
329
  time_elem = soup.select_one(selector)
330
  if time_elem:
331
+ # 檢查datetime屬性
332
+ datetime_attr = time_elem.get('datetime') or time_elem.get('content')
333
  if datetime_attr:
334
  try:
335
  return datetime.fromisoformat(datetime_attr.replace('Z', '+00:00')).replace(tzinfo=None)
336
  except:
337
  pass
338
 
339
+ # 檢查文本內容
340
  time_text = time_elem.get_text(strip=True)
341
  parsed_time = self._parse_time_text(time_text)
342
  if parsed_time:
 
345
  return datetime.now()
346
 
347
  def _parse_time_text(self, time_text: str) -> Optional[datetime]:
348
+ """解析時間文字 - 改進版"""
349
  patterns = [
350
  r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
351
  r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2})',
352
  r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
353
+ r'(\d{4})-(\d{2})-(\d{2})',
354
+ r'(\d{4})年(\d{1,2})月(\d{1,2})日\s*(\d{1,2}):(\d{2})',
355
+ r'(\d{4})年(\d{1,2})月(\d{1,2})日'
356
  ]
357
 
358
  for pattern in patterns:
 
373
 
374
  return None
375
 
376
+ def crawl_category(self, category: str, max_articles: int = 10) -> List[NewsItem]:
377
+ """爬取指定分類的新聞 - 減少數量避免被封"""
378
  if category not in self.categories:
379
  logger.error(f"無效的分類: {category}")
380
  return []
 
383
 
384
  # 獲取文章URL列表
385
  category_url = self.categories[category]
386
+ article_urls = self._extract_article_urls(category_url, max_pages=2)
387
 
388
  if not article_urls:
389
  logger.warning(f"未找到 {category} 分類的文章URL")
390
  return []
391
 
392
+ # 限制文章數量,避免被封
393
  if len(article_urls) > max_articles:
394
  article_urls = article_urls[:max_articles]
395
 
 
402
  if article:
403
  articles.append(article)
404
 
405
+ # 更長的隨機延遲,模擬人類閱讀
406
+ time.sleep(random.uniform(15, 30))
407
 
408
  except Exception as e:
409
  logger.error(f"處理文章時發生錯誤 {url}: {e}")
 
412
  logger.info(f"{category} 分類爬取完成,共 {len(articles)} 篇文章")
413
  return articles
414
 
415
+ def crawl_all_categories(self, max_articles_per_category: int = 8) -> Dict[str, List[NewsItem]]:
416
+ """爬取所有分類的新聞 - 減少數量"""
417
  results = {}
418
 
419
  for category in self.categories.keys():
 
422
  articles = self.crawl_category(category, max_articles_per_category)
423
  results[category] = articles
424
 
425
+ # 分類間更長延遲
426
+ time.sleep(random.uniform(60, 120))
427
 
428
  except Exception as e:
429
  logger.error(f"爬取 {category} 分類時發生錯誤: {e}")