Spaces:
Sleeping
Sleeping
fix craw fail issue1
Browse files- crawler.py +168 -50
crawler.py
CHANGED
|
@@ -27,11 +27,17 @@ class NewsItem:
|
|
| 27 |
sentiment_score: Optional[float] = None
|
| 28 |
|
| 29 |
class CnYesNewsCrawler:
|
| 30 |
-
"""鉅亨網新聞爬蟲"""
|
| 31 |
|
| 32 |
def __init__(self):
|
| 33 |
self.base_url = "https://news.cnyes.com"
|
| 34 |
-
self.session = cloudscraper.create_scraper(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
self.ua = UserAgent()
|
| 36 |
|
| 37 |
# 新聞分類URL
|
|
@@ -44,10 +50,10 @@ class CnYesNewsCrawler:
|
|
| 44 |
self._setup_headers()
|
| 45 |
|
| 46 |
def _setup_headers(self):
|
| 47 |
-
"""
|
| 48 |
self.session.headers.update({
|
| 49 |
-
'User-Agent':
|
| 50 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 51 |
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
|
| 52 |
'Accept-Encoding': 'gzip, deflate, br',
|
| 53 |
'DNT': '1',
|
|
@@ -56,36 +62,49 @@ class CnYesNewsCrawler:
|
|
| 56 |
'Sec-Fetch-Dest': 'document',
|
| 57 |
'Sec-Fetch-Mode': 'navigate',
|
| 58 |
'Sec-Fetch-Site': 'none',
|
| 59 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
})
|
| 61 |
|
| 62 |
def _get_page(self, url: str, retries: int = 3) -> Optional[BeautifulSoup]:
|
| 63 |
-
"""獲取網頁內容"""
|
| 64 |
for attempt in range(retries):
|
| 65 |
try:
|
| 66 |
-
#
|
| 67 |
-
time.sleep(random.uniform(
|
| 68 |
|
| 69 |
# 輪換 User-Agent
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
|
|
|
| 72 |
response = self.session.get(url, timeout=30)
|
| 73 |
|
| 74 |
if response.status_code == 200:
|
| 75 |
response.encoding = 'utf-8'
|
| 76 |
-
|
|
|
|
|
|
|
| 77 |
else:
|
| 78 |
logger.warning(f"HTTP {response.status_code} for {url}")
|
| 79 |
|
| 80 |
except Exception as e:
|
| 81 |
logger.error(f"請求失敗 (嘗試 {attempt + 1}/{retries}): {e}")
|
| 82 |
if attempt < retries - 1:
|
| 83 |
-
time.sleep(random.uniform(
|
| 84 |
|
| 85 |
return None
|
| 86 |
|
| 87 |
-
def _extract_article_urls(self, category_url: str, max_pages: int =
|
| 88 |
-
"""從分類頁面提取文章URL"""
|
| 89 |
article_urls = []
|
| 90 |
|
| 91 |
for page in range(1, max_pages + 1):
|
|
@@ -93,21 +112,35 @@ class CnYesNewsCrawler:
|
|
| 93 |
if page == 1:
|
| 94 |
url = category_url
|
| 95 |
else:
|
|
|
|
| 96 |
url = f"{category_url}?page={page}"
|
| 97 |
|
| 98 |
-
logger.info(f"
|
| 99 |
soup = self._get_page(url)
|
| 100 |
|
| 101 |
if not soup:
|
| 102 |
continue
|
| 103 |
|
| 104 |
-
#
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
page_urls = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
for link in links:
|
| 109 |
href = link.get('href')
|
| 110 |
-
if href:
|
| 111 |
full_url = urljoin(self.base_url, href)
|
| 112 |
if full_url not in page_urls:
|
| 113 |
page_urls.append(full_url)
|
|
@@ -116,27 +149,44 @@ class CnYesNewsCrawler:
|
|
| 116 |
logger.info(f"第 {page} 頁找到 {len(page_urls)} 篇文章")
|
| 117 |
|
| 118 |
if not page_urls:
|
|
|
|
| 119 |
break
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
except Exception as e:
|
| 122 |
logger.error(f"爬取第 {page} 頁時發生錯誤: {e}")
|
| 123 |
continue
|
| 124 |
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
def _extract_article_content(self, url: str, category: str) -> Optional[NewsItem]:
|
| 128 |
-
"""提取文章詳細內容"""
|
| 129 |
try:
|
| 130 |
soup = self._get_page(url)
|
| 131 |
if not soup:
|
| 132 |
return None
|
| 133 |
|
| 134 |
-
#
|
|
|
|
|
|
|
|
|
|
| 135 |
title_selectors = [
|
| 136 |
-
'h1.news-title',
|
| 137 |
'h1[class*="title"]',
|
|
|
|
|
|
|
| 138 |
'.article-header h1',
|
| 139 |
-
'h1'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
]
|
| 141 |
|
| 142 |
title = ""
|
|
@@ -144,47 +194,90 @@ class CnYesNewsCrawler:
|
|
| 144 |
title_elem = soup.select_one(selector)
|
| 145 |
if title_elem:
|
| 146 |
title = title_elem.get_text(strip=True)
|
| 147 |
-
if title and len(title) >
|
|
|
|
| 148 |
break
|
| 149 |
|
| 150 |
if not title:
|
| 151 |
logger.warning(f"無法提取標題: {url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
return None
|
| 153 |
|
| 154 |
-
#
|
| 155 |
content_selectors = [
|
| 156 |
-
'.news-content',
|
| 157 |
'.article-content',
|
|
|
|
| 158 |
'.content-body',
|
| 159 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
]
|
| 161 |
|
| 162 |
content = ""
|
| 163 |
for selector in content_selectors:
|
| 164 |
-
|
| 165 |
-
if
|
| 166 |
# 移除不需要的元素
|
| 167 |
-
for unwanted in
|
| 168 |
unwanted.decompose()
|
| 169 |
|
| 170 |
-
|
|
|
|
| 171 |
content_parts = []
|
|
|
|
| 172 |
for p in paragraphs:
|
| 173 |
text = p.get_text(strip=True)
|
| 174 |
-
if text and len(text) >
|
| 175 |
content_parts.append(text)
|
| 176 |
|
| 177 |
content = '\n'.join(content_parts)
|
| 178 |
-
if content:
|
|
|
|
| 179 |
break
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
if not content or len(content) < 50:
|
| 182 |
-
logger.warning(f"內容太短或無法提取: {url}")
|
|
|
|
| 183 |
return None
|
| 184 |
|
| 185 |
# 提取發布時間
|
| 186 |
published_date = self._extract_publish_date(soup)
|
| 187 |
|
|
|
|
|
|
|
|
|
|
| 188 |
# 創建新聞項目
|
| 189 |
news_item = NewsItem(
|
| 190 |
title=title,
|
|
@@ -195,32 +288,55 @@ class CnYesNewsCrawler:
|
|
| 195 |
published_date=published_date
|
| 196 |
)
|
| 197 |
|
| 198 |
-
logger.info(f"成功提取文章: {title[:50]}...")
|
| 199 |
return news_item
|
| 200 |
|
| 201 |
except Exception as e:
|
| 202 |
logger.error(f"提取文章內容時發生錯誤 {url}: {e}")
|
| 203 |
return None
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
def _extract_publish_date(self, soup: BeautifulSoup) -> datetime:
|
| 206 |
-
"""提取發布時間"""
|
| 207 |
time_selectors = [
|
| 208 |
'time[datetime]',
|
| 209 |
'.publish-time',
|
| 210 |
'.news-time',
|
| 211 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
]
|
| 213 |
|
| 214 |
for selector in time_selectors:
|
| 215 |
time_elem = soup.select_one(selector)
|
| 216 |
if time_elem:
|
| 217 |
-
|
|
|
|
| 218 |
if datetime_attr:
|
| 219 |
try:
|
| 220 |
return datetime.fromisoformat(datetime_attr.replace('Z', '+00:00')).replace(tzinfo=None)
|
| 221 |
except:
|
| 222 |
pass
|
| 223 |
|
|
|
|
| 224 |
time_text = time_elem.get_text(strip=True)
|
| 225 |
parsed_time = self._parse_time_text(time_text)
|
| 226 |
if parsed_time:
|
|
@@ -229,12 +345,14 @@ class CnYesNewsCrawler:
|
|
| 229 |
return datetime.now()
|
| 230 |
|
| 231 |
def _parse_time_text(self, time_text: str) -> Optional[datetime]:
|
| 232 |
-
"""解析時間文字"""
|
| 233 |
patterns = [
|
| 234 |
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
|
| 235 |
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2})',
|
| 236 |
r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
|
| 237 |
-
r'(\d{4})-(\d{2})-(\d{2})'
|
|
|
|
|
|
|
| 238 |
]
|
| 239 |
|
| 240 |
for pattern in patterns:
|
|
@@ -255,8 +373,8 @@ class CnYesNewsCrawler:
|
|
| 255 |
|
| 256 |
return None
|
| 257 |
|
| 258 |
-
def crawl_category(self, category: str, max_articles: int =
|
| 259 |
-
"""爬取指定分類的新聞"""
|
| 260 |
if category not in self.categories:
|
| 261 |
logger.error(f"無效的分類: {category}")
|
| 262 |
return []
|
|
@@ -265,13 +383,13 @@ class CnYesNewsCrawler:
|
|
| 265 |
|
| 266 |
# 獲取文章URL列表
|
| 267 |
category_url = self.categories[category]
|
| 268 |
-
article_urls = self._extract_article_urls(category_url)
|
| 269 |
|
| 270 |
if not article_urls:
|
| 271 |
logger.warning(f"未找到 {category} 分類的文章URL")
|
| 272 |
return []
|
| 273 |
|
| 274 |
-
#
|
| 275 |
if len(article_urls) > max_articles:
|
| 276 |
article_urls = article_urls[:max_articles]
|
| 277 |
|
|
@@ -284,8 +402,8 @@ class CnYesNewsCrawler:
|
|
| 284 |
if article:
|
| 285 |
articles.append(article)
|
| 286 |
|
| 287 |
-
#
|
| 288 |
-
time.sleep(random.uniform(
|
| 289 |
|
| 290 |
except Exception as e:
|
| 291 |
logger.error(f"處理文章時發生錯誤 {url}: {e}")
|
|
@@ -294,8 +412,8 @@ class CnYesNewsCrawler:
|
|
| 294 |
logger.info(f"{category} 分類爬取完成,共 {len(articles)} 篇文章")
|
| 295 |
return articles
|
| 296 |
|
| 297 |
-
def crawl_all_categories(self, max_articles_per_category: int =
|
| 298 |
-
"""爬取所有分類的新聞"""
|
| 299 |
results = {}
|
| 300 |
|
| 301 |
for category in self.categories.keys():
|
|
@@ -304,8 +422,8 @@ class CnYesNewsCrawler:
|
|
| 304 |
articles = self.crawl_category(category, max_articles_per_category)
|
| 305 |
results[category] = articles
|
| 306 |
|
| 307 |
-
#
|
| 308 |
-
time.sleep(random.uniform(
|
| 309 |
|
| 310 |
except Exception as e:
|
| 311 |
logger.error(f"爬取 {category} 分類時發生錯誤: {e}")
|
|
|
|
| 27 |
sentiment_score: Optional[float] = None
|
| 28 |
|
| 29 |
class CnYesNewsCrawler:
|
| 30 |
+
"""鉅亨網新聞爬蟲 - 改進版"""
|
| 31 |
|
| 32 |
def __init__(self):
|
| 33 |
self.base_url = "https://news.cnyes.com"
|
| 34 |
+
self.session = cloudscraper.create_scraper(
|
| 35 |
+
browser={
|
| 36 |
+
'browser': 'chrome',
|
| 37 |
+
'platform': 'windows',
|
| 38 |
+
'mobile': False
|
| 39 |
+
}
|
| 40 |
+
)
|
| 41 |
self.ua = UserAgent()
|
| 42 |
|
| 43 |
# 新聞分類URL
|
|
|
|
| 50 |
self._setup_headers()
|
| 51 |
|
| 52 |
def _setup_headers(self):
|
| 53 |
+
"""設置更真實的請求頭"""
|
| 54 |
self.session.headers.update({
|
| 55 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 56 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
| 57 |
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
|
| 58 |
'Accept-Encoding': 'gzip, deflate, br',
|
| 59 |
'DNT': '1',
|
|
|
|
| 62 |
'Sec-Fetch-Dest': 'document',
|
| 63 |
'Sec-Fetch-Mode': 'navigate',
|
| 64 |
'Sec-Fetch-Site': 'none',
|
| 65 |
+
'Sec-Fetch-User': '?1',
|
| 66 |
+
'Cache-Control': 'max-age=0',
|
| 67 |
+
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
| 68 |
+
'sec-ch-ua-mobile': '?0',
|
| 69 |
+
'sec-ch-ua-platform': '"Windows"'
|
| 70 |
})
|
| 71 |
|
| 72 |
def _get_page(self, url: str, retries: int = 3) -> Optional[BeautifulSoup]:
|
| 73 |
+
"""獲取網頁內容 - 改進版"""
|
| 74 |
for attempt in range(retries):
|
| 75 |
try:
|
| 76 |
+
# 更長的隨機延遲,模擬人類行為
|
| 77 |
+
time.sleep(random.uniform(8, 15))
|
| 78 |
|
| 79 |
# 輪換 User-Agent
|
| 80 |
+
user_agents = [
|
| 81 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 82 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
| 83 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 84 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'
|
| 85 |
+
]
|
| 86 |
+
self.session.headers['User-Agent'] = random.choice(user_agents)
|
| 87 |
|
| 88 |
+
logger.info(f"正在請求: {url}")
|
| 89 |
response = self.session.get(url, timeout=30)
|
| 90 |
|
| 91 |
if response.status_code == 200:
|
| 92 |
response.encoding = 'utf-8'
|
| 93 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 94 |
+
logger.info(f"成功獲取網頁: {url}")
|
| 95 |
+
return soup
|
| 96 |
else:
|
| 97 |
logger.warning(f"HTTP {response.status_code} for {url}")
|
| 98 |
|
| 99 |
except Exception as e:
|
| 100 |
logger.error(f"請求失敗 (嘗試 {attempt + 1}/{retries}): {e}")
|
| 101 |
if attempt < retries - 1:
|
| 102 |
+
time.sleep(random.uniform(15, 30))
|
| 103 |
|
| 104 |
return None
|
| 105 |
|
| 106 |
+
def _extract_article_urls(self, category_url: str, max_pages: int = 2) -> List[str]:
|
| 107 |
+
"""從分類頁面提取文章URL - 改進版"""
|
| 108 |
article_urls = []
|
| 109 |
|
| 110 |
for page in range(1, max_pages + 1):
|
|
|
|
| 112 |
if page == 1:
|
| 113 |
url = category_url
|
| 114 |
else:
|
| 115 |
+
# 修正分頁URL格式
|
| 116 |
url = f"{category_url}?page={page}"
|
| 117 |
|
| 118 |
+
logger.info(f"爬取分類頁面 {page}: {url}")
|
| 119 |
soup = self._get_page(url)
|
| 120 |
|
| 121 |
if not soup:
|
| 122 |
continue
|
| 123 |
|
| 124 |
+
# 改進的選擇器,針對鉅亨網的實際結構
|
| 125 |
+
link_selectors = [
|
| 126 |
+
'a[href*="/news/id/"]',
|
| 127 |
+
'.news-list a[href*="/news/id/"]',
|
| 128 |
+
'.list-item a[href*="/news/id/"]',
|
| 129 |
+
'.news-item a[href*="/news/id/"]',
|
| 130 |
+
'h3 a[href*="/news/id/"]',
|
| 131 |
+
'.title a[href*="/news/id/"]'
|
| 132 |
+
]
|
| 133 |
+
|
| 134 |
page_urls = []
|
| 135 |
+
for selector in link_selectors:
|
| 136 |
+
links = soup.select(selector)
|
| 137 |
+
if links:
|
| 138 |
+
logger.info(f"使用選擇器 '{selector}' 找到 {len(links)} 個連結")
|
| 139 |
+
break
|
| 140 |
|
| 141 |
for link in links:
|
| 142 |
href = link.get('href')
|
| 143 |
+
if href and '/news/id/' in href:
|
| 144 |
full_url = urljoin(self.base_url, href)
|
| 145 |
if full_url not in page_urls:
|
| 146 |
page_urls.append(full_url)
|
|
|
|
| 149 |
logger.info(f"第 {page} 頁找到 {len(page_urls)} 篇文章")
|
| 150 |
|
| 151 |
if not page_urls:
|
| 152 |
+
logger.warning(f"第 {page} 頁沒有找到文章,可能遇到反爬蟲機制")
|
| 153 |
break
|
| 154 |
|
| 155 |
+
# 頁面間更長延遲
|
| 156 |
+
if page < max_pages:
|
| 157 |
+
time.sleep(random.uniform(20, 40))
|
| 158 |
+
|
| 159 |
except Exception as e:
|
| 160 |
logger.error(f"爬取第 {page} 頁時發生錯誤: {e}")
|
| 161 |
continue
|
| 162 |
|
| 163 |
+
# 去重並限制數量
|
| 164 |
+
unique_urls = list(set(article_urls))
|
| 165 |
+
logger.info(f"總共找到 {len(unique_urls)} 篇獨特文章")
|
| 166 |
+
return unique_urls
|
| 167 |
|
| 168 |
def _extract_article_content(self, url: str, category: str) -> Optional[NewsItem]:
|
| 169 |
+
"""提取文章詳細內容 - 改進版"""
|
| 170 |
try:
|
| 171 |
soup = self._get_page(url)
|
| 172 |
if not soup:
|
| 173 |
return None
|
| 174 |
|
| 175 |
+
# 調試:打印網頁結構的一部分
|
| 176 |
+
logger.info(f"網頁標題標籤: {[tag.name for tag in soup.find_all(['h1', 'h2', 'h3'])]}")
|
| 177 |
+
|
| 178 |
+
# 改進的標題選擇器
|
| 179 |
title_selectors = [
|
|
|
|
| 180 |
'h1[class*="title"]',
|
| 181 |
+
'h1.news-title',
|
| 182 |
+
'h1.article-title',
|
| 183 |
'.article-header h1',
|
| 184 |
+
'.news-header h1',
|
| 185 |
+
'.content-header h1',
|
| 186 |
+
'h1',
|
| 187 |
+
'h2[class*="title"]',
|
| 188 |
+
'.title h1',
|
| 189 |
+
'.title h2'
|
| 190 |
]
|
| 191 |
|
| 192 |
title = ""
|
|
|
|
| 194 |
title_elem = soup.select_one(selector)
|
| 195 |
if title_elem:
|
| 196 |
title = title_elem.get_text(strip=True)
|
| 197 |
+
if title and len(title) > 10:
|
| 198 |
+
logger.info(f"使用選擇器 '{selector}' 找到標題: {title[:50]}...")
|
| 199 |
break
|
| 200 |
|
| 201 |
if not title:
|
| 202 |
logger.warning(f"無法提取標題: {url}")
|
| 203 |
+
# 嘗試從頁面標題獲取
|
| 204 |
+
page_title = soup.find('title')
|
| 205 |
+
if page_title:
|
| 206 |
+
title = page_title.get_text(strip=True).split(' | ')[0]
|
| 207 |
+
logger.info(f"從頁面標題獲取: {title[:50]}...")
|
| 208 |
+
|
| 209 |
+
if not title or len(title) < 5:
|
| 210 |
+
logger.warning(f"標題太短或無法提取: {url}")
|
| 211 |
return None
|
| 212 |
|
| 213 |
+
# 改進的內容選擇器
|
| 214 |
content_selectors = [
|
|
|
|
| 215 |
'.article-content',
|
| 216 |
+
'.news-content',
|
| 217 |
'.content-body',
|
| 218 |
+
'.article-body',
|
| 219 |
+
'.news-body',
|
| 220 |
+
'.post-content',
|
| 221 |
+
'[class*="article-text"]',
|
| 222 |
+
'[class*="content"]',
|
| 223 |
+
'.article p',
|
| 224 |
+
'.content p'
|
| 225 |
]
|
| 226 |
|
| 227 |
content = ""
|
| 228 |
for selector in content_selectors:
|
| 229 |
+
content_container = soup.select_one(selector)
|
| 230 |
+
if content_container:
|
| 231 |
# 移除不需要的元素
|
| 232 |
+
for unwanted in content_container.select('script, style, .ad, .advertisement, .related, .share, .comment'):
|
| 233 |
unwanted.decompose()
|
| 234 |
|
| 235 |
+
# 提取文本段落
|
| 236 |
+
paragraphs = content_container.find_all(['p', 'div'], string=True)
|
| 237 |
content_parts = []
|
| 238 |
+
|
| 239 |
for p in paragraphs:
|
| 240 |
text = p.get_text(strip=True)
|
| 241 |
+
if text and len(text) > 20 and not any(skip in text.lower() for skip in ['廣告', 'ad', 'advertisement', '分享', 'share']):
|
| 242 |
content_parts.append(text)
|
| 243 |
|
| 244 |
content = '\n'.join(content_parts)
|
| 245 |
+
if len(content) > 100:
|
| 246 |
+
logger.info(f"使用選擇器 '{selector}' 找到內容,長度: {len(content)}")
|
| 247 |
break
|
| 248 |
|
| 249 |
+
# 如果還是沒有內容,嘗試獲取所有文本
|
| 250 |
+
if not content or len(content) < 100:
|
| 251 |
+
logger.warning(f"常規方法無法提取內容,嘗試備用方法: {url}")
|
| 252 |
+
|
| 253 |
+
# 移除不需要的標籤
|
| 254 |
+
for unwanted in soup.select('script, style, nav, header, footer, .menu, .sidebar, .ad'):
|
| 255 |
+
unwanted.decompose()
|
| 256 |
+
|
| 257 |
+
# 尋找包含最多文本的元素
|
| 258 |
+
all_text_elements = soup.find_all(['p', 'div'], string=True)
|
| 259 |
+
text_blocks = []
|
| 260 |
+
|
| 261 |
+
for elem in all_text_elements:
|
| 262 |
+
text = elem.get_text(strip=True)
|
| 263 |
+
if len(text) > 50:
|
| 264 |
+
text_blocks.append(text)
|
| 265 |
+
|
| 266 |
+
if text_blocks:
|
| 267 |
+
content = '\n'.join(text_blocks[:10]) # 取前10段
|
| 268 |
+
logger.info(f"備用方法找到內容,長度: {len(content)}")
|
| 269 |
+
|
| 270 |
if not content or len(content) < 50:
|
| 271 |
+
logger.warning(f"內容太短或無法提取: {url}, 內容長度: {len(content)}")
|
| 272 |
+
logger.debug(f"網頁HTML結構預覽: {str(soup)[:500]}...")
|
| 273 |
return None
|
| 274 |
|
| 275 |
# 提取發布時間
|
| 276 |
published_date = self._extract_publish_date(soup)
|
| 277 |
|
| 278 |
+
# 清理內容
|
| 279 |
+
content = self._clean_content(content)
|
| 280 |
+
|
| 281 |
# 創建新聞項目
|
| 282 |
news_item = NewsItem(
|
| 283 |
title=title,
|
|
|
|
| 288 |
published_date=published_date
|
| 289 |
)
|
| 290 |
|
| 291 |
+
logger.info(f"成功提取文章: {title[:50]}... (內容長度: {len(content)})")
|
| 292 |
return news_item
|
| 293 |
|
| 294 |
except Exception as e:
|
| 295 |
logger.error(f"提取文章內容時發生錯誤 {url}: {e}")
|
| 296 |
return None
|
| 297 |
|
| 298 |
+
def _clean_content(self, content: str) -> str:
|
| 299 |
+
"""清理內容"""
|
| 300 |
+
# 移除多餘空白
|
| 301 |
+
content = re.sub(r'\s+', ' ', content)
|
| 302 |
+
|
| 303 |
+
# 移除特殊字符
|
| 304 |
+
content = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()(),。!?:;「」『』]', '', content)
|
| 305 |
+
|
| 306 |
+
# 移除重複句子
|
| 307 |
+
sentences = content.split('。')
|
| 308 |
+
unique_sentences = []
|
| 309 |
+
for sentence in sentences:
|
| 310 |
+
if sentence.strip() and sentence.strip() not in unique_sentences:
|
| 311 |
+
unique_sentences.append(sentence.strip())
|
| 312 |
+
|
| 313 |
+
return '。'.join(unique_sentences)
|
| 314 |
+
|
| 315 |
def _extract_publish_date(self, soup: BeautifulSoup) -> datetime:
|
| 316 |
+
"""提取發布時間 - 改進版"""
|
| 317 |
time_selectors = [
|
| 318 |
'time[datetime]',
|
| 319 |
'.publish-time',
|
| 320 |
'.news-time',
|
| 321 |
+
'.article-time',
|
| 322 |
+
'[class*="time"]',
|
| 323 |
+
'[class*="date"]',
|
| 324 |
+
'meta[property="article:published_time"]',
|
| 325 |
+
'meta[name="pubdate"]'
|
| 326 |
]
|
| 327 |
|
| 328 |
for selector in time_selectors:
|
| 329 |
time_elem = soup.select_one(selector)
|
| 330 |
if time_elem:
|
| 331 |
+
# 檢查datetime屬性
|
| 332 |
+
datetime_attr = time_elem.get('datetime') or time_elem.get('content')
|
| 333 |
if datetime_attr:
|
| 334 |
try:
|
| 335 |
return datetime.fromisoformat(datetime_attr.replace('Z', '+00:00')).replace(tzinfo=None)
|
| 336 |
except:
|
| 337 |
pass
|
| 338 |
|
| 339 |
+
# 檢查文本內容
|
| 340 |
time_text = time_elem.get_text(strip=True)
|
| 341 |
parsed_time = self._parse_time_text(time_text)
|
| 342 |
if parsed_time:
|
|
|
|
| 345 |
return datetime.now()
|
| 346 |
|
| 347 |
def _parse_time_text(self, time_text: str) -> Optional[datetime]:
|
| 348 |
+
"""解析時間文字 - 改進版"""
|
| 349 |
patterns = [
|
| 350 |
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
|
| 351 |
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2})',
|
| 352 |
r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
|
| 353 |
+
r'(\d{4})-(\d{2})-(\d{2})',
|
| 354 |
+
r'(\d{4})年(\d{1,2})月(\d{1,2})日\s*(\d{1,2}):(\d{2})',
|
| 355 |
+
r'(\d{4})年(\d{1,2})月(\d{1,2})日'
|
| 356 |
]
|
| 357 |
|
| 358 |
for pattern in patterns:
|
|
|
|
| 373 |
|
| 374 |
return None
|
| 375 |
|
| 376 |
+
def crawl_category(self, category: str, max_articles: int = 10) -> List[NewsItem]:
|
| 377 |
+
"""爬取指定分類的新聞 - 減少數量避免被封"""
|
| 378 |
if category not in self.categories:
|
| 379 |
logger.error(f"無效的分類: {category}")
|
| 380 |
return []
|
|
|
|
| 383 |
|
| 384 |
# 獲取文章URL列表
|
| 385 |
category_url = self.categories[category]
|
| 386 |
+
article_urls = self._extract_article_urls(category_url, max_pages=2)
|
| 387 |
|
| 388 |
if not article_urls:
|
| 389 |
logger.warning(f"未找到 {category} 分類的文章URL")
|
| 390 |
return []
|
| 391 |
|
| 392 |
+
# 限制文章數量,避免被封
|
| 393 |
if len(article_urls) > max_articles:
|
| 394 |
article_urls = article_urls[:max_articles]
|
| 395 |
|
|
|
|
| 402 |
if article:
|
| 403 |
articles.append(article)
|
| 404 |
|
| 405 |
+
# 更長的隨機延遲,模擬人類閱讀
|
| 406 |
+
time.sleep(random.uniform(15, 30))
|
| 407 |
|
| 408 |
except Exception as e:
|
| 409 |
logger.error(f"處理文章時發生錯誤 {url}: {e}")
|
|
|
|
| 412 |
logger.info(f"{category} 分類爬取完成,共 {len(articles)} 篇文章")
|
| 413 |
return articles
|
| 414 |
|
| 415 |
+
def crawl_all_categories(self, max_articles_per_category: int = 8) -> Dict[str, List[NewsItem]]:
|
| 416 |
+
"""爬取所有分類的新聞 - 減少數量"""
|
| 417 |
results = {}
|
| 418 |
|
| 419 |
for category in self.categories.keys():
|
|
|
|
| 422 |
articles = self.crawl_category(category, max_articles_per_category)
|
| 423 |
results[category] = articles
|
| 424 |
|
| 425 |
+
# 分類間更長延遲
|
| 426 |
+
time.sleep(random.uniform(60, 120))
|
| 427 |
|
| 428 |
except Exception as e:
|
| 429 |
logger.error(f"爬取 {category} 分類時發生錯誤: {e}")
|