yukee1992 commited on
Commit
823e327
·
verified ·
1 Parent(s): fa45285

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +374 -264
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==============================================
2
- # IMPROVED CONTENT EXTRACTOR FOR NEWS SITES
3
  # ==============================================
4
 
5
  import gradio as gr
@@ -13,77 +13,93 @@ from fastapi import FastAPI, Request
13
  import uvicorn
14
  import traceback
15
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
16
 
17
  # ==============================================
18
- # NEWS-SPECIFIC CONTENT EXTRACTOR
19
  # ==============================================
20
 
21
- class NewsContentExtractor:
22
- """Content extractor specifically optimized for news websites"""
23
 
24
  def __init__(self):
25
- self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
 
 
 
 
 
26
 
27
- def extract_content(self, url: str) -> Dict[str, Any]:
28
- """Extract news content with article-focused extraction"""
29
  start_time = time.time()
30
 
31
- print(f"📰 Extracting news from: {url}")
32
 
33
  # Ensure URL has protocol
34
  if not url.startswith(('http://', 'https://')):
35
  url = 'https://' + url
36
 
37
- # Try multiple strategies
38
- strategies = [
39
- self._try_direct_extract, # Direct extraction with BeautifulSoup
40
- self._try_jina_reader, # Jina Reader
41
- self._try_simple_extract, # Simple fallback
 
42
  ]
43
 
44
  best_result = None
45
  best_score = 0
46
 
47
- for i, strategy in enumerate(strategies):
48
  try:
49
- print(f" Trying strategy {i+1}...")
50
- result = strategy(url)
51
 
52
  if result.get("success"):
53
- # Score the result based on content quality
54
- score = self._score_content(result.get("main_content", ""))
55
  result["score"] = score
56
 
 
 
57
  if score > best_score:
58
  best_score = score
59
  best_result = result
60
- print(f" ✓ Strategy {i+1} score: {score}")
61
-
 
 
 
62
  except Exception as e:
63
- print(f" Strategy {i+1} failed: {e}")
64
- time.sleep(0.5)
65
 
66
- if best_result and best_score > 10: # Minimum score threshold
67
  best_result["execution_time"] = round(time.time() - start_time, 2)
68
- best_result["method"] = "best_extraction"
69
  return best_result
70
 
71
- # All failed or low quality
72
  return {
73
  "success": False,
74
  "url": url,
75
- "error": "Could not extract quality news content",
76
- "execution_time": round(time.time() - start_time, 2),
77
- "suggestion": "Website might have anti-scraping protection"
78
  }
79
 
80
- def _try_direct_extract(self, url: str) -> Dict[str, Any]:
81
- """Direct extraction with BeautifulSoup for better HTML parsing"""
82
  try:
83
  headers = {
84
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
85
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
86
- "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
87
  "Accept-Encoding": "gzip, deflate",
88
  "DNT": "1",
89
  "Connection": "keep-alive",
@@ -92,158 +108,230 @@ class NewsContentExtractor:
92
  "Sec-Fetch-Mode": "navigate",
93
  "Sec-Fetch-Site": "none",
94
  "Sec-Fetch-User": "?1",
95
- "Cache-Control": "max-age=0",
96
  }
97
 
98
- response = requests.get(url, headers=headers, timeout=15, verify=False)
99
 
100
  if response.status_code == 200:
101
- soup = BeautifulSoup(response.content, 'html.parser')
102
-
103
- # Remove unwanted elements
104
- for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer',
105
- 'aside', 'form', 'iframe', 'button', 'svg',
106
- 'link', 'meta', 'noscript']):
107
- unwanted.decompose()
108
 
109
- # Try to find article content using multiple strategies
110
- article_text = ""
 
111
 
112
- # Strategy 1: Look for article-specific containers
113
- article_selectors = [
114
- 'article', '.article-content', '.post-content', '.entry-content',
115
- '.news-content', '.content-area', '.main-content',
116
- 'div[class*="article"]', 'div[class*="content"]',
117
- 'div[class*="post"]', 'div[class*="entry"]',
118
- 'div[itemprop="articleBody"]', 'div[class*="story"]'
119
- ]
120
 
121
- for selector in article_selectors:
122
- article = soup.select_one(selector)
123
- if article:
124
- article_text = article.get_text(separator='\n', strip=True)
125
- if len(article_text) > 300: # Minimum content length
126
- print(f" Found content with selector: {selector}")
127
- break
128
 
129
- # Strategy 2: Look for main content by paragraph density
130
- if len(article_text) < 300:
131
- all_paragraphs = soup.find_all('p')
132
- if len(all_paragraphs) > 3:
133
- article_text = '\n'.join([p.get_text(strip=True) for p in all_paragraphs])
134
-
135
- # Strategy 3: Extract text from main divs
136
- if len(article_text) < 300:
137
- main_divs = soup.find_all(['div', 'section'])
138
- for div in main_divs:
139
- text = div.get_text(separator='\n', strip=True)
140
- # Check if this looks like article content
141
- if (len(text) > 500 and
142
- text.count('\n') > 5 and
143
- not any(word in text.lower() for word in ['cookie', 'privacy', 'copyright', 'advertisement'])):
144
- article_text = text
145
- break
146
-
147
- # Clean and format the text
148
- if article_text:
149
- cleaned_text = self._clean_news_content(article_text)
150
-
151
- # Extract title
152
- title = self._extract_title(soup)
153
- if not title:
154
- title_match = soup.find('title')
155
- title = title_match.get_text(strip=True) if title_match else "新闻标题"
156
-
157
- # Extract date if available
158
- date = self._extract_date(soup)
159
 
160
  return {
161
  "success": True,
162
  "url": url,
163
  "title": title[:200],
164
- "date": date,
165
  "main_content": cleaned_text,
166
  "content_length": len(cleaned_text),
167
  "content_preview": cleaned_text[:500] + ("..." if len(cleaned_text) > 500 else ""),
168
- "source": "direct_extraction",
169
- "status": response.status_code
 
170
  }
171
 
172
  return {"success": False, "error": f"Status: {response.status_code}"}
173
 
174
  except Exception as e:
175
- return {"success": False, "error": f"Direct extract error: {str(e)}"}
176
 
177
- def _try_jina_reader(self, url: str) -> Dict[str, Any]:
178
- """Try Jina Reader"""
179
  try:
180
  jina_url = f"https://r.jina.ai/{url}"
181
 
182
- response = requests.get(
183
- jina_url,
184
- headers={"Accept": "text/plain"},
185
- timeout=20
186
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  if response.status_code == 200:
189
- content = response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- # Clean the content
192
- cleaned = self._clean_news_content(content)
 
 
 
 
193
 
194
- # Extract title from Jina response
195
- title = "Jina提取内容"
196
- lines = content.split('\n')
197
- for line in lines[:10]:
198
- if line.startswith('Title:') or line.startswith('# '):
199
- title = line.replace('Title:', '').replace('# ', '').strip()
200
- break
 
 
 
 
 
 
 
 
201
 
202
- return {
203
- "success": True,
204
- "url": url,
205
- "title": title[:200],
206
- "main_content": cleaned,
207
- "content_length": len(cleaned),
208
- "content_preview": cleaned[:500] + ("..." if len(cleaned) > 500 else ""),
209
- "source": "jina_reader",
210
- "status": response.status_code
211
- }
 
 
 
 
 
 
 
 
 
 
 
212
 
213
- return {"success": False, "error": f"Jina status: {response.status_code}"}
214
 
215
  except Exception as e:
216
- return {"success": False, "error": f"Jina error: {str(e)}"}
217
 
218
- def _try_simple_extract(self, url: str) -> Dict[str, Any]:
219
- """Simple fallback extraction"""
220
  try:
221
  response = requests.get(url, timeout=10, verify=False)
222
 
223
  if response.status_code == 200:
 
224
  soup = BeautifulSoup(response.content, 'html.parser')
225
 
226
- # Get all text
227
- all_text = soup.get_text(separator='\n', strip=True)
 
 
228
 
229
- # Clean and extract meaningful parts
 
230
  lines = all_text.split('\n')
231
- meaningful_lines = []
232
 
 
 
233
  for line in lines:
234
  line = line.strip()
235
- if (len(line) > 20 and
236
- not any(word in line.lower() for word in ['cookie', 'privacy', 'copyright',
237
- 'advertisement', 'newsletter', 'subscribe',
238
- 'follow us', 'share this']) and
239
- not re.match(r'^[0-9\.\-\s]+$', line)): # Skip number-only lines
240
- meaningful_lines.append(line)
241
 
242
- cleaned_text = '\n'.join(meaningful_lines[:100]) # Take top 100 lines
243
 
244
  if len(cleaned_text) > 200:
245
  title = soup.find('title')
246
- title_text = title.get_text(strip=True) if title else "新闻内容"
247
 
248
  return {
249
  "success": True,
@@ -251,89 +339,96 @@ class NewsContentExtractor:
251
  "title": title_text[:150],
252
  "main_content": cleaned_text,
253
  "content_length": len(cleaned_text),
254
- "source": "simple_extract"
255
  }
256
 
257
- return {"success": False, "error": "Simple extraction failed"}
258
 
259
  except Exception as e:
260
  return {"success": False, "error": str(e)}
261
 
262
- def _extract_title(self, soup) -> str:
263
- """Extract title from BeautifulSoup object"""
264
- # Try multiple title sources
265
- title_sources = [
266
- soup.find('title'),
267
- soup.find('h1'),
268
- soup.find('meta', property='og:title'),
269
- soup.find('meta', attrs={'name': 'title'}),
270
- soup.find('h2', class_=re.compile(r'title|heading')),
 
 
 
 
 
 
 
 
271
  ]
272
 
273
- for source in title_sources:
274
- if source:
275
- if hasattr(source, 'get'):
276
- content = source.get('content', '') if source.name == 'meta' else source.get_text(strip=True)
277
- if content and len(content) > 5 and len(content) < 200:
278
- return content
 
 
 
 
279
 
280
- return ""
281
 
282
- def _extract_date(self, soup) -> str:
283
  """Extract date from BeautifulSoup object"""
284
- date_patterns = [
285
- r'\d{4}[-/]\d{2}[-/]\d{2}',
286
- r'\d{2}[-/]\d{2}[-/]\d{4}',
287
- r'\d{1,2}\s+\w+\s+\d{4}',
288
- ]
289
-
290
- # Look in common date locations
291
  date_selectors = [
 
 
 
292
  'time',
293
  '.date',
294
  '.published',
295
  '.post-date',
296
  '.article-date',
297
- 'meta[property="article:published_time"]',
298
- 'meta[name="pubdate"]',
299
- 'meta[name="date"]',
300
  ]
301
 
302
  for selector in date_selectors:
303
- elements = soup.select(selector)
304
- for element in elements:
305
  if element.name == 'meta':
306
  date_str = element.get('content', '')
 
 
307
  else:
308
- date_str = element.get_text(strip=True) or element.get('datetime', '')
309
 
310
- for pattern in date_patterns:
311
- match = re.search(pattern, date_str)
312
- if match:
313
- return match.group()
 
 
 
 
 
 
 
 
 
314
 
315
  return ""
316
 
317
- def _clean_news_content(self, text: str) -> str:
318
- """Clean and format news content"""
319
  if not text:
320
  return ""
321
 
322
- # Remove excessive whitespace
323
- text = re.sub(r'\s+', ' ', text)
324
-
325
- # Remove common unwanted patterns
326
- unwanted_patterns = [
327
- r'adsbygoogle.*?\[\]\]',
328
  r'ADVERTISEMENT',
329
  r'Sponsored Content',
330
- r'Sign up for.*?newsletter',
331
- r'Subscribe to.*?channel',
332
- r'Follow us on.*',
333
- r'Share this.*',
334
- r'Like us on.*',
335
- r'Read more.*',
336
- r'Continue reading.*',
337
  r'点击这里.*',
338
  r'更多新闻.*',
339
  r'相关新闻.*',
@@ -346,87 +441,98 @@ class NewsContentExtractor:
346
  r'简\s*繁',
347
  r'登入.*',
348
  r'下载APP.*',
349
- r'首页.*最新.*头条.*',
350
- r'[\*\-\=]{5,}', # Multiple special characters
351
  ]
352
 
353
- for pattern in unwanted_patterns:
354
- text = re.sub(pattern, '', text, flags=re.IGNORECASE)
355
 
356
- # Remove very short lines (likely navigation)
357
  lines = text.split('\n')
358
  cleaned_lines = []
 
359
  for line in lines:
360
  line = line.strip()
361
- if (len(line) > 15 and
362
  not line.startswith(('http://', 'https://', 'www.')) and
363
- not re.match(r'^[\d\s\.\-]+$', line)):
 
364
  cleaned_lines.append(line)
365
 
366
- text = '\n'.join(cleaned_lines)
367
-
368
  # Remove duplicate consecutive lines
369
- lines = text.split('\n')
370
  unique_lines = []
371
- for i, line in enumerate(lines):
372
- if i == 0 or line != lines[i-1]:
373
  unique_lines.append(line)
374
 
375
- return '\n'.join(unique_lines).strip()
 
 
 
 
 
 
 
376
 
377
- def _score_content(self, text: str) -> int:
378
- """Score content quality based on various factors"""
379
- if not text:
380
  return 0
381
 
382
  score = 0
 
383
 
384
- # Length-based scoring
385
- length = len(text)
386
- if length > 1000:
387
  score += 30
388
  elif length > 500:
389
  score += 20
390
- elif length > 200:
391
  score += 10
392
 
393
- # Paragraph count (rough estimate)
394
- paragraphs = text.count('\n\n') + 1
395
- if paragraphs > 5:
396
- score += 20
397
- elif paragraphs > 3:
398
- score += 10
399
 
400
- # News indicators
401
- news_keywords = ['报道', '新闻', '记者', '警方', '调查', '发生', '表示', '指出',
402
- '据知', '据了解', '据悉', '事件', '事故', '案件']
 
403
 
404
- for keyword in news_keywords:
405
- if keyword in text:
406
  score += 2
407
 
408
- # Penalize for unwanted content
409
- unwanted_terms = ['cookie', 'privacy', 'copyright', 'advertisement', 'newsletter']
410
- for term in unwanted_terms:
411
- if term.lower() in text.lower():
412
- score -= 5
 
 
 
413
 
414
- return max(0, score)
415
 
416
  # ==============================================
417
  # INITIALIZE
418
  # ==============================================
419
 
420
- extractor = NewsContentExtractor()
421
 
422
  # ==============================================
423
  # FASTAPI APP
424
  # ==============================================
425
 
426
  fastapi_app = FastAPI(
427
- title="News Content Extractor",
428
- description="Extracts news article content with BeautifulSoup",
429
- version="3.0"
430
  )
431
 
432
  from fastapi.middleware.cors import CORSMiddleware
@@ -443,13 +549,13 @@ fastapi_app.add_middleware(
443
  @fastapi_app.get("/")
444
  async def root():
445
  return {
446
- "service": "News Content Extractor",
447
- "version": "3.0",
448
- "description": "Extracts news article content using BeautifulSoup",
449
  "endpoints": {
450
  "GET /": "This info",
451
  "GET /health": "Health check",
452
- "POST /extract": "Extract news content"
453
  }
454
  }
455
 
@@ -458,7 +564,7 @@ async def health():
458
  return {
459
  "status": "healthy",
460
  "timestamp": time.time(),
461
- "service": "news_extractor"
462
  }
463
 
464
  @fastapi_app.post("/extract")
@@ -474,16 +580,16 @@ async def api_extract(request: Request):
474
  content={"success": False, "error": "URL is required"}
475
  )
476
 
477
- print(f"📰 API Request for news: {url}")
478
- print(f" Starting at {time.strftime('%Y-%m-%d %H:%M:%S')}")
479
 
480
  start_time = time.time()
481
- result = extractor.extract_content(url)
482
  elapsed = time.time() - start_time
483
 
484
- print(f" Extraction completed in {elapsed:.2f}s")
485
- print(f" Success: {result.get('success')}")
486
- print(f" Content length: {result.get('content_length', 0)}")
 
487
 
488
  return result
489
 
@@ -493,7 +599,7 @@ async def api_extract(request: Request):
493
  content={"success": False, "error": "Invalid JSON"}
494
  )
495
  except Exception as e:
496
- print(f" API Error: {traceback.format_exc()}")
497
  return JSONResponse(
498
  status_code=500,
499
  content={
@@ -511,48 +617,51 @@ def gradio_extract(url: str):
511
  if not url:
512
  return "❌ 请输入URL", {}
513
 
514
- result = extractor.extract_content(url)
515
 
516
  if result["success"]:
517
  content = result["main_content"]
518
  title = result.get("title", "无标题")
519
- date = result.get("date", "")
520
 
521
- output = f"""
522
- ## 提取成功!
 
 
 
 
 
 
 
 
 
523
 
524
- **标题:** {title}
525
- **日期:** {date if date else "未提取到日期"}
526
- **方法:** {result.get('method', '提取')}
527
- **时间:** {result['execution_time']}s
528
- **字符数:** {result['content_length']:,}
529
 
530
- ### 内容预览:
531
- {content[:800]}{"..." if len(content) > 800 else ""}
532
  """
533
  return output, result
534
  else:
535
  error = result.get("error", "未知错误")
536
- return f"## ❌ 错误\n\n{error}", result
537
 
538
  # Create Gradio interface
539
  gradio_interface = gr.Interface(
540
  fn=gradio_extract,
541
  inputs=gr.Textbox(
542
- label="新闻URL",
543
- placeholder="https://example.com/news",
544
  value="https://northern.sinchew.com.my/?p=7217886"
545
  ),
546
  outputs=[
547
- gr.Markdown(label="结果"),
548
- gr.JSON(label="API响应")
549
  ],
550
- title="📰 新闻内容提取器",
551
- description="使用BeautifulSoup提取新闻文章内容",
552
  examples=[
553
  ["https://northern.sinchew.com.my/?p=7217886"],
554
  ["https://www.sinchew.com.my/?p=7234965"],
555
- ["https://example.com"]
556
  ]
557
  )
558
 
@@ -568,16 +677,17 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
568
 
569
  if __name__ == "__main__":
570
  print("\n" + "="*60)
571
- print("📰 新闻内容提取器 v3.0 启动")
572
  print("="*60)
573
  print("特性:")
574
- print("• 使用BeautifulSoup进行HTML解析")
575
- print("• 专门针对新闻网站优化")
576
- print("• 智能内容评分系统")
 
577
  print("="*60)
578
  print("API端点:")
579
  print("• GET /health - 健康检查")
580
- print("• POST /extract - 提取新闻内容")
581
  print("="*60 + "\n")
582
 
583
  uvicorn.run(
 
1
  # ==============================================
2
+ # NEWS CONTENT EXTRACTOR WITH READABILITY
3
  # ==============================================
4
 
5
  import gradio as gr
 
13
  import uvicorn
14
  import traceback
15
  from bs4 import BeautifulSoup
16
+ from readability import Document
17
+ import logging
18
+
19
+ # Set up logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
 
24
  # ==============================================
25
+ # NEWS CONTENT EXTRACTOR WITH READABILITY
26
  # ==============================================
27
 
28
+ class NewsArticleExtractor:
29
+ """Extract news articles using readability-lxml"""
30
 
31
  def __init__(self):
32
+ self.user_agents = [
33
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
34
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
35
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
36
+ "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
37
+ ]
38
 
39
+ def extract_article(self, url: str) -> Dict[str, Any]:
40
+ """Extract article content using multiple methods"""
41
  start_time = time.time()
42
 
43
+ logger.info(f"📰 Extracting article from: {url}")
44
 
45
  # Ensure URL has protocol
46
  if not url.startswith(('http://', 'https://')):
47
  url = 'https://' + url
48
 
49
+ # Try multiple extraction methods
50
+ methods = [
51
+ self._extract_with_readability,
52
+ self._extract_with_jina,
53
+ self._extract_with_selectors,
54
+ self._extract_fallback,
55
  ]
56
 
57
  best_result = None
58
  best_score = 0
59
 
60
+ for i, method in enumerate(methods):
61
  try:
62
+ logger.info(f" Trying method {i+1}: {method.__name__}")
63
+ result = method(url)
64
 
65
  if result.get("success"):
66
+ # Score the article
67
+ score = self._score_article(result)
68
  result["score"] = score
69
 
70
+ logger.info(f" ✓ Method {i+1} score: {score}")
71
+
72
  if score > best_score:
73
  best_score = score
74
  best_result = result
75
+
76
+ # If we have a good score, return early
77
+ if score > 50:
78
+ break
79
+
80
  except Exception as e:
81
+ logger.error(f" Method {i+1} failed: {e}")
82
+ time.sleep(1)
83
 
84
+ if best_result and best_score > 20:
85
  best_result["execution_time"] = round(time.time() - start_time, 2)
86
+ best_result["method"] = "article_extraction"
87
  return best_result
88
 
 
89
  return {
90
  "success": False,
91
  "url": url,
92
+ "error": "Could not extract article content",
93
+ "execution_time": round(time.time() - start_time, 2)
 
94
  }
95
 
96
+ def _extract_with_readability(self, url: str) -> Dict[str, Any]:
97
+ """Use readability-lxml to extract article content"""
98
  try:
99
  headers = {
100
+ "User-Agent": self.user_agents[0],
101
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
102
+ "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,ms;q=0.6",
103
  "Accept-Encoding": "gzip, deflate",
104
  "DNT": "1",
105
  "Connection": "keep-alive",
 
108
  "Sec-Fetch-Mode": "navigate",
109
  "Sec-Fetch-Site": "none",
110
  "Sec-Fetch-User": "?1",
 
111
  }
112
 
113
+ response = requests.get(url, headers=headers, timeout=20, verify=False)
114
 
115
  if response.status_code == 200:
116
+ # Parse with readability
117
+ doc = Document(response.text)
 
 
 
 
 
118
 
119
+ # Extract content
120
+ article_html = doc.summary()
121
+ title = doc.title()
122
 
123
+ # Convert HTML to clean text
124
+ soup = BeautifulSoup(article_html, 'html.parser')
125
+ article_text = soup.get_text(separator='\n', strip=True)
 
 
 
 
 
126
 
127
+ # Clean the text
128
+ cleaned_text = self._clean_article_text(article_text)
 
 
 
 
 
129
 
130
+ if len(cleaned_text) > 200:
131
+ # Extract metadata
132
+ metadata = self._extract_metadata(response.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  return {
135
  "success": True,
136
  "url": url,
137
  "title": title[:200],
 
138
  "main_content": cleaned_text,
139
  "content_length": len(cleaned_text),
140
  "content_preview": cleaned_text[:500] + ("..." if len(cleaned_text) > 500 else ""),
141
+ "source": "readability",
142
+ "status": response.status_code,
143
+ "metadata": metadata
144
  }
145
 
146
  return {"success": False, "error": f"Status: {response.status_code}"}
147
 
148
  except Exception as e:
149
+ return {"success": False, "error": f"Readability error: {str(e)}"}
150
 
151
+ def _extract_with_jina(self, url: str) -> Dict[str, Any]:
152
+ """Try Jina Reader with different parameters"""
153
  try:
154
  jina_url = f"https://r.jina.ai/{url}"
155
 
156
+ # Try with different accept headers
157
+ accept_headers = [
158
+ "text/plain",
159
+ "application/json",
160
+ "text/markdown"
161
+ ]
162
+
163
+ for accept in accept_headers:
164
+ try:
165
+ response = requests.get(
166
+ jina_url,
167
+ headers={
168
+ "Accept": accept,
169
+ "User-Agent": self.user_agents[0]
170
+ },
171
+ timeout=25
172
+ )
173
+
174
+ if response.status_code == 200:
175
+ content = response.text
176
+
177
+ # Parse based on content type
178
+ if accept == "application/json":
179
+ try:
180
+ data = json.loads(content)
181
+ content = data.get("content", content)
182
+ except:
183
+ pass
184
+
185
+ # Clean content
186
+ cleaned = self._clean_article_text(content)
187
+
188
+ # Extract title
189
+ title = "Jina提取"
190
+ lines = content.split('\n')
191
+ for line in lines[:5]:
192
+ if line.startswith('Title:') or line.startswith('# '):
193
+ title = line.replace('Title:', '').replace('# ', '').strip()
194
+ break
195
+
196
+ if len(cleaned) > 200:
197
+ return {
198
+ "success": True,
199
+ "url": url,
200
+ "title": title[:200],
201
+ "main_content": cleaned,
202
+ "content_length": len(cleaned),
203
+ "source": f"jina_{accept}",
204
+ "status": response.status_code
205
+ }
206
+
207
+ except Exception as e:
208
+ logger.warning(f"Jina attempt with {accept} failed: {e}")
209
+ continue
210
+
211
+ return {"success": False, "error": "All Jina attempts failed"}
212
+
213
+ except Exception as e:
214
+ return {"success": False, "error": f"Jina error: {str(e)}"}
215
+
216
+ def _extract_with_selectors(self, url: str) -> Dict[str, Any]:
217
+ """Extract using specific selectors for sinchew.com.my"""
218
+ try:
219
+ headers = {
220
+ "User-Agent": self.user_agents[1],
221
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
222
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
223
+ }
224
+
225
+ response = requests.get(url, headers=headers, timeout=15, verify=False)
226
 
227
  if response.status_code == 200:
228
+ soup = BeautifulSoup(response.content, 'html.parser')
229
+
230
+ # Remove unwanted elements
231
+ for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer',
232
+ 'aside', 'form', 'iframe', 'button', 'svg']):
233
+ unwanted.decompose()
234
+
235
+ # Try specific selectors for sinchew.com.my
236
+ selectors_to_try = [
237
+ 'div.entry-content',
238
+ 'article',
239
+ 'div.post-content',
240
+ 'div.content-area',
241
+ 'div.article-content',
242
+ 'div.story-content',
243
+ 'div[itemprop="articleBody"]',
244
+ 'div.article-body',
245
+ 'div.main-content',
246
+ 'div.news-content',
247
+ ]
248
+
249
+ article_text = ""
250
 
251
+ for selector in selectors_to_try:
252
+ element = soup.select_one(selector)
253
+ if element:
254
+ text = element.get_text(separator='\n', strip=True)
255
+ if len(text) > len(article_text):
256
+ article_text = text
257
 
258
+ # If specific selectors didn't work, try finding the main content
259
+ if len(article_text) < 300:
260
+ # Look for paragraphs with Chinese text
261
+ all_p = soup.find_all('p')
262
+ chinese_paragraphs = []
263
+
264
+ for p in all_p:
265
+ text = p.get_text(strip=True)
266
+ if text and len(text) > 50:
267
+ # Check if it contains Chinese characters
268
+ if re.search(r'[\u4e00-\u9fff]', text):
269
+ chinese_paragraphs.append(text)
270
+
271
+ if chinese_paragraphs:
272
+ article_text = '\n\n'.join(chinese_paragraphs[:20]) # Limit to 20 paragraphs
273
 
274
+ # Clean the text
275
+ cleaned_text = self._clean_article_text(article_text)
276
+
277
+ if len(cleaned_text) > 200:
278
+ # Extract title
279
+ title = soup.find('title')
280
+ title_text = title.get_text(strip=True) if title else "新闻标题"
281
+
282
+ # Extract date
283
+ date = self._extract_date_from_soup(soup)
284
+
285
+ return {
286
+ "success": True,
287
+ "url": url,
288
+ "title": title_text[:200],
289
+ "date": date,
290
+ "main_content": cleaned_text,
291
+ "content_length": len(cleaned_text),
292
+ "source": "selectors",
293
+ "status": response.status_code
294
+ }
295
 
296
+ return {"success": False, "error": f"Status: {response.status_code}"}
297
 
298
  except Exception as e:
299
+ return {"success": False, "error": f"Selector error: {str(e)}"}
300
 
301
+ def _extract_fallback(self, url: str) -> Dict[str, Any]:
302
+ """Fallback extraction method"""
303
  try:
304
  response = requests.get(url, timeout=10, verify=False)
305
 
306
  if response.status_code == 200:
307
+ # Use BeautifulSoup to get clean text
308
  soup = BeautifulSoup(response.content, 'html.parser')
309
 
310
+ # Remove all tags except p, div, span
311
+ for tag in soup.find_all(['script', 'style', 'nav', 'header', 'footer',
312
+ 'aside', 'form', 'iframe', 'button']):
313
+ tag.decompose()
314
 
315
+ # Get text and filter
316
+ all_text = soup.get_text(separator='\n', strip=True)
317
  lines = all_text.split('\n')
 
318
 
319
+ # Filter lines
320
+ filtered_lines = []
321
  for line in lines:
322
  line = line.strip()
323
+ if (len(line) > 30 and # Minimum length
324
+ re.search(r'[\u4e00-\u9fff]', line) and # Contains Chinese
325
+ not re.search(r'cookie|privacy|copyright|advertisement|newsletter|subscribe',
326
+ line.lower()) and
327
+ not line.startswith('http')):
328
+ filtered_lines.append(line)
329
 
330
+ cleaned_text = '\n\n'.join(filtered_lines[:50])
331
 
332
  if len(cleaned_text) > 200:
333
  title = soup.find('title')
334
+ title_text = title.get_text(strip=True) if title else "内容提取"
335
 
336
  return {
337
  "success": True,
 
339
  "title": title_text[:150],
340
  "main_content": cleaned_text,
341
  "content_length": len(cleaned_text),
342
+ "source": "fallback"
343
  }
344
 
345
+ return {"success": False, "error": "Fallback extraction failed"}
346
 
347
  except Exception as e:
348
  return {"success": False, "error": str(e)}
349
 
350
+ def _extract_metadata(self, html_content: str) -> Dict[str, str]:
351
+ """Extract metadata from HTML"""
352
+ metadata = {}
353
+ soup = BeautifulSoup(html_content, 'html.parser')
354
+
355
+ # Extract date
356
+ date = self._extract_date_from_soup(soup)
357
+ if date:
358
+ metadata["date"] = date
359
+
360
+ # Extract author
361
+ author_selectors = [
362
+ 'meta[name="author"]',
363
+ 'meta[property="article:author"]',
364
+ '.author',
365
+ '.byline',
366
+ 'span[itemprop="author"]',
367
  ]
368
 
369
+ for selector in author_selectors:
370
+ element = soup.select_one(selector)
371
+ if element:
372
+ if element.name == 'meta':
373
+ author = element.get('content', '')
374
+ else:
375
+ author = element.get_text(strip=True)
376
+ if author:
377
+ metadata["author"] = author
378
+ break
379
 
380
+ return metadata
381
 
382
+ def _extract_date_from_soup(self, soup) -> str:
383
  """Extract date from BeautifulSoup object"""
 
 
 
 
 
 
 
384
  date_selectors = [
385
+ 'meta[property="article:published_time"]',
386
+ 'meta[name="pubdate"]',
387
+ 'meta[name="date"]',
388
  'time',
389
  '.date',
390
  '.published',
391
  '.post-date',
392
  '.article-date',
 
 
 
393
  ]
394
 
395
  for selector in date_selectors:
396
+ element = soup.select_one(selector)
397
+ if element:
398
  if element.name == 'meta':
399
  date_str = element.get('content', '')
400
+ elif element.name == 'time':
401
+ date_str = element.get('datetime', '') or element.get_text(strip=True)
402
  else:
403
+ date_str = element.get_text(strip=True)
404
 
405
+ if date_str:
406
+ # Try to parse date
407
+ date_patterns = [
408
+ r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}',
409
+ r'\d{4}/\d{2}/\d{2}',
410
+ r'\d{4}-\d{2}-\d{2}',
411
+ r'\d{2}/\d{2}/\d{4}',
412
+ ]
413
+
414
+ for pattern in date_patterns:
415
+ match = re.search(pattern, date_str)
416
+ if match:
417
+ return match.group()
418
 
419
  return ""
420
 
421
+ def _clean_article_text(self, text: str) -> str:
422
+ """Clean article text"""
423
  if not text:
424
  return ""
425
 
426
+ # Remove image markers and other noise
427
+ patterns_to_remove = [
428
+ r'!\[Image \d+: .*?\]',
429
+ r'Image \d+:',
 
 
430
  r'ADVERTISEMENT',
431
  r'Sponsored Content',
 
 
 
 
 
 
 
432
  r'点击这里.*',
433
  r'更多新闻.*',
434
  r'相关新闻.*',
 
441
  r'简\s*繁',
442
  r'登入.*',
443
  r'下载APP.*',
444
+ r'[\*\-\=]{5,}',
445
+ r'^\s*\d+\s*$', # Line with only numbers
446
  ]
447
 
448
+ for pattern in patterns_to_remove:
449
+ text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.MULTILINE)
450
 
451
+ # Split into lines and clean
452
  lines = text.split('\n')
453
  cleaned_lines = []
454
+
455
  for line in lines:
456
  line = line.strip()
457
+ if (len(line) > 20 and # Minimum length
458
  not line.startswith(('http://', 'https://', 'www.')) and
459
+ not re.search(r'^[\d\s\.\-]+$', line) and # Not just numbers/dashes
460
+ not re.search(r'cookie|隐私|版权|广告', line.lower())):
461
  cleaned_lines.append(line)
462
 
 
 
463
  # Remove duplicate consecutive lines
 
464
  unique_lines = []
465
+ for i, line in enumerate(cleaned_lines):
466
+ if i == 0 or line != cleaned_lines[i-1]:
467
  unique_lines.append(line)
468
 
469
+ # Join with paragraph breaks
470
+ text = '\n\n'.join(unique_lines)
471
+
472
+ # Final cleanup
473
+ text = re.sub(r'\n{3,}', '\n\n', text)
474
+ text = re.sub(r'\s+', ' ', text)
475
+
476
+ return text.strip()
477
 
478
+ def _score_article(self, result: Dict[str, Any]) -> int:
479
+ """Score article quality"""
480
+ if not result.get("success"):
481
  return 0
482
 
483
  score = 0
484
+ content = result.get("main_content", "")
485
 
486
+ # Length score
487
+ length = len(content)
488
+ if length > 800:
489
  score += 30
490
  elif length > 500:
491
  score += 20
492
+ elif length > 300:
493
  score += 10
494
 
495
+ # Paragraph count
496
+ paragraphs = content.count('\n\n') + 1
497
+ if paragraphs > 3:
498
+ score += 15
499
+ elif paragraphs > 1:
500
+ score += 5
501
 
502
+ # News keywords in Chinese
503
+ news_keywords_chinese = ['报道', '新闻', '记者', '警方', '调查', '发生', '表示',
504
+ '指出', '据知', '据了解', '据悉', '事件', '事故', '案件',
505
+ '透露', '说明', '强调', '要求', '建议', '认为']
506
 
507
+ for keyword in news_keywords_chinese:
508
+ if keyword in content:
509
  score += 2
510
 
511
+ # Check for Chinese text
512
+ if re.search(r'[\u4e00-\u9fff]', content):
513
+ score += 20
514
+
515
+ # Source bonus
516
+ source = result.get("source", "")
517
+ if "readability" in source:
518
+ score += 10
519
 
520
+ return score
521
 
522
  # ==============================================
523
  # INITIALIZE
524
  # ==============================================
525
 
526
+ extractor = NewsArticleExtractor()
527
 
528
  # ==============================================
529
  # FASTAPI APP
530
  # ==============================================
531
 
532
  fastapi_app = FastAPI(
533
+ title="News Article Extractor",
534
+ description="Extracts news articles using readability-lxml",
535
+ version="4.0"
536
  )
537
 
538
  from fastapi.middleware.cors import CORSMiddleware
 
549
  @fastapi_app.get("/")
550
  async def root():
551
  return {
552
+ "service": "News Article Extractor",
553
+ "version": "4.0",
554
+ "description": "Extracts news articles using multiple methods including readability-lxml",
555
  "endpoints": {
556
  "GET /": "This info",
557
  "GET /health": "Health check",
558
+ "POST /extract": "Extract article content"
559
  }
560
  }
561
 
 
564
  return {
565
  "status": "healthy",
566
  "timestamp": time.time(),
567
+ "service": "article_extractor"
568
  }
569
 
570
  @fastapi_app.post("/extract")
 
580
  content={"success": False, "error": "URL is required"}
581
  )
582
 
583
+ logger.info(f"📰 API Request: {url}")
 
584
 
585
  start_time = time.time()
586
+ result = extractor.extract_article(url)
587
  elapsed = time.time() - start_time
588
 
589
+ logger.info(f" Extraction completed in {elapsed:.2f}s")
590
+ logger.info(f" Success: {result.get('success')}")
591
+ logger.info(f" Content length: {result.get('content_length', 0)}")
592
+ logger.info(f" Method used: {result.get('method', 'unknown')}")
593
 
594
  return result
595
 
 
599
  content={"success": False, "error": "Invalid JSON"}
600
  )
601
  except Exception as e:
602
+ logger.error(f"API Error: {traceback.format_exc()}")
603
  return JSONResponse(
604
  status_code=500,
605
  content={
 
617
  if not url:
618
  return "❌ 请输入URL", {}
619
 
620
+ result = extractor.extract_article(url)
621
 
622
  if result["success"]:
623
  content = result["main_content"]
624
  title = result.get("title", "无标题")
 
625
 
626
+ # Format output nicely
627
+ output = f"""## 📰 {title}
628
+
629
+ **URL:** {result['url']}
630
+ **提取方法:** {result.get('method', '未知')}
631
+ **提取时间:** {result['execution_time']}秒
632
+ **内容长度:** {result['content_length']}字符
633
+
634
+ ---
635
+
636
+ {content}
637
 
638
+ ---
 
 
 
 
639
 
640
+ *提取完成于 {time.strftime('%Y-%m-%d %H:%M:%S')}*
 
641
  """
642
  return output, result
643
  else:
644
  error = result.get("error", "未知错误")
645
+ return f"## ❌ 提取失败\n\n**错误:** {error}\n\n**URL:** {result.get('url', '未知')}", result
646
 
647
  # Create Gradio interface
648
  gradio_interface = gr.Interface(
649
  fn=gradio_extract,
650
  inputs=gr.Textbox(
651
+ label="新闻文章URL",
652
+ placeholder="https://example.com/news/article",
653
  value="https://northern.sinchew.com.my/?p=7217886"
654
  ),
655
  outputs=[
656
+ gr.Markdown(label="文章内容"),
657
+ gr.JSON(label="原始数据")
658
  ],
659
+ title="📰 新闻文章提取器 v4.0",
660
+ description="使用readability-lxml提取新闻文章主要内容",
661
  examples=[
662
  ["https://northern.sinchew.com.my/?p=7217886"],
663
  ["https://www.sinchew.com.my/?p=7234965"],
664
+ ["https://www.zaobao.com.sg/realtime/china/story20250127-1525893"]
665
  ]
666
  )
667
 
 
677
 
678
  if __name__ == "__main__":
679
  print("\n" + "="*60)
680
+ print("📰 新闻文章提取器 v4.0 启动")
681
  print("="*60)
682
  print("特性:")
683
+ print("• 使用readability-lxml进行智能文章提取")
684
+ print("• 多种提取方法备用")
685
+ print("• 专门优化中文新闻网站")
686
+ print("• 自动内容评分系统")
687
  print("="*60)
688
  print("API端点:")
689
  print("• GET /health - 健康检查")
690
+ print("• POST /extract - 提取文章内容")
691
  print("="*60 + "\n")
692
 
693
  uvicorn.run(