yukee1992 commited on
Commit
15aced8
·
verified ·
1 Parent(s): 5d4e21f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +357 -329
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==============================================
2
- # ROBUST CONTENT EXTRACTOR WITH BETTER ERROR HANDLING
3
  # ==============================================
4
 
5
  import gradio as gr
@@ -12,54 +12,37 @@ from typing import Dict, Any
12
  from fastapi import FastAPI, Request
13
  import uvicorn
14
  import traceback
 
15
 
16
  # ==============================================
17
- # IMPROVED CONTENT EXTRACTOR
18
  # ==============================================
19
 
20
- class RobustContentExtractor:
21
- """Content extractor with better timeout handling"""
22
 
23
  def __init__(self):
24
  self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
25
 
26
  def extract_content(self, url: str) -> Dict[str, Any]:
27
- """Extract content with better error handling"""
28
  start_time = time.time()
29
 
30
- print(f"🌐 Extracting: {url}")
31
 
32
  # Ensure URL has protocol
33
  if not url.startswith(('http://', 'https://')):
34
  url = 'https://' + url
35
 
36
- # Clean URL - remove any problematic characters
37
- try:
38
- from urllib.parse import quote, urlparse, urlunparse
39
- parsed = urlparse(url)
40
- # Only encode the path and query
41
- encoded_path = quote(parsed.path, safe='/')
42
- encoded_query = quote(parsed.query, safe='=&')
43
- url = urlunparse((
44
- parsed.scheme,
45
- parsed.netloc,
46
- encoded_path,
47
- parsed.params,
48
- encoded_query,
49
- parsed.fragment
50
- ))
51
- except:
52
- pass # Keep original if encoding fails
53
-
54
  # Try multiple strategies
55
  strategies = [
56
- self._try_jina_reader_fast, # Faster timeout
57
- self._try_direct_request, # Direct attempt
58
- self._try_simple_request, # Simple headers
59
- self._try_fallback_request, # Fallback with different settings
60
  ]
61
 
62
- last_error = None
 
63
 
64
  for i, strategy in enumerate(strategies):
65
  try:
@@ -67,334 +50,383 @@ class RobustContentExtractor:
67
  result = strategy(url)
68
 
69
  if result.get("success"):
70
- result["execution_time"] = round(time.time() - start_time, 2)
71
- result["method"] = f"strategy_{i+1}"
72
- print(f" Strategy {i+1} succeeded")
73
- return result
 
 
 
 
74
 
75
  except Exception as e:
76
- last_error = str(e)
77
  print(f" Strategy {i+1} failed: {e}")
78
- time.sleep(1) # Short pause between strategies
 
 
 
 
 
79
 
80
- # All failed
81
  return {
82
  "success": False,
83
  "url": url,
84
- "error": f"All extraction methods failed. Last error: {last_error}",
85
  "execution_time": round(time.time() - start_time, 2),
86
- "suggestion": "Website may block automated access. Try a different URL."
87
  }
88
 
89
- def _try_jina_reader_fast(self, url: str) -> Dict[str, Any]:
90
- """Try Jina Reader with shorter timeout"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  try:
92
- # Use encoded URL for Jina
93
  jina_url = f"https://r.jina.ai/{url}"
94
 
95
- # Try with very short timeout first
96
  response = requests.get(
97
  jina_url,
98
- headers={
99
- "Accept": "text/plain",
100
- "User-Agent": self.user_agent
101
- },
102
- timeout=12 # Reduced from 15s
103
  )
104
 
105
  if response.status_code == 200:
106
  content = response.text
107
 
108
- # Try to parse as JSON
109
- try:
110
- data = json.loads(content)
111
- if isinstance(data, dict):
112
- if "content" in data:
113
- content = data["content"]
114
- elif "data" in data:
115
- content = str(data["data"])
116
- except:
117
- pass # Keep as plain text
118
-
119
- # Extract title
120
- title = self._extract_title_from_text(content)
121
 
122
- # Clean content
123
- cleaned = self._clean_content(content)
 
 
 
 
 
124
 
125
  return {
126
  "success": True,
127
  "url": url,
128
- "title": title[:300] if title else "Extracted via Jina",
129
- "main_content": cleaned[:25000],
130
  "content_length": len(cleaned),
131
- "content_preview": cleaned[:800] + ("..." if len(cleaned) > 800 else ""),
132
  "source": "jina_reader",
133
  "status": response.status_code
134
  }
135
 
136
  return {"success": False, "error": f"Jina status: {response.status_code}"}
137
 
138
- except requests.exceptions.Timeout:
139
- print(f" Jina timeout after 12s, trying next strategy...")
140
- return {"success": False, "error": "Jina Reader timed out"}
141
  except Exception as e:
142
- print(f" Jina error: {e}")
143
  return {"success": False, "error": f"Jina error: {str(e)}"}
144
 
145
- def _try_direct_request(self, url: str) -> Dict[str, Any]:
146
- """Try direct request with various headers"""
147
- headers_list = [
148
- {
149
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
150
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
151
- "Accept-Language": "en-US,en;q=0.9",
152
- "Accept-Encoding": "gzip, deflate, br",
153
- "DNT": "1",
154
- "Connection": "keep-alive",
155
- "Upgrade-Insecure-Requests": "1",
156
- "Sec-Fetch-Dest": "document",
157
- "Sec-Fetch-Mode": "navigate",
158
- "Sec-Fetch-Site": "none",
159
- "Sec-Fetch-User": "?1",
160
- "Cache-Control": "max-age=0",
161
- },
162
- {
163
- "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
164
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
165
- },
166
- {
167
- "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
168
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
169
- },
170
- ]
171
-
172
- for i, headers in enumerate(headers_list):
173
- try:
174
- print(f" Direct attempt {i+1}...")
175
- response = requests.get(
176
- url,
177
- headers=headers,
178
- timeout=10,
179
- allow_redirects=True,
180
- verify=False # Try without SSL verification
181
- )
182
-
183
- print(f" Status: {response.status_code}")
184
-
185
- if response.status_code == 200:
186
- html_content = response.text
187
-
188
- # Extract content
189
- text_content = self._extract_from_html(html_content)
190
- cleaned = self._clean_content(text_content)
191
-
192
- # Extract title
193
- title = self._extract_title_from_html(html_content)
194
-
195
- if len(cleaned) > 100:
196
- return {
197
- "success": True,
198
- "url": url,
199
- "title": title[:300] if title else "Direct extraction",
200
- "main_content": cleaned[:20000],
201
- "content_length": len(cleaned),
202
- "source": f"direct_request_{i+1}",
203
- "status": response.status_code
204
- }
205
-
206
- except requests.exceptions.Timeout:
207
- print(f" Direct request {i+1} timed out")
208
- continue
209
- except Exception as e:
210
- print(f" Direct request {i+1} error: {e}")
211
- continue
212
-
213
- return {"success": False, "error": "All direct attempts failed"}
214
-
215
- def _try_simple_request(self, url: str) -> Dict[str, Any]:
216
- """Simple request with minimal headers"""
217
  try:
218
- print(" Simple request attempt...")
219
- response = requests.get(
220
- url,
221
- headers={"User-Agent": "Mozilla/5.0"},
222
- timeout=8,
223
- allow_redirects=True,
224
- verify=False
225
- )
226
-
227
- print(f" Simple status: {response.status_code}")
228
 
229
  if response.status_code == 200:
230
- html_content = response.text
231
- text_content = self._extract_from_html(html_content)
232
- cleaned = self._clean_content(text_content)
233
- title = self._extract_title_from_html(html_content)
234
 
235
- if len(cleaned) > 50:
236
- return {
237
- "success": True,
238
- "url": url,
239
- "title": title[:200] if title else "Simple extraction",
240
- "main_content": cleaned[:15000],
241
- "content_length": len(cleaned),
242
- "source": "simple_request"
243
- }
244
-
245
- return {"success": False, "error": f"Status: {response.status_code}"}
246
-
247
- except Exception as e:
248
- return {"success": False, "error": str(e)}
249
-
250
- def _try_fallback_request(self, url: str) -> Dict[str, Any]:
251
- """Fallback using alternative methods"""
252
- try:
253
- print(" Fallback attempt...")
254
-
255
- # Try with requests session
256
- session = requests.Session()
257
- session.headers.update({
258
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
259
- "Accept": "text/html",
260
- })
261
-
262
- response = session.get(url, timeout=15, allow_redirects=True, verify=False)
263
-
264
- if response.status_code == 200:
265
- html_content = response.text
266
 
267
- # Very simple text extraction
268
- text = self._simple_text_extraction(html_content)
 
 
 
 
 
 
269
 
270
- if len(text) > 50:
 
 
 
 
 
271
  return {
272
  "success": True,
273
  "url": url,
274
- "title": "Fallback extraction",
275
- "main_content": text[:10000],
276
- "content_length": len(text),
277
- "source": "fallback",
278
- "status": response.status_code
279
  }
280
 
281
- return {"success": False, "error": f"Fallback status: {response.status_code}"}
282
 
283
  except Exception as e:
284
- return {"success": False, "error": f"Fallback error: {str(e)}"}
285
-
286
- def _simple_text_extraction(self, html_content: str) -> str:
287
- """Very simple text extraction"""
288
- # Remove scripts and styles
289
- html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
290
- html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
291
-
292
- # Extract text between tags
293
- text = re.sub(r'<[^>]+>', ' ', html_content)
294
- text = html.unescape(text)
295
- text = re.sub(r'\s+', ' ', text)
296
-
297
- return text.strip()
298
 
299
- def _extract_from_html(self, html_content: str) -> str:
300
- """Extract text from HTML"""
301
- # Remove scripts and styles
302
- html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
303
- html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
304
- html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
305
-
306
- # Remove unwanted tags
307
- unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe', 'svg', 'button']
308
- for tag in unwanted_tags:
309
- html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
310
-
311
- # Extract text
312
- text = re.sub(r'<[^>]+>', ' ', html_content)
313
- text = html.unescape(text)
314
-
315
- # Clean up
316
- text = re.sub(r'\s+', ' ', text)
317
 
318
- return text
319
-
320
- def _extract_title_from_html(self, html_content: str) -> str:
321
- """Extract title from HTML"""
322
- # Try <title> tag
323
- title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
324
- if title_match:
325
- title = title_match.group(1)
326
- title = re.sub(r'\s+', ' ', title).strip()
327
- title = html.unescape(title)
328
- if title:
329
- return title[:200]
330
-
331
- # Try meta title
332
- meta_match = re.search(r'<meta[^>]*property=["\']og:title["\'][^>]*content=["\'](.*?)["\']', html_content, re.IGNORECASE)
333
- if meta_match:
334
- title = meta_match.group(1)
335
- title = html.unescape(title).strip()
336
- if title:
337
- return title[:200]
338
-
339
- # Try h1
340
- h1_match = re.search(r'<h1[^>]*>(.*?)</h1>', html_content, re.IGNORECASE | re.DOTALL)
341
- if h1_match:
342
- title = h1_match.group(1)
343
- title = re.sub(r'<[^>]+>', '', title)
344
- title = html.unescape(title).strip()
345
- if title:
346
- return title[:200]
347
 
348
  return ""
349
 
350
- def _extract_title_from_text(self, text: str) -> str:
351
- """Try to extract title from text"""
352
- # Look for title patterns
353
- patterns = [
354
- r'Title:\s*(.*?)(?:\n|$)',
355
- r'#\s+(.*?)(?:\n|$)',
356
- r'<title[^>]*>(.*?)</title>',
 
 
 
 
 
 
 
 
 
 
 
357
  ]
358
 
359
- for pattern in patterns:
360
- match = re.search(pattern, text[:1000], re.IGNORECASE)
361
- if match:
362
- title = match.group(1).strip()
363
- if len(title) > 10 and len(title) < 200:
364
- return title
 
 
 
 
 
 
365
 
366
  return ""
367
 
368
- def _clean_content(self, content: str) -> str:
369
- """Clean and normalize content"""
370
- if not content:
371
  return ""
372
 
373
- # Replace multiple whitespace
374
- content = re.sub(r'\s+', ' ', content)
375
-
376
- # Remove control characters
377
- content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
378
 
379
- # Remove excessive line breaks
380
- content = re.sub(r'\n{3,}', '\n\n', content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
- return content.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
 
384
  # ==============================================
385
  # INITIALIZE
386
  # ==============================================
387
 
388
- extractor = RobustContentExtractor()
389
 
390
  # ==============================================
391
  # FASTAPI APP
392
  # ==============================================
393
 
394
  fastapi_app = FastAPI(
395
- title="Robust Content Extractor",
396
- description="Extracts content with better timeout handling",
397
- version="2.1"
398
  )
399
 
400
  from fastapi.middleware.cors import CORSMiddleware
@@ -411,23 +443,22 @@ fastapi_app.add_middleware(
411
  @fastapi_app.get("/")
412
  async def root():
413
  return {
414
- "service": "Robust Content Extractor",
415
- "version": "2.1",
416
- "description": "Extracts website content with multiple fallback strategies",
417
  "endpoints": {
418
  "GET /": "This info",
419
- "GET /health": "Health check (fast)",
420
- "POST /extract": "Extract content"
421
  }
422
  }
423
 
424
  @fastapi_app.get("/health")
425
  async def health():
426
- """Fast health check endpoint for wake-up calls"""
427
  return {
428
  "status": "healthy",
429
  "timestamp": time.time(),
430
- "service": "content_extractor"
431
  }
432
 
433
  @fastapi_app.post("/extract")
@@ -443,8 +474,8 @@ async def api_extract(request: Request):
443
  content={"success": False, "error": "URL is required"}
444
  )
445
 
446
- print(f"📨 API Request: {url}")
447
- print(f" Starting extraction at {time.strftime('%Y-%m-%d %H:%M:%S')}")
448
 
449
  start_time = time.time()
450
  result = extractor.extract_content(url)
@@ -452,6 +483,7 @@ async def api_extract(request: Request):
452
 
453
  print(f" Extraction completed in {elapsed:.2f}s")
454
  print(f" Success: {result.get('success')}")
 
455
 
456
  return result
457
 
@@ -466,8 +498,7 @@ async def api_extract(request: Request):
466
  status_code=500,
467
  content={
468
  "success": False,
469
- "error": str(e),
470
- "traceback": traceback.format_exc()[:500]
471
  }
472
  )
473
 
@@ -478,53 +509,50 @@ async def api_extract(request: Request):
478
  def gradio_extract(url: str):
479
  """Gradio interface"""
480
  if not url:
481
- return "❌ Please enter a URL", {}
482
 
483
  result = extractor.extract_content(url)
484
 
485
  if result["success"]:
486
  content = result["main_content"]
487
- content_length = result["content_length"]
488
-
489
- preview = content[:500]
490
- if len(content) > 500:
491
- preview += "..."
492
 
493
  output = f"""
494
- ## ✅ Success!
495
 
496
- **URL:** {result['url']}
497
- **Title:** {result.get('title', 'N/A')}
498
- **Method:** {result.get('method', 'extracted')}
499
- **Time:** {result['execution_time']}s
500
- **Characters:** {content_length:,}
501
 
502
- ### Preview:
503
- {preview}
504
  """
505
  return output, result
506
  else:
507
- error = result.get("error", "Unknown error")
508
- return f"## ❌ Error\n\n{error}", result
509
 
510
  # Create Gradio interface
511
  gradio_interface = gr.Interface(
512
  fn=gradio_extract,
513
  inputs=gr.Textbox(
514
- label="Website URL",
515
- placeholder="https://example.com",
516
- value="https://example.com"
517
  ),
518
  outputs=[
519
- gr.Markdown(label="Result"),
520
- gr.JSON(label="API Response")
521
  ],
522
- title="🌐 Robust Content Extractor v2.1",
523
- description="Extracts content with better error handling and multiple fallbacks",
524
  examples=[
525
- ["https://example.com"],
526
- ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
527
- ["https://news.ycombinator.com"]
528
  ]
529
  )
530
 
@@ -540,16 +568,16 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
540
 
541
  if __name__ == "__main__":
542
  print("\n" + "="*60)
543
- print("🌐 Robust Content Extractor v2.1 Starting")
544
  print("="*60)
545
- print("Features:")
546
- print("• Multiple fallback strategies")
547
- print("• Better error handling")
548
- print("• URL encoding support")
549
  print("="*60)
550
- print("API Endpoints:")
551
- print("• GET /health - Fast health check")
552
- print("• POST /extract - Extract content")
553
  print("="*60 + "\n")
554
 
555
  uvicorn.run(
 
1
  # ==============================================
2
+ # IMPROVED CONTENT EXTRACTOR FOR NEWS SITES
3
  # ==============================================
4
 
5
  import gradio as gr
 
12
  from fastapi import FastAPI, Request
13
  import uvicorn
14
  import traceback
15
+ from bs4 import BeautifulSoup
16
 
17
  # ==============================================
18
+ # NEWS-SPECIFIC CONTENT EXTRACTOR
19
  # ==============================================
20
 
21
+ class NewsContentExtractor:
22
+ """Content extractor specifically optimized for news websites"""
23
 
24
  def __init__(self):
25
  self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
26
 
27
  def extract_content(self, url: str) -> Dict[str, Any]:
28
+ """Extract news content with article-focused extraction"""
29
  start_time = time.time()
30
 
31
+ print(f"📰 Extracting news from: {url}")
32
 
33
  # Ensure URL has protocol
34
  if not url.startswith(('http://', 'https://')):
35
  url = 'https://' + url
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # Try multiple strategies
38
  strategies = [
39
+ self._try_direct_extract, # Direct extraction with BeautifulSoup
40
+ self._try_jina_reader, # Jina Reader
41
+ self._try_simple_extract, # Simple fallback
 
42
  ]
43
 
44
+ best_result = None
45
+ best_score = 0
46
 
47
  for i, strategy in enumerate(strategies):
48
  try:
 
50
  result = strategy(url)
51
 
52
  if result.get("success"):
53
+ # Score the result based on content quality
54
+ score = self._score_content(result.get("main_content", ""))
55
+ result["score"] = score
56
+
57
+ if score > best_score:
58
+ best_score = score
59
+ best_result = result
60
+ print(f" ✓ Strategy {i+1} score: {score}")
61
 
62
  except Exception as e:
 
63
  print(f" Strategy {i+1} failed: {e}")
64
+ time.sleep(0.5)
65
+
66
+ if best_result and best_score > 10: # Minimum score threshold
67
+ best_result["execution_time"] = round(time.time() - start_time, 2)
68
+ best_result["method"] = "best_extraction"
69
+ return best_result
70
 
71
+ # All failed or low quality
72
  return {
73
  "success": False,
74
  "url": url,
75
+ "error": "Could not extract quality news content",
76
  "execution_time": round(time.time() - start_time, 2),
77
+ "suggestion": "Website might have anti-scraping protection"
78
  }
79
 
80
+ def _try_direct_extract(self, url: str) -> Dict[str, Any]:
81
+ """Direct extraction with BeautifulSoup for better HTML parsing"""
82
+ try:
83
+ headers = {
84
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
85
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
86
+ "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
87
+ "Accept-Encoding": "gzip, deflate",
88
+ "DNT": "1",
89
+ "Connection": "keep-alive",
90
+ "Upgrade-Insecure-Requests": "1",
91
+ "Sec-Fetch-Dest": "document",
92
+ "Sec-Fetch-Mode": "navigate",
93
+ "Sec-Fetch-Site": "none",
94
+ "Sec-Fetch-User": "?1",
95
+ "Cache-Control": "max-age=0",
96
+ }
97
+
98
+ response = requests.get(url, headers=headers, timeout=15, verify=False)
99
+
100
+ if response.status_code == 200:
101
+ soup = BeautifulSoup(response.content, 'html.parser')
102
+
103
+ # Remove unwanted elements
104
+ for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer',
105
+ 'aside', 'form', 'iframe', 'button', 'svg',
106
+ 'link', 'meta', 'noscript']):
107
+ unwanted.decompose()
108
+
109
+ # Try to find article content using multiple strategies
110
+ article_text = ""
111
+
112
+ # Strategy 1: Look for article-specific containers
113
+ article_selectors = [
114
+ 'article', '.article-content', '.post-content', '.entry-content',
115
+ '.news-content', '.content-area', '.main-content',
116
+ 'div[class*="article"]', 'div[class*="content"]',
117
+ 'div[class*="post"]', 'div[class*="entry"]',
118
+ 'div[itemprop="articleBody"]', 'div[class*="story"]'
119
+ ]
120
+
121
+ for selector in article_selectors:
122
+ article = soup.select_one(selector)
123
+ if article:
124
+ article_text = article.get_text(separator='\n', strip=True)
125
+ if len(article_text) > 300: # Minimum content length
126
+ print(f" Found content with selector: {selector}")
127
+ break
128
+
129
+ # Strategy 2: Look for main content by paragraph density
130
+ if len(article_text) < 300:
131
+ all_paragraphs = soup.find_all('p')
132
+ if len(all_paragraphs) > 3:
133
+ article_text = '\n'.join([p.get_text(strip=True) for p in all_paragraphs])
134
+
135
+ # Strategy 3: Extract text from main divs
136
+ if len(article_text) < 300:
137
+ main_divs = soup.find_all(['div', 'section'])
138
+ for div in main_divs:
139
+ text = div.get_text(separator='\n', strip=True)
140
+ # Check if this looks like article content
141
+ if (len(text) > 500 and
142
+ text.count('\n') > 5 and
143
+ not any(word in text.lower() for word in ['cookie', 'privacy', 'copyright', 'advertisement'])):
144
+ article_text = text
145
+ break
146
+
147
+ # Clean and format the text
148
+ if article_text:
149
+ cleaned_text = self._clean_news_content(article_text)
150
+
151
+ # Extract title
152
+ title = self._extract_title(soup)
153
+ if not title:
154
+ title_match = soup.find('title')
155
+ title = title_match.get_text(strip=True) if title_match else "新闻标题"
156
+
157
+ # Extract date if available
158
+ date = self._extract_date(soup)
159
+
160
+ return {
161
+ "success": True,
162
+ "url": url,
163
+ "title": title[:200],
164
+ "date": date,
165
+ "main_content": cleaned_text,
166
+ "content_length": len(cleaned_text),
167
+ "content_preview": cleaned_text[:500] + ("..." if len(cleaned_text) > 500 else ""),
168
+ "source": "direct_extraction",
169
+ "status": response.status_code
170
+ }
171
+
172
+ return {"success": False, "error": f"Status: {response.status_code}"}
173
+
174
+ except Exception as e:
175
+ return {"success": False, "error": f"Direct extract error: {str(e)}"}
176
+
177
+ def _try_jina_reader(self, url: str) -> Dict[str, Any]:
178
+ """Try Jina Reader"""
179
  try:
 
180
  jina_url = f"https://r.jina.ai/{url}"
181
 
 
182
  response = requests.get(
183
  jina_url,
184
+ headers={"Accept": "text/plain"},
185
+ timeout=20
 
 
 
186
  )
187
 
188
  if response.status_code == 200:
189
  content = response.text
190
 
191
+ # Clean the content
192
+ cleaned = self._clean_news_content(content)
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ # Extract title from Jina response
195
+ title = "Jina提取内容"
196
+ lines = content.split('\n')
197
+ for line in lines[:10]:
198
+ if line.startswith('Title:') or line.startswith('# '):
199
+ title = line.replace('Title:', '').replace('# ', '').strip()
200
+ break
201
 
202
  return {
203
  "success": True,
204
  "url": url,
205
+ "title": title[:200],
206
+ "main_content": cleaned,
207
  "content_length": len(cleaned),
208
+ "content_preview": cleaned[:500] + ("..." if len(cleaned) > 500 else ""),
209
  "source": "jina_reader",
210
  "status": response.status_code
211
  }
212
 
213
  return {"success": False, "error": f"Jina status: {response.status_code}"}
214
 
 
 
 
215
  except Exception as e:
 
216
  return {"success": False, "error": f"Jina error: {str(e)}"}
217
 
218
+ def _try_simple_extract(self, url: str) -> Dict[str, Any]:
219
+ """Simple fallback extraction"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  try:
221
+ response = requests.get(url, timeout=10, verify=False)
 
 
 
 
 
 
 
 
 
222
 
223
  if response.status_code == 200:
224
+ soup = BeautifulSoup(response.content, 'html.parser')
 
 
 
225
 
226
+ # Get all text
227
+ all_text = soup.get_text(separator='\n', strip=True)
228
+
229
+ # Clean and extract meaningful parts
230
+ lines = all_text.split('\n')
231
+ meaningful_lines = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
+ for line in lines:
234
+ line = line.strip()
235
+ if (len(line) > 20 and
236
+ not any(word in line.lower() for word in ['cookie', 'privacy', 'copyright',
237
+ 'advertisement', 'newsletter', 'subscribe',
238
+ 'follow us', 'share this']) and
239
+ not re.match(r'^[0-9\.\-\s]+$', line)): # Skip number-only lines
240
+ meaningful_lines.append(line)
241
 
242
+ cleaned_text = '\n'.join(meaningful_lines[:100]) # Take top 100 lines
243
+
244
+ if len(cleaned_text) > 200:
245
+ title = soup.find('title')
246
+ title_text = title.get_text(strip=True) if title else "新闻内容"
247
+
248
  return {
249
  "success": True,
250
  "url": url,
251
+ "title": title_text[:150],
252
+ "main_content": cleaned_text,
253
+ "content_length": len(cleaned_text),
254
+ "source": "simple_extract"
 
255
  }
256
 
257
+ return {"success": False, "error": "Simple extraction failed"}
258
 
259
  except Exception as e:
260
+ return {"success": False, "error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
+ def _extract_title(self, soup) -> str:
263
+ """Extract title from BeautifulSoup object"""
264
+ # Try multiple title sources
265
+ title_sources = [
266
+ soup.find('title'),
267
+ soup.find('h1'),
268
+ soup.find('meta', property='og:title'),
269
+ soup.find('meta', attrs={'name': 'title'}),
270
+ soup.find('h2', class_=re.compile(r'title|heading')),
271
+ ]
 
 
 
 
 
 
 
 
272
 
273
+ for source in title_sources:
274
+ if source:
275
+ if hasattr(source, 'get'):
276
+ content = source.get('content', '') if source.name == 'meta' else source.get_text(strip=True)
277
+ if content and len(content) > 5 and len(content) < 200:
278
+ return content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  return ""
281
 
282
+ def _extract_date(self, soup) -> str:
283
+ """Extract date from BeautifulSoup object"""
284
+ date_patterns = [
285
+ r'\d{4}[-/]\d{2}[-/]\d{2}',
286
+ r'\d{2}[-/]\d{2}[-/]\d{4}',
287
+ r'\d{1,2}\s+\w+\s+\d{4}',
288
+ ]
289
+
290
+ # Look in common date locations
291
+ date_selectors = [
292
+ 'time',
293
+ '.date',
294
+ '.published',
295
+ '.post-date',
296
+ '.article-date',
297
+ 'meta[property="article:published_time"]',
298
+ 'meta[name="pubdate"]',
299
+ 'meta[name="date"]',
300
  ]
301
 
302
+ for selector in date_selectors:
303
+ elements = soup.select(selector)
304
+ for element in elements:
305
+ if element.name == 'meta':
306
+ date_str = element.get('content', '')
307
+ else:
308
+ date_str = element.get_text(strip=True) or element.get('datetime', '')
309
+
310
+ for pattern in date_patterns:
311
+ match = re.search(pattern, date_str)
312
+ if match:
313
+ return match.group()
314
 
315
  return ""
316
 
317
+ def _clean_news_content(self, text: str) -> str:
318
+ """Clean and format news content"""
319
+ if not text:
320
  return ""
321
 
322
+ # Remove excessive whitespace
323
+ text = re.sub(r'\s+', ' ', text)
 
 
 
324
 
325
+ # Remove common unwanted patterns
326
+ unwanted_patterns = [
327
+ r'adsbygoogle.*?\[\]\]',
328
+ r'ADVERTISEMENT',
329
+ r'Sponsored Content',
330
+ r'Sign up for.*?newsletter',
331
+ r'Subscribe to.*?channel',
332
+ r'Follow us on.*',
333
+ r'Share this.*',
334
+ r'Like us on.*',
335
+ r'Read more.*',
336
+ r'Continue reading.*',
337
+ r'点击这里.*',
338
+ r'更多新闻.*',
339
+ r'相关新闻.*',
340
+ r'热门搜索.*',
341
+ r'大事件.*',
342
+ r'Copyright.*All rights reserved',
343
+ r'本网站.*Cookies',
344
+ r'了解更多.*',
345
+ r'接受.*',
346
+ r'简\s*繁',
347
+ r'登入.*',
348
+ r'下载APP.*',
349
+ r'首页.*最新.*头条.*',
350
+ r'[\*\-\=]{5,}', # Multiple special characters
351
+ ]
352
 
353
+ for pattern in unwanted_patterns:
354
+ text = re.sub(pattern, '', text, flags=re.IGNORECASE)
355
+
356
+ # Remove very short lines (likely navigation)
357
+ lines = text.split('\n')
358
+ cleaned_lines = []
359
+ for line in lines:
360
+ line = line.strip()
361
+ if (len(line) > 15 and
362
+ not line.startswith(('http://', 'https://', 'www.')) and
363
+ not re.match(r'^[\d\s\.\-]+$', line)):
364
+ cleaned_lines.append(line)
365
+
366
+ text = '\n'.join(cleaned_lines)
367
+
368
+ # Remove duplicate consecutive lines
369
+ lines = text.split('\n')
370
+ unique_lines = []
371
+ for i, line in enumerate(lines):
372
+ if i == 0 or line != lines[i-1]:
373
+ unique_lines.append(line)
374
+
375
+ return '\n'.join(unique_lines).strip()
376
+
377
+ def _score_content(self, text: str) -> int:
378
+ """Score content quality based on various factors"""
379
+ if not text:
380
+ return 0
381
+
382
+ score = 0
383
+
384
+ # Length-based scoring
385
+ length = len(text)
386
+ if length > 1000:
387
+ score += 30
388
+ elif length > 500:
389
+ score += 20
390
+ elif length > 200:
391
+ score += 10
392
+
393
+ # Paragraph count (rough estimate)
394
+ paragraphs = text.count('\n\n') + 1
395
+ if paragraphs > 5:
396
+ score += 20
397
+ elif paragraphs > 3:
398
+ score += 10
399
+
400
+ # News indicators
401
+ news_keywords = ['报道', '新闻', '记者', '警方', '调查', '发生', '表示', '指出',
402
+ '据知', '据了解', '据悉', '事件', '事故', '案件']
403
+
404
+ for keyword in news_keywords:
405
+ if keyword in text:
406
+ score += 2
407
+
408
+ # Penalize for unwanted content
409
+ unwanted_terms = ['cookie', 'privacy', 'copyright', 'advertisement', 'newsletter']
410
+ for term in unwanted_terms:
411
+ if term.lower() in text.lower():
412
+ score -= 5
413
+
414
+ return max(0, score)
415
 
416
  # ==============================================
417
  # INITIALIZE
418
  # ==============================================
419
 
420
+ extractor = NewsContentExtractor()
421
 
422
  # ==============================================
423
  # FASTAPI APP
424
  # ==============================================
425
 
426
  fastapi_app = FastAPI(
427
+ title="News Content Extractor",
428
+ description="Extracts news article content with BeautifulSoup",
429
+ version="3.0"
430
  )
431
 
432
  from fastapi.middleware.cors import CORSMiddleware
 
443
  @fastapi_app.get("/")
444
  async def root():
445
  return {
446
+ "service": "News Content Extractor",
447
+ "version": "3.0",
448
+ "description": "Extracts news article content using BeautifulSoup",
449
  "endpoints": {
450
  "GET /": "This info",
451
+ "GET /health": "Health check",
452
+ "POST /extract": "Extract news content"
453
  }
454
  }
455
 
456
  @fastapi_app.get("/health")
457
  async def health():
 
458
  return {
459
  "status": "healthy",
460
  "timestamp": time.time(),
461
+ "service": "news_extractor"
462
  }
463
 
464
  @fastapi_app.post("/extract")
 
474
  content={"success": False, "error": "URL is required"}
475
  )
476
 
477
+ print(f"📰 API Request for news: {url}")
478
+ print(f" Starting at {time.strftime('%Y-%m-%d %H:%M:%S')}")
479
 
480
  start_time = time.time()
481
  result = extractor.extract_content(url)
 
483
 
484
  print(f" Extraction completed in {elapsed:.2f}s")
485
  print(f" Success: {result.get('success')}")
486
+ print(f" Content length: {result.get('content_length', 0)}")
487
 
488
  return result
489
 
 
498
  status_code=500,
499
  content={
500
  "success": False,
501
+ "error": str(e)
 
502
  }
503
  )
504
 
 
509
  def gradio_extract(url: str):
510
  """Gradio interface"""
511
  if not url:
512
+ return "❌ 请输入URL", {}
513
 
514
  result = extractor.extract_content(url)
515
 
516
  if result["success"]:
517
  content = result["main_content"]
518
+ title = result.get("title", "无标题")
519
+ date = result.get("date", "")
 
 
 
520
 
521
  output = f"""
522
+ ## ✅ 提取成功!
523
 
524
+ **标题:** {title}
525
+ **日期:** {date if date else "未提取到日期"}
526
+ **方法:** {result.get('method', '提取')}
527
+ **时间:** {result['execution_time']}s
528
+ **字符数:** {result['content_length']:,}
529
 
530
+ ### 内容预览:
531
+ {content[:800]}{"..." if len(content) > 800 else ""}
532
  """
533
  return output, result
534
  else:
535
+ error = result.get("error", "未知错误")
536
+ return f"## ❌ 错误\n\n{error}", result
537
 
538
  # Create Gradio interface
539
  gradio_interface = gr.Interface(
540
  fn=gradio_extract,
541
  inputs=gr.Textbox(
542
+ label="新闻URL",
543
+ placeholder="https://example.com/news",
544
+ value="https://northern.sinchew.com.my/?p=7217886"
545
  ),
546
  outputs=[
547
+ gr.Markdown(label="结果"),
548
+ gr.JSON(label="API响应")
549
  ],
550
+ title="📰 新闻内容提取器",
551
+ description="使用BeautifulSoup提取新闻文章内容",
552
  examples=[
553
+ ["https://northern.sinchew.com.my/?p=7217886"],
554
+ ["https://www.sinchew.com.my/?p=7234965"],
555
+ ["https://example.com"]
556
  ]
557
  )
558
 
 
568
 
569
  if __name__ == "__main__":
570
  print("\n" + "="*60)
571
+ print("📰 新闻内容提取器 v3.0 启动")
572
  print("="*60)
573
+ print("特性:")
574
+ print("• 使用BeautifulSoup进行HTML解析")
575
+ print("• 专门针对新闻网站优化")
576
+ print("• 智能内容评分系统")
577
  print("="*60)
578
+ print("API端点:")
579
+ print("• GET /health - 健康检查")
580
+ print("• POST /extract - 提取新闻内容")
581
  print("="*60 + "\n")
582
 
583
  uvicorn.run(