yukee1992 commited on
Commit
5d4e21f
·
verified ·
1 Parent(s): 2448858

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -97
app.py CHANGED
@@ -11,6 +11,7 @@ import html
11
  from typing import Dict, Any
12
  from fastapi import FastAPI, Request
13
  import uvicorn
 
14
 
15
  # ==============================================
16
  # IMPROVED CONTENT EXTRACTOR
@@ -32,11 +33,30 @@ class RobustContentExtractor:
32
  if not url.startswith(('http://', 'https://')):
33
  url = 'https://' + url
34
 
35
- # Try multiple strategies with shorter timeouts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  strategies = [
37
  self._try_jina_reader_fast, # Faster timeout
38
  self._try_direct_request, # Direct attempt
39
  self._try_simple_request, # Simple headers
 
40
  ]
41
 
42
  last_error = None
@@ -49,12 +69,13 @@ class RobustContentExtractor:
49
  if result.get("success"):
50
  result["execution_time"] = round(time.time() - start_time, 2)
51
  result["method"] = f"strategy_{i+1}"
 
52
  return result
53
 
54
  except Exception as e:
55
  last_error = str(e)
56
  print(f" Strategy {i+1} failed: {e}")
57
- time.sleep(0.5)
58
 
59
  # All failed
60
  return {
@@ -68,88 +89,98 @@ class RobustContentExtractor:
68
  def _try_jina_reader_fast(self, url: str) -> Dict[str, Any]:
69
  """Try Jina Reader with shorter timeout"""
70
  try:
71
- # Try with shorter timeout first
72
  jina_url = f"https://r.jina.ai/{url}"
73
 
74
- # Try multiple approaches
75
- attempts = [
76
- {"timeout": 15, "headers": {"Accept": "text/plain"}},
77
- {"timeout": 20, "headers": {"Accept": "application/json"}},
78
- {"timeout": 25, "headers": {"User-Agent": self.user_agent}},
79
- ]
 
 
 
80
 
81
- for attempt in attempts:
 
 
 
82
  try:
83
- response = requests.get(
84
- jina_url,
85
- headers=attempt["headers"],
86
- timeout=attempt["timeout"]
87
- )
88
-
89
- if response.status_code == 200:
90
- content = response.text
91
-
92
- # Try to parse as JSON
93
- try:
94
- data = json.loads(content)
95
- if isinstance(data, dict):
96
- if "content" in data:
97
- content = data["content"]
98
- elif "data" in data:
99
- content = str(data["data"])
100
- except:
101
- pass # Keep as plain text
102
-
103
- # Extract title
104
- title = self._extract_title_from_text(content)
105
-
106
- # Clean content
107
- cleaned = self._clean_content(content)
108
-
109
- return {
110
- "success": True,
111
- "url": url,
112
- "title": title[:300] if title else "Extracted via Jina",
113
- "main_content": cleaned[:25000],
114
- "content_length": len(cleaned),
115
- "content_preview": cleaned[:800] + ("..." if len(cleaned) > 800 else ""),
116
- "source": "jina_reader",
117
- "status": response.status_code
118
- }
119
-
120
- except requests.exceptions.Timeout:
121
- print(f" Jina timeout after {attempt['timeout']}s, trying next...")
122
- continue
123
- except Exception as e:
124
- print(f" Jina attempt failed: {e}")
125
- continue
126
 
127
- return {"success": False, "error": "Jina Reader timed out"}
128
 
 
 
 
129
  except Exception as e:
 
130
  return {"success": False, "error": f"Jina error: {str(e)}"}
131
 
132
  def _try_direct_request(self, url: str) -> Dict[str, Any]:
133
  """Try direct request with various headers"""
134
  headers_list = [
135
  {
136
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
137
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
138
  "Accept-Language": "en-US,en;q=0.9",
 
 
 
 
 
 
 
 
 
139
  },
140
  {
141
- "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15",
142
- "Accept": "text/html",
143
  },
144
  {
145
  "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
146
- "Accept": "text/html",
147
  },
148
  ]
149
 
150
- for headers in headers_list:
151
  try:
152
- response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
 
 
 
 
 
 
 
 
 
153
 
154
  if response.status_code == 200:
155
  html_content = response.text
@@ -168,26 +199,33 @@ class RobustContentExtractor:
168
  "title": title[:300] if title else "Direct extraction",
169
  "main_content": cleaned[:20000],
170
  "content_length": len(cleaned),
171
- "source": "direct_request",
172
  "status": response.status_code
173
  }
174
 
 
 
 
175
  except Exception as e:
176
- print(f" Direct request failed: {e}")
177
  continue
178
 
179
- return {"success": False}
180
 
181
  def _try_simple_request(self, url: str) -> Dict[str, Any]:
182
  """Simple request with minimal headers"""
183
  try:
 
184
  response = requests.get(
185
  url,
186
  headers={"User-Agent": "Mozilla/5.0"},
187
  timeout=8,
188
- allow_redirects=True
 
189
  )
190
 
 
 
191
  if response.status_code == 200:
192
  html_content = response.text
193
  text_content = self._extract_from_html(html_content)
@@ -204,11 +242,60 @@ class RobustContentExtractor:
204
  "source": "simple_request"
205
  }
206
 
207
- return {"success": False}
208
 
209
  except Exception as e:
210
  return {"success": False, "error": str(e)}
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  def _extract_from_html(self, html_content: str) -> str:
213
  """Extract text from HTML"""
214
  # Remove scripts and styles
@@ -217,7 +304,7 @@ class RobustContentExtractor:
217
  html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
218
 
219
  # Remove unwanted tags
220
- unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe']
221
  for tag in unwanted_tags:
222
  html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
223
 
@@ -232,12 +319,32 @@ class RobustContentExtractor:
232
 
233
  def _extract_title_from_html(self, html_content: str) -> str:
234
  """Extract title from HTML"""
235
- title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
 
236
  if title_match:
237
  title = title_match.group(1)
238
  title = re.sub(r'\s+', ' ', title).strip()
239
  title = html.unescape(title)
240
- return title
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  return ""
242
 
243
  def _extract_title_from_text(self, text: str) -> str:
@@ -272,22 +379,6 @@ class RobustContentExtractor:
272
  # Remove excessive line breaks
273
  content = re.sub(r'\n{3,}', '\n\n', content)
274
 
275
- # Remove common unwanted phrases
276
- unwanted = [
277
- r'adsbygoogle',
278
- r'advertisement',
279
- r'sponsored content',
280
- r'sign up for',
281
- r'subscribe to',
282
- r'follow us on',
283
- r'share this article',
284
- r'read more',
285
- r'continue reading',
286
- ]
287
-
288
- for phrase in unwanted:
289
- content = re.sub(phrase, '', content, flags=re.IGNORECASE)
290
-
291
  return content.strip()
292
 
293
  # ==============================================
@@ -303,7 +394,7 @@ extractor = RobustContentExtractor()
303
  fastapi_app = FastAPI(
304
  title="Robust Content Extractor",
305
  description="Extracts content with better timeout handling",
306
- version="2.0"
307
  )
308
 
309
  from fastapi.middleware.cors import CORSMiddleware
@@ -321,14 +412,13 @@ fastapi_app.add_middleware(
321
  async def root():
322
  return {
323
  "service": "Robust Content Extractor",
324
- "version": "2.0",
325
- "description": "Extracts website content with better error handling",
326
  "endpoints": {
327
  "GET /": "This info",
328
  "GET /health": "Health check (fast)",
329
  "POST /extract": "Extract content"
330
- },
331
- "timeout_notes": "Jina Reader timeout reduced to 15-25 seconds for faster response"
332
  }
333
 
334
  @fastapi_app.get("/health")
@@ -354,7 +444,14 @@ async def api_extract(request: Request):
354
  )
355
 
356
  print(f"📨 API Request: {url}")
 
 
 
357
  result = extractor.extract_content(url)
 
 
 
 
358
 
359
  return result
360
 
@@ -364,9 +461,14 @@ async def api_extract(request: Request):
364
  content={"success": False, "error": "Invalid JSON"}
365
  )
366
  except Exception as e:
 
367
  return JSONResponse(
368
  status_code=500,
369
- content={"success": False, "error": str(e)}
 
 
 
 
370
  )
371
 
372
  # ==============================================
@@ -417,8 +519,8 @@ gradio_interface = gr.Interface(
417
  gr.Markdown(label="Result"),
418
  gr.JSON(label="API Response")
419
  ],
420
- title="🌐 Robust Content Extractor",
421
- description="Extracts content with better error handling and timeouts",
422
  examples=[
423
  ["https://example.com"],
424
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
@@ -438,15 +540,15 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
438
 
439
  if __name__ == "__main__":
440
  print("\n" + "="*60)
441
- print("🌐 Robust Content Extractor Starting")
442
  print("="*60)
443
  print("Features:")
444
- print("• Faster timeouts (15-25s for Jina)")
445
  print("• Multiple fallback strategies")
446
- print("• Fast health endpoint for wake-up")
 
447
  print("="*60)
448
  print("API Endpoints:")
449
- print("• GET /health - Fast health check (for wake-up)")
450
  print("• POST /extract - Extract content")
451
  print("="*60 + "\n")
452
 
 
11
  from typing import Dict, Any
12
  from fastapi import FastAPI, Request
13
  import uvicorn
14
+ import traceback
15
 
16
  # ==============================================
17
  # IMPROVED CONTENT EXTRACTOR
 
33
  if not url.startswith(('http://', 'https://')):
34
  url = 'https://' + url
35
 
36
+ # Clean URL - remove any problematic characters
37
+ try:
38
+ from urllib.parse import quote, urlparse, urlunparse
39
+ parsed = urlparse(url)
40
+ # Only encode the path and query
41
+ encoded_path = quote(parsed.path, safe='/')
42
+ encoded_query = quote(parsed.query, safe='=&')
43
+ url = urlunparse((
44
+ parsed.scheme,
45
+ parsed.netloc,
46
+ encoded_path,
47
+ parsed.params,
48
+ encoded_query,
49
+ parsed.fragment
50
+ ))
51
+ except:
52
+ pass # Keep original if encoding fails
53
+
54
+ # Try multiple strategies
55
  strategies = [
56
  self._try_jina_reader_fast, # Faster timeout
57
  self._try_direct_request, # Direct attempt
58
  self._try_simple_request, # Simple headers
59
+ self._try_fallback_request, # Fallback with different settings
60
  ]
61
 
62
  last_error = None
 
69
  if result.get("success"):
70
  result["execution_time"] = round(time.time() - start_time, 2)
71
  result["method"] = f"strategy_{i+1}"
72
+ print(f" ✓ Strategy {i+1} succeeded")
73
  return result
74
 
75
  except Exception as e:
76
  last_error = str(e)
77
  print(f" Strategy {i+1} failed: {e}")
78
+ time.sleep(1) # Short pause between strategies
79
 
80
  # All failed
81
  return {
 
89
  def _try_jina_reader_fast(self, url: str) -> Dict[str, Any]:
90
  """Try Jina Reader with shorter timeout"""
91
  try:
92
+ # Use encoded URL for Jina
93
  jina_url = f"https://r.jina.ai/{url}"
94
 
95
+ # Try with very short timeout first
96
+ response = requests.get(
97
+ jina_url,
98
+ headers={
99
+ "Accept": "text/plain",
100
+ "User-Agent": self.user_agent
101
+ },
102
+ timeout=12 # Reduced from 15s
103
+ )
104
 
105
+ if response.status_code == 200:
106
+ content = response.text
107
+
108
+ # Try to parse as JSON
109
  try:
110
+ data = json.loads(content)
111
+ if isinstance(data, dict):
112
+ if "content" in data:
113
+ content = data["content"]
114
+ elif "data" in data:
115
+ content = str(data["data"])
116
+ except:
117
+ pass # Keep as plain text
118
+
119
+ # Extract title
120
+ title = self._extract_title_from_text(content)
121
+
122
+ # Clean content
123
+ cleaned = self._clean_content(content)
124
+
125
+ return {
126
+ "success": True,
127
+ "url": url,
128
+ "title": title[:300] if title else "Extracted via Jina",
129
+ "main_content": cleaned[:25000],
130
+ "content_length": len(cleaned),
131
+ "content_preview": cleaned[:800] + ("..." if len(cleaned) > 800 else ""),
132
+ "source": "jina_reader",
133
+ "status": response.status_code
134
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
+ return {"success": False, "error": f"Jina status: {response.status_code}"}
137
 
138
+ except requests.exceptions.Timeout:
139
+ print(f" Jina timeout after 12s, trying next strategy...")
140
+ return {"success": False, "error": "Jina Reader timed out"}
141
  except Exception as e:
142
+ print(f" Jina error: {e}")
143
  return {"success": False, "error": f"Jina error: {str(e)}"}
144
 
145
  def _try_direct_request(self, url: str) -> Dict[str, Any]:
146
  """Try direct request with various headers"""
147
  headers_list = [
148
  {
149
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
150
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
151
  "Accept-Language": "en-US,en;q=0.9",
152
+ "Accept-Encoding": "gzip, deflate, br",
153
+ "DNT": "1",
154
+ "Connection": "keep-alive",
155
+ "Upgrade-Insecure-Requests": "1",
156
+ "Sec-Fetch-Dest": "document",
157
+ "Sec-Fetch-Mode": "navigate",
158
+ "Sec-Fetch-Site": "none",
159
+ "Sec-Fetch-User": "?1",
160
+ "Cache-Control": "max-age=0",
161
  },
162
  {
163
+ "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
164
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
165
  },
166
  {
167
  "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
168
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
169
  },
170
  ]
171
 
172
+ for i, headers in enumerate(headers_list):
173
  try:
174
+ print(f" Direct attempt {i+1}...")
175
+ response = requests.get(
176
+ url,
177
+ headers=headers,
178
+ timeout=10,
179
+ allow_redirects=True,
180
+ verify=False # Try without SSL verification
181
+ )
182
+
183
+ print(f" Status: {response.status_code}")
184
 
185
  if response.status_code == 200:
186
  html_content = response.text
 
199
  "title": title[:300] if title else "Direct extraction",
200
  "main_content": cleaned[:20000],
201
  "content_length": len(cleaned),
202
+ "source": f"direct_request_{i+1}",
203
  "status": response.status_code
204
  }
205
 
206
+ except requests.exceptions.Timeout:
207
+ print(f" Direct request {i+1} timed out")
208
+ continue
209
  except Exception as e:
210
+ print(f" Direct request {i+1} error: {e}")
211
  continue
212
 
213
+ return {"success": False, "error": "All direct attempts failed"}
214
 
215
  def _try_simple_request(self, url: str) -> Dict[str, Any]:
216
  """Simple request with minimal headers"""
217
  try:
218
+ print(" Simple request attempt...")
219
  response = requests.get(
220
  url,
221
  headers={"User-Agent": "Mozilla/5.0"},
222
  timeout=8,
223
+ allow_redirects=True,
224
+ verify=False
225
  )
226
 
227
+ print(f" Simple status: {response.status_code}")
228
+
229
  if response.status_code == 200:
230
  html_content = response.text
231
  text_content = self._extract_from_html(html_content)
 
242
  "source": "simple_request"
243
  }
244
 
245
+ return {"success": False, "error": f"Status: {response.status_code}"}
246
 
247
  except Exception as e:
248
  return {"success": False, "error": str(e)}
249
 
250
+ def _try_fallback_request(self, url: str) -> Dict[str, Any]:
251
+ """Fallback using alternative methods"""
252
+ try:
253
+ print(" Fallback attempt...")
254
+
255
+ # Try with requests session
256
+ session = requests.Session()
257
+ session.headers.update({
258
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
259
+ "Accept": "text/html",
260
+ })
261
+
262
+ response = session.get(url, timeout=15, allow_redirects=True, verify=False)
263
+
264
+ if response.status_code == 200:
265
+ html_content = response.text
266
+
267
+ # Very simple text extraction
268
+ text = self._simple_text_extraction(html_content)
269
+
270
+ if len(text) > 50:
271
+ return {
272
+ "success": True,
273
+ "url": url,
274
+ "title": "Fallback extraction",
275
+ "main_content": text[:10000],
276
+ "content_length": len(text),
277
+ "source": "fallback",
278
+ "status": response.status_code
279
+ }
280
+
281
+ return {"success": False, "error": f"Fallback status: {response.status_code}"}
282
+
283
+ except Exception as e:
284
+ return {"success": False, "error": f"Fallback error: {str(e)}"}
285
+
286
+ def _simple_text_extraction(self, html_content: str) -> str:
287
+ """Very simple text extraction"""
288
+ # Remove scripts and styles
289
+ html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
290
+ html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
291
+
292
+ # Extract text between tags
293
+ text = re.sub(r'<[^>]+>', ' ', html_content)
294
+ text = html.unescape(text)
295
+ text = re.sub(r'\s+', ' ', text)
296
+
297
+ return text.strip()
298
+
299
  def _extract_from_html(self, html_content: str) -> str:
300
  """Extract text from HTML"""
301
  # Remove scripts and styles
 
304
  html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
305
 
306
  # Remove unwanted tags
307
+ unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe', 'svg', 'button']
308
  for tag in unwanted_tags:
309
  html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
310
 
 
319
 
320
  def _extract_title_from_html(self, html_content: str) -> str:
321
  """Extract title from HTML"""
322
+ # Try <title> tag
323
+ title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
324
  if title_match:
325
  title = title_match.group(1)
326
  title = re.sub(r'\s+', ' ', title).strip()
327
  title = html.unescape(title)
328
+ if title:
329
+ return title[:200]
330
+
331
+ # Try meta title
332
+ meta_match = re.search(r'<meta[^>]*property=["\']og:title["\'][^>]*content=["\'](.*?)["\']', html_content, re.IGNORECASE)
333
+ if meta_match:
334
+ title = meta_match.group(1)
335
+ title = html.unescape(title).strip()
336
+ if title:
337
+ return title[:200]
338
+
339
+ # Try h1
340
+ h1_match = re.search(r'<h1[^>]*>(.*?)</h1>', html_content, re.IGNORECASE | re.DOTALL)
341
+ if h1_match:
342
+ title = h1_match.group(1)
343
+ title = re.sub(r'<[^>]+>', '', title)
344
+ title = html.unescape(title).strip()
345
+ if title:
346
+ return title[:200]
347
+
348
  return ""
349
 
350
  def _extract_title_from_text(self, text: str) -> str:
 
379
  # Remove excessive line breaks
380
  content = re.sub(r'\n{3,}', '\n\n', content)
381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  return content.strip()
383
 
384
  # ==============================================
 
394
  fastapi_app = FastAPI(
395
  title="Robust Content Extractor",
396
  description="Extracts content with better timeout handling",
397
+ version="2.1"
398
  )
399
 
400
  from fastapi.middleware.cors import CORSMiddleware
 
412
  async def root():
413
  return {
414
  "service": "Robust Content Extractor",
415
+ "version": "2.1",
416
+ "description": "Extracts website content with multiple fallback strategies",
417
  "endpoints": {
418
  "GET /": "This info",
419
  "GET /health": "Health check (fast)",
420
  "POST /extract": "Extract content"
421
+ }
 
422
  }
423
 
424
  @fastapi_app.get("/health")
 
444
  )
445
 
446
  print(f"📨 API Request: {url}")
447
+ print(f" Starting extraction at {time.strftime('%Y-%m-%d %H:%M:%S')}")
448
+
449
+ start_time = time.time()
450
  result = extractor.extract_content(url)
451
+ elapsed = time.time() - start_time
452
+
453
+ print(f" Extraction completed in {elapsed:.2f}s")
454
+ print(f" Success: {result.get('success')}")
455
 
456
  return result
457
 
 
461
  content={"success": False, "error": "Invalid JSON"}
462
  )
463
  except Exception as e:
464
+ print(f" API Error: {traceback.format_exc()}")
465
  return JSONResponse(
466
  status_code=500,
467
+ content={
468
+ "success": False,
469
+ "error": str(e),
470
+ "traceback": traceback.format_exc()[:500]
471
+ }
472
  )
473
 
474
  # ==============================================
 
519
  gr.Markdown(label="Result"),
520
  gr.JSON(label="API Response")
521
  ],
522
+ title="🌐 Robust Content Extractor v2.1",
523
+ description="Extracts content with better error handling and multiple fallbacks",
524
  examples=[
525
  ["https://example.com"],
526
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
 
540
 
541
  if __name__ == "__main__":
542
  print("\n" + "="*60)
543
+ print("🌐 Robust Content Extractor v2.1 Starting")
544
  print("="*60)
545
  print("Features:")
 
546
  print("• Multiple fallback strategies")
547
+ print("• Better error handling")
548
+ print("• URL encoding support")
549
  print("="*60)
550
  print("API Endpoints:")
551
+ print("• GET /health - Fast health check")
552
  print("• POST /extract - Extract content")
553
  print("="*60 + "\n")
554