yukee1992 commited on
Commit
310b130
Β·
verified Β·
1 Parent(s): ba2f5fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +236 -284
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==============================================
2
- # FREE SCREENSHOT SCRAPER FOR N8N
3
  # ==============================================
4
 
5
  import gradio as gr
@@ -8,50 +8,19 @@ import json
8
  import time
9
  import re
10
  import html
11
- import base64
12
- from io import BytesIO
13
- from PIL import Image
14
- import pytesseract # Free OCR
15
  from typing import Dict, Any
16
  from fastapi import FastAPI, Request
17
  import uvicorn
18
 
19
  # ==============================================
20
- # FREE SCREENSHOT SCRAPER
21
  # ==============================================
22
 
23
- class FreeScreenshotScraper:
24
- """Free scraper using screenshot APIs + fallback"""
25
 
26
  def __init__(self):
27
- # Free screenshot APIs (no API key needed)
28
- self.screenshot_apis = [
29
- {
30
- "url": lambda u: f"https://s0.wp.com/mshots/v1/{u}?w=1024",
31
- "name": "wordpress_mshots"
32
- },
33
- {
34
- "url": lambda u: f"https://render-tron.appspot.com/screenshot/{u}?width=1024&height=768",
35
- "name": "render_tron"
36
- },
37
- {
38
- "url": lambda u: f"https://image.thum.io/get/width/1024/crop/768/noanimate/{u}",
39
- "name": "thumio"
40
- },
41
- ]
42
-
43
- # Free HTML content APIs
44
- self.html_apis = [
45
- {
46
- "url": lambda u: f"https://r.jina.ai/{u}",
47
- "name": "jina_reader",
48
- "headers": {"Accept": "application/json"}
49
- },
50
- {
51
- "url": lambda u: f"https://extractorapi.com/api/v1/extractor?apikey=demo&url={u}",
52
- "name": "extractor_api"
53
- },
54
- ]
55
 
56
  def extract_content(self, url: str) -> Dict[str, Any]:
57
  """Extract content using free APIs"""
@@ -63,267 +32,196 @@ class FreeScreenshotScraper:
63
  if not url.startswith(('http://', 'https://')):
64
  url = 'https://' + url
65
 
66
- # Strategy 1: Try Jina Reader API (best for content extraction)
67
- print(" Trying Jina Reader API...")
68
- jina_result = self._try_jina_reader(url)
69
- if jina_result["success"]:
70
- jina_result["execution_time"] = round(time.time() - start_time, 2)
71
- jina_result["method"] = "jina_reader_api"
72
- return jina_result
73
-
74
- # Strategy 2: Try other HTML APIs
75
- print(" Trying other HTML APIs...")
76
- for api in self.html_apis[1:]:
77
- result = self._try_api(api, url)
78
- if result["success"]:
79
- result["execution_time"] = round(time.time() - start_time, 2)
80
- result["method"] = api["name"]
81
- return result
82
-
83
- # Strategy 3: Try direct request with smart headers
84
- print(" Trying direct request...")
85
- direct_result = self._try_direct_request(url)
86
- if direct_result["success"]:
87
- direct_result["execution_time"] = round(time.time() - start_time, 2)
88
- direct_result["method"] = "direct_with_fallback"
89
- return direct_result
90
 
91
- # Strategy 4: Try screenshot APIs as last resort
92
- print(" Trying screenshot APIs...")
93
- for api in self.screenshot_apis:
94
- result = self._try_screenshot_api(api, url)
95
- if result["success"]:
96
- result["execution_time"] = round(time.time() - start_time, 2)
97
- result["method"] = f"screenshot_{api['name']}"
98
- return result
 
 
 
 
 
99
 
100
  # All failed
101
  return {
102
  "success": False,
103
  "url": url,
104
- "error": "All free methods failed",
105
  "execution_time": round(time.time() - start_time, 2),
106
- "suggestions": [
107
- "Try a different URL",
108
- "Website may block automated access",
109
- "Try using Jina Reader directly: https://r.jina.ai/your-url"
110
- ]
111
  }
112
 
113
  def _try_jina_reader(self, url: str) -> Dict[str, Any]:
114
- """Try Jina Reader API (free, no API key needed)"""
115
  try:
 
116
  api_url = f"https://r.jina.ai/{url}"
117
- headers = {
118
- "User-Agent": "Mozilla/5.0",
119
- "Accept": "application/json",
120
- }
121
 
122
- response = requests.get(api_url, headers=headers, timeout=30)
 
 
 
 
 
123
 
124
- if response.status_code == 200:
125
- # Jina returns clean text directly
126
- content = response.text
127
-
128
- # Try to parse as JSON first
129
  try:
130
- data = json.loads(content)
131
- if "data" in data:
132
- content = data["data"]["content"] if "content" in data["data"] else str(data["data"])
133
- except:
134
- pass # Keep as text
135
-
136
- # Extract title if possible
137
- title = ""
138
- title_match = re.search(r'<title[^>]*>(.*?)</title>', content, re.IGNORECASE)
139
- if title_match:
140
- title = title_match.group(1)
141
-
142
- # Clean content
143
- cleaned = self._clean_content(content)
144
-
145
- return {
146
- "success": True,
147
- "url": url,
148
- "title": title[:200] if title else "Extracted via Jina Reader",
149
- "main_content": cleaned[:30000],
150
- "content_length": len(cleaned),
151
- "source": "jina_reader",
152
- "note": "Content extracted via free Jina Reader API"
153
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- return {"success": False, "error": f"Jina API status: {response.status_code}"}
156
 
157
  except Exception as e:
158
  return {"success": False, "error": f"Jina API error: {str(e)}"}
159
 
160
- def _try_api(self, api: dict, url: str) -> Dict[str, Any]:
161
- """Try other free APIs"""
162
- try:
163
- api_url = api["url"](url)
164
- headers = api.get("headers", {"User-Agent": "Mozilla/5.0"})
165
-
166
- response = requests.get(api_url, headers=headers, timeout=15)
167
-
168
- if response.status_code == 200:
169
- content = response.text
170
-
171
- # Try to parse JSON
172
- try:
173
- data = json.loads(content)
174
- # Extract content from common API formats
175
- if "text" in data:
176
- content = data["text"]
177
- elif "content" in data:
178
- content = data["content"]
179
- elif "article" in data:
180
- content = data["article"]
181
- except:
182
- pass
183
-
184
- cleaned = self._clean_content(content)
185
-
186
- return {
187
- "success": True,
188
- "url": url,
189
- "main_content": cleaned[:20000],
190
- "content_length": len(cleaned)
191
- }
192
-
193
- return {"success": False}
194
-
195
- except:
196
- return {"success": False}
197
-
198
  def _try_direct_request(self, url: str) -> Dict[str, Any]:
199
- """Try direct request with various strategies"""
200
- strategies = [
201
- self._direct_request_with_headers,
202
- self._direct_request_as_googlebot,
203
- self._direct_request_with_referer,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  ]
205
 
206
- for strategy in strategies:
207
  try:
208
- result = strategy(url)
209
- if result["success"]:
210
- return result
211
- except:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  continue
213
 
214
  return {"success": False}
215
 
216
- def _direct_request_with_headers(self, url: str) -> Dict[str, Any]:
217
- """Direct request with browser-like headers"""
218
- headers = {
219
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
220
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
221
- "Accept-Language": "en-US,en;q=0.5",
222
- "Accept-Encoding": "gzip, deflate",
223
- "Connection": "keep-alive",
224
- "Upgrade-Insecure-Requests": "1",
225
- "Cache-Control": "max-age=0",
226
- }
227
-
228
- response = requests.get(url, headers=headers, timeout=10)
229
-
230
- if response.status_code == 200:
231
- content = self._extract_from_html(response.text)
232
- cleaned = self._clean_content(content)
233
-
234
- return {
235
- "success": True,
236
- "content": cleaned
237
- }
238
-
239
- return {"success": False}
240
-
241
- def _direct_request_as_googlebot(self, url: str) -> Dict[str, Any]:
242
  """Pretend to be Googlebot"""
243
- headers = {
244
- "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
245
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
246
- }
247
-
248
- response = requests.get(url, headers=headers, timeout=10)
249
-
250
- if response.status_code == 200:
251
- content = self._extract_from_html(response.text)
252
- cleaned = self._clean_content(content)
253
-
254
- return {
255
- "success": True,
256
- "content": cleaned
257
- }
258
-
259
- return {"success": False}
260
-
261
- def _direct_request_with_referer(self, url: str) -> Dict[str, Any]:
262
- """Request with referer"""
263
- headers = {
264
- "User-Agent": "Mozilla/5.0",
265
- "Referer": "https://www.google.com/",
266
- "Accept": "text/html",
267
- }
268
-
269
- response = requests.get(url, headers=headers, timeout=10)
270
-
271
- if response.status_code == 200:
272
- content = self._extract_from_html(response.text)
273
- cleaned = self._clean_content(content)
274
-
275
- return {
276
- "success": True,
277
- "content": cleaned
278
- }
279
-
280
- return {"success": False}
281
-
282
- def _try_screenshot_api(self, api: dict, url: str) -> Dict[str, Any]:
283
- """Try screenshot API"""
284
  try:
285
- api_url = api["url"](url)
286
- headers = {"User-Agent": "Mozilla/5.0"}
 
 
 
287
 
288
- response = requests.get(api_url, headers=headers, timeout=15)
289
 
290
- if response.status_code == 200 and len(response.content) > 1000:
291
- # Check if it's actually an image
292
- try:
293
- img = Image.open(BytesIO(response.content))
294
- img.verify()
295
-
296
- # Try OCR if available
297
- try:
298
- text = pytesseract.image_to_string(img)
299
- cleaned = self._clean_content(text)
300
-
301
- return {
302
- "success": True,
303
- "url": url,
304
- "main_content": cleaned[:15000],
305
- "content_length": len(cleaned),
306
- "note": "Content extracted from screenshot via OCR"
307
- }
308
- except:
309
- return {"success": False, "error": "OCR not available"}
310
-
311
- except:
312
- return {"success": False}
313
 
314
  return {"success": False}
315
 
316
- except:
317
- return {"success": False}
318
 
319
  def _extract_from_html(self, html_content: str) -> str:
320
  """Extract text from HTML"""
321
  # Remove scripts and styles
322
- html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
323
- html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
 
324
 
325
- # Remove unwanted tags
326
- unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu']
327
  for tag in unwanted_tags:
328
  html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
329
 
@@ -331,10 +229,41 @@ class FreeScreenshotScraper:
331
  text = re.sub(r'<[^>]+>', ' ', html_content)
332
  text = html.unescape(text)
333
 
 
 
 
334
  return text
335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  def _clean_content(self, content: str) -> str:
337
- """Clean content"""
338
  if not content:
339
  return ""
340
 
@@ -347,13 +276,36 @@ class FreeScreenshotScraper:
347
  # Remove excessive line breaks
348
  content = re.sub(r'\n{3,}', '\n\n', content)
349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  return content.strip()
351
 
352
  # ==============================================
353
  # INITIALIZE
354
  # ==============================================
355
 
356
- scraper = FreeScreenshotScraper()
357
 
358
  # ==============================================
359
  # FASTAPI APP
@@ -361,7 +313,7 @@ scraper = FreeScreenshotScraper()
361
 
362
  fastapi_app = FastAPI(
363
  title="Free Content Extractor",
364
- description="Uses free APIs to extract content from websites",
365
  version="1.0"
366
  )
367
 
@@ -381,18 +333,18 @@ async def root():
381
  return {
382
  "service": "Free Content Extractor",
383
  "version": "1.0",
384
- "description": "Uses free APIs (Jina Reader, etc.) to extract website content",
385
  "endpoints": {
386
  "GET /": "This info",
387
  "GET /health": "Health check",
388
- "POST /extract": "Extract content (for n8n)"
389
  },
390
- "free_apis_used": [
391
- "Jina Reader (https://r.jina.ai/)",
392
- "WordPress mShots",
393
- "Render-Tron",
394
- "ExtractorAPI (demo)"
395
- ]
396
  }
397
 
398
  @fastapi_app.get("/health")
@@ -412,8 +364,8 @@ async def api_extract(request: Request):
412
  content={"success": False, "error": "URL is required"}
413
  )
414
 
415
- print(f"πŸ“¨ Request: {url}")
416
- result = scraper.extract_content(url)
417
 
418
  return result
419
 
@@ -433,16 +385,17 @@ async def api_extract(request: Request):
433
  # ==============================================
434
 
435
  def gradio_extract(url: str):
436
- """Gradio interface"""
437
  if not url:
438
  return "❌ Please enter a URL", {}
439
 
440
- result = scraper.extract_content(url)
441
 
442
  if result["success"]:
443
  content = result["main_content"]
444
  content_length = result["content_length"]
445
 
 
446
  preview = content[:800]
447
  if len(content) > 800:
448
  preview += "..."
@@ -451,23 +404,22 @@ def gradio_extract(url: str):
451
  ## βœ… Success!
452
 
453
  **URL:** {result['url']}
454
- **Method:** {result.get('method', 'free_api')}
 
455
  **Time:** {result['execution_time']}s
456
  **Content Length:** {content_length:,} characters
457
 
458
  ### Preview:
459
  {preview}
460
 
461
- *Using free APIs - may not work on all websites*
462
  """
463
  return output, result
464
  else:
465
  error = result.get("error", "Unknown error")
466
- suggestions = result.get("suggestions", [])
467
 
468
- suggestion_text = ""
469
- if suggestions:
470
- suggestion_text = "\n\n**Suggestions:**\n" + "\n".join([f"β€’ {s}" for s in suggestions])
471
 
472
  return f"## ❌ Error\n\n{error}{suggestion_text}", result
473
 
@@ -484,12 +436,13 @@ gradio_interface = gr.Interface(
484
  gr.JSON(label="API Response")
485
  ],
486
  title="🌐 Free Content Extractor for n8n",
487
- description="Uses free APIs to extract content. Works with most websites.",
488
  examples=[
489
  ["https://www.sinchew.com.my/"],
490
  ["https://example.com"],
491
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
492
- ["https://news.ycombinator.com"]
 
493
  ]
494
  )
495
 
@@ -507,12 +460,11 @@ if __name__ == "__main__":
507
  print("\n" + "="*60)
508
  print("🌐 Free Content Extractor Starting")
509
  print("="*60)
510
- print("Using free APIs:")
511
- print("β€’ Jina Reader (r.jina.ai)")
512
- print("β€’ WordPress mShots")
513
- print("β€’ Render-Tron")
514
  print("="*60)
515
  print("API Endpoint: POST /extract")
 
516
  print("="*60 + "\n")
517
 
518
  uvicorn.run(
 
1
  # ==============================================
2
+ # SIMPLE FREE CONTENT EXTRACTOR FOR N8N
3
  # ==============================================
4
 
5
  import gradio as gr
 
8
  import time
9
  import re
10
  import html
 
 
 
 
11
  from typing import Dict, Any
12
  from fastapi import FastAPI, Request
13
  import uvicorn
14
 
15
  # ==============================================
16
+ # SIMPLE CONTENT EXTRACTOR
17
  # ==============================================
18
 
19
+ class SimpleContentExtractor:
20
+ """Simple extractor using Jina Reader API + fallbacks"""
21
 
22
  def __init__(self):
23
+ self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def extract_content(self, url: str) -> Dict[str, Any]:
26
  """Extract content using free APIs"""
 
32
  if not url.startswith(('http://', 'https://')):
33
  url = 'https://' + url
34
 
35
+ # Try multiple strategies
36
+ strategies = [
37
+ self._try_jina_reader,
38
+ self._try_direct_request,
39
+ self._try_googlebot,
40
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ for i, strategy in enumerate(strategies):
43
+ try:
44
+ print(f" Trying strategy {i+1}...")
45
+ result = strategy(url)
46
+
47
+ if result.get("success"):
48
+ result["execution_time"] = round(time.time() - start_time, 2)
49
+ result["method"] = f"strategy_{i+1}"
50
+ return result
51
+
52
+ except Exception as e:
53
+ print(f" Strategy {i+1} failed: {e}")
54
+ time.sleep(0.3) # Small delay
55
 
56
  # All failed
57
  return {
58
  "success": False,
59
  "url": url,
60
+ "error": "Failed to extract content",
61
  "execution_time": round(time.time() - start_time, 2),
62
+ "suggestion": "Try using Jina Reader directly: https://r.jina.ai/your-url"
 
 
 
 
63
  }
64
 
65
  def _try_jina_reader(self, url: str) -> Dict[str, Any]:
66
+ """Try Jina Reader API (free, no API key, handles JavaScript)"""
67
  try:
68
+ # Jina Reader endpoint
69
  api_url = f"https://r.jina.ai/{url}"
 
 
 
 
70
 
71
+ # Try with different formats
72
+ formats = [
73
+ {"headers": {"Accept": "text/plain"}},
74
+ {"headers": {"Accept": "application/json"}},
75
+ {"url": f"https://r.jina.ai/{url}?format=json"},
76
+ ]
77
 
78
+ for fmt in formats:
 
 
 
 
79
  try:
80
+ headers = fmt.get("headers", {"Accept": "text/plain", "User-Agent": self.user_agent})
81
+ api_url_to_use = fmt.get("url", api_url)
82
+
83
+ response = requests.get(api_url_to_use, headers=headers, timeout=30)
84
+
85
+ if response.status_code == 200:
86
+ content = response.text
87
+
88
+ # Try to parse as JSON
89
+ try:
90
+ data = json.loads(content)
91
+ if isinstance(data, dict):
92
+ if "content" in data:
93
+ content = data["content"]
94
+ elif "data" in data:
95
+ content = str(data["data"])
96
+ elif "text" in data:
97
+ content = data["text"]
98
+ except:
99
+ pass # Keep as plain text
100
+
101
+ # Extract title
102
+ title = self._extract_title_from_text(content)
103
+
104
+ # Clean content
105
+ cleaned = self._clean_content(content)
106
+
107
+ return {
108
+ "success": True,
109
+ "url": url,
110
+ "title": title[:300] if title else "Extracted via Jina Reader",
111
+ "main_content": cleaned[:35000],
112
+ "content_length": len(cleaned),
113
+ "content_preview": cleaned[:1000] + ("..." if len(cleaned) > 1000 else ""),
114
+ "source": "jina_reader",
115
+ "note": "Content extracted via free Jina Reader API (handles JavaScript)",
116
+ "status": response.status_code
117
+ }
118
+
119
+ except Exception as e:
120
+ print(f" Jina format failed: {e}")
121
+ continue
122
 
123
+ return {"success": False, "error": f"Jina returned status: {response.status_code}"}
124
 
125
  except Exception as e:
126
  return {"success": False, "error": f"Jina API error: {str(e)}"}
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  def _try_direct_request(self, url: str) -> Dict[str, Any]:
129
+ """Try direct HTTP request with smart headers"""
130
+ headers_list = [
131
+ # Normal browser
132
+ {
133
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
134
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
135
+ "Accept-Language": "en-US,en;q=0.9",
136
+ "Accept-Encoding": "gzip, deflate",
137
+ "Connection": "keep-alive",
138
+ },
139
+ # Mobile browser
140
+ {
141
+ "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15",
142
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
143
+ "Accept-Language": "en-US,en;q=0.9",
144
+ },
145
+ # Simple headers
146
+ {
147
+ "User-Agent": "Mozilla/5.0",
148
+ "Accept": "text/html",
149
+ },
150
  ]
151
 
152
+ for headers in headers_list:
153
  try:
154
+ response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
155
+
156
+ if response.status_code == 200:
157
+ html_content = response.text
158
+
159
+ # Extract content
160
+ text_content = self._extract_from_html(html_content)
161
+ cleaned = self._clean_content(text_content)
162
+
163
+ # Extract title
164
+ title = self._extract_title_from_html(html_content)
165
+
166
+ if len(cleaned) > 100: # If we got meaningful content
167
+ return {
168
+ "success": True,
169
+ "url": url,
170
+ "title": title[:300] if title else "Extracted via direct request",
171
+ "main_content": cleaned[:30000],
172
+ "content_length": len(cleaned),
173
+ "source": "direct_request",
174
+ "status": response.status_code
175
+ }
176
+
177
+ except Exception as e:
178
+ print(f" Direct request failed: {e}")
179
  continue
180
 
181
  return {"success": False}
182
 
183
+ def _try_googlebot(self, url: str) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  """Pretend to be Googlebot"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  try:
186
+ headers = {
187
+ "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
188
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
189
+ "From": "googlebot(at)googlebot.com",
190
+ }
191
 
192
+ response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
193
 
194
+ if response.status_code == 200:
195
+ html_content = response.text
196
+ text_content = self._extract_from_html(html_content)
197
+ cleaned = self._clean_content(text_content)
198
+ title = self._extract_title_from_html(html_content)
199
+
200
+ if len(cleaned) > 100:
201
+ return {
202
+ "success": True,
203
+ "url": url,
204
+ "title": title[:300] if title else "Extracted as Googlebot",
205
+ "main_content": cleaned[:30000],
206
+ "content_length": len(cleaned),
207
+ "source": "googlebot",
208
+ "status": response.status_code
209
+ }
 
 
 
 
 
 
 
210
 
211
  return {"success": False}
212
 
213
+ except Exception as e:
214
+ return {"success": False, "error": str(e)}
215
 
216
  def _extract_from_html(self, html_content: str) -> str:
217
  """Extract text from HTML"""
218
  # Remove scripts and styles
219
+ html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
220
+ html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
221
+ html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
222
 
223
+ # Remove unwanted sections
224
+ unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe']
225
  for tag in unwanted_tags:
226
  html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
227
 
 
229
  text = re.sub(r'<[^>]+>', ' ', html_content)
230
  text = html.unescape(text)
231
 
232
+ # Remove excessive whitespace
233
+ text = re.sub(r'\s+', ' ', text)
234
+
235
  return text
236
 
237
+ def _extract_title_from_html(self, html_content: str) -> str:
238
+ """Extract title from HTML"""
239
+ title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
240
+ if title_match:
241
+ title = title_match.group(1)
242
+ title = re.sub(r'\s+', ' ', title).strip()
243
+ title = html.unescape(title)
244
+ return title
245
+ return ""
246
+
247
+ def _extract_title_from_text(self, text: str) -> str:
248
+ """Try to extract title from plain text"""
249
+ # Look for title patterns
250
+ patterns = [
251
+ r'Title:\s*(.*?)(?:\n|$)',
252
+ r'#\s+(.*?)(?:\n|$)',
253
+ r'^(.*?)(?:\n|$)',
254
+ ]
255
+
256
+ for pattern in patterns:
257
+ match = re.search(pattern, text[:500], re.IGNORECASE)
258
+ if match:
259
+ title = match.group(1).strip()
260
+ if len(title) > 10 and len(title) < 200:
261
+ return title
262
+
263
+ return ""
264
+
265
  def _clean_content(self, content: str) -> str:
266
+ """Clean and normalize content"""
267
  if not content:
268
  return ""
269
 
 
276
  # Remove excessive line breaks
277
  content = re.sub(r'\n{3,}', '\n\n', content)
278
 
279
+ # Remove common unwanted phrases
280
+ unwanted = [
281
+ r'adsbygoogle',
282
+ r'advertisement',
283
+ r'sponsored content',
284
+ r'sign up for',
285
+ r'subscribe to',
286
+ r'follow us on',
287
+ r'like us on facebook',
288
+ r'share this article',
289
+ r'read more',
290
+ r'continue reading',
291
+ r'click here',
292
+ r'learn more',
293
+ ]
294
+
295
+ for phrase in unwanted:
296
+ content = re.sub(phrase, '', content, flags=re.IGNORECASE)
297
+
298
+ # Remove email addresses and URLs
299
+ content = re.sub(r'\S+@\S+\.\S+', '', content)
300
+ content = re.sub(r'https?://\S+', '', content)
301
+
302
  return content.strip()
303
 
304
  # ==============================================
305
  # INITIALIZE
306
  # ==============================================
307
 
308
+ extractor = SimpleContentExtractor()
309
 
310
  # ==============================================
311
  # FASTAPI APP
 
313
 
314
  fastapi_app = FastAPI(
315
  title="Free Content Extractor",
316
+ description="Extracts content using free Jina Reader API and fallbacks",
317
  version="1.0"
318
  )
319
 
 
333
  return {
334
  "service": "Free Content Extractor",
335
  "version": "1.0",
336
+ "description": "Extracts website content using free Jina Reader API (handles JavaScript)",
337
  "endpoints": {
338
  "GET /": "This info",
339
  "GET /health": "Health check",
340
+ "POST /extract": "Extract content"
341
  },
342
+ "usage_n8n": {
343
+ "method": "POST",
344
+ "url": "https://your-space.hf.space/extract",
345
+ "body": {"url": "https://example.com"}
346
+ },
347
+ "alternative": "Use Jina Reader directly: GET https://r.jina.ai/your-url"
348
  }
349
 
350
  @fastapi_app.get("/health")
 
364
  content={"success": False, "error": "URL is required"}
365
  )
366
 
367
+ print(f"πŸ“¨ API Request: {url}")
368
+ result = extractor.extract_content(url)
369
 
370
  return result
371
 
 
385
  # ==============================================
386
 
387
  def gradio_extract(url: str):
388
+ """Gradio interface function"""
389
  if not url:
390
  return "❌ Please enter a URL", {}
391
 
392
+ result = extractor.extract_content(url)
393
 
394
  if result["success"]:
395
  content = result["main_content"]
396
  content_length = result["content_length"]
397
 
398
+ # Create preview
399
  preview = content[:800]
400
  if len(content) > 800:
401
  preview += "..."
 
404
  ## βœ… Success!
405
 
406
  **URL:** {result['url']}
407
+ **Title:** {result.get('title', 'N/A')}
408
+ **Method:** {result.get('method', 'jina_reader')}
409
  **Time:** {result['execution_time']}s
410
  **Content Length:** {content_length:,} characters
411
 
412
  ### Preview:
413
  {preview}
414
 
415
+ *Powered by free Jina Reader API*
416
  """
417
  return output, result
418
  else:
419
  error = result.get("error", "Unknown error")
420
+ suggestion = result.get("suggestion", "")
421
 
422
+ suggestion_text = f"\n\n{suggestion}" if suggestion else ""
 
 
423
 
424
  return f"## ❌ Error\n\n{error}{suggestion_text}", result
425
 
 
436
  gr.JSON(label="API Response")
437
  ],
438
  title="🌐 Free Content Extractor for n8n",
439
+ description="Uses free Jina Reader API to extract content (handles JavaScript websites)",
440
  examples=[
441
  ["https://www.sinchew.com.my/"],
442
  ["https://example.com"],
443
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
444
+ ["https://news.ycombinator.com"],
445
+ ["https://zhihu.com"]
446
  ]
447
  )
448
 
 
460
  print("\n" + "="*60)
461
  print("🌐 Free Content Extractor Starting")
462
  print("="*60)
463
+ print("Primary method: Jina Reader API")
464
+ print("Secondary: Direct requests + Googlebot")
 
 
465
  print("="*60)
466
  print("API Endpoint: POST /extract")
467
+ print("Direct Jina: GET https://r.jina.ai/your-url")
468
  print("="*60 + "\n")
469
 
470
  uvicorn.run(