yukee1992 commited on
Commit
52b8ad8
Β·
verified Β·
1 Parent(s): 4b458b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +263 -524
app.py CHANGED
@@ -1,621 +1,345 @@
1
  # ==============================================
2
- # SCREENSHOT SCRAPER FOR N8N - IMPROVED VERSION
3
  # ==============================================
4
 
5
  import gradio as gr
6
  import requests
7
- import base64
8
  import json
9
  import time
10
- from io import BytesIO
11
- from PIL import Image, ImageEnhance, ImageDraw, ImageFont
12
- import textwrap
13
  import re
14
- from typing import Dict, Any
15
- from fastapi import FastAPI
16
- import uvicorn
 
 
 
 
 
 
17
 
18
- # Try imports with fallbacks
19
  try:
20
  from bs4 import BeautifulSoup
21
  BEAUTIFULSOUP_AVAILABLE = True
22
  except ImportError:
23
  BEAUTIFULSOUP_AVAILABLE = False
24
- print("BeautifulSoup not available")
25
 
26
  try:
 
27
  from transformers import pipeline
28
  TRANSFORMERS_AVAILABLE = True
29
  except ImportError:
30
  TRANSFORMERS_AVAILABLE = False
31
- print("Transformers not available")
32
 
33
  # ==============================================
34
- # IMPROVED SCREENSHOT CAPTURER
35
  # ==============================================
36
 
37
- class ImprovedScreenshotCapturer:
38
- """Better screenshot capture using HTML content extraction"""
39
 
40
  def __init__(self):
41
- self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
42
-
43
- def capture(self, url: str) -> Dict[str, Any]:
44
- """Capture or create screenshot from webpage"""
45
- print(f"\nπŸ“Έ Attempting to capture: {url}")
46
-
47
- # Ensure URL has protocol
48
- if not url.startswith('http'):
49
- url = 'https://' + url
50
-
51
- # Method 1: Try actual screenshot APIs
52
- screenshot_result = self._try_screenshot_apis(url)
53
- if screenshot_result.get("success"):
54
- return screenshot_result
55
 
56
- # Method 2: Create screenshot from HTML content (most reliable)
57
- print("πŸ”„ Using HTML content method...")
58
- return self._create_from_html(url)
59
-
60
- def _try_screenshot_apis(self, url: str) -> Dict[str, Any]:
61
- """Try various screenshot APIs"""
62
- apis = [
63
- # These are more reliable APIs
64
- {
65
- "url": f"https://render-tron.appspot.com/screenshot/{url}?width=1200&height=800",
66
- "name": "rendertron"
67
- },
68
- {
69
- "url": f"https://s.wordpress.com/mshots/v1/{url}?w=1200&h=800",
70
- "name": "wordpress"
71
- },
72
- {
73
- "url": f"https://image.thum.io/get/width/1200/crop/900/{url}",
74
- "name": "thumio"
75
- }
76
- ]
77
 
78
- headers = {'User-Agent': self.user_agent}
79
 
80
- for api in apis:
81
- try:
82
- print(f" Trying {api['name']}...")
83
- response = requests.get(api["url"], headers=headers, timeout=15)
84
-
85
- if response.status_code == 200 and len(response.content) > 5000:
86
- # Verify it's an image
87
- try:
88
- img = Image.open(BytesIO(response.content))
89
- img.verify()
90
-
91
- return {
92
- "success": True,
93
- "image_bytes": response.content,
94
- "size": len(response.content),
95
- "method": api["name"],
96
- "is_real_screenshot": True
97
- }
98
- except:
99
- continue
100
- except:
101
- continue
102
 
103
- return {"success": False}
104
-
105
- def _create_from_html(self, url: str) -> Dict[str, Any]:
106
- """Create screenshot from HTML content"""
107
  try:
108
- # Fetch webpage content
109
- headers = {'User-Agent': self.user_agent}
110
- response = requests.get(url, headers=headers, timeout=15)
111
 
112
- if response.status_code != 200:
113
- return self._create_error_image(f"HTTP {response.status_code}", url)
 
 
 
 
 
 
 
 
 
 
 
114
 
115
- html_content = response.text
 
 
116
 
117
- # Parse HTML if BeautifulSoup is available
118
- if BEAUTIFULSOUP_AVAILABLE:
119
- title, main_text = self._parse_html_with_bs4(html_content)
120
- else:
121
- title, main_text = self._parse_html_simple(html_content)
122
 
123
- # Create image with the content
124
- image_bytes = self._create_content_image(url, title, main_text)
 
125
 
 
126
  return {
127
- "success": True,
128
- "image_bytes": image_bytes,
129
- "size": len(image_bytes),
130
- "method": "html_content",
131
- "is_real_screenshot": False,
132
- "content_length": len(main_text)
133
  }
134
 
135
  except Exception as e:
136
- print(f"Error creating from HTML: {str(e)}")
137
- return self._create_error_image(str(e), url)
138
-
139
- def _parse_html_with_bs4(self, html: str):
140
- """Parse HTML using BeautifulSoup"""
141
- soup = BeautifulSoup(html, 'html.parser')
142
-
143
- # Get title
144
- title = soup.title.string if soup.title else "No title"
145
-
146
- # Remove scripts, styles, nav, footer
147
- for tag in soup(["script", "style", "nav", "footer", "header", "iframe"]):
148
- tag.decompose()
149
-
150
- # Try to get main content
151
- main_content = ""
152
-
153
- # Look for main content areas
154
- selectors = ['article', 'main', '.content', '.post-content', '.article', '#content']
155
-
156
- for selector in selectors:
157
- elements = soup.select(selector)
158
- if elements:
159
- main_content = ' '.join([elem.get_text() for elem in elements[:3]])
160
- break
161
-
162
- # Fallback to body
163
- if not main_content and soup.body:
164
- main_content = soup.body.get_text()
165
-
166
- # Clean text
167
- text = self._clean_text(main_content)
168
-
169
- return title, text[:10000] # Limit text length
170
-
171
- def _parse_html_simple(self, html: str):
172
- """Simple HTML parsing without BeautifulSoup"""
173
- # Extract title
174
- title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.IGNORECASE)
175
- title = title_match.group(1) if title_match else "No title"
176
-
177
- # Extract text between body tags
178
- body_match = re.search(r'<body[^>]*>(.*?)</body>', html, re.IGNORECASE | re.DOTALL)
179
- if body_match:
180
- body_text = body_match.group(1)
181
- # Remove HTML tags
182
- clean_text = re.sub(r'<[^>]+>', ' ', body_text)
183
- else:
184
- clean_text = html[:5000]
185
-
186
- # Clean text
187
- text = self._clean_text(clean_text)
188
-
189
- return title, text[:10000]
190
-
191
- def _clean_text(self, text: str) -> str:
192
- """Clean and normalize text"""
193
- # Replace multiple whitespace with single space
194
- text = re.sub(r'\s+', ' ', text)
195
- # Remove control characters
196
- text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
197
- return text.strip()
198
-
199
- def _create_content_image(self, url: str, title: str, content: str) -> bytes:
200
- """Create an image with webpage content"""
201
- # Create image
202
- img_width, img_height = 1200, 1000
203
- img = Image.new('RGB', (img_width, img_height), color='white')
204
- draw = ImageDraw.Draw(img)
205
-
206
- # Try to load fonts
207
- try:
208
- font_large = ImageFont.truetype("arial.ttf", 24)
209
- font_medium = ImageFont.truetype("arial.ttf", 20)
210
- font_small = ImageFont.truetype("arial.ttf", 16)
211
- except:
212
- font_large = ImageFont.load_default()
213
- font_medium = ImageFont.load_default()
214
- font_small = ImageFont.load_default()
215
-
216
- # Draw header
217
- draw.text((50, 30), "πŸ“„ WEBPAGE CONTENT EXTRACT", fill='darkblue', font=font_large)
218
- draw.text((50, 70), f"URL: {url[:80]}", fill='blue', font=font_medium)
219
- draw.text((50, 100), f"Title: {title[:100]}", fill='black', font=font_medium)
220
-
221
- draw.line([(50, 130), (1150, 130)], fill='gray', width=2)
222
-
223
- # Draw content
224
- y_offset = 150
225
- lines = textwrap.wrap(content, width=100)
226
-
227
- for i, line in enumerate(lines):
228
- if y_offset < 950:
229
- draw.text((50, y_offset), line, fill='black', font=font_small)
230
- y_offset += 20
231
- else:
232
- draw.text((50, y_offset), f"... (showing {i} of {len(lines)} lines)",
233
- fill='darkgray', font=font_small)
234
- break
235
-
236
- # Footer
237
- draw.line([(50, 970), (1150, 970)], fill='lightgray', width=1)
238
- draw.text((50, 980), f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}",
239
- fill='gray', font=font_small)
240
-
241
- # Convert to bytes
242
- img_byte_arr = BytesIO()
243
- img.save(img_byte_arr, format='PNG', optimize=True, quality=85)
244
- return img_byte_arr.getvalue()
245
-
246
- def _create_error_image(self, error: str, url: str) -> Dict[str, Any]:
247
- """Create error image"""
248
- img = Image.new('RGB', (800, 400), color='white')
249
- draw = ImageDraw.Draw(img)
250
-
251
- try:
252
- font = ImageFont.truetype("arial.ttf", 20)
253
- except:
254
- font = ImageFont.load_default()
255
-
256
- draw.text((50, 50), "❌ SCREENSHOT ERROR", fill='red', font=font)
257
- draw.text((50, 100), f"URL: {url[:100]}", fill='black', font=font)
258
- draw.text((50, 150), f"Error: {error[:200]}", fill='darkred', font=font)
259
- draw.text((50, 200), "Content was extracted directly from HTML.", fill='black', font=font)
260
- draw.text((50, 250), "This is actually BETTER for text extraction!", fill='green', font=font)
261
-
262
- img_byte_arr = BytesIO()
263
- img.save(img_byte_arr, format='PNG')
264
- img_bytes = img_byte_arr.getvalue()
265
-
266
- return {
267
- "success": True, # Still successful for our purposes
268
- "image_bytes": img_bytes,
269
- "size": len(img_bytes),
270
- "method": "error_fallback",
271
- "is_real_screenshot": False,
272
- "note": f"Error: {error}"
273
- }
274
-
275
- # ==============================================
276
- # IMPROVED OCR PROCESSOR
277
- # ==============================================
278
-
279
- class ImprovedOCRProcessor:
280
- """Better OCR with preprocessing"""
281
-
282
- def __init__(self):
283
- self.processor = None
284
-
285
- def load_model(self):
286
- """Load OCR model"""
287
- if not TRANSFORMERS_AVAILABLE:
288
- return None
289
-
290
- try:
291
- # Use a smaller, faster model
292
- self.processor = pipeline(
293
- "image-to-text",
294
- model="microsoft/trocr-base-printed",
295
- device=-1
296
- )
297
- print("βœ… OCR model loaded")
298
- return self.processor
299
- except Exception as e:
300
- print(f"❌ OCR model load failed: {e}")
301
- return None
302
 
303
- def extract_text(self, image_bytes: bytes) -> Dict[str, Any]:
304
- """Extract text from image"""
305
- if not self.processor:
306
- if not self.load_model():
307
- return {"success": False, "error": "OCR not available"}
308
-
309
  try:
310
- # Open and preprocess image
311
- image = Image.open(BytesIO(image_bytes))
312
-
313
- # Convert to RGB if needed
314
- if image.mode != 'RGB':
315
- image = image.convert('RGB')
 
316
 
317
- # Resize if too large (better for OCR)
318
- max_size = 1600
319
- if max(image.size) > max_size:
320
- ratio = max_size / max(image.size)
321
- new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
322
- image = image.resize(new_size, Image.Resampling.LANCZOS)
323
 
324
- # Enhance image for better OCR
325
- enhancer = ImageEnhance.Contrast(image)
326
- image = enhancer.enhance(1.5)
327
 
328
- enhancer = ImageEnhance.Sharpness(image)
329
- image = enhancer.enhance(1.2)
330
 
331
- # Perform OCR
332
- print("πŸ” Running OCR...")
333
- start_time = time.time()
334
- result = self.processor(image)
335
- ocr_time = time.time() - start_time
336
 
337
- # Extract text from result
338
- text = ""
339
- if isinstance(result, list) and result:
340
- if isinstance(result[0], dict):
341
- text = result[0].get('generated_text', '')
342
- else:
343
- text = str(result[0])
344
  else:
345
- text = str(result)
346
-
347
- # Clean text
348
- text = self._clean_ocr_text(text)
349
 
350
- print(f"πŸ“Š OCR completed in {ocr_time:.2f}s, extracted {len(text)} chars")
 
351
 
352
  return {
353
  "success": True,
354
- "text": text,
355
- "length": len(text),
356
- "ocr_time": ocr_time,
357
- "model": "trocr-base-printed"
358
  }
359
 
 
 
 
360
  except Exception as e:
361
- print(f"❌ OCR error: {e}")
362
  return {"success": False, "error": str(e)}
363
 
364
- def _clean_ocr_text(self, text: str) -> str:
365
- """Clean OCR output"""
366
- # Remove extra whitespace
367
- text = re.sub(r'\s+', ' ', text)
368
- # Fix common OCR errors
369
- text = text.replace('|', 'I').replace('[]', 'll').replace('()', 'o')
370
- return text.strip()
371
-
372
- # ==============================================
373
- # MAIN SCRAPER
374
- # ==============================================
375
-
376
- class WebScraper:
377
- """Main scraper that combines screenshot and direct text extraction"""
378
-
379
- def __init__(self):
380
- self.screenshot_capturer = ImprovedScreenshotCapturer()
381
- self.ocr_processor = ImprovedOCRProcessor()
382
- print("πŸš€ Web Scraper initialized")
383
-
384
- def scrape(self, url: str) -> Dict[str, Any]:
385
- """Main scraping function - uses BOTH methods"""
386
- start_time = time.time()
387
-
388
- print(f"\n{'='*60}")
389
- print(f"🌐 Scraping: {url}")
390
- print(f"{'='*60}")
391
 
392
- # Method 1: Try direct HTML extraction first (fastest, most reliable for text)
393
- print("\nπŸ“ Method 1: Direct HTML text extraction...")
394
- html_start = time.time()
395
- direct_text = self._extract_direct_html(url)
396
- html_time = time.time() - html_start
397
-
398
- if direct_text and len(direct_text) > 100:
399
- print(f"βœ… Direct extraction: {len(direct_text)} characters")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
- # Also get screenshot for reference
402
- print("\nπŸ“Έ Method 2: Getting screenshot for reference...")
403
- screenshot_result = self.screenshot_capturer.capture(url)
 
404
 
405
- total_time = time.time() - start_time
 
406
 
407
- return {
408
- "success": True,
409
- "url": url,
410
- "execution_time": round(total_time, 2),
411
- "method_used": "direct_html_extraction",
412
- "extracted_text": direct_text[:15000], # Limit for response
413
- "text_length": len(direct_text),
414
- "screenshot_info": {
415
- "method": screenshot_result.get("method", "none"),
416
- "size_bytes": screenshot_result.get("size", 0),
417
- "is_real_screenshot": screenshot_result.get("is_real_screenshot", False)
418
- },
419
- "notes": "Text extracted directly from HTML (most accurate for text content)"
420
- }
421
-
422
- # Method 2: If direct extraction fails, use OCR
423
- print("\nπŸ“ Direct extraction failed, using OCR method...")
424
-
425
- # Get screenshot
426
- screenshot_start = time.time()
427
- screenshot_result = self.screenshot_capturer.capture(url)
428
- screenshot_time = time.time() - screenshot_start
429
-
430
- if not screenshot_result.get("success", False):
431
- return {
432
- "success": False,
433
- "url": url,
434
- "error": "Failed to capture content",
435
- "execution_time": round(time.time() - start_time, 2)
436
- }
437
-
438
- # Extract text with OCR
439
- print("\nπŸ” Running OCR on captured content...")
440
- ocr_start = time.time()
441
- ocr_result = self.ocr_processor.extract_text(screenshot_result["image_bytes"])
442
- ocr_time = time.time() - ocr_start
443
-
444
- total_time = time.time() - start_time
445
-
446
- if ocr_result["success"]:
447
- return {
448
- "success": True,
449
- "url": url,
450
- "execution_time": round(total_time, 2),
451
- "step_times": {
452
- "screenshot": round(screenshot_time, 2),
453
- "ocr": round(ocr_time, 2)
454
- },
455
- "method_used": "screenshot_ocr",
456
- "extracted_text": ocr_result["text"][:15000],
457
- "text_length": ocr_result["length"],
458
- "ocr_info": {
459
- "model": ocr_result.get("model", "unknown"),
460
- "processing_time": round(ocr_time, 2)
461
- },
462
- "screenshot_info": {
463
- "method": screenshot_result.get("method", "none"),
464
- "size_bytes": screenshot_result.get("size", 0),
465
- "is_real_screenshot": screenshot_result.get("is_real_screenshot", False)
466
- }
467
- }
468
- else:
469
- return {
470
- "success": False,
471
- "url": url,
472
- "error": f"OCR failed: {ocr_result.get('error', 'Unknown error')}",
473
- "execution_time": round(total_time, 2)
474
- }
475
-
476
- def _extract_direct_html(self, url: str) -> str:
477
- """Extract text directly from HTML (fastest method)"""
478
- try:
479
- headers = {'User-Agent': 'Mozilla/5.0'}
480
- response = requests.get(url, headers=headers, timeout=10)
481
 
482
- if response.status_code != 200:
483
- return ""
 
 
 
484
 
485
- html = response.text
 
 
 
 
486
 
487
- if BEAUTIFULSOUP_AVAILABLE:
488
- return self._extract_with_bs4(html)
489
- else:
490
- return self._extract_simple(html)
491
-
492
  except Exception as e:
493
- print(f"Direct extraction error: {e}")
494
- return ""
495
 
496
- def _extract_with_bs4(self, html: str) -> str:
497
- """Extract text using BeautifulSoup"""
498
- soup = BeautifulSoup(html, 'html.parser')
499
-
500
- # Remove unwanted elements
501
- for tag in soup(["script", "style", "nav", "footer", "header", "iframe", "aside"]):
502
- tag.decompose()
503
-
504
- # Get text from main content areas
505
- text_parts = []
506
-
507
- # Try various content selectors
508
- content_selectors = [
509
- 'article', 'main', '.content', '.post-content', '.article-content',
510
- '#content', '.entry-content', '.story-content', '.text'
511
- ]
512
-
513
- for selector in content_selectors:
514
- elements = soup.select(selector)
515
- if elements:
516
- for elem in elements[:2]: # Take first 2 matching elements
517
- text_parts.append(elem.get_text())
518
-
519
- # Fallback to body
520
- if not text_parts and soup.body:
521
- text_parts.append(soup.body.get_text())
522
-
523
- # Combine and clean
524
- combined = ' '.join(text_parts)
525
- return self._clean_text(combined)
526
-
527
- def _extract_simple(self, html: str) -> str:
528
  """Simple text extraction without BeautifulSoup"""
529
  # Remove scripts and styles
530
  html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
531
  html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
532
 
533
- # Remove HTML tags
 
 
 
534
  text = re.sub(r'<[^>]+>', ' ', html)
535
 
536
- # Remove extra whitespace
537
- text = re.sub(r'\s+', ' ', text)
 
538
 
539
- return text.strip()
540
 
541
  def _clean_text(self, text: str) -> str:
542
  """Clean extracted text"""
543
- # Remove extra whitespace and normalize
544
  text = re.sub(r'\s+', ' ', text)
 
545
  # Remove control characters
546
  text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
 
 
 
 
547
  return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
 
549
  # ==============================================
550
- # INITIALIZE AND SETUP API
551
  # ==============================================
552
 
553
- scraper = WebScraper()
 
554
 
555
  # Create FastAPI app
556
- fastapi_app = FastAPI(
557
- title="Web Scraper API",
558
- description="Extract text from webpages using direct HTML parsing or OCR",
559
- version="2.0"
560
- )
561
-
562
- # CORS
563
- from fastapi.middleware.cors import CORSMiddleware
564
- fastapi_app.add_middleware(
565
- CORSMiddleware,
566
- allow_origins=["*"],
567
- allow_credentials=True,
568
- allow_methods=["*"],
569
- allow_headers=["*"],
570
  )
571
 
572
- @fastapi_app.get("/")
573
  async def root():
574
  return {
575
  "service": "Web Scraper API",
576
- "version": "2.0",
577
- "description": "Extracts text from webpages. Uses direct HTML parsing (preferred) or screenshot+OCR.",
578
  "endpoints": {
579
- "GET /": "This info",
580
  "GET /health": "Health check",
581
- "POST /api/scrape": "Main scraping endpoint"
582
  },
583
  "usage": {
584
- "curl": 'curl -X POST "YOUR_URL/api/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\'',
585
- "n8n": 'HTTP Request node: POST to /api/scrape with JSON body: {"url": "{{$json.url}}"}'
586
  }
587
  }
588
 
589
- @fastapi_app.get("/health")
590
  async def health():
591
  return {
592
  "status": "healthy",
593
  "timestamp": time.time(),
594
  "features": {
595
- "direct_html": True,
596
- "ocr": TRANSFORMERS_AVAILABLE,
597
- "html_parsing": BEAUTIFULSOUP_AVAILABLE
598
  }
599
  }
600
 
601
- @fastapi_app.post("/api/scrape")
602
- async def api_scrape(data: dict):
603
  """Main API endpoint for n8n"""
604
  try:
605
- url = data.get("url", "")
 
 
606
  if not url:
607
- return {"success": False, "error": "URL is required"}
 
 
 
 
608
 
609
- print(f"\nπŸ“¨ API Request: {url}")
610
  result = scraper.scrape(url)
 
611
  return result
612
 
 
 
 
 
 
 
613
  except Exception as e:
614
  print(f"❌ API Error: {e}")
615
  return {
616
  "success": False,
617
- "error": str(e),
618
- "url": data.get("url", "unknown")
619
  }
620
 
621
  # ==============================================
@@ -623,64 +347,79 @@ async def api_scrape(data: dict):
623
  # ==============================================
624
 
625
  def gradio_scrape(url: str):
626
- """Gradio interface"""
627
  if not url:
628
- return "❌ Enter a URL", {}
629
 
 
630
  result = scraper.scrape(url)
631
 
632
- if result["success"]:
633
- output = f"## βœ… Success!\n\n"
634
- output += f"**URL:** {result['url']}\n"
635
- output += f"**Method:** {result.get('method_used', 'unknown')}\n"
636
- output += f"**Time:** {result['execution_time']}s\n"
637
- output += f"**Text Length:** {result['text_length']:,} characters\n\n"
638
 
639
- if result.get('extracted_text'):
640
- preview = result['extracted_text'][:500]
641
- if len(result['extracted_text']) > 500:
642
- preview += "..."
643
- output += f"**Preview:**\n{preview}"
644
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
  return output, result
646
  else:
647
- return f"## ❌ Error\n\n{result.get('error', 'Unknown')}", result
 
648
 
649
- gradio_app = gr.Interface(
 
650
  fn=gradio_scrape,
651
  inputs=gr.Textbox(
652
  label="Website URL",
653
- placeholder="https://en.wikipedia.org/wiki/Artificial_intelligence",
654
- value="https://en.wikipedia.org/wiki/Artificial_intelligence"
655
  ),
656
  outputs=[
657
  gr.Markdown(label="Result"),
658
  gr.JSON(label="API Response")
659
  ],
660
  title="🌐 Web Scraper for n8n",
661
- description="Extract text from webpages. Perfect for n8n workflows!",
662
  examples=[
663
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
664
- ["https://news.ycombinator.com"],
665
  ["https://example.com"],
666
  ["https://httpbin.org/html"]
667
- ]
 
668
  )
669
 
670
- # Mount Gradio
671
- app = gr.mount_gradio_app(fastapi_app, gradio_app, path="/")
672
 
673
  # ==============================================
674
- # START APPLICATION
675
  # ==============================================
676
 
677
  if __name__ == "__main__":
 
 
678
  print("\n" + "="*60)
679
  print("πŸš€ Web Scraper API Starting")
680
  print("="*60)
681
  print(f"πŸ“ Direct HTML: Enabled")
682
  print(f"πŸ” OCR Available: {TRANSFORMERS_AVAILABLE}")
683
- print(f"πŸ§ͺ HTML Parsing: {BEAUTIFULSOUP_AVAILABLE}")
684
  print("="*60 + "\n")
685
 
686
  uvicorn.run(
 
1
  # ==============================================
2
+ # WEB SCRAPER FOR N8N - HUGGING FACE SPACES VERSION
3
  # ==============================================
4
 
5
  import gradio as gr
6
  import requests
 
7
  import json
8
  import time
 
 
 
9
  import re
10
+ import textwrap
11
+ from typing import Dict, Any, Optional
12
+ from fastapi import FastAPI, Request
13
+ from io import BytesIO
14
+ from PIL import Image, ImageDraw, ImageFont
15
+
16
+ # ==============================================
17
+ # IMPORTS WITH FALLBACKS
18
+ # ==============================================
19
 
20
+ # Try to import optional dependencies
21
  try:
22
  from bs4 import BeautifulSoup
23
  BEAUTIFULSOUP_AVAILABLE = True
24
  except ImportError:
25
  BEAUTIFULSOUP_AVAILABLE = False
26
+ print("⚠️ BeautifulSoup not available - using simple HTML parsing")
27
 
28
  try:
29
+ import torch
30
  from transformers import pipeline
31
  TRANSFORMERS_AVAILABLE = True
32
  except ImportError:
33
  TRANSFORMERS_AVAILABLE = False
34
+ print("⚠️ Transformers not available - OCR disabled")
35
 
36
  # ==============================================
37
+ # SIMPLE WEB SCRAPER (NO COMPLEX DEPENDENCIES)
38
  # ==============================================
39
 
40
+ class SimpleWebScraper:
41
+ """Lightweight web scraper optimized for Hugging Face Spaces"""
42
 
43
  def __init__(self):
44
+ self.user_agent = (
45
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
46
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
47
+ "Chrome/120.0.0.0 Safari/537.36"
48
+ )
49
+ self.ocr_processor = None
 
 
 
 
 
 
 
 
50
 
51
+ def scrape(self, url: str) -> Dict[str, Any]:
52
+ """Main scraping function"""
53
+ start_time = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ print(f"🌐 Scraping: {url}")
56
 
57
+ # Ensure URL has protocol
58
+ if not url.startswith(('http://', 'https://')):
59
+ url = 'https://' + url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
 
 
 
 
61
  try:
62
+ # Method 1: Direct HTML extraction (fastest and most reliable)
63
+ html_result = self._extract_direct_html(url)
 
64
 
65
+ if html_result.get("success") and html_result.get("text_length", 0) > 50:
66
+ total_time = time.time() - start_time
67
+
68
+ return {
69
+ "success": True,
70
+ "url": url,
71
+ "execution_time": round(total_time, 2),
72
+ "method_used": "direct_html",
73
+ "extracted_text": html_result["text"],
74
+ "text_length": html_result["text_length"],
75
+ "metadata": html_result.get("metadata", {}),
76
+ "notes": "Text extracted directly from HTML (most accurate)"
77
+ }
78
 
79
+ # Method 2: If direct extraction fails, try alternative
80
+ print("Direct extraction limited, trying enhanced method...")
81
+ enhanced_result = self._enhanced_extraction(url)
82
 
83
+ total_time = time.time() - start_time
 
 
 
 
84
 
85
+ if enhanced_result.get("success"):
86
+ enhanced_result["execution_time"] = round(total_time, 2)
87
+ return enhanced_result
88
 
89
+ # Final fallback
90
  return {
91
+ "success": False,
92
+ "url": url,
93
+ "error": "Failed to extract meaningful content",
94
+ "execution_time": round(total_time, 2)
 
 
95
  }
96
 
97
  except Exception as e:
98
+ return {
99
+ "success": False,
100
+ "url": url,
101
+ "error": str(e),
102
+ "execution_time": round(time.time() - start_time, 2)
103
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ def _extract_direct_html(self, url: str) -> Dict[str, Any]:
106
+ """Extract text directly from HTML"""
 
 
 
 
107
  try:
108
+ headers = {
109
+ 'User-Agent': self.user_agent,
110
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
111
+ 'Accept-Language': 'en-US,en;q=0.5',
112
+ 'Accept-Encoding': 'gzip, deflate',
113
+ 'Connection': 'keep-alive',
114
+ }
115
 
116
+ response = requests.get(url, headers=headers, timeout=15)
117
+ response.raise_for_status()
 
 
 
 
118
 
119
+ # Get encoding
120
+ if response.encoding is None:
121
+ response.encoding = 'utf-8'
122
 
123
+ html_content = response.text
 
124
 
125
+ # Extract metadata
126
+ metadata = self._extract_metadata(html_content)
 
 
 
127
 
128
+ # Extract text
129
+ if BEAUTIFULSOUP_AVAILABLE:
130
+ text = self._extract_text_with_bs4(html_content)
 
 
 
 
131
  else:
132
+ text = self._extract_text_simple(html_content)
 
 
 
133
 
134
+ # Clean and truncate text
135
+ cleaned_text = self._clean_text(text)
136
 
137
  return {
138
  "success": True,
139
+ "text": cleaned_text[:15000], # Limit for API response
140
+ "text_length": len(cleaned_text),
141
+ "metadata": metadata,
142
+ "http_status": response.status_code
143
  }
144
 
145
+ except requests.exceptions.RequestException as e:
146
+ print(f"Request error: {e}")
147
+ return {"success": False, "error": f"Request failed: {str(e)}"}
148
  except Exception as e:
149
+ print(f"Extraction error: {e}")
150
  return {"success": False, "error": str(e)}
151
 
152
+ def _extract_metadata(self, html: str) -> Dict[str, Any]:
153
+ """Extract basic metadata from HTML"""
154
+ metadata = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ # Extract title
157
+ title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.IGNORECASE | re.DOTALL)
158
+ if title_match:
159
+ metadata['title'] = re.sub(r'\s+', ' ', title_match.group(1)).strip()[:200]
160
+
161
+ # Extract meta description
162
+ desc_match = re.search(r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
163
+ html, re.IGNORECASE)
164
+ if desc_match:
165
+ metadata['description'] = desc_match.group(1)[:300]
166
+
167
+ # Extract meta keywords
168
+ keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
169
+ html, re.IGNORECASE)
170
+ if keywords_match:
171
+ metadata['keywords'] = keywords_match.group(1)[:300]
172
+
173
+ return metadata
174
+
175
+ def _extract_text_with_bs4(self, html: str) -> str:
176
+ """Extract text using BeautifulSoup if available"""
177
+ try:
178
+ soup = BeautifulSoup(html, 'html.parser')
179
 
180
+ # Remove unwanted elements
181
+ for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside',
182
+ 'iframe', 'noscript', 'svg', 'form']):
183
+ tag.decompose()
184
 
185
+ # Try to find main content
186
+ main_text = ""
187
 
188
+ # Common content selectors
189
+ content_selectors = [
190
+ 'main', 'article', '.content', '.post-content', '.article-content',
191
+ '.entry-content', '.story-content', '.text-content', '#content',
192
+ '.main-content', '.blog-content', '.page-content'
193
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ for selector in content_selectors:
196
+ elements = soup.select(selector)
197
+ if elements:
198
+ for elem in elements[:3]: # Take first 3 matching
199
+ main_text += elem.get_text() + "\n\n"
200
 
201
+ # If still no content, use body
202
+ if not main_text.strip() and soup.body:
203
+ main_text = soup.body.get_text()
204
+
205
+ return main_text
206
 
 
 
 
 
 
207
  except Exception as e:
208
+ print(f"BeautifulSoup error: {e}")
209
+ return self._extract_text_simple(html)
210
 
211
+ def _extract_text_simple(self, html: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  """Simple text extraction without BeautifulSoup"""
213
  # Remove scripts and styles
214
  html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
215
  html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
216
 
217
+ # Remove HTML comments
218
+ html = re.sub(r'<!--.*?-->', ' ', html, flags=re.DOTALL)
219
+
220
+ # Remove HTML tags but keep text
221
  text = re.sub(r'<[^>]+>', ' ', html)
222
 
223
+ # Decode HTML entities
224
+ import html as html_module
225
+ text = html_module.unescape(text)
226
 
227
+ return text
228
 
229
  def _clean_text(self, text: str) -> str:
230
  """Clean extracted text"""
231
+ # Replace multiple whitespace
232
  text = re.sub(r'\s+', ' ', text)
233
+
234
  # Remove control characters
235
  text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
236
+
237
+ # Remove excessive line breaks
238
+ text = re.sub(r'\n{3,}', '\n\n', text)
239
+
240
  return text.strip()
241
+
242
+ def _enhanced_extraction(self, url: str) -> Dict[str, Any]:
243
+ """Enhanced extraction with fallback methods"""
244
+ try:
245
+ # Try with different headers
246
+ headers = {
247
+ 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
248
+ 'Accept': 'text/html',
249
+ }
250
+
251
+ response = requests.get(url, headers=headers, timeout=15)
252
+
253
+ if response.status_code == 200:
254
+ text = self._extract_text_simple(response.text)
255
+ cleaned = self._clean_text(text)
256
+
257
+ if len(cleaned) > 100:
258
+ return {
259
+ "success": True,
260
+ "text": cleaned[:15000],
261
+ "text_length": len(cleaned),
262
+ "method_used": "enhanced_direct",
263
+ "notes": "Extracted with Googlebot user-agent"
264
+ }
265
+
266
+ return {"success": False, "error": "Enhanced extraction failed"}
267
+
268
+ except Exception as e:
269
+ return {"success": False, "error": str(e)}
270
 
271
  # ==============================================
272
+ # FASTAPI APPLICATION
273
  # ==============================================
274
 
275
+ # Initialize scraper
276
+ scraper = SimpleWebScraper()
277
 
278
  # Create FastAPI app
279
+ app = FastAPI(
280
+ title="Web Scraper API for n8n",
281
+ description="Lightweight web scraper optimized for Hugging Face Spaces",
282
+ version="1.0"
 
 
 
 
 
 
 
 
 
 
283
  )
284
 
285
+ @app.get("/")
286
  async def root():
287
  return {
288
  "service": "Web Scraper API",
289
+ "version": "1.0",
290
+ "description": "Extract text content from webpages",
291
  "endpoints": {
292
+ "GET /": "This information",
293
  "GET /health": "Health check",
294
+ "POST /scrape": "Main scraping endpoint"
295
  },
296
  "usage": {
297
+ "curl": 'curl -X POST "https://your-space.hf.space/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\'',
298
+ "n8n": "Use HTTP Request node with POST method to /scrape endpoint"
299
  }
300
  }
301
 
302
+ @app.get("/health")
303
  async def health():
304
  return {
305
  "status": "healthy",
306
  "timestamp": time.time(),
307
  "features": {
308
+ "html_parsing": BEAUTIFULSOUP_AVAILABLE,
309
+ "ocr": TRANSFORMERS_AVAILABLE
 
310
  }
311
  }
312
 
313
+ @app.post("/scrape")
314
+ async def api_scrape(request: Request):
315
  """Main API endpoint for n8n"""
316
  try:
317
+ data = await request.json()
318
+ url = data.get("url", "").strip()
319
+
320
  if not url:
321
+ return {
322
+ "success": False,
323
+ "error": "URL parameter is required",
324
+ "example": {"url": "https://example.com"}
325
+ }
326
 
327
+ print(f"πŸ“¨ API Request received for URL: {url}")
328
  result = scraper.scrape(url)
329
+
330
  return result
331
 
332
+ except json.JSONDecodeError:
333
+ return {
334
+ "success": False,
335
+ "error": "Invalid JSON payload",
336
+ "example": {"url": "https://example.com"}
337
+ }
338
  except Exception as e:
339
  print(f"❌ API Error: {e}")
340
  return {
341
  "success": False,
342
+ "error": f"Internal server error: {str(e)}"
 
343
  }
344
 
345
  # ==============================================
 
347
  # ==============================================
348
 
349
  def gradio_scrape(url: str):
350
+ """Gradio interface function"""
351
  if not url:
352
+ return "❌ Please enter a URL", {}
353
 
354
+ print(f"🎨 Gradio interface scraping: {url}")
355
  result = scraper.scrape(url)
356
 
357
+ if result.get("success"):
358
+ text = result.get("extracted_text", "")
359
+ text_length = result.get("text_length", 0)
 
 
 
360
 
361
+ # Create preview
362
+ preview = text[:500]
363
+ if len(text) > 500:
364
+ preview += "..."
 
365
 
366
+ output = f"""
367
+ ## βœ… Success!
368
+
369
+ **URL:** {result['url']}
370
+ **Method:** {result.get('method_used', 'direct_html')}
371
+ **Time:** {result.get('execution_time', 0)}s
372
+ **Characters:** {text_length:,}
373
+
374
+ ### Preview:
375
+ {preview}
376
+
377
+ ### Full Response:
378
+ Check the JSON output for complete data.
379
+ """
380
  return output, result
381
  else:
382
+ error_msg = result.get("error", "Unknown error")
383
+ return f"## ❌ Error\n\n{error_msg}", result
384
 
385
+ # Create Gradio interface
386
+ gradio_interface = gr.Interface(
387
  fn=gradio_scrape,
388
  inputs=gr.Textbox(
389
  label="Website URL",
390
+ placeholder="Enter a URL (e.g., https://example.com)",
391
+ lines=1
392
  ),
393
  outputs=[
394
  gr.Markdown(label="Result"),
395
  gr.JSON(label="API Response")
396
  ],
397
  title="🌐 Web Scraper for n8n",
398
+ description="Extract text content from webpages. Use with n8n HTTP Request node.",
399
  examples=[
400
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
 
401
  ["https://example.com"],
402
  ["https://httpbin.org/html"]
403
+ ],
404
+ allow_flagging="never"
405
  )
406
 
407
+ # Mount Gradio to FastAPI
408
+ app = gr.mount_gradio_app(app, gradio_interface, path="/")
409
 
410
  # ==============================================
411
+ # APPLICATION ENTRY POINT
412
  # ==============================================
413
 
414
  if __name__ == "__main__":
415
+ import uvicorn
416
+
417
  print("\n" + "="*60)
418
  print("πŸš€ Web Scraper API Starting")
419
  print("="*60)
420
  print(f"πŸ“ Direct HTML: Enabled")
421
  print(f"πŸ” OCR Available: {TRANSFORMERS_AVAILABLE}")
422
+ print(f"πŸ§ͺ BeautifulSoup: {BEAUTIFULSOUP_AVAILABLE}")
423
  print("="*60 + "\n")
424
 
425
  uvicorn.run(