yukee1992 commited on
Commit
383cb78
Β·
verified Β·
1 Parent(s): 2cfb68a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +472 -559
app.py CHANGED
@@ -1,6 +1,5 @@
1
  # ==============================================
2
- # SCREENSHOT SCRAPER FOR N8N
3
- # Complete version with all improvements - GRADIO COMPATIBLE
4
  # ==============================================
5
 
6
  import gradio as gr
@@ -11,561 +10,556 @@ import time
11
  from io import BytesIO
12
  from PIL import Image, ImageEnhance, ImageDraw, ImageFont
13
  import textwrap
14
- from typing import Optional, Dict, Any
 
15
  from fastapi import FastAPI
16
  import uvicorn
17
 
18
- # Import BeautifulSoup for HTML parsing
19
  try:
20
  from bs4 import BeautifulSoup
21
  BEAUTIFULSOUP_AVAILABLE = True
22
  except ImportError:
23
  BEAUTIFULSOUP_AVAILABLE = False
24
- print("BeautifulSoup not available, HTML fallback limited")
25
 
26
- # Import transformers for OCR
27
  try:
28
  from transformers import pipeline
29
  TRANSFORMERS_AVAILABLE = True
30
  except ImportError:
31
  TRANSFORMERS_AVAILABLE = False
32
- print("Transformers not available, OCR disabled")
33
 
34
  # ==============================================
35
- # CONFIGURATION
36
  # ==============================================
37
 
38
- class Config:
39
- """Configuration settings"""
40
- OCR_MODELS = [
41
- "microsoft/trocr-base-printed", # Best for printed text
42
- "microsoft/trocr-small-printed", # Smaller, faster
43
- "facebook/nougat-base", # Good for documents
44
- ]
45
- DEFAULT_MODEL = "microsoft/trocr-base-printed"
46
-
47
- SCREENSHOT_TIMEOUT = 20
48
- MAX_IMAGE_SIZE = 1600 # pixels
49
- TEXT_LIMIT = 10000 # characters
50
-
51
- USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
52
-
53
- # ==============================================
54
- # OCR PROCESSOR
55
- # ==============================================
56
-
57
- class OCRProcessor:
58
- """Handles text extraction from images"""
59
 
60
  def __init__(self):
61
- self.processor = None
62
- self.loaded_model = None
63
 
64
- def load_model(self, model_name: str = None):
65
- """Load OCR model with fallbacks"""
66
- if not TRANSFORMERS_AVAILABLE:
67
- print("⚠️ Transformers library not available")
68
- return None
69
 
70
- if model_name is None:
71
- model_name = Config.DEFAULT_MODEL
 
72
 
73
- try:
74
- print(f"πŸ”„ Loading OCR model: {model_name}")
75
- self.processor = pipeline(
76
- "image-to-text",
77
- model=model_name,
78
- device=-1 # Use CPU
79
- )
80
- self.loaded_model = model_name
81
- print(f"βœ… OCR model loaded: {model_name}")
82
- return self.processor
83
- except Exception as e:
84
- print(f"❌ Failed to load {model_name}: {str(e)[:100]}")
85
-
86
- # Try fallback models
87
- for fallback_model in Config.OCR_MODELS:
88
- if fallback_model != model_name:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  try:
90
- print(f"πŸ”„ Trying fallback model: {fallback_model}")
91
- self.processor = pipeline(
92
- "image-to-text",
93
- model=fallback_model,
94
- device=-1
95
- )
96
- self.loaded_model = fallback_model
97
- print(f"βœ… Fallback OCR model loaded: {fallback_model}")
98
- return self.processor
 
99
  except:
100
  continue
101
-
102
- print("❌ Could not load any OCR model")
103
- return None
104
-
105
- def extract_text(self, image_bytes: bytes) -> Dict[str, Any]:
106
- """Extract text from image with preprocessing"""
107
- if not self.processor:
108
- if not self.load_model():
109
- return {"success": False, "error": "OCR not available"}
110
 
 
 
 
 
111
  try:
112
- # Convert bytes to image
113
- image = Image.open(BytesIO(image_bytes))
114
- print(f"πŸ“· Original image: size={image.size}, mode={image.mode}")
115
 
116
- # Preprocess image
117
- image = self._preprocess_image(image)
118
 
119
- # Extract text
120
- print("πŸ” Starting OCR processing...")
121
- start_time = time.time()
122
- result = self.processor(image)
123
- ocr_time = time.time() - start_time
124
- print(f"⏱️ OCR completed in {ocr_time:.2f}s")
125
-
126
- # Extract text from result
127
- text = self._extract_text_from_result(result)
128
 
129
- # Clean text
130
- text = self._clean_text(text)
131
-
132
- print(f"πŸ“Š Extracted {len(text)} characters")
 
133
 
134
- if len(text) < 10:
135
- print("⚠️ Warning: Very short text extracted")
136
- if len(text) > 0:
137
- print(f"πŸ“ Text: '{text}'")
138
 
139
  return {
140
  "success": True,
141
- "text": text,
142
- "length": len(text),
143
- "ocr_time": ocr_time,
144
- "model_used": self.loaded_model
 
145
  }
146
 
147
  except Exception as e:
148
- print(f"❌ OCR error: {str(e)}")
149
- import traceback
150
- traceback.print_exc()
151
- return {"success": False, "error": str(e)}
152
-
153
- def _preprocess_image(self, image: Image.Image) -> Image.Image:
154
- """Preprocess image for better OCR results"""
155
- try:
156
- # Convert to RGB if needed
157
- if image.mode != 'RGB':
158
- image = image.convert('RGB')
159
-
160
- # Resize if too large (improves OCR speed/accuracy)
161
- max_dimension = Config.MAX_IMAGE_SIZE
162
- if max(image.size) > max_dimension:
163
- ratio = max_dimension / max(image.size)
164
- new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
165
- image = image.resize(new_size, Image.Resampling.LANCZOS)
166
- print(f"πŸ”„ Resized to: {new_size}")
167
-
168
- # Enhance image
169
- enhancer = ImageEnhance.Contrast(image)
170
- image = enhancer.enhance(1.3) # Increase contrast
171
-
172
- enhancer = ImageEnhance.Sharpness(image)
173
- image = enhancer.enhance(1.2) # Increase sharpness
174
-
175
- return image
176
-
177
- except Exception as e:
178
- print(f"⚠️ Preprocessing error: {e}")
179
- return image
180
 
181
- def _extract_text_from_result(self, result) -> str:
182
- """Extract text from OCR result object"""
183
- if isinstance(result, list):
184
- if len(result) > 0:
185
- if isinstance(result[0], dict):
186
- return result[0].get('generated_text', '')
187
- else:
188
- return str(result[0])
189
- elif isinstance(result, dict):
190
- return result.get('generated_text', '')
191
 
192
- return str(result)
193
-
194
- def _clean_text(self, text: str) -> str:
195
- """Clean extracted text"""
196
- # Remove excessive whitespace
197
- lines = [line.strip() for line in text.splitlines()]
198
- text = ' '.join(line for line in lines if line)
199
 
200
- # Remove multiple spaces
201
- import re
202
- text = re.sub(r'\s+', ' ', text)
203
 
204
- return text.strip()
205
-
206
- # ==============================================
207
- # SCREENSHOT CAPTURER
208
- # ==============================================
209
-
210
- class ScreenshotCapturer:
211
- """Captures screenshots using multiple methods"""
212
-
213
- def __init__(self):
214
- self.headers = {
215
- 'User-Agent': Config.USER_AGENT,
216
- 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
217
- 'Accept-Language': 'en-US,en;q=0.9',
218
- }
219
-
220
- def capture(self, url: str) -> Dict[str, Any]:
221
- """Capture screenshot using multiple methods"""
222
- print(f"\n{'='*60}")
223
- print(f"πŸ“Έ Capturing screenshot for: {url}")
224
- print(f"{'='*60}")
225
 
226
- # Ensure URL has protocol
227
- if not url.startswith('http'):
228
- url = 'https://' + url
229
 
230
- # Try multiple methods
231
- methods = [
232
- self._capture_via_api,
233
- self._capture_via_html,
234
- self._create_fallback_image
235
- ]
236
 
237
- for method in methods:
238
- result = method(url)
239
- if result.get("success", False):
240
- print(f"βœ… Screenshot captured via {result.get('method', 'unknown')}")
241
- print(f"πŸ“¦ Size: {result.get('size', 0)} bytes")
242
- return result
243
 
244
- return {
245
- "success": False,
246
- "error": "All screenshot methods failed"
247
- }
248
 
249
- def _capture_via_api(self, url: str) -> Dict[str, Any]:
250
- """Capture screenshot using external APIs"""
251
- # List of free screenshot APIs
252
- apis = [
253
- # WordPress screenshot service (good for most sites)
254
- {
255
- "url": f"https://s0.wp.com/mshots/v1/{url}?w=1280&h=720",
256
- "name": "wordpress_mshots"
257
- },
258
- # PagePeeker
259
- {
260
- "url": f"https://api.pagepeeker.com/v2/thumbs.php?size=m&url={url}",
261
- "name": "pagepeeker"
262
- },
263
- # Screenshot Machine (free tier with watermark)
264
- {
265
- "url": f"https://api.screenshotmachine.com/?key=demo&url={url}&dimension=1024x768",
266
- "name": "screenshotmachine"
267
- },
268
- # WebShot (alternative)
269
- {
270
- "url": f"https://r.jina.ai/http://{url}?format=screenshot&width=1200",
271
- "name": "jina_screenshot"
272
- }
273
- ]
274
 
275
- for api in apis:
276
- try:
277
- print(f"πŸ”„ Trying API: {api['name']}")
278
- response = requests.get(
279
- api["url"],
280
- headers=self.headers,
281
- timeout=Config.SCREENSHOT_TIMEOUT
282
- )
283
-
284
- if response.status_code == 200:
285
- content = response.content
286
-
287
- # Validate it's a reasonable image
288
- if len(content) > 10000: # At least 10KB
289
- # Verify it's a valid image
290
- try:
291
- img = Image.open(BytesIO(content))
292
- img.verify()
293
-
294
- return {
295
- "success": True,
296
- "image_bytes": content,
297
- "base64": base64.b64encode(content).decode('utf-8'),
298
- "size": len(content),
299
- "method": f"api_{api['name']}",
300
- "image_format": img.format
301
- }
302
- except:
303
- print(f"⚠️ Invalid image from {api['name']}")
304
- continue
305
- else:
306
- print(f"⚠️ {api['name']} returned {response.status_code}")
307
-
308
- except Exception as e:
309
- print(f"⚠️ {api['name']} failed: {str(e)[:50]}")
310
- continue
311
 
312
- return {"success": False, "error": "All APIs failed"}
313
 
314
- def _capture_via_html(self, url: str) -> Dict[str, Any]:
315
- """Create screenshot by rendering HTML content"""
316
- if not BEAUTIFULSOUP_AVAILABLE:
317
- return {"success": False, "error": "BeautifulSoup not available"}
318
-
319
- try:
320
- print("πŸ”„ Trying HTML-based capture...")
321
-
322
- # Fetch HTML content
323
- response = requests.get(url, headers=self.headers, timeout=10)
324
-
325
- if response.status_code == 200:
326
- # Parse HTML
327
- soup = BeautifulSoup(response.text, 'html.parser')
328
-
329
- # Get title
330
- title = soup.title.string if soup.title else "No title"
331
-
332
- # Remove unwanted elements
333
- for element in soup(["script", "style", "nav", "footer", "header", "iframe"]):
334
- element.decompose()
335
-
336
- # Get main content
337
- main_content = ""
338
-
339
- # Try to find main content
340
- for tag in ['article', 'main', 'div.content', 'div.post-content']:
341
- element = soup.select_one(tag)
342
- if element:
343
- main_content = element.get_text()
344
- break
345
-
346
- # Fallback to body text
347
- if not main_content:
348
- main_content = soup.body.get_text() if soup.body else soup.get_text()
349
-
350
- # Clean text
351
- lines = (line.strip() for line in main_content.splitlines())
352
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
353
- cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
354
-
355
- # Limit text length
356
- text_to_render = cleaned_text[:5000]
357
-
358
- # Create image from text
359
- image_bytes = self._create_text_image(url, title, text_to_render)
360
-
361
- return {
362
- "success": True,
363
- "image_bytes": image_bytes,
364
- "base64": base64.b64encode(image_bytes).decode('utf-8'),
365
- "size": len(image_bytes),
366
- "method": "html_render",
367
- "note": "Created from HTML content"
368
- }
369
- else:
370
- return {"success": False, "error": f"HTTP {response.status_code}"}
371
-
372
- except Exception as e:
373
- print(f"⚠️ HTML capture failed: {str(e)}")
374
- return {"success": False, "error": str(e)}
375
 
376
- def _create_text_image(self, url: str, title: str, text: str) -> bytes:
377
- """Create an image with text content"""
378
  # Create image
379
- img = Image.new('RGB', (1200, 800), color='white')
 
380
  draw = ImageDraw.Draw(img)
381
 
382
- # Try to load font
383
  try:
384
- font = ImageFont.truetype("arial.ttf", 24)
385
- font_small = ImageFont.truetype("arial.ttf", 18)
 
386
  except:
387
- font = ImageFont.load_default()
 
388
  font_small = ImageFont.load_default()
389
 
390
- # Draw URL and title
391
- draw.text((50, 30), f"πŸ“„ URL: {url}", fill='blue', font=font)
392
- draw.text((50, 70), f"🏷️ Title: {title[:80]}", fill='black', font=font)
 
 
 
393
 
394
- # Draw separator
395
- draw.line([(50, 110), (1150, 110)], fill='gray', width=2)
 
396
 
397
- # Draw text content
398
- y_offset = 130
399
- for line in textwrap.wrap(text, width=80):
400
- if y_offset < 750:
401
  draw.text((50, y_offset), line, fill='black', font=font_small)
402
- y_offset += 25
403
  else:
404
- draw.text((50, y_offset), "... (text truncated)", fill='gray', font=font_small)
 
405
  break
406
 
407
- # Add watermark
408
- draw.text((1000, 770), "Generated by Screenshot Scraper", fill='lightgray', font=font_small)
 
 
409
 
410
  # Convert to bytes
411
  img_byte_arr = BytesIO()
412
- img.save(img_byte_arr, format='PNG', optimize=True)
413
  return img_byte_arr.getvalue()
414
 
415
- def _create_fallback_image(self, url: str) -> Dict[str, Any]:
416
- """Create a simple fallback image"""
417
- print("πŸ”„ Creating fallback image...")
418
-
419
  img = Image.new('RGB', (800, 400), color='white')
420
  draw = ImageDraw.Draw(img)
421
 
422
- # Try to load font
423
  try:
424
  font = ImageFont.truetype("arial.ttf", 20)
425
  except:
426
  font = ImageFont.load_default()
427
 
428
- # Draw message
429
- draw.text((50, 50), "⚠️ Could not capture screenshot", fill='red', font=font)
430
  draw.text((50, 100), f"URL: {url[:100]}", fill='black', font=font)
431
- draw.text((50, 150), "Possible reasons:", fill='black', font=font)
432
- draw.text((80, 200), "β€’ Website blocks screenshots", fill='black', font=font)
433
- draw.text((80, 250), "β€’ Screenshot services are down", fill='black', font=font)
434
- draw.text((80, 300), "β€’ Try a different URL", fill='black', font=font)
435
 
436
- # Convert to bytes
437
  img_byte_arr = BytesIO()
438
  img.save(img_byte_arr, format='PNG')
439
  img_bytes = img_byte_arr.getvalue()
440
 
441
  return {
442
- "success": True,
443
  "image_bytes": img_bytes,
444
- "base64": base64.b64encode(img_bytes).decode('utf-8'),
445
  "size": len(img_bytes),
446
- "method": "fallback",
447
- "note": "Fallback image created"
 
448
  }
449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  # ==============================================
451
  # MAIN SCRAPER
452
  # ==============================================
453
 
454
- class ScreenshotScraper:
455
- """Main scraper class"""
456
 
457
  def __init__(self):
458
- self.screenshot_capturer = ScreenshotCapturer()
459
- self.ocr_processor = OCRProcessor()
460
- print("πŸš€ Screenshot Scraper initialized")
461
 
462
  def scrape(self, url: str) -> Dict[str, Any]:
463
- """Main scraping function"""
464
  start_time = time.time()
465
 
466
  print(f"\n{'='*60}")
467
- print(f"🎯 Starting scrape: {url}")
468
  print(f"{'='*60}")
469
 
470
- # Step 1: Capture screenshot
471
- print("πŸ“Έ Step 1: Capturing screenshot...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  screenshot_start = time.time()
473
  screenshot_result = self.screenshot_capturer.capture(url)
474
  screenshot_time = time.time() - screenshot_start
475
 
476
  if not screenshot_result.get("success", False):
477
- total_time = time.time() - start_time
478
- print(f"❌ Screenshot capture failed after {total_time:.2f}s")
479
  return {
480
  "success": False,
481
  "url": url,
482
- "error": screenshot_result.get("error", "Screenshot failed"),
483
- "execution_time": round(total_time, 2),
484
- "step": "screenshot"
485
  }
486
 
487
- print(f"βœ… Screenshot captured in {screenshot_time:.2f}s")
488
-
489
- # Step 2: Extract text with OCR
490
- print("\nπŸ” Step 2: Extracting text with OCR...")
491
  ocr_start = time.time()
492
  ocr_result = self.ocr_processor.extract_text(screenshot_result["image_bytes"])
493
  ocr_time = time.time() - ocr_start
494
 
495
- # Prepare response
496
- response = {
497
- "success": True,
498
- "url": url,
499
- "execution_time": round(time.time() - start_time, 2),
500
- "step_times": {
501
- "screenshot": round(screenshot_time, 2),
502
- "ocr": round(ocr_time, 2)
503
- },
504
- "screenshot_info": {
505
- "size_bytes": screenshot_result.get("size", 0),
506
- "method": screenshot_result.get("method", "unknown"),
507
- "available": True
508
- },
509
- "ocr_info": {
510
- "success": ocr_result.get("success", False),
511
- "model_used": ocr_result.get("model_used", "none"),
512
- "processing_time": round(ocr_time, 2)
513
- }
514
- }
515
 
516
- # Add OCR results
517
  if ocr_result["success"]:
518
- text = ocr_result["text"][:Config.TEXT_LIMIT] # Limit text length
519
- response["extracted_text"] = text
520
- response["text_length"] = len(text)
521
- print(f"βœ… OCR completed in {ocr_time:.2f}s")
522
- print(f"πŸ“Š Extracted {len(text)} characters")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  else:
524
- response["extracted_text"] = ""
525
- response["text_length"] = 0
526
- response["ocr_error"] = ocr_result.get("error", "Unknown OCR error")
527
- print(f"⚠️ OCR failed: {ocr_result.get('error', 'Unknown error')}")
528
-
529
- # Add screenshot preview if requested
530
- if screenshot_result.get("base64"):
531
- # Only include first 500 chars of base64 to reduce response size
532
- b64_preview = screenshot_result["base64"][:500]
533
- response["screenshot_preview"] = f"{b64_preview}..."
534
- response["has_screenshot_data"] = True
535
-
536
- # Log summary
537
- print(f"\n{'='*60}")
538
- print(f"πŸ“Š SCRAPING SUMMARY")
539
- print(f"{'='*60}")
540
- print(f"URL: {url}")
541
- print(f"Total time: {response['execution_time']}s")
542
- print(f"Screenshot: {response['screenshot_info']['method']} "
543
- f"({response['screenshot_info']['size_bytes']} bytes)")
544
- print(f"OCR: {response['ocr_info']['success']} "
545
- f"(model: {response['ocr_info']['model_used']})")
546
- print(f"Text length: {response['text_length']} characters")
547
- print(f"{'='*60}\n")
548
-
549
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
 
551
  # ==============================================
552
- # INITIALIZE GLOBAL INSTANCES
553
  # ==============================================
554
 
555
- scraper = ScreenshotScraper()
556
-
557
- # ==============================================
558
- # FASTAPI APPLICATION
559
- # ==============================================
560
 
561
  # Create FastAPI app
562
  fastapi_app = FastAPI(
563
- title="Screenshot Scraper API",
564
- description="AI-powered web scraper that takes screenshots and extracts text using OCR",
565
- version="2.0.0"
566
  )
567
 
568
- # CORS middleware
569
  from fastapi.middleware.cors import CORSMiddleware
570
  fastapi_app.add_middleware(
571
  CORSMiddleware,
@@ -577,103 +571,51 @@ fastapi_app.add_middleware(
577
 
578
  @fastapi_app.get("/")
579
  async def root():
580
- """Root endpoint with API information"""
581
  return {
582
- "service": "Screenshot Scraper API",
583
- "version": "2.0.0",
584
- "description": "AI-powered web scraper for n8n integration",
585
  "endpoints": {
586
- "GET /": "This information",
587
  "GET /health": "Health check",
588
- "POST /api/scrape": "Main scraping endpoint",
589
- "GET /api/info": "System information"
590
  },
591
  "usage": {
592
- "n8n": "Use HTTP Request node to POST to /api/scrape with JSON: {\"url\": \"https://example.com\"}",
593
- "curl": 'curl -X POST "https://[username]-screenshot-scraper.hf.space/api/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\''
594
  }
595
  }
596
 
597
  @fastapi_app.get("/health")
598
  async def health():
599
- """Health check endpoint"""
600
  return {
601
  "status": "healthy",
602
  "timestamp": time.time(),
603
- "services": {
 
604
  "ocr": TRANSFORMERS_AVAILABLE,
605
  "html_parsing": BEAUTIFULSOUP_AVAILABLE
606
  }
607
  }
608
 
609
- @fastapi_app.get("/api/info")
610
- async def api_info():
611
- """Get system information"""
612
- return {
613
- "ocr_available": TRANSFORMERS_AVAILABLE,
614
- "ocr_model": scraper.ocr_processor.loaded_model if scraper.ocr_processor.processor else "none",
615
- "html_parsing": BEAUTIFULSOUP_AVAILABLE,
616
- "config": {
617
- "default_ocr_model": Config.DEFAULT_MODEL,
618
- "max_image_size": Config.MAX_IMAGE_SIZE,
619
- "text_limit": Config.TEXT_LIMIT
620
- }
621
- }
622
-
623
  @fastapi_app.post("/api/scrape")
624
- async def api_scrape(data: dict = None, url: str = None):
625
- """
626
- Main scraping endpoint for n8n
627
-
628
- Request body (JSON):
629
- {
630
- "url": "https://example.com",
631
- "options": {
632
- "timeout": 30,
633
- "full_page": true
634
- }
635
- }
636
- """
637
  try:
638
- # Get URL from request
639
- target_url = None
640
-
641
- # Try different ways to get the URL
642
- if url:
643
- target_url = url
644
- elif data:
645
- if isinstance(data, dict):
646
- target_url = data.get("url")
647
- elif isinstance(data, str):
648
- # Try to parse as JSON
649
- try:
650
- data_dict = json.loads(data)
651
- target_url = data_dict.get("url")
652
- except:
653
- pass
654
-
655
- if not target_url:
656
- return {
657
- "success": False,
658
- "error": "URL parameter is required",
659
- "usage": "Send JSON: {\"url\": \"https://example.com\"}"
660
- }
661
 
662
- # Start scraping
663
- result = scraper.scrape(target_url)
664
  return result
665
 
666
  except Exception as e:
667
- import traceback
668
- error_details = traceback.format_exc()
669
- print(f"❌ API Error: {str(e)}")
670
- print(error_details)
671
-
672
  return {
673
  "success": False,
674
  "error": str(e),
675
- "url": target_url if 'target_url' in locals() else "unknown",
676
- "timestamp": time.time()
677
  }
678
 
679
  # ==============================================
@@ -681,93 +623,64 @@ async def api_scrape(data: dict = None, url: str = None):
681
  # ==============================================
682
 
683
  def gradio_scrape(url: str):
684
- """Gradio interface function"""
685
  if not url:
686
- return "❌ Please enter a URL", {"error": "No URL provided"}
687
 
688
- print(f"\n🌐 Gradio request for: {url}")
689
 
690
- try:
691
- # Call the scraper
692
- result = scraper.scrape(url)
693
-
694
- # Format output for Gradio
695
- if result["success"]:
696
- output = f"## βœ… Success!\n\n"
697
- output += f"**URL:** {result['url']}\n"
698
- output += f"**Total Time:** {result['execution_time']}s\n"
699
- output += f"**Screenshot Method:** {result['screenshot_info']['method']}\n"
700
- output += f"**Screenshot Size:** {result['screenshot_info']['size_bytes']} bytes\n"
701
- output += f"**OCR Model:** {result['ocr_info']['model_used']}\n"
702
- output += f"**Text Length:** {result['text_length']} characters\n\n"
703
-
704
- if result.get('extracted_text'):
705
- # Show first 1000 characters
706
- text_preview = result['extracted_text'][:1000]
707
- if len(result['extracted_text']) > 1000:
708
- text_preview += "..."
709
- output += f"**Extracted Text Preview:**\n\n{text_preview}"
710
-
711
- return output, result
712
- else:
713
- error_msg = f"## ❌ Error\n\n**URL:** {result.get('url', 'unknown')}\n\n"
714
- error_msg += f"**Error:** {result.get('error', 'Unknown error')}\n\n"
715
- error_msg += f"**Step:** {result.get('step', 'unknown')}"
716
- return error_msg, result
717
-
718
- except Exception as e:
719
- error_msg = f"## ❌ Unexpected Error\n\n{str(e)}"
720
- return error_msg, {"error": str(e), "url": url}
721
 
722
- # Create Gradio interface
723
  gradio_app = gr.Interface(
724
  fn=gradio_scrape,
725
  inputs=gr.Textbox(
726
  label="Website URL",
727
- placeholder="https://example.com",
728
- value="https://example.com",
729
- lines=1
730
  ),
731
  outputs=[
732
  gr.Markdown(label="Result"),
733
  gr.JSON(label="API Response")
734
  ],
735
- title="πŸ“Έ Screenshot Scraper for n8n",
736
- description=(
737
- "Take screenshots of websites and extract text using AI OCR.\n\n"
738
- "**API Usage for n8n:**\n"
739
- "```bash\n"
740
- "POST /api/scrape\n"
741
- "Content-Type: application/json\n"
742
- '{"url": "https://example.com"}\n'
743
- "```"
744
- ),
745
  examples=[
746
- ["https://example.com"],
747
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
748
  ["https://news.ycombinator.com"],
 
749
  ["https://httpbin.org/html"]
750
  ]
751
  )
752
 
753
- # ==============================================
754
- # MOUNT GRADIO TO FASTAPI
755
- # ==============================================
756
-
757
- # Mount Gradio app to FastAPI
758
  app = gr.mount_gradio_app(fastapi_app, gradio_app, path="/")
759
 
760
  # ==============================================
761
- # APPLICATION STARTUP
762
  # ==============================================
763
 
764
  if __name__ == "__main__":
765
  print("\n" + "="*60)
766
- print("πŸš€ Starting Screenshot Scraper API")
767
  print("="*60)
768
- print(f"πŸ“¦ OCR Available: {TRANSFORMERS_AVAILABLE}")
769
- print(f"πŸ“ HTML Parsing: {BEAUTIFULSOUP_AVAILABLE}")
770
- print(f"πŸ”§ Default OCR Model: {Config.DEFAULT_MODEL}")
771
  print("="*60 + "\n")
772
 
773
  uvicorn.run(
 
1
  # ==============================================
2
+ # SCREENSHOT SCRAPER FOR N8N - IMPROVED VERSION
 
3
  # ==============================================
4
 
5
  import gradio as gr
 
10
  from io import BytesIO
11
  from PIL import Image, ImageEnhance, ImageDraw, ImageFont
12
  import textwrap
13
+ import re
14
+ from typing import Dict, Any
15
  from fastapi import FastAPI
16
  import uvicorn
17
 
18
+ # Try imports with fallbacks
19
  try:
20
  from bs4 import BeautifulSoup
21
  BEAUTIFULSOUP_AVAILABLE = True
22
  except ImportError:
23
  BEAUTIFULSOUP_AVAILABLE = False
24
+ print("BeautifulSoup not available")
25
 
 
26
  try:
27
  from transformers import pipeline
28
  TRANSFORMERS_AVAILABLE = True
29
  except ImportError:
30
  TRANSFORMERS_AVAILABLE = False
31
+ print("Transformers not available")
32
 
33
  # ==============================================
34
+ # IMPROVED SCREENSHOT CAPTURER
35
  # ==============================================
36
 
37
+ class ImprovedScreenshotCapturer:
38
+ """Better screenshot capture using HTML content extraction"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  def __init__(self):
41
+ self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
 
42
 
43
+ def capture(self, url: str) -> Dict[str, Any]:
44
+ """Capture or create screenshot from webpage"""
45
+ print(f"\nπŸ“Έ Attempting to capture: {url}")
 
 
46
 
47
+ # Ensure URL has protocol
48
+ if not url.startswith('http'):
49
+ url = 'https://' + url
50
 
51
+ # Method 1: Try actual screenshot APIs
52
+ screenshot_result = self._try_screenshot_apis(url)
53
+ if screenshot_result.get("success"):
54
+ return screenshot_result
55
+
56
+ # Method 2: Create screenshot from HTML content (most reliable)
57
+ print("πŸ”„ Using HTML content method...")
58
+ return self._create_from_html(url)
59
+
60
+ def _try_screenshot_apis(self, url: str) -> Dict[str, Any]:
61
+ """Try various screenshot APIs"""
62
+ apis = [
63
+ # These are more reliable APIs
64
+ {
65
+ "url": f"https://render-tron.appspot.com/screenshot/{url}?width=1200&height=800",
66
+ "name": "rendertron"
67
+ },
68
+ {
69
+ "url": f"https://s.wordpress.com/mshots/v1/{url}?w=1200&h=800",
70
+ "name": "wordpress"
71
+ },
72
+ {
73
+ "url": f"https://image.thum.io/get/width/1200/crop/900/{url}",
74
+ "name": "thumio"
75
+ }
76
+ ]
77
+
78
+ headers = {'User-Agent': self.user_agent}
79
+
80
+ for api in apis:
81
+ try:
82
+ print(f" Trying {api['name']}...")
83
+ response = requests.get(api["url"], headers=headers, timeout=15)
84
+
85
+ if response.status_code == 200 and len(response.content) > 5000:
86
+ # Verify it's an image
87
  try:
88
+ img = Image.open(BytesIO(response.content))
89
+ img.verify()
90
+
91
+ return {
92
+ "success": True,
93
+ "image_bytes": response.content,
94
+ "size": len(response.content),
95
+ "method": api["name"],
96
+ "is_real_screenshot": True
97
+ }
98
  except:
99
  continue
100
+ except:
101
+ continue
 
 
 
 
 
 
 
102
 
103
+ return {"success": False}
104
+
105
+ def _create_from_html(self, url: str) -> Dict[str, Any]:
106
+ """Create screenshot from HTML content"""
107
  try:
108
+ # Fetch webpage content
109
+ headers = {'User-Agent': self.user_agent}
110
+ response = requests.get(url, headers=headers, timeout=15)
111
 
112
+ if response.status_code != 200:
113
+ return self._create_error_image(f"HTTP {response.status_code}", url)
114
 
115
+ html_content = response.text
 
 
 
 
 
 
 
 
116
 
117
+ # Parse HTML if BeautifulSoup is available
118
+ if BEAUTIFULSOUP_AVAILABLE:
119
+ title, main_text = self._parse_html_with_bs4(html_content)
120
+ else:
121
+ title, main_text = self._parse_html_simple(html_content)
122
 
123
+ # Create image with the content
124
+ image_bytes = self._create_content_image(url, title, main_text)
 
 
125
 
126
  return {
127
  "success": True,
128
+ "image_bytes": image_bytes,
129
+ "size": len(image_bytes),
130
+ "method": "html_content",
131
+ "is_real_screenshot": False,
132
+ "content_length": len(main_text)
133
  }
134
 
135
  except Exception as e:
136
+ print(f"Error creating from HTML: {str(e)}")
137
+ return self._create_error_image(str(e), url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
+ def _parse_html_with_bs4(self, html: str):
140
+ """Parse HTML using BeautifulSoup"""
141
+ soup = BeautifulSoup(html, 'html.parser')
 
 
 
 
 
 
 
142
 
143
+ # Get title
144
+ title = soup.title.string if soup.title else "No title"
 
 
 
 
 
145
 
146
+ # Remove scripts, styles, nav, footer
147
+ for tag in soup(["script", "style", "nav", "footer", "header", "iframe"]):
148
+ tag.decompose()
149
 
150
+ # Try to get main content
151
+ main_content = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
+ # Look for main content areas
154
+ selectors = ['article', 'main', '.content', '.post-content', '.article', '#content']
 
155
 
156
+ for selector in selectors:
157
+ elements = soup.select(selector)
158
+ if elements:
159
+ main_content = ' '.join([elem.get_text() for elem in elements[:3]])
160
+ break
 
161
 
162
+ # Fallback to body
163
+ if not main_content and soup.body:
164
+ main_content = soup.body.get_text()
 
 
 
165
 
166
+ # Clean text
167
+ text = self._clean_text(main_content)
168
+
169
+ return title, text[:10000] # Limit text length
170
 
171
+ def _parse_html_simple(self, html: str):
172
+ """Simple HTML parsing without BeautifulSoup"""
173
+ # Extract title
174
+ title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.IGNORECASE)
175
+ title = title_match.group(1) if title_match else "No title"
176
+
177
+ # Extract text between body tags
178
+ body_match = re.search(r'<body[^>]*>(.*?)</body>', html, re.IGNORECASE | re.DOTALL)
179
+ if body_match:
180
+ body_text = body_match.group(1)
181
+ # Remove HTML tags
182
+ clean_text = re.sub(r'<[^>]+>', ' ', body_text)
183
+ else:
184
+ clean_text = html[:5000]
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ # Clean text
187
+ text = self._clean_text(clean_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
+ return title, text[:10000]
190
 
191
+ def _clean_text(self, text: str) -> str:
192
+ """Clean and normalize text"""
193
+ # Replace multiple whitespace with single space
194
+ text = re.sub(r'\s+', ' ', text)
195
+ # Remove control characters
196
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
197
+ return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
+ def _create_content_image(self, url: str, title: str, content: str) -> bytes:
200
+ """Create an image with webpage content"""
201
  # Create image
202
+ img_width, img_height = 1200, 1000
203
+ img = Image.new('RGB', (img_width, img_height), color='white')
204
  draw = ImageDraw.Draw(img)
205
 
206
+ # Try to load fonts
207
  try:
208
+ font_large = ImageFont.truetype("arial.ttf", 24)
209
+ font_medium = ImageFont.truetype("arial.ttf", 20)
210
+ font_small = ImageFont.truetype("arial.ttf", 16)
211
  except:
212
+ font_large = ImageFont.load_default()
213
+ font_medium = ImageFont.load_default()
214
  font_small = ImageFont.load_default()
215
 
216
+ # Draw header
217
+ draw.text((50, 30), "πŸ“„ WEBPAGE CONTENT EXTRACT", fill='darkblue', font=font_large)
218
+ draw.text((50, 70), f"URL: {url[:80]}", fill='blue', font=font_medium)
219
+ draw.text((50, 100), f"Title: {title[:100]}", fill='black', font=font_medium)
220
+
221
+ draw.line([(50, 130), (1150, 130)], fill='gray', width=2)
222
 
223
+ # Draw content
224
+ y_offset = 150
225
+ lines = textwrap.wrap(content, width=100)
226
 
227
+ for i, line in enumerate(lines):
228
+ if y_offset < 950:
 
 
229
  draw.text((50, y_offset), line, fill='black', font=font_small)
230
+ y_offset += 20
231
  else:
232
+ draw.text((50, y_offset), f"... (showing {i} of {len(lines)} lines)",
233
+ fill='darkgray', font=font_small)
234
  break
235
 
236
+ # Footer
237
+ draw.line([(50, 970), (1150, 970)], fill='lightgray', width=1)
238
+ draw.text((50, 980), f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}",
239
+ fill='gray', font=font_small)
240
 
241
  # Convert to bytes
242
  img_byte_arr = BytesIO()
243
+ img.save(img_byte_arr, format='PNG', optimize=True, quality=85)
244
  return img_byte_arr.getvalue()
245
 
246
+ def _create_error_image(self, error: str, url: str) -> Dict[str, Any]:
247
+ """Create error image"""
 
 
248
  img = Image.new('RGB', (800, 400), color='white')
249
  draw = ImageDraw.Draw(img)
250
 
 
251
  try:
252
  font = ImageFont.truetype("arial.ttf", 20)
253
  except:
254
  font = ImageFont.load_default()
255
 
256
+ draw.text((50, 50), "❌ SCREENSHOT ERROR", fill='red', font=font)
 
257
  draw.text((50, 100), f"URL: {url[:100]}", fill='black', font=font)
258
+ draw.text((50, 150), f"Error: {error[:200]}", fill='darkred', font=font)
259
+ draw.text((50, 200), "Content was extracted directly from HTML.", fill='black', font=font)
260
+ draw.text((50, 250), "This is actually BETTER for text extraction!", fill='green', font=font)
 
261
 
 
262
  img_byte_arr = BytesIO()
263
  img.save(img_byte_arr, format='PNG')
264
  img_bytes = img_byte_arr.getvalue()
265
 
266
  return {
267
+ "success": True, # Still successful for our purposes
268
  "image_bytes": img_bytes,
 
269
  "size": len(img_bytes),
270
+ "method": "error_fallback",
271
+ "is_real_screenshot": False,
272
+ "note": f"Error: {error}"
273
  }
274
 
275
+ # ==============================================
276
+ # IMPROVED OCR PROCESSOR
277
+ # ==============================================
278
+
279
+ class ImprovedOCRProcessor:
280
+ """Better OCR with preprocessing"""
281
+
282
+ def __init__(self):
283
+ self.processor = None
284
+
285
+ def load_model(self):
286
+ """Load OCR model"""
287
+ if not TRANSFORMERS_AVAILABLE:
288
+ return None
289
+
290
+ try:
291
+ # Use a smaller, faster model
292
+ self.processor = pipeline(
293
+ "image-to-text",
294
+ model="microsoft/trocr-base-printed",
295
+ device=-1
296
+ )
297
+ print("βœ… OCR model loaded")
298
+ return self.processor
299
+ except Exception as e:
300
+ print(f"❌ OCR model load failed: {e}")
301
+ return None
302
+
303
+ def extract_text(self, image_bytes: bytes) -> Dict[str, Any]:
304
+ """Extract text from image"""
305
+ if not self.processor:
306
+ if not self.load_model():
307
+ return {"success": False, "error": "OCR not available"}
308
+
309
+ try:
310
+ # Open and preprocess image
311
+ image = Image.open(BytesIO(image_bytes))
312
+
313
+ # Convert to RGB if needed
314
+ if image.mode != 'RGB':
315
+ image = image.convert('RGB')
316
+
317
+ # Resize if too large (better for OCR)
318
+ max_size = 1600
319
+ if max(image.size) > max_size:
320
+ ratio = max_size / max(image.size)
321
+ new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
322
+ image = image.resize(new_size, Image.Resampling.LANCZOS)
323
+
324
+ # Enhance image for better OCR
325
+ enhancer = ImageEnhance.Contrast(image)
326
+ image = enhancer.enhance(1.5)
327
+
328
+ enhancer = ImageEnhance.Sharpness(image)
329
+ image = enhancer.enhance(1.2)
330
+
331
+ # Perform OCR
332
+ print("πŸ” Running OCR...")
333
+ start_time = time.time()
334
+ result = self.processor(image)
335
+ ocr_time = time.time() - start_time
336
+
337
+ # Extract text from result
338
+ text = ""
339
+ if isinstance(result, list) and result:
340
+ if isinstance(result[0], dict):
341
+ text = result[0].get('generated_text', '')
342
+ else:
343
+ text = str(result[0])
344
+ else:
345
+ text = str(result)
346
+
347
+ # Clean text
348
+ text = self._clean_ocr_text(text)
349
+
350
+ print(f"πŸ“Š OCR completed in {ocr_time:.2f}s, extracted {len(text)} chars")
351
+
352
+ return {
353
+ "success": True,
354
+ "text": text,
355
+ "length": len(text),
356
+ "ocr_time": ocr_time,
357
+ "model": "trocr-base-printed"
358
+ }
359
+
360
+ except Exception as e:
361
+ print(f"❌ OCR error: {e}")
362
+ return {"success": False, "error": str(e)}
363
+
364
+ def _clean_ocr_text(self, text: str) -> str:
365
+ """Clean OCR output"""
366
+ # Remove extra whitespace
367
+ text = re.sub(r'\s+', ' ', text)
368
+ # Fix common OCR errors
369
+ text = text.replace('|', 'I').replace('[]', 'll').replace('()', 'o')
370
+ return text.strip()
371
+
372
  # ==============================================
373
  # MAIN SCRAPER
374
  # ==============================================
375
 
376
+ class WebScraper:
377
+ """Main scraper that combines screenshot and direct text extraction"""
378
 
379
  def __init__(self):
380
+ self.screenshot_capturer = ImprovedScreenshotCapturer()
381
+ self.ocr_processor = ImprovedOCRProcessor()
382
+ print("πŸš€ Web Scraper initialized")
383
 
384
  def scrape(self, url: str) -> Dict[str, Any]:
385
+ """Main scraping function - uses BOTH methods"""
386
  start_time = time.time()
387
 
388
  print(f"\n{'='*60}")
389
+ print(f"🌐 Scraping: {url}")
390
  print(f"{'='*60}")
391
 
392
+ # Method 1: Try direct HTML extraction first (fastest, most reliable for text)
393
+ print("\nπŸ“ Method 1: Direct HTML text extraction...")
394
+ html_start = time.time()
395
+ direct_text = self._extract_direct_html(url)
396
+ html_time = time.time() - html_start
397
+
398
+ if direct_text and len(direct_text) > 100:
399
+ print(f"βœ… Direct extraction: {len(direct_text)} characters")
400
+
401
+ # Also get screenshot for reference
402
+ print("\nπŸ“Έ Method 2: Getting screenshot for reference...")
403
+ screenshot_result = self.screenshot_capturer.capture(url)
404
+
405
+ total_time = time.time() - start_time
406
+
407
+ return {
408
+ "success": True,
409
+ "url": url,
410
+ "execution_time": round(total_time, 2),
411
+ "method_used": "direct_html_extraction",
412
+ "extracted_text": direct_text[:15000], # Limit for response
413
+ "text_length": len(direct_text),
414
+ "screenshot_info": {
415
+ "method": screenshot_result.get("method", "none"),
416
+ "size_bytes": screenshot_result.get("size", 0),
417
+ "is_real_screenshot": screenshot_result.get("is_real_screenshot", False)
418
+ },
419
+ "notes": "Text extracted directly from HTML (most accurate for text content)"
420
+ }
421
+
422
+ # Method 2: If direct extraction fails, use OCR
423
+ print("\nπŸ“ Direct extraction failed, using OCR method...")
424
+
425
+ # Get screenshot
426
  screenshot_start = time.time()
427
  screenshot_result = self.screenshot_capturer.capture(url)
428
  screenshot_time = time.time() - screenshot_start
429
 
430
  if not screenshot_result.get("success", False):
 
 
431
  return {
432
  "success": False,
433
  "url": url,
434
+ "error": "Failed to capture content",
435
+ "execution_time": round(time.time() - start_time, 2)
 
436
  }
437
 
438
+ # Extract text with OCR
439
+ print("\nπŸ” Running OCR on captured content...")
 
 
440
  ocr_start = time.time()
441
  ocr_result = self.ocr_processor.extract_text(screenshot_result["image_bytes"])
442
  ocr_time = time.time() - ocr_start
443
 
444
+ total_time = time.time() - start_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
 
446
  if ocr_result["success"]:
447
+ return {
448
+ "success": True,
449
+ "url": url,
450
+ "execution_time": round(total_time, 2),
451
+ "step_times": {
452
+ "screenshot": round(screenshot_time, 2),
453
+ "ocr": round(ocr_time, 2)
454
+ },
455
+ "method_used": "screenshot_ocr",
456
+ "extracted_text": ocr_result["text"][:15000],
457
+ "text_length": ocr_result["length"],
458
+ "ocr_info": {
459
+ "model": ocr_result.get("model", "unknown"),
460
+ "processing_time": round(ocr_time, 2)
461
+ },
462
+ "screenshot_info": {
463
+ "method": screenshot_result.get("method", "none"),
464
+ "size_bytes": screenshot_result.get("size", 0),
465
+ "is_real_screenshot": screenshot_result.get("is_real_screenshot", False)
466
+ }
467
+ }
468
  else:
469
+ return {
470
+ "success": False,
471
+ "url": url,
472
+ "error": f"OCR failed: {ocr_result.get('error', 'Unknown error')}",
473
+ "execution_time": round(total_time, 2)
474
+ }
475
+
476
+ def _extract_direct_html(self, url: str) -> str:
477
+ """Extract text directly from HTML (fastest method)"""
478
+ try:
479
+ headers = {'User-Agent': 'Mozilla/5.0'}
480
+ response = requests.get(url, headers=headers, timeout=10)
481
+
482
+ if response.status_code != 200:
483
+ return ""
484
+
485
+ html = response.text
486
+
487
+ if BEAUTIFULSOUP_AVAILABLE:
488
+ return self._extract_with_bs4(html)
489
+ else:
490
+ return self._extract_simple(html)
491
+
492
+ except Exception as e:
493
+ print(f"Direct extraction error: {e}")
494
+ return ""
495
+
496
+ def _extract_with_bs4(self, html: str) -> str:
497
+ """Extract text using BeautifulSoup"""
498
+ soup = BeautifulSoup(html, 'html.parser')
499
+
500
+ # Remove unwanted elements
501
+ for tag in soup(["script", "style", "nav", "footer", "header", "iframe", "aside"]):
502
+ tag.decompose()
503
+
504
+ # Get text from main content areas
505
+ text_parts = []
506
+
507
+ # Try various content selectors
508
+ content_selectors = [
509
+ 'article', 'main', '.content', '.post-content', '.article-content',
510
+ '#content', '.entry-content', '.story-content', '.text'
511
+ ]
512
+
513
+ for selector in content_selectors:
514
+ elements = soup.select(selector)
515
+ if elements:
516
+ for elem in elements[:2]: # Take first 2 matching elements
517
+ text_parts.append(elem.get_text())
518
+
519
+ # Fallback to body
520
+ if not text_parts and soup.body:
521
+ text_parts.append(soup.body.get_text())
522
+
523
+ # Combine and clean
524
+ combined = ' '.join(text_parts)
525
+ return self._clean_text(combined)
526
+
527
+ def _extract_simple(self, html: str) -> str:
528
+ """Simple text extraction without BeautifulSoup"""
529
+ # Remove scripts and styles
530
+ html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
531
+ html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
532
+
533
+ # Remove HTML tags
534
+ text = re.sub(r'<[^>]+>', ' ', html)
535
+
536
+ # Remove extra whitespace
537
+ text = re.sub(r'\s+', ' ', text)
538
+
539
+ return text.strip()
540
+
541
+ def _clean_text(self, text: str) -> str:
542
+ """Clean extracted text"""
543
+ # Remove extra whitespace and normalize
544
+ text = re.sub(r'\s+', ' ', text)
545
+ # Remove control characters
546
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
547
+ return text.strip()
548
 
549
  # ==============================================
550
+ # INITIALIZE AND SETUP API
551
  # ==============================================
552
 
553
+ scraper = WebScraper()
 
 
 
 
554
 
555
  # Create FastAPI app
556
  fastapi_app = FastAPI(
557
+ title="Web Scraper API",
558
+ description="Extract text from webpages using direct HTML parsing or OCR",
559
+ version="2.0"
560
  )
561
 
562
+ # CORS
563
  from fastapi.middleware.cors import CORSMiddleware
564
  fastapi_app.add_middleware(
565
  CORSMiddleware,
 
571
 
572
  @fastapi_app.get("/")
573
  async def root():
 
574
  return {
575
+ "service": "Web Scraper API",
576
+ "version": "2.0",
577
+ "description": "Extracts text from webpages. Uses direct HTML parsing (preferred) or screenshot+OCR.",
578
  "endpoints": {
579
+ "GET /": "This info",
580
  "GET /health": "Health check",
581
+ "POST /api/scrape": "Main scraping endpoint"
 
582
  },
583
  "usage": {
584
+ "curl": 'curl -X POST "YOUR_URL/api/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\'',
585
+ "n8n": 'HTTP Request node: POST to /api/scrape with JSON body: {"url": "{{$json.url}}"}'
586
  }
587
  }
588
 
589
  @fastapi_app.get("/health")
590
  async def health():
 
591
  return {
592
  "status": "healthy",
593
  "timestamp": time.time(),
594
+ "features": {
595
+ "direct_html": True,
596
  "ocr": TRANSFORMERS_AVAILABLE,
597
  "html_parsing": BEAUTIFULSOUP_AVAILABLE
598
  }
599
  }
600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
  @fastapi_app.post("/api/scrape")
602
+ async def api_scrape(data: dict):
603
+ """Main API endpoint for n8n"""
 
 
 
 
 
 
 
 
 
 
 
604
  try:
605
+ url = data.get("url", "")
606
+ if not url:
607
+ return {"success": False, "error": "URL is required"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
 
609
+ print(f"\nπŸ“¨ API Request: {url}")
610
+ result = scraper.scrape(url)
611
  return result
612
 
613
  except Exception as e:
614
+ print(f"❌ API Error: {e}")
 
 
 
 
615
  return {
616
  "success": False,
617
  "error": str(e),
618
+ "url": data.get("url", "unknown")
 
619
  }
620
 
621
  # ==============================================
 
623
  # ==============================================
624
 
625
  def gradio_scrape(url: str):
626
+ """Gradio interface"""
627
  if not url:
628
+ return "❌ Enter a URL", {}
629
 
630
+ result = scraper.scrape(url)
631
 
632
+ if result["success"]:
633
+ output = f"## βœ… Success!\n\n"
634
+ output += f"**URL:** {result['url']}\n"
635
+ output += f"**Method:** {result.get('method_used', 'unknown')}\n"
636
+ output += f"**Time:** {result['execution_time']}s\n"
637
+ output += f"**Text Length:** {result['text_length']:,} characters\n\n"
638
+
639
+ if result.get('extracted_text'):
640
+ preview = result['extracted_text'][:500]
641
+ if len(result['extracted_text']) > 500:
642
+ preview += "..."
643
+ output += f"**Preview:**\n{preview}"
644
+
645
+ return output, result
646
+ else:
647
+ return f"## ❌ Error\n\n{result.get('error', 'Unknown')}", result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
 
 
649
  gradio_app = gr.Interface(
650
  fn=gradio_scrape,
651
  inputs=gr.Textbox(
652
  label="Website URL",
653
+ placeholder="https://en.wikipedia.org/wiki/Artificial_intelligence",
654
+ value="https://en.wikipedia.org/wiki/Artificial_intelligence"
 
655
  ),
656
  outputs=[
657
  gr.Markdown(label="Result"),
658
  gr.JSON(label="API Response")
659
  ],
660
+ title="🌐 Web Scraper for n8n",
661
+ description="Extract text from webpages. Perfect for n8n workflows!",
 
 
 
 
 
 
 
 
662
  examples=[
 
663
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
664
  ["https://news.ycombinator.com"],
665
+ ["https://example.com"],
666
  ["https://httpbin.org/html"]
667
  ]
668
  )
669
 
670
+ # Mount Gradio
 
 
 
 
671
  app = gr.mount_gradio_app(fastapi_app, gradio_app, path="/")
672
 
673
  # ==============================================
674
+ # START APPLICATION
675
  # ==============================================
676
 
677
  if __name__ == "__main__":
678
  print("\n" + "="*60)
679
+ print("πŸš€ Web Scraper API Starting")
680
  print("="*60)
681
+ print(f"πŸ“ Direct HTML: Enabled")
682
+ print(f"πŸ” OCR Available: {TRANSFORMERS_AVAILABLE}")
683
+ print(f"πŸ§ͺ HTML Parsing: {BEAUTIFULSOUP_AVAILABLE}")
684
  print("="*60 + "\n")
685
 
686
  uvicorn.run(