Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on Jan 25

Commit

52b8ad8

verified ·

1 Parent(s): 4b458b6

Update app.py

Browse files

Files changed (1) hide show

app.py +263 -524

app.py CHANGED Viewed

@@ -1,621 +1,345 @@
 # ==============================================
-# SCREENSHOT SCRAPER FOR N8N - IMPROVED VERSION
 # ==============================================
 import gradio as gr
 import requests
-import base64
 import json
 import time
-from io import BytesIO
-from PIL import Image, ImageEnhance, ImageDraw, ImageFont
-import textwrap
 import re
-from typing import Dict, Any
-from fastapi import FastAPI
-import uvicorn
-# Try imports with fallbacks
 try:
     from bs4 import BeautifulSoup
     BEAUTIFULSOUP_AVAILABLE = True
 except ImportError:
     BEAUTIFULSOUP_AVAILABLE = False
-    print("BeautifulSoup not available")
 try:
     from transformers import pipeline
     TRANSFORMERS_AVAILABLE = True
 except ImportError:
     TRANSFORMERS_AVAILABLE = False
-    print("Transformers not available")
 # ==============================================
-# IMPROVED SCREENSHOT CAPTURER
 # ==============================================
-class ImprovedScreenshotCapturer:
-    """Better screenshot capture using HTML content extraction"""
     def __init__(self):
-        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-    def capture(self, url: str) -> Dict[str, Any]:
-        """Capture or create screenshot from webpage"""
-        print(f"\n📸 Attempting to capture: {url}")
-        # Ensure URL has protocol
-        if not url.startswith('http'):
-            url = 'https://' + url
-        # Method 1: Try actual screenshot APIs
-        screenshot_result = self._try_screenshot_apis(url)
-        if screenshot_result.get("success"):
-            return screenshot_result
-        # Method 2: Create screenshot from HTML content (most reliable)
-        print("🔄 Using HTML content method...")
-        return self._create_from_html(url)
-    def _try_screenshot_apis(self, url: str) -> Dict[str, Any]:
-        """Try various screenshot APIs"""
-        apis = [
-            # These are more reliable APIs
-            {
-                "url": f"https://render-tron.appspot.com/screenshot/{url}?width=1200&height=800",
-                "name": "rendertron"
-            },
-            {
-                "url": f"https://s.wordpress.com/mshots/v1/{url}?w=1200&h=800",
-                "name": "wordpress"
-            },
-            {
-                "url": f"https://image.thum.io/get/width/1200/crop/900/{url}",
-                "name": "thumio"
-            }
-        ]
-        headers = {'User-Agent': self.user_agent}
-        for api in apis:
-            try:
-                print(f"  Trying {api['name']}...")
-                response = requests.get(api["url"], headers=headers, timeout=15)
-                if response.status_code == 200 and len(response.content) > 5000:
-                    # Verify it's an image
-                    try:
-                        img = Image.open(BytesIO(response.content))
-                        img.verify()
-                        return {
-                            "success": True,
-                            "image_bytes": response.content,
-                            "size": len(response.content),
-                            "method": api["name"],
-                            "is_real_screenshot": True
-                        }
-                    except:
-                        continue
-            except:
-                continue
-        return {"success": False}
-    def _create_from_html(self, url: str) -> Dict[str, Any]:
-        """Create screenshot from HTML content"""
         try:
-            # Fetch webpage content
-            headers = {'User-Agent': self.user_agent}
-            response = requests.get(url, headers=headers, timeout=15)
-            if response.status_code != 200:
-                return self._create_error_image(f"HTTP {response.status_code}", url)
-            html_content = response.text
-            # Parse HTML if BeautifulSoup is available
-            if BEAUTIFULSOUP_AVAILABLE:
-                title, main_text = self._parse_html_with_bs4(html_content)
-            else:
-                title, main_text = self._parse_html_simple(html_content)
-            # Create image with the content
-            image_bytes = self._create_content_image(url, title, main_text)
             return {
-                "success": True,
-                "image_bytes": image_bytes,
-                "size": len(image_bytes),
-                "method": "html_content",
-                "is_real_screenshot": False,
-                "content_length": len(main_text)
             }
         except Exception as e:
-            print(f"Error creating from HTML: {str(e)}")
-            return self._create_error_image(str(e), url)
-    def _parse_html_with_bs4(self, html: str):
-        """Parse HTML using BeautifulSoup"""
-        soup = BeautifulSoup(html, 'html.parser')
-        # Get title
-        title = soup.title.string if soup.title else "No title"
-        # Remove scripts, styles, nav, footer
-        for tag in soup(["script", "style", "nav", "footer", "header", "iframe"]):
-            tag.decompose()
-        # Try to get main content
-        main_content = ""
-        # Look for main content areas
-        selectors = ['article', 'main', '.content', '.post-content', '.article', '#content']
-        for selector in selectors:
-            elements = soup.select(selector)
-            if elements:
-                main_content = ' '.join([elem.get_text() for elem in elements[:3]])
-                break
-        # Fallback to body
-        if not main_content and soup.body:
-            main_content = soup.body.get_text()
-        # Clean text
-        text = self._clean_text(main_content)
-        return title, text[:10000]  # Limit text length
-    def _parse_html_simple(self, html: str):
-        """Simple HTML parsing without BeautifulSoup"""
-        # Extract title
-        title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.IGNORECASE)
-        title = title_match.group(1) if title_match else "No title"
-        # Extract text between body tags
-        body_match = re.search(r'<body[^>]*>(.*?)</body>', html, re.IGNORECASE | re.DOTALL)
-        if body_match:
-            body_text = body_match.group(1)
-            # Remove HTML tags
-            clean_text = re.sub(r'<[^>]+>', ' ', body_text)
-        else:
-            clean_text = html[:5000]
-        # Clean text
-        text = self._clean_text(clean_text)
-        return title, text[:10000]
-    def _clean_text(self, text: str) -> str:
-        """Clean and normalize text"""
-        # Replace multiple whitespace with single space
-        text = re.sub(r'\s+', ' ', text)
-        # Remove control characters
-        text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
-        return text.strip()
-    def _create_content_image(self, url: str, title: str, content: str) -> bytes:
-        """Create an image with webpage content"""
-        # Create image
-        img_width, img_height = 1200, 1000
-        img = Image.new('RGB', (img_width, img_height), color='white')
-        draw = ImageDraw.Draw(img)
-        # Try to load fonts
-        try:
-            font_large = ImageFont.truetype("arial.ttf", 24)
-            font_medium = ImageFont.truetype("arial.ttf", 20)
-            font_small = ImageFont.truetype("arial.ttf", 16)
-        except:
-            font_large = ImageFont.load_default()
-            font_medium = ImageFont.load_default()
-            font_small = ImageFont.load_default()
-        # Draw header
-        draw.text((50, 30), "📄 WEBPAGE CONTENT EXTRACT", fill='darkblue', font=font_large)
-        draw.text((50, 70), f"URL: {url[:80]}", fill='blue', font=font_medium)
-        draw.text((50, 100), f"Title: {title[:100]}", fill='black', font=font_medium)
-        draw.line([(50, 130), (1150, 130)], fill='gray', width=2)
-        # Draw content
-        y_offset = 150
-        lines = textwrap.wrap(content, width=100)
-        for i, line in enumerate(lines):
-            if y_offset < 950:
-                draw.text((50, y_offset), line, fill='black', font=font_small)
-                y_offset += 20
-            else:
-                draw.text((50, y_offset), f"... (showing {i} of {len(lines)} lines)",
-                         fill='darkgray', font=font_small)
-                break
-        # Footer
-        draw.line([(50, 970), (1150, 970)], fill='lightgray', width=1)
-        draw.text((50, 980), f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}",
-                 fill='gray', font=font_small)
-        # Convert to bytes
-        img_byte_arr = BytesIO()
-        img.save(img_byte_arr, format='PNG', optimize=True, quality=85)
-        return img_byte_arr.getvalue()
-    def _create_error_image(self, error: str, url: str) -> Dict[str, Any]:
-        """Create error image"""
-        img = Image.new('RGB', (800, 400), color='white')
-        draw = ImageDraw.Draw(img)
-        try:
-            font = ImageFont.truetype("arial.ttf", 20)
-        except:
-            font = ImageFont.load_default()
-        draw.text((50, 50), "❌ SCREENSHOT ERROR", fill='red', font=font)
-        draw.text((50, 100), f"URL: {url[:100]}", fill='black', font=font)
-        draw.text((50, 150), f"Error: {error[:200]}", fill='darkred', font=font)
-        draw.text((50, 200), "Content was extracted directly from HTML.", fill='black', font=font)
-        draw.text((50, 250), "This is actually BETTER for text extraction!", fill='green', font=font)
-        img_byte_arr = BytesIO()
-        img.save(img_byte_arr, format='PNG')
-        img_bytes = img_byte_arr.getvalue()
-        return {
-            "success": True,  # Still successful for our purposes
-            "image_bytes": img_bytes,
-            "size": len(img_bytes),
-            "method": "error_fallback",
-            "is_real_screenshot": False,
-            "note": f"Error: {error}"
-        }
-# ==============================================
-# IMPROVED OCR PROCESSOR
-# ==============================================
-class ImprovedOCRProcessor:
-    """Better OCR with preprocessing"""
-    def __init__(self):
-        self.processor = None
-    def load_model(self):
-        """Load OCR model"""
-        if not TRANSFORMERS_AVAILABLE:
-            return None
-        try:
-            # Use a smaller, faster model
-            self.processor = pipeline(
-                "image-to-text",
-                model="microsoft/trocr-base-printed",
-                device=-1
-            )
-            print("✅ OCR model loaded")
-            return self.processor
-        except Exception as e:
-            print(f"❌ OCR model load failed: {e}")
-            return None
-    def extract_text(self, image_bytes: bytes) -> Dict[str, Any]:
-        """Extract text from image"""
-        if not self.processor:
-            if not self.load_model():
-                return {"success": False, "error": "OCR not available"}
         try:
-            # Open and preprocess image
-            image = Image.open(BytesIO(image_bytes))
-            # Convert to RGB if needed
-            if image.mode != 'RGB':
-                image = image.convert('RGB')
-            # Resize if too large (better for OCR)
-            max_size = 1600
-            if max(image.size) > max_size:
-                ratio = max_size / max(image.size)
-                new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
-                image = image.resize(new_size, Image.Resampling.LANCZOS)
-            # Enhance image for better OCR
-            enhancer = ImageEnhance.Contrast(image)
-            image = enhancer.enhance(1.5)
-            enhancer = ImageEnhance.Sharpness(image)
-            image = enhancer.enhance(1.2)
-            # Perform OCR
-            print("🔍 Running OCR...")
-            start_time = time.time()
-            result = self.processor(image)
-            ocr_time = time.time() - start_time
-            # Extract text from result
-            text = ""
-            if isinstance(result, list) and result:
-                if isinstance(result[0], dict):
-                    text = result[0].get('generated_text', '')
-                else:
-                    text = str(result[0])
             else:
-                text = str(result)
-            # Clean text
-            text = self._clean_ocr_text(text)
-            print(f"📊 OCR completed in {ocr_time:.2f}s, extracted {len(text)} chars")
             return {
                 "success": True,
-                "text": text,
-                "length": len(text),
-                "ocr_time": ocr_time,
-                "model": "trocr-base-printed"
             }
         except Exception as e:
-            print(f"❌ OCR error: {e}")
             return {"success": False, "error": str(e)}
-    def _clean_ocr_text(self, text: str) -> str:
-        """Clean OCR output"""
-        # Remove extra whitespace
-        text = re.sub(r'\s+', ' ', text)
-        # Fix common OCR errors
-        text = text.replace('|', 'I').replace('[]', 'll').replace('()', 'o')
-        return text.strip()
-# ==============================================
-# MAIN SCRAPER
-# ==============================================
-class WebScraper:
-    """Main scraper that combines screenshot and direct text extraction"""
-    def __init__(self):
-        self.screenshot_capturer = ImprovedScreenshotCapturer()
-        self.ocr_processor = ImprovedOCRProcessor()
-        print("🚀 Web Scraper initialized")
-    def scrape(self, url: str) -> Dict[str, Any]:
-        """Main scraping function - uses BOTH methods"""
-        start_time = time.time()
-        print(f"\n{'='*60}")
-        print(f"🌐 Scraping: {url}")
-        print(f"{'='*60}")
-        # Method 1: Try direct HTML extraction first (fastest, most reliable for text)
-        print("\n📝 Method 1: Direct HTML text extraction...")
-        html_start = time.time()
-        direct_text = self._extract_direct_html(url)
-        html_time = time.time() - html_start
-        if direct_text and len(direct_text) > 100:
-            print(f"✅ Direct extraction: {len(direct_text)} characters")
-            # Also get screenshot for reference
-            print("\n📸 Method 2: Getting screenshot for reference...")
-            screenshot_result = self.screenshot_capturer.capture(url)
-            total_time = time.time() - start_time
-            return {
-                "success": True,
-                "url": url,
-                "execution_time": round(total_time, 2),
-                "method_used": "direct_html_extraction",
-                "extracted_text": direct_text[:15000],  # Limit for response
-                "text_length": len(direct_text),
-                "screenshot_info": {
-                    "method": screenshot_result.get("method", "none"),
-                    "size_bytes": screenshot_result.get("size", 0),
-                    "is_real_screenshot": screenshot_result.get("is_real_screenshot", False)
-                },
-                "notes": "Text extracted directly from HTML (most accurate for text content)"
-            }
-        # Method 2: If direct extraction fails, use OCR
-        print("\n📝 Direct extraction failed, using OCR method...")
-        # Get screenshot
-        screenshot_start = time.time()
-        screenshot_result = self.screenshot_capturer.capture(url)
-        screenshot_time = time.time() - screenshot_start
-        if not screenshot_result.get("success", False):
-            return {
-                "success": False,
-                "url": url,
-                "error": "Failed to capture content",
-                "execution_time": round(time.time() - start_time, 2)
-            }
-        # Extract text with OCR
-        print("\n🔍 Running OCR on captured content...")
-        ocr_start = time.time()
-        ocr_result = self.ocr_processor.extract_text(screenshot_result["image_bytes"])
-        ocr_time = time.time() - ocr_start
-        total_time = time.time() - start_time
-        if ocr_result["success"]:
-            return {
-                "success": True,
-                "url": url,
-                "execution_time": round(total_time, 2),
-                "step_times": {
-                    "screenshot": round(screenshot_time, 2),
-                    "ocr": round(ocr_time, 2)
-                },
-                "method_used": "screenshot_ocr",
-                "extracted_text": ocr_result["text"][:15000],
-                "text_length": ocr_result["length"],
-                "ocr_info": {
-                    "model": ocr_result.get("model", "unknown"),
-                    "processing_time": round(ocr_time, 2)
-                },
-                "screenshot_info": {
-                    "method": screenshot_result.get("method", "none"),
-                    "size_bytes": screenshot_result.get("size", 0),
-                    "is_real_screenshot": screenshot_result.get("is_real_screenshot", False)
-                }
-            }
-        else:
-            return {
-                "success": False,
-                "url": url,
-                "error": f"OCR failed: {ocr_result.get('error', 'Unknown error')}",
-                "execution_time": round(total_time, 2)
-            }
-    def _extract_direct_html(self, url: str) -> str:
-        """Extract text directly from HTML (fastest method)"""
-        try:
-            headers = {'User-Agent': 'Mozilla/5.0'}
-            response = requests.get(url, headers=headers, timeout=10)
-            if response.status_code != 200:
-                return ""
-            html = response.text
-            if BEAUTIFULSOUP_AVAILABLE:
-                return self._extract_with_bs4(html)
-            else:
-                return self._extract_simple(html)
         except Exception as e:
-            print(f"Direct extraction error: {e}")
-            return ""
-    def _extract_with_bs4(self, html: str) -> str:
-        """Extract text using BeautifulSoup"""
-        soup = BeautifulSoup(html, 'html.parser')
-        # Remove unwanted elements
-        for tag in soup(["script", "style", "nav", "footer", "header", "iframe", "aside"]):
-            tag.decompose()
-        # Get text from main content areas
-        text_parts = []
-        # Try various content selectors
-        content_selectors = [
-            'article', 'main', '.content', '.post-content', '.article-content',
-            '#content', '.entry-content', '.story-content', '.text'
-        ]
-        for selector in content_selectors:
-            elements = soup.select(selector)
-            if elements:
-                for elem in elements[:2]:  # Take first 2 matching elements
-                    text_parts.append(elem.get_text())
-        # Fallback to body
-        if not text_parts and soup.body:
-            text_parts.append(soup.body.get_text())
-        # Combine and clean
-        combined = ' '.join(text_parts)
-        return self._clean_text(combined)
-    def _extract_simple(self, html: str) -> str:
         """Simple text extraction without BeautifulSoup"""
         # Remove scripts and styles
         html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
         html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
-        # Remove HTML tags
         text = re.sub(r'<[^>]+>', ' ', html)
-        # Remove extra whitespace
-        text = re.sub(r'\s+', ' ', text)
-        return text.strip()
     def _clean_text(self, text: str) -> str:
         """Clean extracted text"""
-        # Remove extra whitespace and normalize
         text = re.sub(r'\s+', ' ', text)
         # Remove control characters
         text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
         return text.strip()
 # ==============================================
-# INITIALIZE AND SETUP API
 # ==============================================
-scraper = WebScraper()
 # Create FastAPI app
-fastapi_app = FastAPI(
-    title="Web Scraper API",
-    description="Extract text from webpages using direct HTML parsing or OCR",
-    version="2.0"
-)
-# CORS
-from fastapi.middleware.cors import CORSMiddleware
-fastapi_app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
-@fastapi_app.get("/")
 async def root():
     return {
         "service": "Web Scraper API",
-        "version": "2.0",
-        "description": "Extracts text from webpages. Uses direct HTML parsing (preferred) or screenshot+OCR.",
         "endpoints": {
-            "GET /": "This info",
             "GET /health": "Health check",
-            "POST /api/scrape": "Main scraping endpoint"
         },
         "usage": {
-            "curl": 'curl -X POST "YOUR_URL/api/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\'',
-            "n8n": 'HTTP Request node: POST to /api/scrape with JSON body: {"url": "{{$json.url}}"}'
         }
     }
-@fastapi_app.get("/health")
 async def health():
     return {
         "status": "healthy",
         "timestamp": time.time(),
         "features": {
-            "direct_html": True,
-            "ocr": TRANSFORMERS_AVAILABLE,
-            "html_parsing": BEAUTIFULSOUP_AVAILABLE
         }
     }
-@fastapi_app.post("/api/scrape")
-async def api_scrape(data: dict):
     """Main API endpoint for n8n"""
     try:
-        url = data.get("url", "")
         if not url:
-            return {"success": False, "error": "URL is required"}
-        print(f"\n📨 API Request: {url}")
         result = scraper.scrape(url)
         return result
     except Exception as e:
         print(f"❌ API Error: {e}")
         return {
             "success": False,
-            "error": str(e),
-            "url": data.get("url", "unknown")
         }
 # ==============================================
@@ -623,64 +347,79 @@ async def api_scrape(data: dict):
 # ==============================================
 def gradio_scrape(url: str):
-    """Gradio interface"""
     if not url:
-        return "❌ Enter a URL", {}
     result = scraper.scrape(url)
-    if result["success"]:
-        output = f"## ✅ Success!\n\n"
-        output += f"**URL:** {result['url']}\n"
-        output += f"**Method:** {result.get('method_used', 'unknown')}\n"
-        output += f"**Time:** {result['execution_time']}s\n"
-        output += f"**Text Length:** {result['text_length']:,} characters\n\n"
-        if result.get('extracted_text'):
-            preview = result['extracted_text'][:500]
-            if len(result['extracted_text']) > 500:
-                preview += "..."
-            output += f"**Preview:**\n{preview}"
         return output, result
     else:
-        return f"## ❌ Error\n\n{result.get('error', 'Unknown')}", result
-gradio_app = gr.Interface(
     fn=gradio_scrape,
     inputs=gr.Textbox(
         label="Website URL",
-        placeholder="https://en.wikipedia.org/wiki/Artificial_intelligence",
-        value="https://en.wikipedia.org/wiki/Artificial_intelligence"
     ),
     outputs=[
         gr.Markdown(label="Result"),
         gr.JSON(label="API Response")
     ],
     title="🌐 Web Scraper for n8n",
-    description="Extract text from webpages. Perfect for n8n workflows!",
     examples=[
         ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
-        ["https://news.ycombinator.com"],
         ["https://example.com"],
         ["https://httpbin.org/html"]
-    ]
 )
-# Mount Gradio
-app = gr.mount_gradio_app(fastapi_app, gradio_app, path="/")
 # ==============================================
-# START APPLICATION
 # ==============================================
 if __name__ == "__main__":
     print("\n" + "="*60)
     print("🚀 Web Scraper API Starting")
     print("="*60)
     print(f"📝 Direct HTML: Enabled")
     print(f"🔍 OCR Available: {TRANSFORMERS_AVAILABLE}")
-    print(f"🧪 HTML Parsing: {BEAUTIFULSOUP_AVAILABLE}")
     print("="*60 + "\n")
     uvicorn.run(

 # ==============================================
+# WEB SCRAPER FOR N8N - HUGGING FACE SPACES VERSION
 # ==============================================
 import gradio as gr
 import requests
 import json
 import time
 import re
+import textwrap
+from typing import Dict, Any, Optional
+from fastapi import FastAPI, Request
+from io import BytesIO
+from PIL import Image, ImageDraw, ImageFont
+# ==============================================
+# IMPORTS WITH FALLBACKS
+# ==============================================
+# Try to import optional dependencies
 try:
     from bs4 import BeautifulSoup
     BEAUTIFULSOUP_AVAILABLE = True
 except ImportError:
     BEAUTIFULSOUP_AVAILABLE = False
+    print("⚠️ BeautifulSoup not available - using simple HTML parsing")
 try:
+    import torch
     from transformers import pipeline
     TRANSFORMERS_AVAILABLE = True
 except ImportError:
     TRANSFORMERS_AVAILABLE = False
+    print("⚠️ Transformers not available - OCR disabled")
 # ==============================================
+# SIMPLE WEB SCRAPER (NO COMPLEX DEPENDENCIES)
 # ==============================================
+class SimpleWebScraper:
+    """Lightweight web scraper optimized for Hugging Face Spaces"""
     def __init__(self):
+        self.user_agent = (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/120.0.0.0 Safari/537.36"
+        )
+        self.ocr_processor = None
+    def scrape(self, url: str) -> Dict[str, Any]:
+        """Main scraping function"""
+        start_time = time.time()
+        print(f"🌐 Scraping: {url}")
+        # Ensure URL has protocol
+        if not url.startswith(('http://', 'https://')):
+            url = 'https://' + url
         try:
+            # Method 1: Direct HTML extraction (fastest and most reliable)
+            html_result = self._extract_direct_html(url)
+            if html_result.get("success") and html_result.get("text_length", 0) > 50:
+                total_time = time.time() - start_time
+                return {
+                    "success": True,
+                    "url": url,
+                    "execution_time": round(total_time, 2),
+                    "method_used": "direct_html",
+                    "extracted_text": html_result["text"],
+                    "text_length": html_result["text_length"],
+                    "metadata": html_result.get("metadata", {}),
+                    "notes": "Text extracted directly from HTML (most accurate)"
+                }
+            # Method 2: If direct extraction fails, try alternative
+            print("Direct extraction limited, trying enhanced method...")
+            enhanced_result = self._enhanced_extraction(url)
+            total_time = time.time() - start_time
+            if enhanced_result.get("success"):
+                enhanced_result["execution_time"] = round(total_time, 2)
+                return enhanced_result
+            # Final fallback
             return {
+                "success": False,
+                "url": url,
+                "error": "Failed to extract meaningful content",
+                "execution_time": round(total_time, 2)
             }
         except Exception as e:
+            return {
+                "success": False,
+                "url": url,
+                "error": str(e),
+                "execution_time": round(time.time() - start_time, 2)
+            }
+    def _extract_direct_html(self, url: str) -> Dict[str, Any]:
+        """Extract text directly from HTML"""
         try:
+            headers = {
+                'User-Agent': self.user_agent,
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5',
+                'Accept-Encoding': 'gzip, deflate',
+                'Connection': 'keep-alive',
+            }
+            response = requests.get(url, headers=headers, timeout=15)
+            response.raise_for_status()
+            # Get encoding
+            if response.encoding is None:
+                response.encoding = 'utf-8'
+            html_content = response.text
+            # Extract metadata
+            metadata = self._extract_metadata(html_content)
+            # Extract text
+            if BEAUTIFULSOUP_AVAILABLE:
+                text = self._extract_text_with_bs4(html_content)
             else:
+                text = self._extract_text_simple(html_content)
+            # Clean and truncate text
+            cleaned_text = self._clean_text(text)
             return {
                 "success": True,
+                "text": cleaned_text[:15000],  # Limit for API response
+                "text_length": len(cleaned_text),
+                "metadata": metadata,
+                "http_status": response.status_code
             }
+        except requests.exceptions.RequestException as e:
+            print(f"Request error: {e}")
+            return {"success": False, "error": f"Request failed: {str(e)}"}
         except Exception as e:
+            print(f"Extraction error: {e}")
             return {"success": False, "error": str(e)}
+    def _extract_metadata(self, html: str) -> Dict[str, Any]:
+        """Extract basic metadata from HTML"""
+        metadata = {}
+        # Extract title
+        title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.IGNORECASE | re.DOTALL)
+        if title_match:
+            metadata['title'] = re.sub(r'\s+', ' ', title_match.group(1)).strip()[:200]
+        # Extract meta description
+        desc_match = re.search(r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
+                              html, re.IGNORECASE)
+        if desc_match:
+            metadata['description'] = desc_match.group(1)[:300]
+        # Extract meta keywords
+        keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
+                                  html, re.IGNORECASE)
+        if keywords_match:
+            metadata['keywords'] = keywords_match.group(1)[:300]
+        return metadata
+    def _extract_text_with_bs4(self, html: str) -> str:
+        """Extract text using BeautifulSoup if available"""
+        try:
+            soup = BeautifulSoup(html, 'html.parser')
+            # Remove unwanted elements
+            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside',
+                           'iframe', 'noscript', 'svg', 'form']):
+                tag.decompose()
+            # Try to find main content
+            main_text = ""
+            # Common content selectors
+            content_selectors = [
+                'main', 'article', '.content', '.post-content', '.article-content',
+                '.entry-content', '.story-content', '.text-content', '#content',
+                '.main-content', '.blog-content', '.page-content'
+            ]
+            for selector in content_selectors:
+                elements = soup.select(selector)
+                if elements:
+                    for elem in elements[:3]:  # Take first 3 matching
+                        main_text += elem.get_text() + "\n\n"
+            # If still no content, use body
+            if not main_text.strip() and soup.body:
+                main_text = soup.body.get_text()
+            return main_text
         except Exception as e:
+            print(f"BeautifulSoup error: {e}")
+            return self._extract_text_simple(html)
+    def _extract_text_simple(self, html: str) -> str:
         """Simple text extraction without BeautifulSoup"""
         # Remove scripts and styles
         html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
         html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
+        # Remove HTML comments
+        html = re.sub(r'<!--.*?-->', ' ', html, flags=re.DOTALL)
+        # Remove HTML tags but keep text
         text = re.sub(r'<[^>]+>', ' ', html)
+        # Decode HTML entities
+        import html as html_module
+        text = html_module.unescape(text)
+        return text
     def _clean_text(self, text: str) -> str:
         """Clean extracted text"""
+        # Replace multiple whitespace
         text = re.sub(r'\s+', ' ', text)
         # Remove control characters
         text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
+        # Remove excessive line breaks
+        text = re.sub(r'\n{3,}', '\n\n', text)
         return text.strip()
+    def _enhanced_extraction(self, url: str) -> Dict[str, Any]:
+        """Enhanced extraction with fallback methods"""
+        try:
+            # Try with different headers
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
+                'Accept': 'text/html',
+            }
+            response = requests.get(url, headers=headers, timeout=15)
+            if response.status_code == 200:
+                text = self._extract_text_simple(response.text)
+                cleaned = self._clean_text(text)
+                if len(cleaned) > 100:
+                    return {
+                        "success": True,
+                        "text": cleaned[:15000],
+                        "text_length": len(cleaned),
+                        "method_used": "enhanced_direct",
+                        "notes": "Extracted with Googlebot user-agent"
+                    }
+            return {"success": False, "error": "Enhanced extraction failed"}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
 # ==============================================
+# FASTAPI APPLICATION
 # ==============================================
+# Initialize scraper
+scraper = SimpleWebScraper()
 # Create FastAPI app
+app = FastAPI(
+    title="Web Scraper API for n8n",
+    description="Lightweight web scraper optimized for Hugging Face Spaces",
+    version="1.0"
 )
+@app.get("/")
 async def root():
     return {
         "service": "Web Scraper API",
+        "version": "1.0",
+        "description": "Extract text content from webpages",
         "endpoints": {
+            "GET /": "This information",
             "GET /health": "Health check",
+            "POST /scrape": "Main scraping endpoint"
         },
         "usage": {
+            "curl": 'curl -X POST "https://your-space.hf.space/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\'',
+            "n8n": "Use HTTP Request node with POST method to /scrape endpoint"
         }
     }
+@app.get("/health")
 async def health():
     return {
         "status": "healthy",
         "timestamp": time.time(),
         "features": {
+            "html_parsing": BEAUTIFULSOUP_AVAILABLE,
+            "ocr": TRANSFORMERS_AVAILABLE
         }
     }
+@app.post("/scrape")
+async def api_scrape(request: Request):
     """Main API endpoint for n8n"""
     try:
+        data = await request.json()
+        url = data.get("url", "").strip()
         if not url:
+            return {
+                "success": False,
+                "error": "URL parameter is required",
+                "example": {"url": "https://example.com"}
+            }
+        print(f"📨 API Request received for URL: {url}")
         result = scraper.scrape(url)
         return result
+    except json.JSONDecodeError:
+        return {
+            "success": False,
+            "error": "Invalid JSON payload",
+            "example": {"url": "https://example.com"}
+        }
     except Exception as e:
         print(f"❌ API Error: {e}")
         return {
             "success": False,
+            "error": f"Internal server error: {str(e)}"
         }
 # ==============================================
 # ==============================================
 def gradio_scrape(url: str):
+    """Gradio interface function"""
     if not url:
+        return "❌ Please enter a URL", {}
+    print(f"🎨 Gradio interface scraping: {url}")
     result = scraper.scrape(url)
+    if result.get("success"):
+        text = result.get("extracted_text", "")
+        text_length = result.get("text_length", 0)
+        # Create preview
+        preview = text[:500]
+        if len(text) > 500:
+            preview += "..."
+        output = f"""
+## ✅ Success!
+**URL:** {result['url']}
+**Method:** {result.get('method_used', 'direct_html')}
+**Time:** {result.get('execution_time', 0)}s
+**Characters:** {text_length:,}
+### Preview:
+{preview}
+### Full Response:
+Check the JSON output for complete data.
+"""
         return output, result
     else:
+        error_msg = result.get("error", "Unknown error")
+        return f"## ❌ Error\n\n{error_msg}", result
+# Create Gradio interface
+gradio_interface = gr.Interface(
     fn=gradio_scrape,
     inputs=gr.Textbox(
         label="Website URL",
+        placeholder="Enter a URL (e.g., https://example.com)",
+        lines=1
     ),
     outputs=[
         gr.Markdown(label="Result"),
         gr.JSON(label="API Response")
     ],
     title="🌐 Web Scraper for n8n",
+    description="Extract text content from webpages. Use with n8n HTTP Request node.",
     examples=[
         ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
         ["https://example.com"],
         ["https://httpbin.org/html"]
+    ],
+    allow_flagging="never"
 )
+# Mount Gradio to FastAPI
+app = gr.mount_gradio_app(app, gradio_interface, path="/")
 # ==============================================
+# APPLICATION ENTRY POINT
 # ==============================================
 if __name__ == "__main__":
+    import uvicorn
     print("\n" + "="*60)
     print("🚀 Web Scraper API Starting")
     print("="*60)
     print(f"📝 Direct HTML: Enabled")
     print(f"🔍 OCR Available: {TRANSFORMERS_AVAILABLE}")
+    print(f"🧪 BeautifulSoup: {BEAUTIFULSOUP_AVAILABLE}")
     print("="*60 + "\n")
     uvicorn.run(