Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on Jan 25

Commit

0fc5caf

verified ·

1 Parent(s): cfe45d2

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -308

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==============================================
-# WEB SCRAPER FOR N8N - HUGGING FACE SPACES VERSION
 # ==============================================
 import gradio as gr
@@ -7,47 +7,19 @@ import requests
 import json
 import time
 import re
-import textwrap
-from typing import Dict, Any, Optional
-from fastapi import FastAPI, Request
 from io import BytesIO
-from PIL import Image, ImageDraw, ImageFont
 # ==============================================
-# IMPORTS WITH FALLBACKS
 # ==============================================
-# Try to import optional dependencies
-try:
-    from bs4 import BeautifulSoup
-    BEAUTIFULSOUP_AVAILABLE = True
-except ImportError:
-    BEAUTIFULSOUP_AVAILABLE = False
-    print("⚠️ BeautifulSoup not available - using simple HTML parsing")
-try:
-    import torch
-    from transformers import pipeline
-    TRANSFORMERS_AVAILABLE = True
-except ImportError:
-    TRANSFORMERS_AVAILABLE = False
-    print("⚠️ Transformers not available - OCR disabled")
-# ==============================================
-# SIMPLE WEB SCRAPER (NO COMPLEX DEPENDENCIES)
-# ==============================================
-class SimpleWebScraper:
-    """Lightweight web scraper optimized for Hugging Face Spaces"""
     def __init__(self):
-        self.user_agent = (
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/120.0.0.0 Safari/537.36"
-        )
-        self.ocr_processor = None
     def scrape(self, url: str) -> Dict[str, Any]:
         """Main scraping function"""
         start_time = time.time()
@@ -59,41 +31,28 @@ class SimpleWebScraper:
             url = 'https://' + url
         try:
-            # Method 1: Direct HTML extraction (fastest and most reliable)
-            html_result = self._extract_direct_html(url)
-            if html_result.get("success") and html_result.get("text_length", 0) > 50:
-                total_time = time.time() - start_time
                 return {
                     "success": True,
                     "url": url,
-                    "execution_time": round(total_time, 2),
-                    "method_used": "direct_html",
-                    "extracted_text": html_result["text"],
-                    "text_length": html_result["text_length"],
-                    "metadata": html_result.get("metadata", {}),
-                    "notes": "Text extracted directly from HTML (most accurate)"
                 }
-            # Method 2: If direct extraction fails, try alternative
-            print("Direct extraction limited, trying enhanced method...")
-            enhanced_result = self._enhanced_extraction(url)
-            total_time = time.time() - start_time
-            if enhanced_result.get("success"):
-                enhanced_result["execution_time"] = round(total_time, 2)
-                return enhanced_result
-            # Final fallback
-            return {
-                "success": False,
-                "url": url,
-                "error": "Failed to extract meaningful content",
-                "execution_time": round(total_time, 2)
-            }
         except Exception as e:
             return {
                 "success": False,
@@ -102,122 +61,73 @@ class SimpleWebScraper:
                 "execution_time": round(time.time() - start_time, 2)
             }
-    def _extract_direct_html(self, url: str) -> Dict[str, Any]:
-        """Extract text directly from HTML"""
         try:
             headers = {
                 'User-Agent': self.user_agent,
                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-                'Accept-Language': 'en-US,en;q=0.5',
-                'Accept-Encoding': 'gzip, deflate',
-                'Connection': 'keep-alive',
             }
-            response = requests.get(url, headers=headers, timeout=15)
             response.raise_for_status()
-            # Get encoding
-            if response.encoding is None:
-                response.encoding = 'utf-8'
-            html_content = response.text
-            # Extract metadata
-            metadata = self._extract_metadata(html_content)
-            # Extract text
-            if BEAUTIFULSOUP_AVAILABLE:
-                text = self._extract_text_with_bs4(html_content)
-            else:
-                text = self._extract_text_simple(html_content)
-            # Clean and truncate text
             cleaned_text = self._clean_text(text)
             return {
                 "success": True,
-                "text": cleaned_text[:15000],  # Limit for API response
-                "text_length": len(cleaned_text),
                 "metadata": metadata,
-                "http_status": response.status_code
             }
         except requests.exceptions.RequestException as e:
-            print(f"Request error: {e}")
             return {"success": False, "error": f"Request failed: {str(e)}"}
         except Exception as e:
-            print(f"Extraction error: {e}")
             return {"success": False, "error": str(e)}
-    def _extract_metadata(self, html: str) -> Dict[str, Any]:
-        """Extract basic metadata from HTML"""
-        metadata = {}
-        # Extract title
-        title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.IGNORECASE | re.DOTALL)
-        if title_match:
-            metadata['title'] = re.sub(r'\s+', ' ', title_match.group(1)).strip()[:200]
-        # Extract meta description
-        desc_match = re.search(r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
-                              html, re.IGNORECASE)
-        if desc_match:
-            metadata['description'] = desc_match.group(1)[:300]
-        # Extract meta keywords
-        keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
-                                  html, re.IGNORECASE)
-        if keywords_match:
-            metadata['keywords'] = keywords_match.group(1)[:300]
-        return metadata
-    def _extract_text_with_bs4(self, html: str) -> str:
-        """Extract text using BeautifulSoup if available"""
-        try:
-            soup = BeautifulSoup(html, 'html.parser')
-            # Remove unwanted elements
-            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside',
-                           'iframe', 'noscript', 'svg', 'form']):
-                tag.decompose()
-            # Try to find main content
-            main_text = ""
-            # Common content selectors
-            content_selectors = [
-                'main', 'article', '.content', '.post-content', '.article-content',
-                '.entry-content', '.story-content', '.text-content', '#content',
-                '.main-content', '.blog-content', '.page-content'
-            ]
-            for selector in content_selectors:
-                elements = soup.select(selector)
-                if elements:
-                    for elem in elements[:3]:  # Take first 3 matching
-                        main_text += elem.get_text() + "\n\n"
-            # If still no content, use body
-            if not main_text.strip() and soup.body:
-                main_text = soup.body.get_text()
-            return main_text
-        except Exception as e:
-            print(f"BeautifulSoup error: {e}")
-            return self._extract_text_simple(html)
-    def _extract_text_simple(self, html: str) -> str:
-        """Simple text extraction without BeautifulSoup"""
         # Remove scripts and styles
         html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
         html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
-        # Remove HTML comments
-        html = re.sub(r'<!--.*?-->', ' ', html, flags=re.DOTALL)
-        # Remove HTML tags but keep text
         text = re.sub(r'<[^>]+>', ' ', html)
         # Decode HTML entities
@@ -227,136 +137,39 @@ class SimpleWebScraper:
         return text
     def _clean_text(self, text: str) -> str:
-        """Clean extracted text"""
         # Replace multiple whitespace
         text = re.sub(r'\s+', ' ', text)
         # Remove control characters
         text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
-        # Remove excessive line breaks
         text = re.sub(r'\n{3,}', '\n\n', text)
         return text.strip()
-    def _enhanced_extraction(self, url: str) -> Dict[str, Any]:
-        """Enhanced extraction with fallback methods"""
-        try:
-            # Try with different headers
-            headers = {
-                'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
-                'Accept': 'text/html',
-            }
-            response = requests.get(url, headers=headers, timeout=15)
-            if response.status_code == 200:
-                text = self._extract_text_simple(response.text)
-                cleaned = self._clean_text(text)
-                if len(cleaned) > 100:
-                    return {
-                        "success": True,
-                        "text": cleaned[:15000],
-                        "text_length": len(cleaned),
-                        "method_used": "enhanced_direct",
-                        "notes": "Extracted with Googlebot user-agent"
-                    }
-            return {"success": False, "error": "Enhanced extraction failed"}
-        except Exception as e:
-            return {"success": False, "error": str(e)}
 # ==============================================
-# FASTAPI APPLICATION
 # ==============================================
-# Initialize scraper
-scraper = SimpleWebScraper()
-# Create FastAPI app
-app = FastAPI(
-    title="Web Scraper API for n8n",
-    description="Lightweight web scraper optimized for Hugging Face Spaces",
-    version="1.0"
-)
-@app.get("/")
-async def root():
-    return {
-        "service": "Web Scraper API",
-        "version": "1.0",
-        "description": "Extract text content from webpages",
-        "endpoints": {
-            "GET /": "This information",
-            "GET /health": "Health check",
-            "POST /scrape": "Main scraping endpoint"
-        },
-        "usage": {
-            "curl": 'curl -X POST "https://your-space.hf.space/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\'',
-            "n8n": "Use HTTP Request node with POST method to /scrape endpoint"
-        }
-    }
-@app.get("/health")
-async def health():
-    return {
-        "status": "healthy",
-        "timestamp": time.time(),
-        "features": {
-            "html_parsing": BEAUTIFULSOUP_AVAILABLE,
-            "ocr": TRANSFORMERS_AVAILABLE
-        }
-    }
-@app.post("/scrape")
-async def api_scrape(request: Request):
-    """Main API endpoint for n8n"""
-    try:
-        data = await request.json()
-        url = data.get("url", "").strip()
-        if not url:
-            return {
-                "success": False,
-                "error": "URL parameter is required",
-                "example": {"url": "https://example.com"}
-            }
-        print(f"📨 API Request received for URL: {url}")
-        result = scraper.scrape(url)
-        return result
-    except json.JSONDecodeError:
-        return {
-            "success": False,
-            "error": "Invalid JSON payload",
-            "example": {"url": "https://example.com"}
-        }
-    except Exception as e:
-        print(f"❌ API Error: {e}")
-        return {
-            "success": False,
-            "error": f"Internal server error: {str(e)}"
-        }
 # ==============================================
 # GRADIO INTERFACE
 # ==============================================
-def gradio_scrape(url: str):
     """Gradio interface function"""
     if not url:
         return "❌ Please enter a URL", {}
-    print(f"🎨 Gradio interface scraping: {url}")
     result = scraper.scrape(url)
-    if result.get("success"):
-        text = result.get("extracted_text", "")
-        text_length = result.get("text_length", 0)
         # Create preview
         preview = text[:500]
@@ -367,64 +180,98 @@ def gradio_scrape(url: str):
 ## ✅ Success!
 **URL:** {result['url']}
-**Method:** {result.get('method_used', 'direct_html')}
-**Time:** {result.get('execution_time', 0)}s
 **Characters:** {text_length:,}
 ### Preview:
 {preview}
-### Full Response:
-Check the JSON output for complete data.
 """
         return output, result
     else:
-        error_msg = result.get("error", "Unknown error")
-        return f"## ❌ Error\n\n{error_msg}", result
-# Create Gradio interface
-gradio_interface = gr.Interface(
-    fn=gradio_scrape,
-    inputs=gr.Textbox(
-        label="Website URL",
-        placeholder="Enter a URL (e.g., https://example.com)",
-        lines=1
-    ),
-    outputs=[
-        gr.Markdown(label="Result"),
-        gr.JSON(label="API Response")
-    ],
-    title="🌐 Web Scraper for n8n",
-    description="Extract text content from webpages. Use with n8n HTTP Request node.",
-    examples=[
-        ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
-        ["https://example.com"],
-        ["https://httpbin.org/html"]
-    ],
-    allow_flagging="never"
-)
-# Mount Gradio to FastAPI
-app = gr.mount_gradio_app(app, gradio_interface, path="/")
 # ==============================================
-# APPLICATION ENTRY POINT
 # ==============================================
 if __name__ == "__main__":
-    import uvicorn
-    print("\n" + "="*60)
-    print("🚀 Web Scraper API Starting")
-    print("="*60)
-    print(f"📝 Direct HTML: Enabled")
-    print(f"🔍 OCR Available: {TRANSFORMERS_AVAILABLE}")
-    print(f"🧪 BeautifulSoup: {BEAUTIFULSOUP_AVAILABLE}")
-    print("="*60 + "\n")
-    uvicorn.run(
-        app,
-        host="0.0.0.0",
-        port=7860,
-        log_level="info"
-    )

 # ==============================================
+# SIMPLE WEB SCRAPER FOR HUGGING FACE SPACES
 # ==============================================
 import gradio as gr
 import json
 import time
 import re
+from typing import Dict, Any
 from io import BytesIO
 # ==============================================
+# SIMPLE WEB SCRAPER
 # ==============================================
+class WebScraper:
+    """Lightweight web scraper for Hugging Face Spaces"""
     def __init__(self):
+        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
     def scrape(self, url: str) -> Dict[str, Any]:
         """Main scraping function"""
         start_time = time.time()
             url = 'https://' + url
         try:
+            # Extract content
+            result = self._extract_content(url)
+            if result["success"]:
                 return {
                     "success": True,
                     "url": url,
+                    "execution_time": round(time.time() - start_time, 2),
+                    "method": result["method"],
+                    "extracted_text": result["text"][:10000],  # Limit response size
+                    "text_length": len(result["text"]),
+                    "metadata": result.get("metadata", {}),
+                    "status_code": result.get("status_code", 200)
                 }
+            else:
+                return {
+                    "success": False,
+                    "url": url,
+                    "error": result.get("error", "Unknown error"),
+                    "execution_time": round(time.time() - start_time, 2)
+                }
         except Exception as e:
             return {
                 "success": False,
                 "execution_time": round(time.time() - start_time, 2)
             }
+    def _extract_content(self, url: str) -> Dict[str, Any]:
+        """Extract content from URL"""
         try:
             headers = {
                 'User-Agent': self.user_agent,
                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.9',
             }
+            response = requests.get(url, headers=headers, timeout=10)
             response.raise_for_status()
+            # Try to extract with BeautifulSoup if available
+            try:
+                from bs4 import BeautifulSoup
+                soup = BeautifulSoup(response.text, 'html.parser')
+                # Remove unwanted tags
+                for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
+                    tag.decompose()
+                # Get text
+                text = soup.get_text()
+                method = "beautifulsoup"
+                # Extract metadata
+                metadata = {}
+                if soup.title:
+                    metadata['title'] = soup.title.string
+                # Try to find main content
+                main_selectors = ['article', 'main', '.content', '.post-content', '.article-content']
+                for selector in main_selectors:
+                    elements = soup.select(selector)
+                    if elements:
+                        text = ' '.join([elem.get_text() for elem in elements])
+                        break
+            except ImportError:
+                # Fallback to simple regex extraction
+                text = self._simple_extract(response.text)
+                method = "regex"
+                metadata = {}
+            # Clean text
             cleaned_text = self._clean_text(text)
             return {
                 "success": True,
+                "text": cleaned_text,
+                "method": method,
                 "metadata": metadata,
+                "status_code": response.status_code
             }
         except requests.exceptions.RequestException as e:
             return {"success": False, "error": f"Request failed: {str(e)}"}
         except Exception as e:
             return {"success": False, "error": str(e)}
+    def _simple_extract(self, html: str) -> str:
+        """Simple HTML extraction using regex"""
         # Remove scripts and styles
         html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
         html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
+        # Remove HTML tags
         text = re.sub(r'<[^>]+>', ' ', html)
         # Decode HTML entities
         return text
     def _clean_text(self, text: str) -> str:
+        """Clean and normalize text"""
         # Replace multiple whitespace
         text = re.sub(r'\s+', ' ', text)
         # Remove control characters
         text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
+        # Remove excessive newlines
         text = re.sub(r'\n{3,}', '\n\n', text)
         return text.strip()
 # ==============================================
+# INITIALIZE
 # ==============================================
+scraper = WebScraper()
 # ==============================================
 # GRADIO INTERFACE
 # ==============================================
+def scrape_url(url: str):
     """Gradio interface function"""
     if not url:
         return "❌ Please enter a URL", {}
+    print(f"Processing: {url}")
     result = scraper.scrape(url)
+    if result["success"]:
+        text = result["extracted_text"]
+        text_length = result["text_length"]
         # Create preview
         preview = text[:500]
 ## ✅ Success!
 **URL:** {result['url']}
+**Method:** {result.get('method', 'unknown')}
+**Time:** {result['execution_time']}s
 **Characters:** {text_length:,}
 ### Preview:
 {preview}
+*Check JSON tab for full response*
 """
         return output, result
     else:
+        return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
+# ==============================================
+# CREATE APP
+# ==============================================
+# For Hugging Face Spaces, we need to create the app correctly
+with gr.Blocks(title="Web Scraper for n8n", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🌐 Web Scraper for n8n")
+    gr.Markdown("Extract text content from webpages. Perfect for n8n workflows!")
+    with gr.Row():
+        with gr.Column(scale=2):
+            url_input = gr.Textbox(
+                label="Website URL",
+                placeholder="https://example.com",
+                value="https://en.wikipedia.org/wiki/Artificial_intelligence"
+            )
+            scrape_btn = gr.Button("Scrape", variant="primary")
+        with gr.Column(scale=1):
+            api_info = gr.Markdown("""
+            ### API Usage (for n8n)
+            **Endpoint:** `POST /scrape`
+            **Body:**
+            ```json
+            {
+              "url": "https://example.com"
+            }
+            ```
+            **Response:** JSON with extracted text
+            """)
+    with gr.Row():
+        with gr.Column():
+            output_md = gr.Markdown(label="Result")
+        with gr.Column():
+            output_json = gr.JSON(label="API Response")
+    # Examples
+    gr.Examples(
+        examples=[
+            ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
+            ["https://example.com"],
+            ["https://httpbin.org/html"]
+        ],
+        inputs=[url_input]
+    )
+    # Event handlers
+    scrape_btn.click(
+        fn=scrape_url,
+        inputs=[url_input],
+        outputs=[output_md, output_json]
+    )
+    # Also trigger on Enter key
+    url_input.submit(
+        fn=scrape_url,
+        inputs=[url_input],
+        outputs=[output_md, output_json]
+    )
+# ==============================================
+# ADD API ENDPOINT FOR N8N
+# ==============================================
+# Hugging Face Spaces will mount the app
+# We'll also create a simple API route
+@app.app.post("/scrape")
+async def api_scrape(url: str = gr.String()):
+    """API endpoint for n8n"""
+    return scraper.scrape(url)
 # ==============================================
+# MAIN ENTRY POINT
 # ==============================================
 if __name__ == "__main__":
+    # This runs locally but not on Hugging Face Spaces
+    app.launch(server_name="0.0.0.0", server_port=7860)