Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on Jan 25

Commit

b2fa5de

verified ·

1 Parent(s): 7f30af0

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -115

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==============================================
-# SIMPLE WEB SCRAPER FOR HUGGING FACE SPACES
 # ==============================================
 import gradio as gr
@@ -7,146 +7,101 @@ import requests
 import json
 import time
 import re
 from typing import Dict, Any
-from io import BytesIO
 # ==============================================
 # SIMPLE WEB SCRAPER
 # ==============================================
 class WebScraper:
-    """Lightweight web scraper for Hugging Face Spaces"""
     def __init__(self):
-        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
     def scrape(self, url: str) -> Dict[str, Any]:
         """Main scraping function"""
         start_time = time.time()
-        print(f"🌐 Scraping: {url}")
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
         try:
-            # Extract content
-            result = self._extract_content(url)
-            if result["success"]:
-                return {
-                    "success": True,
-                    "url": url,
-                    "execution_time": round(time.time() - start_time, 2),
-                    "method": result["method"],
-                    "extracted_text": result["text"][:10000],  # Limit response size
-                    "text_length": len(result["text"]),
-                    "metadata": result.get("metadata", {}),
-                    "status_code": result.get("status_code", 200)
-                }
-            else:
-                return {
-                    "success": False,
-                    "url": url,
-                    "error": result.get("error", "Unknown error"),
-                    "execution_time": round(time.time() - start_time, 2)
-                }
-        except Exception as e:
-            return {
-                "success": False,
-                "url": url,
-                "error": str(e),
-                "execution_time": round(time.time() - start_time, 2)
-            }
-    def _extract_content(self, url: str) -> Dict[str, Any]:
-        """Extract content from URL"""
-        try:
             headers = {
                 'User-Agent': self.user_agent,
-                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-                'Accept-Language': 'en-US,en;q=0.9',
             }
-            response = requests.get(url, headers=headers, timeout=10)
             response.raise_for_status()
-            # Try to extract with BeautifulSoup if available
-            try:
-                from bs4 import BeautifulSoup
-                soup = BeautifulSoup(response.text, 'html.parser')
-                # Remove unwanted tags
-                for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
-                    tag.decompose()
-                # Get text
-                text = soup.get_text()
-                method = "beautifulsoup"
-                # Extract metadata
-                metadata = {}
-                if soup.title:
-                    metadata['title'] = soup.title.string
-                # Try to find main content
-                main_selectors = ['article', 'main', '.content', '.post-content', '.article-content']
-                for selector in main_selectors:
-                    elements = soup.select(selector)
-                    if elements:
-                        text = ' '.join([elem.get_text() for elem in elements])
-                        break
-            except ImportError:
-                # Fallback to simple regex extraction
-                text = self._simple_extract(response.text)
-                method = "regex"
-                metadata = {}
             # Clean text
             cleaned_text = self._clean_text(text)
             return {
                 "success": True,
-                "text": cleaned_text,
-                "method": method,
-                "metadata": metadata,
-                "status_code": response.status_code
             }
-        except requests.exceptions.RequestException as e:
-            return {"success": False, "error": f"Request failed: {str(e)}"}
         except Exception as e:
-            return {"success": False, "error": str(e)}
-    def _simple_extract(self, html: str) -> str:
-        """Simple HTML extraction using regex"""
         # Remove scripts and styles
-        html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
-        html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
         # Remove HTML tags
-        text = re.sub(r'<[^>]+>', ' ', html)
         # Decode HTML entities
-        import html as html_module
-        text = html_module.unescape(text)
         return text
     def _clean_text(self, text: str) -> str:
-        """Clean and normalize text"""
         # Replace multiple whitespace
         text = re.sub(r'\s+', ' ', text)
         # Remove control characters
         text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
-        # Remove excessive newlines
-        text = re.sub(r'\n{3,}', '\n\n', text)
         return text.strip()
 # ==============================================
@@ -155,16 +110,23 @@ class WebScraper:
 scraper = WebScraper()
 # ==============================================
 # GRADIO INTERFACE
 # ==============================================
-def scrape_url(url: str):
     """Gradio interface function"""
     if not url:
         return "❌ Please enter a URL", {}
-    print(f"Processing: {url}")
     result = scraper.scrape(url)
     if result["success"]:
@@ -180,7 +142,7 @@ def scrape_url(url: str):
 ## ✅ Success!
 **URL:** {result['url']}
-**Method:** {result.get('method', 'unknown')}
 **Time:** {result['execution_time']}s
 **Characters:** {text_length:,}
@@ -194,11 +156,11 @@ def scrape_url(url: str):
         return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
 # ==============================================
-# CREATE APP
 # ==============================================
-# For Hugging Face Spaces, we need to create the app correctly
-with gr.Blocks(title="Web Scraper for n8n", theme=gr.themes.Soft()) as app:
     gr.Markdown("# 🌐 Web Scraper for n8n")
     gr.Markdown("Extract text content from webpages. Perfect for n8n workflows!")
@@ -212,10 +174,10 @@ with gr.Blocks(title="Web Scraper for n8n", theme=gr.themes.Soft()) as app:
             scrape_btn = gr.Button("Scrape", variant="primary")
         with gr.Column(scale=1):
-            api_info = gr.Markdown("""
             ### API Usage (for n8n)
-            **Endpoint:** `POST /scrape`
             **Body:**
             ```json
@@ -244,34 +206,49 @@ with gr.Blocks(title="Web Scraper for n8n", theme=gr.themes.Soft()) as app:
     )
     # Event handlers
     scrape_btn.click(
-        fn=scrape_url,
         inputs=[url_input],
         outputs=[output_md, output_json]
     )
-    # Also trigger on Enter key
     url_input.submit(
-        fn=scrape_url,
         inputs=[url_input],
         outputs=[output_md, output_json]
     )
 # ==============================================
-# ADD API ENDPOINT FOR N8N
 # ==============================================
-# Hugging Face Spaces will mount the app
-# We'll also create a simple API route
-@app.app.post("/scrape")
-async def api_scrape(url: str = gr.String()):
-    """API endpoint for n8n"""
-    return scraper.scrape(url)
-# ==============================================
-# MAIN ENTRY POINT
-# ==============================================
 if __name__ == "__main__":
-    # This runs locally but not on Hugging Face Spaces
-    app.launch(server_name="0.0.0.0", server_port=7860)

 # ==============================================
+# WEB SCRAPER FOR N8N - GRADIO 6 COMPATIBLE
 # ==============================================
 import gradio as gr
 import json
 import time
 import re
+import html
 from typing import Dict, Any
+import traceback
 # ==============================================
 # SIMPLE WEB SCRAPER
 # ==============================================
 class WebScraper:
+    """Lightweight web scraper"""
     def __init__(self):
+        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
     def scrape(self, url: str) -> Dict[str, Any]:
         """Main scraping function"""
         start_time = time.time()
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
         try:
+            # Fetch the page
             headers = {
                 'User-Agent': self.user_agent,
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
             }
+            response = requests.get(url, headers=headers, timeout=15)
             response.raise_for_status()
+            # Extract text
+            text = self._extract_text(response.text)
             # Clean text
             cleaned_text = self._clean_text(text)
+            # Extract title
+            title = self._extract_title(response.text)
             return {
                 "success": True,
+                "url": url,
+                "title": title,
+                "extracted_text": cleaned_text[:15000],
+                "text_length": len(cleaned_text),
+                "status_code": response.status_code,
+                "execution_time": round(time.time() - start_time, 2),
+                "method": "regex"
             }
         except Exception as e:
+            return {
+                "success": False,
+                "url": url,
+                "error": str(e),
+                "execution_time": round(time.time() - start_time, 2)
+            }
+    def _extract_text(self, html_content: str) -> str:
+        """Extract text from HTML using regex"""
         # Remove scripts and styles
+        html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
+        html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
+        # Remove comments
+        html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
         # Remove HTML tags
+        text = re.sub(r'<[^>]+>', ' ', html_content)
         # Decode HTML entities
+        text = html.unescape(text)
         return text
+    def _extract_title(self, html_content: str) -> str:
+        """Extract page title"""
+        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
+        if title_match:
+            title = title_match.group(1)
+            # Clean title
+            title = re.sub(r'\s+', ' ', title).strip()
+            return title[:200]
+        return "No title found"
     def _clean_text(self, text: str) -> str:
+        """Clean extracted text"""
         # Replace multiple whitespace
         text = re.sub(r'\s+', ' ', text)
         # Remove control characters
         text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
         return text.strip()
 # ==============================================
 scraper = WebScraper()
+# ==============================================
+# API FUNCTION FOR N8N
+# ==============================================
+def api_scrape_function(url: str) -> Dict[str, Any]:
+    """Function for API calls"""
+    return scraper.scrape(url)
 # ==============================================
 # GRADIO INTERFACE
 # ==============================================
+def gradio_scrape(url: str):
     """Gradio interface function"""
     if not url:
         return "❌ Please enter a URL", {}
     result = scraper.scrape(url)
     if result["success"]:
 ## ✅ Success!
 **URL:** {result['url']}
+**Title:** {result.get('title', 'N/A')}
 **Time:** {result['execution_time']}s
 **Characters:** {text_length:,}
         return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
 # ==============================================
+# CREATE THE APP
 # ==============================================
+# Create Gradio blocks
+with gr.Blocks() as app:
     gr.Markdown("# 🌐 Web Scraper for n8n")
     gr.Markdown("Extract text content from webpages. Perfect for n8n workflows!")
             scrape_btn = gr.Button("Scrape", variant="primary")
         with gr.Column(scale=1):
+            gr.Markdown("""
             ### API Usage (for n8n)
+            **Method:** `POST` to `/api/scrape`
             **Body:**
             ```json
     )
     # Event handlers
+    def process_url(url):
+        return gradio_scrape(url)
     scrape_btn.click(
+        fn=process_url,
         inputs=[url_input],
         outputs=[output_md, output_json]
     )
     url_input.submit(
+        fn=process_url,
         inputs=[url_input],
         outputs=[output_md, output_json]
     )
+    # ==============================================
+    # ADD API ENDPOINT DIRECTLY IN GRADIO
+    # ==============================================
+    # Create a separate API endpoint
+    @app.app.post("/api/scrape")
+    async def api_scrape(request: dict):
+        """API endpoint for n8n"""
+        try:
+            url = request.get("url", "").strip()
+            if not url:
+                return {"success": False, "error": "URL is required"}
+            return api_scrape_function(url)
+        except Exception as e:
+            return {"success": False, "error": str(e)}
 # ==============================================
+# LAUNCH CONFIGURATION
 # ==============================================
+# For Hugging Face Spaces, just define the app
+# The space will handle launching
+# For local testing
 if __name__ == "__main__":
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )