Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on Jan 25

Commit

4451668

verified ·

1 Parent(s): 5703393

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -95

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==============================================
-# WEB SCRAPER FOR N8N - GRADIO 6 COMPATIBLE
 # ==============================================
 import gradio as gr
@@ -9,7 +9,8 @@ import time
 import re
 import html
 from typing import Dict, Any
-import traceback
 # ==============================================
 # SIMPLE WEB SCRAPER
@@ -25,6 +26,8 @@ class WebScraper:
         """Main scraping function"""
         start_time = time.time()
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
@@ -56,7 +59,7 @@ class WebScraper:
                 "text_length": len(cleaned_text),
                 "status_code": response.status_code,
                 "execution_time": round(time.time() - start_time, 2),
-                "method": "regex"
             }
         except Exception as e:
@@ -111,12 +114,80 @@ class WebScraper:
 scraper = WebScraper()
 # ==============================================
-# API FUNCTION FOR N8N
 # ==============================================
-def api_scrape_function(url: str) -> Dict[str, Any]:
-    """Function for API calls"""
-    return scraper.scrape(url)
 # ==============================================
 # GRADIO INTERFACE
@@ -149,106 +220,68 @@ def gradio_scrape(url: str):
 ### Preview:
 {preview}
-*Check JSON tab for full response*
 """
         return output, result
     else:
         return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
 # ==============================================
-# CREATE THE APP
 # ==============================================
-# Create Gradio blocks
-with gr.Blocks() as app:
-    gr.Markdown("# 🌐 Web Scraper for n8n")
-    gr.Markdown("Extract text content from webpages. Perfect for n8n workflows!")
-    with gr.Row():
-        with gr.Column(scale=2):
-            url_input = gr.Textbox(
-                label="Website URL",
-                placeholder="https://example.com",
-                value="https://en.wikipedia.org/wiki/Artificial_intelligence"
-            )
-            scrape_btn = gr.Button("Scrape", variant="primary")
-        with gr.Column(scale=1):
-            gr.Markdown("""
-            ### API Usage (for n8n)
-            **Method:** `POST` to `/api/scrape`
-            **Body:**
-            ```json
-            {
-              "url": "https://example.com"
-            }
-            ```
-            **Response:** JSON with extracted text
-            """)
-    with gr.Row():
-        with gr.Column():
-            output_md = gr.Markdown(label="Result")
-        with gr.Column():
-            output_json = gr.JSON(label="API Response")
-    # Examples
-    gr.Examples(
-        examples=[
-            ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
-            ["https://example.com"],
-            ["https://httpbin.org/html"]
-        ],
-        inputs=[url_input]
-    )
-    # Event handlers
-    def process_url(url):
-        return gradio_scrape(url)
-    scrape_btn.click(
-        fn=process_url,
-        inputs=[url_input],
-        outputs=[output_md, output_json]
-    )
-    url_input.submit(
-        fn=process_url,
-        inputs=[url_input],
-        outputs=[output_md, output_json]
-    )
-    # ==============================================
-    # ADD API ENDPOINT DIRECTLY IN GRADIO
-    # ==============================================
-    # Create a separate API endpoint
-    @app.app.post("/api/scrape")
-    async def api_scrape(request: dict):
-        """API endpoint for n8n"""
-        try:
-            url = request.get("url", "").strip()
-            if not url:
-                return {"success": False, "error": "URL is required"}
-            return api_scrape_function(url)
-        except Exception as e:
-            return {"success": False, "error": str(e)}
 # ==============================================
-# LAUNCH CONFIGURATION
 # ==============================================
-# For Hugging Face Spaces, just define the app
-# The space will handle launching
-# For local testing
 if __name__ == "__main__":
-    app.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
     )

 # ==============================================
+# WEB SCRAPER FOR N8N - WORKING VERSION
 # ==============================================
 import gradio as gr
 import re
 import html
 from typing import Dict, Any
+from fastapi import FastAPI, Request
+import uvicorn
 # ==============================================
 # SIMPLE WEB SCRAPER
         """Main scraping function"""
         start_time = time.time()
+        print(f"🌐 Scraping: {url}")
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
                 "text_length": len(cleaned_text),
                 "status_code": response.status_code,
                 "execution_time": round(time.time() - start_time, 2),
+                "method": "direct_html"
             }
         except Exception as e:
 scraper = WebScraper()
 # ==============================================
+# CREATE FASTAPI APP FIRST
 # ==============================================
+# Create FastAPI app
+fastapi_app = FastAPI(
+    title="Web Scraper API",
+    description="Extract text from webpages for n8n workflows",
+    version="1.0"
+)
+# Add CORS middleware
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+fastapi_app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@fastapi_app.get("/")
+async def root():
+    return {
+        "service": "Web Scraper API",
+        "version": "1.0",
+        "endpoints": {
+            "GET /": "This info",
+            "GET /health": "Health check",
+            "POST /scrape": "Scrape a webpage (for n8n)"
+        },
+        "usage": {
+            "curl": 'curl -X POST "https://your-space.hf.space/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\'',
+            "n8n": "HTTP Request node: POST to /scrape with JSON body"
+        }
+    }
+@fastapi_app.get("/health")
+async def health():
+    return {
+        "status": "healthy",
+        "timestamp": time.time()
+    }
+@fastapi_app.post("/scrape")
+async def api_scrape(request: Request):
+    """Main API endpoint for n8n"""
+    try:
+        # Parse JSON body
+        body = await request.json()
+        url = body.get("url", "").strip()
+        if not url:
+            return JSONResponse(
+                status_code=400,
+                content={"success": False, "error": "URL parameter is required"}
+            )
+        print(f"📨 API Request received for: {url}")
+        result = scraper.scrape(url)
+        return result
+    except json.JSONDecodeError:
+        return JSONResponse(
+            status_code=400,
+            content={"success": False, "error": "Invalid JSON payload"}
+        )
+    except Exception as e:
+        return JSONResponse(
+            status_code=500,
+            content={"success": False, "error": f"Internal error: {str(e)}"}
+        )
 # ==============================================
 # GRADIO INTERFACE
 ### Preview:
 {preview}
+*Check JSON tab for full API response*
 """
         return output, result
     else:
         return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
+# Create Gradio interface
+gradio_interface = gr.Interface(
+    fn=gradio_scrape,
+    inputs=gr.Textbox(
+        label="Website URL",
+        placeholder="https://example.com",
+        value="https://example.com"
+    ),
+    outputs=[
+        gr.Markdown(label="Result"),
+        gr.JSON(label="API Response")
+    ],
+    title="🌐 Web Scraper for n8n",
+    description="Extract text from webpages. Use POST /scrape for n8n integration.",
+    examples=[
+        ["https://example.com"],
+        ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
+        ["https://httpbin.org/html"]
+    ]
+)
 # ==============================================
+# MOUNT GRADIO TO FASTAPI
 # ==============================================
+# Mount Gradio app to FastAPI at root path
+app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
 # ==============================================
+# ALTERNATIVE: If mounting doesn't work, try this:
 # ==============================================
+# Instead of mounting, you can also define routes manually
+# Uncomment below if mounting doesn't work:
+# @fastapi_app.get("/")
+# async def gradio_root():
+#     # This will redirect to the Gradio interface
+#     from fastapi.responses import RedirectResponse
+#     return RedirectResponse(url="/")
+# ==============================================
+# LAUNCH THE APP
+# ==============================================
 if __name__ == "__main__":
+    print("\n" + "="*60)
+    print("🚀 Web Scraper API Starting")
+    print("="*60)
+    print("API Endpoint: POST /scrape")
+    print("Web Interface: GET /")
+    print("="*60 + "\n")
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=7860,
+        log_level="info"
     )