Spaces:

yukee1992
/

Screenshot-scraper

Paused

App Files Files Community

yukee1992 commited on Jan 24

Commit

2be7d27

verified ·

1 Parent(s): 1d051b3

Create app.py

Browse files

Files changed (1) hide show

app.py +220 -0

app.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import gradio as gr
+import requests
+import base64
+import json
+from io import BytesIO
+from PIL import Image
+# Initialize OCR (will load on first use)
+ocr_processor = None
+def load_ocr():
+    """Lazy load OCR to avoid startup issues"""
+    global ocr_processor
+    if ocr_processor is None:
+        try:
+            from transformers import pipeline
+            ocr_processor = pipeline(
+                "image-to-text",
+                model="microsoft/trocr-base-printed"
+            )
+        except Exception as e:
+            print(f"Failed to load OCR: {e}")
+            ocr_processor = None
+    return ocr_processor
+def get_screenshot(url):
+    """Get screenshot using a free external API"""
+    try:
+        # Use a reliable screenshot API
+        # Option 1: ScreenshotAPI.net (free tier available)
+        # Option 2: Use a simpler approach with webpage screenshot services
+        # For simplicity, let's use a basic approach that works
+        screenshot_url = f"https://s0.wp.com/mshots/v1/{url}?w=800"
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        response = requests.get(screenshot_url, headers=headers, timeout=30)
+        if response.status_code == 200:
+            return {
+                "success": True,
+                "image_bytes": response.content,
+                "base64": base64.b64encode(response.content).decode('utf-8'),
+                "size": len(response.content)
+            }
+        else:
+            # Fallback to simpler method
+            return {
+                "success": False,
+                "error": f"HTTP {response.status_code}",
+                "fallback": True
+            }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "fallback": True
+        }
+def extract_text_from_image(image_bytes):
+    """Extract text using OCR"""
+    try:
+        ocr = load_ocr()
+        if ocr is None:
+            return {"success": False, "error": "OCR not available"}
+        # Convert bytes to image
+        image = Image.open(BytesIO(image_bytes))
+        # Extract text
+        result = ocr(image)
+        if isinstance(result, list) and len(result) > 0:
+            text = result[0].get('generated_text', '')
+        else:
+            text = str(result)
+        return {
+            "success": True,
+            "text": text.strip(),
+            "length": len(text.strip())
+        }
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+def scrape_website(url):
+    """Main scraping function - called by Gradio and API"""
+    import time
+    start_time = time.time()
+    # Get screenshot
+    screenshot_result = get_screenshot(url)
+    if not screenshot_result.get("success", False):
+        # Return simple error
+        return {
+            "success": False,
+            "url": url,
+            "error": screenshot_result.get("error", "Unknown error"),
+            "execution_time": time.time() - start_time
+        }
+    # Extract text
+    ocr_result = extract_text_from_image(screenshot_result["image_bytes"])
+    # Prepare response
+    response = {
+        "success": True,
+        "url": url,
+        "execution_time": round(time.time() - start_time, 2),
+        "screenshot_size": screenshot_result.get("size", 0),
+        "screenshot_available": True,
+        "ocr_success": ocr_result.get("success", False)
+    }
+    if ocr_result["success"]:
+        response["extracted_text"] = ocr_result["text"]
+        response["text_length"] = ocr_result["length"]
+    else:
+        response["ocr_error"] = ocr_result.get("error", "Unknown OCR error")
+    return response
+# ==================== GRADIO INTERFACE ====================
+def gradio_scrape(url):
+    """Function for Gradio interface"""
+    result = scrape_website(url)
+    if result["success"]:
+        output = f"## ✅ Success!\n\n"
+        output += f"**URL:** {result['url']}\n"
+        output += f"**Time:** {result['execution_time']}s\n"
+        output += f"**Text Length:** {result.get('text_length', 0)} characters\n\n"
+        if result.get('extracted_text'):
+            # Show first 1000 characters
+            text_preview = result['extracted_text'][:1000]
+            if len(result['extracted_text']) > 1000:
+                text_preview += "..."
+            output += f"**Extracted Text:**\n{text_preview}"
+        return output, result
+    else:
+        return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
+# Create Gradio interface
+demo = gr.Interface(
+    fn=gradio_scrape,
+    inputs=gr.Textbox(
+        label="Website URL",
+        placeholder="https://example.com",
+        value="https://example.com"
+    ),
+    outputs=[
+        gr.Markdown(label="Result"),
+        gr.JSON(label="API Response")
+    ],
+    title="📸 Screenshot Scraper for n8n",
+    description="Take screenshots of websites and extract text using AI. Use the API endpoint below for n8n integration.",
+    examples=[
+        ["https://example.com"],
+        ["https://news.ycombinator.com"],
+        ["https://en.wikipedia.org/wiki/Artificial_intelligence"]
+    ]
+)
+# ==================== FASTAPI ENDPOINT ====================
+# For n8n integration
+from fastapi import FastAPI
+import uvicorn
+# Create FastAPI app
+app = FastAPI(title="Screenshot Scraper API")
+@app.get("/")
+async def root():
+    return {
+        "message": "Screenshot Scraper API",
+        "endpoints": {
+            "GET /health": "Health check",
+            "POST /api/scrape": "Scrape website (for n8n)",
+            "GET /": "This Gradio interface"
+        },
+        "usage_n8n": "Use HTTP Request node to POST to /api/scrape with JSON: {\"url\": \"https://example.com\"}"
+    }
+@app.get("/health")
+async def health():
+    return {"status": "healthy", "service": "screenshot-scraper"}
+@app.post("/api/scrape")
+async def api_scrape(url: str = None, data: dict = None):
+    """API endpoint for n8n"""
+    try:
+        # Get URL from either parameter or JSON body
+        if url:
+            target_url = url
+        elif data and "url" in data:
+            target_url = data["url"]
+        else:
+            return {"success": False, "error": "URL parameter is required"}
+        # Call the scraper
+        result = scrape_website(target_url)
+        return result
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+# Mount Gradio app
+app = gr.mount_gradio_app(app, demo, path="/")
+# For local testing
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)