yukee1992 commited on
Commit
2be7d27
·
verified ·
1 Parent(s): 1d051b3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -0
app.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import base64
4
+ import json
5
+ from io import BytesIO
6
+ from PIL import Image
7
+
8
+ # Initialize OCR (will load on first use)
9
+ ocr_processor = None
10
+
11
+ def load_ocr():
12
+ """Lazy load OCR to avoid startup issues"""
13
+ global ocr_processor
14
+ if ocr_processor is None:
15
+ try:
16
+ from transformers import pipeline
17
+ ocr_processor = pipeline(
18
+ "image-to-text",
19
+ model="microsoft/trocr-base-printed"
20
+ )
21
+ except Exception as e:
22
+ print(f"Failed to load OCR: {e}")
23
+ ocr_processor = None
24
+ return ocr_processor
25
+
26
+ def get_screenshot(url):
27
+ """Get screenshot using a free external API"""
28
+ try:
29
+ # Use a reliable screenshot API
30
+ # Option 1: ScreenshotAPI.net (free tier available)
31
+ # Option 2: Use a simpler approach with webpage screenshot services
32
+
33
+ # For simplicity, let's use a basic approach that works
34
+ screenshot_url = f"https://s0.wp.com/mshots/v1/{url}?w=800"
35
+
36
+ headers = {
37
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
38
+ }
39
+
40
+ response = requests.get(screenshot_url, headers=headers, timeout=30)
41
+
42
+ if response.status_code == 200:
43
+ return {
44
+ "success": True,
45
+ "image_bytes": response.content,
46
+ "base64": base64.b64encode(response.content).decode('utf-8'),
47
+ "size": len(response.content)
48
+ }
49
+ else:
50
+ # Fallback to simpler method
51
+ return {
52
+ "success": False,
53
+ "error": f"HTTP {response.status_code}",
54
+ "fallback": True
55
+ }
56
+
57
+ except Exception as e:
58
+ return {
59
+ "success": False,
60
+ "error": str(e),
61
+ "fallback": True
62
+ }
63
+
64
+ def extract_text_from_image(image_bytes):
65
+ """Extract text using OCR"""
66
+ try:
67
+ ocr = load_ocr()
68
+ if ocr is None:
69
+ return {"success": False, "error": "OCR not available"}
70
+
71
+ # Convert bytes to image
72
+ image = Image.open(BytesIO(image_bytes))
73
+
74
+ # Extract text
75
+ result = ocr(image)
76
+
77
+ if isinstance(result, list) and len(result) > 0:
78
+ text = result[0].get('generated_text', '')
79
+ else:
80
+ text = str(result)
81
+
82
+ return {
83
+ "success": True,
84
+ "text": text.strip(),
85
+ "length": len(text.strip())
86
+ }
87
+
88
+ except Exception as e:
89
+ return {"success": False, "error": str(e)}
90
+
91
+ def scrape_website(url):
92
+ """Main scraping function - called by Gradio and API"""
93
+ import time
94
+ start_time = time.time()
95
+
96
+ # Get screenshot
97
+ screenshot_result = get_screenshot(url)
98
+
99
+ if not screenshot_result.get("success", False):
100
+ # Return simple error
101
+ return {
102
+ "success": False,
103
+ "url": url,
104
+ "error": screenshot_result.get("error", "Unknown error"),
105
+ "execution_time": time.time() - start_time
106
+ }
107
+
108
+ # Extract text
109
+ ocr_result = extract_text_from_image(screenshot_result["image_bytes"])
110
+
111
+ # Prepare response
112
+ response = {
113
+ "success": True,
114
+ "url": url,
115
+ "execution_time": round(time.time() - start_time, 2),
116
+ "screenshot_size": screenshot_result.get("size", 0),
117
+ "screenshot_available": True,
118
+ "ocr_success": ocr_result.get("success", False)
119
+ }
120
+
121
+ if ocr_result["success"]:
122
+ response["extracted_text"] = ocr_result["text"]
123
+ response["text_length"] = ocr_result["length"]
124
+ else:
125
+ response["ocr_error"] = ocr_result.get("error", "Unknown OCR error")
126
+
127
+ return response
128
+
129
+ # ==================== GRADIO INTERFACE ====================
130
+ def gradio_scrape(url):
131
+ """Function for Gradio interface"""
132
+ result = scrape_website(url)
133
+
134
+ if result["success"]:
135
+ output = f"## ✅ Success!\n\n"
136
+ output += f"**URL:** {result['url']}\n"
137
+ output += f"**Time:** {result['execution_time']}s\n"
138
+ output += f"**Text Length:** {result.get('text_length', 0)} characters\n\n"
139
+
140
+ if result.get('extracted_text'):
141
+ # Show first 1000 characters
142
+ text_preview = result['extracted_text'][:1000]
143
+ if len(result['extracted_text']) > 1000:
144
+ text_preview += "..."
145
+ output += f"**Extracted Text:**\n{text_preview}"
146
+
147
+ return output, result
148
+ else:
149
+ return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
150
+
151
+ # Create Gradio interface
152
+ demo = gr.Interface(
153
+ fn=gradio_scrape,
154
+ inputs=gr.Textbox(
155
+ label="Website URL",
156
+ placeholder="https://example.com",
157
+ value="https://example.com"
158
+ ),
159
+ outputs=[
160
+ gr.Markdown(label="Result"),
161
+ gr.JSON(label="API Response")
162
+ ],
163
+ title="📸 Screenshot Scraper for n8n",
164
+ description="Take screenshots of websites and extract text using AI. Use the API endpoint below for n8n integration.",
165
+ examples=[
166
+ ["https://example.com"],
167
+ ["https://news.ycombinator.com"],
168
+ ["https://en.wikipedia.org/wiki/Artificial_intelligence"]
169
+ ]
170
+ )
171
+
172
+ # ==================== FASTAPI ENDPOINT ====================
173
+ # For n8n integration
174
+ from fastapi import FastAPI
175
+ import uvicorn
176
+
177
+ # Create FastAPI app
178
+ app = FastAPI(title="Screenshot Scraper API")
179
+
180
+ @app.get("/")
181
+ async def root():
182
+ return {
183
+ "message": "Screenshot Scraper API",
184
+ "endpoints": {
185
+ "GET /health": "Health check",
186
+ "POST /api/scrape": "Scrape website (for n8n)",
187
+ "GET /": "This Gradio interface"
188
+ },
189
+ "usage_n8n": "Use HTTP Request node to POST to /api/scrape with JSON: {\"url\": \"https://example.com\"}"
190
+ }
191
+
192
+ @app.get("/health")
193
+ async def health():
194
+ return {"status": "healthy", "service": "screenshot-scraper"}
195
+
196
+ @app.post("/api/scrape")
197
+ async def api_scrape(url: str = None, data: dict = None):
198
+ """API endpoint for n8n"""
199
+ try:
200
+ # Get URL from either parameter or JSON body
201
+ if url:
202
+ target_url = url
203
+ elif data and "url" in data:
204
+ target_url = data["url"]
205
+ else:
206
+ return {"success": False, "error": "URL parameter is required"}
207
+
208
+ # Call the scraper
209
+ result = scrape_website(target_url)
210
+ return result
211
+
212
+ except Exception as e:
213
+ return {"success": False, "error": str(e)}
214
+
215
+ # Mount Gradio app
216
+ app = gr.mount_gradio_app(app, demo, path="/")
217
+
218
+ # For local testing
219
+ if __name__ == "__main__":
220
+ uvicorn.run(app, host="0.0.0.0", port=7860)