yukee1992 commited on
Commit
4451668
·
verified ·
1 Parent(s): 5703393

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -95
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==============================================
2
- # WEB SCRAPER FOR N8N - GRADIO 6 COMPATIBLE
3
  # ==============================================
4
 
5
  import gradio as gr
@@ -9,7 +9,8 @@ import time
9
  import re
10
  import html
11
  from typing import Dict, Any
12
- import traceback
 
13
 
14
  # ==============================================
15
  # SIMPLE WEB SCRAPER
@@ -25,6 +26,8 @@ class WebScraper:
25
  """Main scraping function"""
26
  start_time = time.time()
27
 
 
 
28
  # Ensure URL has protocol
29
  if not url.startswith(('http://', 'https://')):
30
  url = 'https://' + url
@@ -56,7 +59,7 @@ class WebScraper:
56
  "text_length": len(cleaned_text),
57
  "status_code": response.status_code,
58
  "execution_time": round(time.time() - start_time, 2),
59
- "method": "regex"
60
  }
61
 
62
  except Exception as e:
@@ -111,12 +114,80 @@ class WebScraper:
111
  scraper = WebScraper()
112
 
113
  # ==============================================
114
- # API FUNCTION FOR N8N
115
  # ==============================================
116
 
117
- def api_scrape_function(url: str) -> Dict[str, Any]:
118
- """Function for API calls"""
119
- return scraper.scrape(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  # ==============================================
122
  # GRADIO INTERFACE
@@ -149,106 +220,68 @@ def gradio_scrape(url: str):
149
  ### Preview:
150
  {preview}
151
 
152
- *Check JSON tab for full response*
153
  """
154
  return output, result
155
  else:
156
  return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  # ==============================================
159
- # CREATE THE APP
160
  # ==============================================
161
 
162
- # Create Gradio blocks
163
- with gr.Blocks() as app:
164
- gr.Markdown("# 🌐 Web Scraper for n8n")
165
- gr.Markdown("Extract text content from webpages. Perfect for n8n workflows!")
166
-
167
- with gr.Row():
168
- with gr.Column(scale=2):
169
- url_input = gr.Textbox(
170
- label="Website URL",
171
- placeholder="https://example.com",
172
- value="https://en.wikipedia.org/wiki/Artificial_intelligence"
173
- )
174
- scrape_btn = gr.Button("Scrape", variant="primary")
175
-
176
- with gr.Column(scale=1):
177
- gr.Markdown("""
178
- ### API Usage (for n8n)
179
-
180
- **Method:** `POST` to `/api/scrape`
181
-
182
- **Body:**
183
- ```json
184
- {
185
- "url": "https://example.com"
186
- }
187
- ```
188
-
189
- **Response:** JSON with extracted text
190
- """)
191
-
192
- with gr.Row():
193
- with gr.Column():
194
- output_md = gr.Markdown(label="Result")
195
- with gr.Column():
196
- output_json = gr.JSON(label="API Response")
197
-
198
- # Examples
199
- gr.Examples(
200
- examples=[
201
- ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
202
- ["https://example.com"],
203
- ["https://httpbin.org/html"]
204
- ],
205
- inputs=[url_input]
206
- )
207
-
208
- # Event handlers
209
- def process_url(url):
210
- return gradio_scrape(url)
211
-
212
- scrape_btn.click(
213
- fn=process_url,
214
- inputs=[url_input],
215
- outputs=[output_md, output_json]
216
- )
217
-
218
- url_input.submit(
219
- fn=process_url,
220
- inputs=[url_input],
221
- outputs=[output_md, output_json]
222
- )
223
-
224
- # ==============================================
225
- # ADD API ENDPOINT DIRECTLY IN GRADIO
226
- # ==============================================
227
-
228
- # Create a separate API endpoint
229
- @app.app.post("/api/scrape")
230
- async def api_scrape(request: dict):
231
- """API endpoint for n8n"""
232
- try:
233
- url = request.get("url", "").strip()
234
- if not url:
235
- return {"success": False, "error": "URL is required"}
236
-
237
- return api_scrape_function(url)
238
- except Exception as e:
239
- return {"success": False, "error": str(e)}
240
 
241
  # ==============================================
242
- # LAUNCH CONFIGURATION
243
  # ==============================================
244
 
245
- # For Hugging Face Spaces, just define the app
246
- # The space will handle launching
 
 
 
 
 
 
 
 
 
 
247
 
248
- # For local testing
249
  if __name__ == "__main__":
250
- app.launch(
251
- server_name="0.0.0.0",
252
- server_port=7860,
253
- share=False
 
 
 
 
 
 
 
 
254
  )
 
1
  # ==============================================
2
+ # WEB SCRAPER FOR N8N - WORKING VERSION
3
  # ==============================================
4
 
5
  import gradio as gr
 
9
  import re
10
  import html
11
  from typing import Dict, Any
12
+ from fastapi import FastAPI, Request
13
+ import uvicorn
14
 
15
  # ==============================================
16
  # SIMPLE WEB SCRAPER
 
26
  """Main scraping function"""
27
  start_time = time.time()
28
 
29
+ print(f"🌐 Scraping: {url}")
30
+
31
  # Ensure URL has protocol
32
  if not url.startswith(('http://', 'https://')):
33
  url = 'https://' + url
 
59
  "text_length": len(cleaned_text),
60
  "status_code": response.status_code,
61
  "execution_time": round(time.time() - start_time, 2),
62
+ "method": "direct_html"
63
  }
64
 
65
  except Exception as e:
 
114
  scraper = WebScraper()
115
 
116
  # ==============================================
117
+ # CREATE FASTAPI APP FIRST
118
  # ==============================================
119
 
120
+ # Create FastAPI app
121
+ fastapi_app = FastAPI(
122
+ title="Web Scraper API",
123
+ description="Extract text from webpages for n8n workflows",
124
+ version="1.0"
125
+ )
126
+
127
+ # Add CORS middleware
128
+ from fastapi.middleware.cors import CORSMiddleware
129
+ from fastapi.responses import JSONResponse
130
+
131
+ fastapi_app.add_middleware(
132
+ CORSMiddleware,
133
+ allow_origins=["*"],
134
+ allow_credentials=True,
135
+ allow_methods=["*"],
136
+ allow_headers=["*"],
137
+ )
138
+
139
+ @fastapi_app.get("/")
140
+ async def root():
141
+ return {
142
+ "service": "Web Scraper API",
143
+ "version": "1.0",
144
+ "endpoints": {
145
+ "GET /": "This info",
146
+ "GET /health": "Health check",
147
+ "POST /scrape": "Scrape a webpage (for n8n)"
148
+ },
149
+ "usage": {
150
+ "curl": 'curl -X POST "https://your-space.hf.space/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\'',
151
+ "n8n": "HTTP Request node: POST to /scrape with JSON body"
152
+ }
153
+ }
154
+
155
+ @fastapi_app.get("/health")
156
+ async def health():
157
+ return {
158
+ "status": "healthy",
159
+ "timestamp": time.time()
160
+ }
161
+
162
+ @fastapi_app.post("/scrape")
163
+ async def api_scrape(request: Request):
164
+ """Main API endpoint for n8n"""
165
+ try:
166
+ # Parse JSON body
167
+ body = await request.json()
168
+ url = body.get("url", "").strip()
169
+
170
+ if not url:
171
+ return JSONResponse(
172
+ status_code=400,
173
+ content={"success": False, "error": "URL parameter is required"}
174
+ )
175
+
176
+ print(f"📨 API Request received for: {url}")
177
+ result = scraper.scrape(url)
178
+
179
+ return result
180
+
181
+ except json.JSONDecodeError:
182
+ return JSONResponse(
183
+ status_code=400,
184
+ content={"success": False, "error": "Invalid JSON payload"}
185
+ )
186
+ except Exception as e:
187
+ return JSONResponse(
188
+ status_code=500,
189
+ content={"success": False, "error": f"Internal error: {str(e)}"}
190
+ )
191
 
192
  # ==============================================
193
  # GRADIO INTERFACE
 
220
  ### Preview:
221
  {preview}
222
 
223
+ *Check JSON tab for full API response*
224
  """
225
  return output, result
226
  else:
227
  return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
228
 
229
+ # Create Gradio interface
230
+ gradio_interface = gr.Interface(
231
+ fn=gradio_scrape,
232
+ inputs=gr.Textbox(
233
+ label="Website URL",
234
+ placeholder="https://example.com",
235
+ value="https://example.com"
236
+ ),
237
+ outputs=[
238
+ gr.Markdown(label="Result"),
239
+ gr.JSON(label="API Response")
240
+ ],
241
+ title="🌐 Web Scraper for n8n",
242
+ description="Extract text from webpages. Use POST /scrape for n8n integration.",
243
+ examples=[
244
+ ["https://example.com"],
245
+ ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
246
+ ["https://httpbin.org/html"]
247
+ ]
248
+ )
249
+
250
  # ==============================================
251
+ # MOUNT GRADIO TO FASTAPI
252
  # ==============================================
253
 
254
+ # Mount Gradio app to FastAPI at root path
255
+ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  # ==============================================
258
+ # ALTERNATIVE: If mounting doesn't work, try this:
259
  # ==============================================
260
 
261
+ # Instead of mounting, you can also define routes manually
262
+ # Uncomment below if mounting doesn't work:
263
+
264
+ # @fastapi_app.get("/")
265
+ # async def gradio_root():
266
+ # # This will redirect to the Gradio interface
267
+ # from fastapi.responses import RedirectResponse
268
+ # return RedirectResponse(url="/")
269
+
270
+ # ==============================================
271
+ # LAUNCH THE APP
272
+ # ==============================================
273
 
 
274
  if __name__ == "__main__":
275
+ print("\n" + "="*60)
276
+ print("🚀 Web Scraper API Starting")
277
+ print("="*60)
278
+ print("API Endpoint: POST /scrape")
279
+ print("Web Interface: GET /")
280
+ print("="*60 + "\n")
281
+
282
+ uvicorn.run(
283
+ app,
284
+ host="0.0.0.0",
285
+ port=7860,
286
+ log_level="info"
287
  )