yukee1992 commited on
Commit
0fc5caf
·
verified ·
1 Parent(s): cfe45d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -308
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==============================================
2
- # WEB SCRAPER FOR N8N - HUGGING FACE SPACES VERSION
3
  # ==============================================
4
 
5
  import gradio as gr
@@ -7,47 +7,19 @@ import requests
7
  import json
8
  import time
9
  import re
10
- import textwrap
11
- from typing import Dict, Any, Optional
12
- from fastapi import FastAPI, Request
13
  from io import BytesIO
14
- from PIL import Image, ImageDraw, ImageFont
15
 
16
  # ==============================================
17
- # IMPORTS WITH FALLBACKS
18
  # ==============================================
19
 
20
- # Try to import optional dependencies
21
- try:
22
- from bs4 import BeautifulSoup
23
- BEAUTIFULSOUP_AVAILABLE = True
24
- except ImportError:
25
- BEAUTIFULSOUP_AVAILABLE = False
26
- print("⚠️ BeautifulSoup not available - using simple HTML parsing")
27
-
28
- try:
29
- import torch
30
- from transformers import pipeline
31
- TRANSFORMERS_AVAILABLE = True
32
- except ImportError:
33
- TRANSFORMERS_AVAILABLE = False
34
- print("⚠️ Transformers not available - OCR disabled")
35
-
36
- # ==============================================
37
- # SIMPLE WEB SCRAPER (NO COMPLEX DEPENDENCIES)
38
- # ==============================================
39
-
40
- class SimpleWebScraper:
41
- """Lightweight web scraper optimized for Hugging Face Spaces"""
42
 
43
  def __init__(self):
44
- self.user_agent = (
45
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
46
- "AppleWebKit/537.36 (KHTML, like Gecko) "
47
- "Chrome/120.0.0.0 Safari/537.36"
48
- )
49
- self.ocr_processor = None
50
-
51
  def scrape(self, url: str) -> Dict[str, Any]:
52
  """Main scraping function"""
53
  start_time = time.time()
@@ -59,41 +31,28 @@ class SimpleWebScraper:
59
  url = 'https://' + url
60
 
61
  try:
62
- # Method 1: Direct HTML extraction (fastest and most reliable)
63
- html_result = self._extract_direct_html(url)
64
 
65
- if html_result.get("success") and html_result.get("text_length", 0) > 50:
66
- total_time = time.time() - start_time
67
-
68
  return {
69
  "success": True,
70
  "url": url,
71
- "execution_time": round(total_time, 2),
72
- "method_used": "direct_html",
73
- "extracted_text": html_result["text"],
74
- "text_length": html_result["text_length"],
75
- "metadata": html_result.get("metadata", {}),
76
- "notes": "Text extracted directly from HTML (most accurate)"
77
  }
78
-
79
- # Method 2: If direct extraction fails, try alternative
80
- print("Direct extraction limited, trying enhanced method...")
81
- enhanced_result = self._enhanced_extraction(url)
82
-
83
- total_time = time.time() - start_time
84
-
85
- if enhanced_result.get("success"):
86
- enhanced_result["execution_time"] = round(total_time, 2)
87
- return enhanced_result
88
-
89
- # Final fallback
90
- return {
91
- "success": False,
92
- "url": url,
93
- "error": "Failed to extract meaningful content",
94
- "execution_time": round(total_time, 2)
95
- }
96
-
97
  except Exception as e:
98
  return {
99
  "success": False,
@@ -102,122 +61,73 @@ class SimpleWebScraper:
102
  "execution_time": round(time.time() - start_time, 2)
103
  }
104
 
105
- def _extract_direct_html(self, url: str) -> Dict[str, Any]:
106
- """Extract text directly from HTML"""
107
  try:
108
  headers = {
109
  'User-Agent': self.user_agent,
110
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
111
- 'Accept-Language': 'en-US,en;q=0.5',
112
- 'Accept-Encoding': 'gzip, deflate',
113
- 'Connection': 'keep-alive',
114
  }
115
 
116
- response = requests.get(url, headers=headers, timeout=15)
117
  response.raise_for_status()
118
 
119
- # Get encoding
120
- if response.encoding is None:
121
- response.encoding = 'utf-8'
122
-
123
- html_content = response.text
124
-
125
- # Extract metadata
126
- metadata = self._extract_metadata(html_content)
127
-
128
- # Extract text
129
- if BEAUTIFULSOUP_AVAILABLE:
130
- text = self._extract_text_with_bs4(html_content)
131
- else:
132
- text = self._extract_text_simple(html_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- # Clean and truncate text
135
  cleaned_text = self._clean_text(text)
136
 
137
  return {
138
  "success": True,
139
- "text": cleaned_text[:15000], # Limit for API response
140
- "text_length": len(cleaned_text),
141
  "metadata": metadata,
142
- "http_status": response.status_code
143
  }
144
 
145
  except requests.exceptions.RequestException as e:
146
- print(f"Request error: {e}")
147
  return {"success": False, "error": f"Request failed: {str(e)}"}
148
  except Exception as e:
149
- print(f"Extraction error: {e}")
150
  return {"success": False, "error": str(e)}
151
 
152
- def _extract_metadata(self, html: str) -> Dict[str, Any]:
153
- """Extract basic metadata from HTML"""
154
- metadata = {}
155
-
156
- # Extract title
157
- title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.IGNORECASE | re.DOTALL)
158
- if title_match:
159
- metadata['title'] = re.sub(r'\s+', ' ', title_match.group(1)).strip()[:200]
160
-
161
- # Extract meta description
162
- desc_match = re.search(r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
163
- html, re.IGNORECASE)
164
- if desc_match:
165
- metadata['description'] = desc_match.group(1)[:300]
166
-
167
- # Extract meta keywords
168
- keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
169
- html, re.IGNORECASE)
170
- if keywords_match:
171
- metadata['keywords'] = keywords_match.group(1)[:300]
172
-
173
- return metadata
174
-
175
- def _extract_text_with_bs4(self, html: str) -> str:
176
- """Extract text using BeautifulSoup if available"""
177
- try:
178
- soup = BeautifulSoup(html, 'html.parser')
179
-
180
- # Remove unwanted elements
181
- for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside',
182
- 'iframe', 'noscript', 'svg', 'form']):
183
- tag.decompose()
184
-
185
- # Try to find main content
186
- main_text = ""
187
-
188
- # Common content selectors
189
- content_selectors = [
190
- 'main', 'article', '.content', '.post-content', '.article-content',
191
- '.entry-content', '.story-content', '.text-content', '#content',
192
- '.main-content', '.blog-content', '.page-content'
193
- ]
194
-
195
- for selector in content_selectors:
196
- elements = soup.select(selector)
197
- if elements:
198
- for elem in elements[:3]: # Take first 3 matching
199
- main_text += elem.get_text() + "\n\n"
200
-
201
- # If still no content, use body
202
- if not main_text.strip() and soup.body:
203
- main_text = soup.body.get_text()
204
-
205
- return main_text
206
-
207
- except Exception as e:
208
- print(f"BeautifulSoup error: {e}")
209
- return self._extract_text_simple(html)
210
-
211
- def _extract_text_simple(self, html: str) -> str:
212
- """Simple text extraction without BeautifulSoup"""
213
  # Remove scripts and styles
214
  html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
215
  html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
216
 
217
- # Remove HTML comments
218
- html = re.sub(r'<!--.*?-->', ' ', html, flags=re.DOTALL)
219
-
220
- # Remove HTML tags but keep text
221
  text = re.sub(r'<[^>]+>', ' ', html)
222
 
223
  # Decode HTML entities
@@ -227,136 +137,39 @@ class SimpleWebScraper:
227
  return text
228
 
229
  def _clean_text(self, text: str) -> str:
230
- """Clean extracted text"""
231
  # Replace multiple whitespace
232
  text = re.sub(r'\s+', ' ', text)
233
 
234
  # Remove control characters
235
  text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
236
 
237
- # Remove excessive line breaks
238
  text = re.sub(r'\n{3,}', '\n\n', text)
239
 
240
  return text.strip()
241
-
242
- def _enhanced_extraction(self, url: str) -> Dict[str, Any]:
243
- """Enhanced extraction with fallback methods"""
244
- try:
245
- # Try with different headers
246
- headers = {
247
- 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
248
- 'Accept': 'text/html',
249
- }
250
-
251
- response = requests.get(url, headers=headers, timeout=15)
252
-
253
- if response.status_code == 200:
254
- text = self._extract_text_simple(response.text)
255
- cleaned = self._clean_text(text)
256
-
257
- if len(cleaned) > 100:
258
- return {
259
- "success": True,
260
- "text": cleaned[:15000],
261
- "text_length": len(cleaned),
262
- "method_used": "enhanced_direct",
263
- "notes": "Extracted with Googlebot user-agent"
264
- }
265
-
266
- return {"success": False, "error": "Enhanced extraction failed"}
267
-
268
- except Exception as e:
269
- return {"success": False, "error": str(e)}
270
 
271
  # ==============================================
272
- # FASTAPI APPLICATION
273
  # ==============================================
274
 
275
- # Initialize scraper
276
- scraper = SimpleWebScraper()
277
-
278
- # Create FastAPI app
279
- app = FastAPI(
280
- title="Web Scraper API for n8n",
281
- description="Lightweight web scraper optimized for Hugging Face Spaces",
282
- version="1.0"
283
- )
284
-
285
- @app.get("/")
286
- async def root():
287
- return {
288
- "service": "Web Scraper API",
289
- "version": "1.0",
290
- "description": "Extract text content from webpages",
291
- "endpoints": {
292
- "GET /": "This information",
293
- "GET /health": "Health check",
294
- "POST /scrape": "Main scraping endpoint"
295
- },
296
- "usage": {
297
- "curl": 'curl -X POST "https://your-space.hf.space/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\'',
298
- "n8n": "Use HTTP Request node with POST method to /scrape endpoint"
299
- }
300
- }
301
-
302
- @app.get("/health")
303
- async def health():
304
- return {
305
- "status": "healthy",
306
- "timestamp": time.time(),
307
- "features": {
308
- "html_parsing": BEAUTIFULSOUP_AVAILABLE,
309
- "ocr": TRANSFORMERS_AVAILABLE
310
- }
311
- }
312
-
313
- @app.post("/scrape")
314
- async def api_scrape(request: Request):
315
- """Main API endpoint for n8n"""
316
- try:
317
- data = await request.json()
318
- url = data.get("url", "").strip()
319
-
320
- if not url:
321
- return {
322
- "success": False,
323
- "error": "URL parameter is required",
324
- "example": {"url": "https://example.com"}
325
- }
326
-
327
- print(f"📨 API Request received for URL: {url}")
328
- result = scraper.scrape(url)
329
-
330
- return result
331
-
332
- except json.JSONDecodeError:
333
- return {
334
- "success": False,
335
- "error": "Invalid JSON payload",
336
- "example": {"url": "https://example.com"}
337
- }
338
- except Exception as e:
339
- print(f"❌ API Error: {e}")
340
- return {
341
- "success": False,
342
- "error": f"Internal server error: {str(e)}"
343
- }
344
 
345
  # ==============================================
346
  # GRADIO INTERFACE
347
  # ==============================================
348
 
349
- def gradio_scrape(url: str):
350
  """Gradio interface function"""
351
  if not url:
352
  return "❌ Please enter a URL", {}
353
 
354
- print(f"🎨 Gradio interface scraping: {url}")
355
  result = scraper.scrape(url)
356
 
357
- if result.get("success"):
358
- text = result.get("extracted_text", "")
359
- text_length = result.get("text_length", 0)
360
 
361
  # Create preview
362
  preview = text[:500]
@@ -367,64 +180,98 @@ def gradio_scrape(url: str):
367
  ## ✅ Success!
368
 
369
  **URL:** {result['url']}
370
- **Method:** {result.get('method_used', 'direct_html')}
371
- **Time:** {result.get('execution_time', 0)}s
372
  **Characters:** {text_length:,}
373
 
374
  ### Preview:
375
  {preview}
376
 
377
- ### Full Response:
378
- Check the JSON output for complete data.
379
  """
380
  return output, result
381
  else:
382
- error_msg = result.get("error", "Unknown error")
383
- return f"## ❌ Error\n\n{error_msg}", result
384
 
385
- # Create Gradio interface
386
- gradio_interface = gr.Interface(
387
- fn=gradio_scrape,
388
- inputs=gr.Textbox(
389
- label="Website URL",
390
- placeholder="Enter a URL (e.g., https://example.com)",
391
- lines=1
392
- ),
393
- outputs=[
394
- gr.Markdown(label="Result"),
395
- gr.JSON(label="API Response")
396
- ],
397
- title="🌐 Web Scraper for n8n",
398
- description="Extract text content from webpages. Use with n8n HTTP Request node.",
399
- examples=[
400
- ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
401
- ["https://example.com"],
402
- ["https://httpbin.org/html"]
403
- ],
404
- allow_flagging="never"
405
- )
406
 
407
- # Mount Gradio to FastAPI
408
- app = gr.mount_gradio_app(app, gradio_interface, path="/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
 
410
  # ==============================================
411
- # APPLICATION ENTRY POINT
412
  # ==============================================
413
 
414
  if __name__ == "__main__":
415
- import uvicorn
416
-
417
- print("\n" + "="*60)
418
- print("🚀 Web Scraper API Starting")
419
- print("="*60)
420
- print(f"📝 Direct HTML: Enabled")
421
- print(f"🔍 OCR Available: {TRANSFORMERS_AVAILABLE}")
422
- print(f"🧪 BeautifulSoup: {BEAUTIFULSOUP_AVAILABLE}")
423
- print("="*60 + "\n")
424
-
425
- uvicorn.run(
426
- app,
427
- host="0.0.0.0",
428
- port=7860,
429
- log_level="info"
430
- )
 
1
  # ==============================================
2
+ # SIMPLE WEB SCRAPER FOR HUGGING FACE SPACES
3
  # ==============================================
4
 
5
  import gradio as gr
 
7
  import json
8
  import time
9
  import re
10
+ from typing import Dict, Any
 
 
11
  from io import BytesIO
 
12
 
13
  # ==============================================
14
+ # SIMPLE WEB SCRAPER
15
  # ==============================================
16
 
17
+ class WebScraper:
18
+ """Lightweight web scraper for Hugging Face Spaces"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def __init__(self):
21
+ self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
22
+
 
 
 
 
 
23
  def scrape(self, url: str) -> Dict[str, Any]:
24
  """Main scraping function"""
25
  start_time = time.time()
 
31
  url = 'https://' + url
32
 
33
  try:
34
+ # Extract content
35
+ result = self._extract_content(url)
36
 
37
+ if result["success"]:
 
 
38
  return {
39
  "success": True,
40
  "url": url,
41
+ "execution_time": round(time.time() - start_time, 2),
42
+ "method": result["method"],
43
+ "extracted_text": result["text"][:10000], # Limit response size
44
+ "text_length": len(result["text"]),
45
+ "metadata": result.get("metadata", {}),
46
+ "status_code": result.get("status_code", 200)
47
  }
48
+ else:
49
+ return {
50
+ "success": False,
51
+ "url": url,
52
+ "error": result.get("error", "Unknown error"),
53
+ "execution_time": round(time.time() - start_time, 2)
54
+ }
55
+
 
 
 
 
 
 
 
 
 
 
 
56
  except Exception as e:
57
  return {
58
  "success": False,
 
61
  "execution_time": round(time.time() - start_time, 2)
62
  }
63
 
64
+ def _extract_content(self, url: str) -> Dict[str, Any]:
65
+ """Extract content from URL"""
66
  try:
67
  headers = {
68
  'User-Agent': self.user_agent,
69
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
70
+ 'Accept-Language': 'en-US,en;q=0.9',
 
 
71
  }
72
 
73
+ response = requests.get(url, headers=headers, timeout=10)
74
  response.raise_for_status()
75
 
76
+ # Try to extract with BeautifulSoup if available
77
+ try:
78
+ from bs4 import BeautifulSoup
79
+ soup = BeautifulSoup(response.text, 'html.parser')
80
+
81
+ # Remove unwanted tags
82
+ for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
83
+ tag.decompose()
84
+
85
+ # Get text
86
+ text = soup.get_text()
87
+ method = "beautifulsoup"
88
+
89
+ # Extract metadata
90
+ metadata = {}
91
+ if soup.title:
92
+ metadata['title'] = soup.title.string
93
+
94
+ # Try to find main content
95
+ main_selectors = ['article', 'main', '.content', '.post-content', '.article-content']
96
+ for selector in main_selectors:
97
+ elements = soup.select(selector)
98
+ if elements:
99
+ text = ' '.join([elem.get_text() for elem in elements])
100
+ break
101
+
102
+ except ImportError:
103
+ # Fallback to simple regex extraction
104
+ text = self._simple_extract(response.text)
105
+ method = "regex"
106
+ metadata = {}
107
 
108
+ # Clean text
109
  cleaned_text = self._clean_text(text)
110
 
111
  return {
112
  "success": True,
113
+ "text": cleaned_text,
114
+ "method": method,
115
  "metadata": metadata,
116
+ "status_code": response.status_code
117
  }
118
 
119
  except requests.exceptions.RequestException as e:
 
120
  return {"success": False, "error": f"Request failed: {str(e)}"}
121
  except Exception as e:
 
122
  return {"success": False, "error": str(e)}
123
 
124
+ def _simple_extract(self, html: str) -> str:
125
+ """Simple HTML extraction using regex"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  # Remove scripts and styles
127
  html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
128
  html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
129
 
130
+ # Remove HTML tags
 
 
 
131
  text = re.sub(r'<[^>]+>', ' ', html)
132
 
133
  # Decode HTML entities
 
137
  return text
138
 
139
  def _clean_text(self, text: str) -> str:
140
+ """Clean and normalize text"""
141
  # Replace multiple whitespace
142
  text = re.sub(r'\s+', ' ', text)
143
 
144
  # Remove control characters
145
  text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
146
 
147
+ # Remove excessive newlines
148
  text = re.sub(r'\n{3,}', '\n\n', text)
149
 
150
  return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  # ==============================================
153
+ # INITIALIZE
154
  # ==============================================
155
 
156
+ scraper = WebScraper()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  # ==============================================
159
  # GRADIO INTERFACE
160
  # ==============================================
161
 
162
+ def scrape_url(url: str):
163
  """Gradio interface function"""
164
  if not url:
165
  return "❌ Please enter a URL", {}
166
 
167
+ print(f"Processing: {url}")
168
  result = scraper.scrape(url)
169
 
170
+ if result["success"]:
171
+ text = result["extracted_text"]
172
+ text_length = result["text_length"]
173
 
174
  # Create preview
175
  preview = text[:500]
 
180
  ## ✅ Success!
181
 
182
  **URL:** {result['url']}
183
+ **Method:** {result.get('method', 'unknown')}
184
+ **Time:** {result['execution_time']}s
185
  **Characters:** {text_length:,}
186
 
187
  ### Preview:
188
  {preview}
189
 
190
+ *Check JSON tab for full response*
 
191
  """
192
  return output, result
193
  else:
194
+ return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
 
195
 
196
+ # ==============================================
197
+ # CREATE APP
198
+ # ==============================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+ # For Hugging Face Spaces, we need to create the app correctly
201
+ with gr.Blocks(title="Web Scraper for n8n", theme=gr.themes.Soft()) as app:
202
+ gr.Markdown("# 🌐 Web Scraper for n8n")
203
+ gr.Markdown("Extract text content from webpages. Perfect for n8n workflows!")
204
+
205
+ with gr.Row():
206
+ with gr.Column(scale=2):
207
+ url_input = gr.Textbox(
208
+ label="Website URL",
209
+ placeholder="https://example.com",
210
+ value="https://en.wikipedia.org/wiki/Artificial_intelligence"
211
+ )
212
+ scrape_btn = gr.Button("Scrape", variant="primary")
213
+
214
+ with gr.Column(scale=1):
215
+ api_info = gr.Markdown("""
216
+ ### API Usage (for n8n)
217
+
218
+ **Endpoint:** `POST /scrape`
219
+
220
+ **Body:**
221
+ ```json
222
+ {
223
+ "url": "https://example.com"
224
+ }
225
+ ```
226
+
227
+ **Response:** JSON with extracted text
228
+ """)
229
+
230
+ with gr.Row():
231
+ with gr.Column():
232
+ output_md = gr.Markdown(label="Result")
233
+ with gr.Column():
234
+ output_json = gr.JSON(label="API Response")
235
+
236
+ # Examples
237
+ gr.Examples(
238
+ examples=[
239
+ ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
240
+ ["https://example.com"],
241
+ ["https://httpbin.org/html"]
242
+ ],
243
+ inputs=[url_input]
244
+ )
245
+
246
+ # Event handlers
247
+ scrape_btn.click(
248
+ fn=scrape_url,
249
+ inputs=[url_input],
250
+ outputs=[output_md, output_json]
251
+ )
252
+
253
+ # Also trigger on Enter key
254
+ url_input.submit(
255
+ fn=scrape_url,
256
+ inputs=[url_input],
257
+ outputs=[output_md, output_json]
258
+ )
259
+
260
+ # ==============================================
261
+ # ADD API ENDPOINT FOR N8N
262
+ # ==============================================
263
+
264
+ # Hugging Face Spaces will mount the app
265
+ # We'll also create a simple API route
266
+ @app.app.post("/scrape")
267
+ async def api_scrape(url: str = gr.String()):
268
+ """API endpoint for n8n"""
269
+ return scraper.scrape(url)
270
 
271
  # ==============================================
272
+ # MAIN ENTRY POINT
273
  # ==============================================
274
 
275
  if __name__ == "__main__":
276
+ # This runs locally but not on Hugging Face Spaces
277
+ app.launch(server_name="0.0.0.0", server_port=7860)