yukee1992 commited on
Commit
b2fa5de
Β·
verified Β·
1 Parent(s): 7f30af0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -115
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==============================================
2
- # SIMPLE WEB SCRAPER FOR HUGGING FACE SPACES
3
  # ==============================================
4
 
5
  import gradio as gr
@@ -7,146 +7,101 @@ import requests
7
  import json
8
  import time
9
  import re
 
10
  from typing import Dict, Any
11
- from io import BytesIO
12
 
13
  # ==============================================
14
  # SIMPLE WEB SCRAPER
15
  # ==============================================
16
 
17
  class WebScraper:
18
- """Lightweight web scraper for Hugging Face Spaces"""
19
 
20
  def __init__(self):
21
- self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
22
 
23
  def scrape(self, url: str) -> Dict[str, Any]:
24
  """Main scraping function"""
25
  start_time = time.time()
26
 
27
- print(f"🌐 Scraping: {url}")
28
-
29
  # Ensure URL has protocol
30
  if not url.startswith(('http://', 'https://')):
31
  url = 'https://' + url
32
 
33
  try:
34
- # Extract content
35
- result = self._extract_content(url)
36
-
37
- if result["success"]:
38
- return {
39
- "success": True,
40
- "url": url,
41
- "execution_time": round(time.time() - start_time, 2),
42
- "method": result["method"],
43
- "extracted_text": result["text"][:10000], # Limit response size
44
- "text_length": len(result["text"]),
45
- "metadata": result.get("metadata", {}),
46
- "status_code": result.get("status_code", 200)
47
- }
48
- else:
49
- return {
50
- "success": False,
51
- "url": url,
52
- "error": result.get("error", "Unknown error"),
53
- "execution_time": round(time.time() - start_time, 2)
54
- }
55
-
56
- except Exception as e:
57
- return {
58
- "success": False,
59
- "url": url,
60
- "error": str(e),
61
- "execution_time": round(time.time() - start_time, 2)
62
- }
63
-
64
- def _extract_content(self, url: str) -> Dict[str, Any]:
65
- """Extract content from URL"""
66
- try:
67
  headers = {
68
  'User-Agent': self.user_agent,
69
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
70
- 'Accept-Language': 'en-US,en;q=0.9',
71
  }
72
 
73
- response = requests.get(url, headers=headers, timeout=10)
74
  response.raise_for_status()
75
 
76
- # Try to extract with BeautifulSoup if available
77
- try:
78
- from bs4 import BeautifulSoup
79
- soup = BeautifulSoup(response.text, 'html.parser')
80
-
81
- # Remove unwanted tags
82
- for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
83
- tag.decompose()
84
-
85
- # Get text
86
- text = soup.get_text()
87
- method = "beautifulsoup"
88
-
89
- # Extract metadata
90
- metadata = {}
91
- if soup.title:
92
- metadata['title'] = soup.title.string
93
-
94
- # Try to find main content
95
- main_selectors = ['article', 'main', '.content', '.post-content', '.article-content']
96
- for selector in main_selectors:
97
- elements = soup.select(selector)
98
- if elements:
99
- text = ' '.join([elem.get_text() for elem in elements])
100
- break
101
-
102
- except ImportError:
103
- # Fallback to simple regex extraction
104
- text = self._simple_extract(response.text)
105
- method = "regex"
106
- metadata = {}
107
 
108
  # Clean text
109
  cleaned_text = self._clean_text(text)
110
 
 
 
 
111
  return {
112
  "success": True,
113
- "text": cleaned_text,
114
- "method": method,
115
- "metadata": metadata,
116
- "status_code": response.status_code
 
 
 
117
  }
118
 
119
- except requests.exceptions.RequestException as e:
120
- return {"success": False, "error": f"Request failed: {str(e)}"}
121
  except Exception as e:
122
- return {"success": False, "error": str(e)}
 
 
 
 
 
123
 
124
- def _simple_extract(self, html: str) -> str:
125
- """Simple HTML extraction using regex"""
126
  # Remove scripts and styles
127
- html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
128
- html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
 
 
 
129
 
130
  # Remove HTML tags
131
- text = re.sub(r'<[^>]+>', ' ', html)
132
 
133
  # Decode HTML entities
134
- import html as html_module
135
- text = html_module.unescape(text)
136
 
137
  return text
138
 
 
 
 
 
 
 
 
 
 
 
139
  def _clean_text(self, text: str) -> str:
140
- """Clean and normalize text"""
141
  # Replace multiple whitespace
142
  text = re.sub(r'\s+', ' ', text)
143
 
144
  # Remove control characters
145
  text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
146
 
147
- # Remove excessive newlines
148
- text = re.sub(r'\n{3,}', '\n\n', text)
149
-
150
  return text.strip()
151
 
152
  # ==============================================
@@ -155,16 +110,23 @@ class WebScraper:
155
 
156
  scraper = WebScraper()
157
 
 
 
 
 
 
 
 
 
158
  # ==============================================
159
  # GRADIO INTERFACE
160
  # ==============================================
161
 
162
- def scrape_url(url: str):
163
  """Gradio interface function"""
164
  if not url:
165
  return "❌ Please enter a URL", {}
166
 
167
- print(f"Processing: {url}")
168
  result = scraper.scrape(url)
169
 
170
  if result["success"]:
@@ -180,7 +142,7 @@ def scrape_url(url: str):
180
  ## βœ… Success!
181
 
182
  **URL:** {result['url']}
183
- **Method:** {result.get('method', 'unknown')}
184
  **Time:** {result['execution_time']}s
185
  **Characters:** {text_length:,}
186
 
@@ -194,11 +156,11 @@ def scrape_url(url: str):
194
  return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
195
 
196
  # ==============================================
197
- # CREATE APP
198
  # ==============================================
199
 
200
- # For Hugging Face Spaces, we need to create the app correctly
201
- with gr.Blocks(title="Web Scraper for n8n", theme=gr.themes.Soft()) as app:
202
  gr.Markdown("# 🌐 Web Scraper for n8n")
203
  gr.Markdown("Extract text content from webpages. Perfect for n8n workflows!")
204
 
@@ -212,10 +174,10 @@ with gr.Blocks(title="Web Scraper for n8n", theme=gr.themes.Soft()) as app:
212
  scrape_btn = gr.Button("Scrape", variant="primary")
213
 
214
  with gr.Column(scale=1):
215
- api_info = gr.Markdown("""
216
  ### API Usage (for n8n)
217
 
218
- **Endpoint:** `POST /scrape`
219
 
220
  **Body:**
221
  ```json
@@ -244,34 +206,49 @@ with gr.Blocks(title="Web Scraper for n8n", theme=gr.themes.Soft()) as app:
244
  )
245
 
246
  # Event handlers
 
 
 
247
  scrape_btn.click(
248
- fn=scrape_url,
249
  inputs=[url_input],
250
  outputs=[output_md, output_json]
251
  )
252
 
253
- # Also trigger on Enter key
254
  url_input.submit(
255
- fn=scrape_url,
256
  inputs=[url_input],
257
  outputs=[output_md, output_json]
258
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  # ==============================================
261
- # ADD API ENDPOINT FOR N8N
262
  # ==============================================
263
 
264
- # Hugging Face Spaces will mount the app
265
- # We'll also create a simple API route
266
- @app.app.post("/scrape")
267
- async def api_scrape(url: str = gr.String()):
268
- """API endpoint for n8n"""
269
- return scraper.scrape(url)
270
-
271
- # ==============================================
272
- # MAIN ENTRY POINT
273
- # ==============================================
274
 
 
275
  if __name__ == "__main__":
276
- # This runs locally but not on Hugging Face Spaces
277
- app.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
1
  # ==============================================
2
+ # WEB SCRAPER FOR N8N - GRADIO 6 COMPATIBLE
3
  # ==============================================
4
 
5
  import gradio as gr
 
7
  import json
8
  import time
9
  import re
10
+ import html
11
  from typing import Dict, Any
12
+ import traceback
13
 
14
  # ==============================================
15
  # SIMPLE WEB SCRAPER
16
  # ==============================================
17
 
18
  class WebScraper:
19
+ """Lightweight web scraper"""
20
 
21
  def __init__(self):
22
+ self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
23
 
24
  def scrape(self, url: str) -> Dict[str, Any]:
25
  """Main scraping function"""
26
  start_time = time.time()
27
 
 
 
28
  # Ensure URL has protocol
29
  if not url.startswith(('http://', 'https://')):
30
  url = 'https://' + url
31
 
32
  try:
33
+ # Fetch the page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  headers = {
35
  'User-Agent': self.user_agent,
36
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 
37
  }
38
 
39
+ response = requests.get(url, headers=headers, timeout=15)
40
  response.raise_for_status()
41
 
42
+ # Extract text
43
+ text = self._extract_text(response.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # Clean text
46
  cleaned_text = self._clean_text(text)
47
 
48
+ # Extract title
49
+ title = self._extract_title(response.text)
50
+
51
  return {
52
  "success": True,
53
+ "url": url,
54
+ "title": title,
55
+ "extracted_text": cleaned_text[:15000],
56
+ "text_length": len(cleaned_text),
57
+ "status_code": response.status_code,
58
+ "execution_time": round(time.time() - start_time, 2),
59
+ "method": "regex"
60
  }
61
 
 
 
62
  except Exception as e:
63
+ return {
64
+ "success": False,
65
+ "url": url,
66
+ "error": str(e),
67
+ "execution_time": round(time.time() - start_time, 2)
68
+ }
69
 
70
+ def _extract_text(self, html_content: str) -> str:
71
+ """Extract text from HTML using regex"""
72
  # Remove scripts and styles
73
+ html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
74
+ html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
75
+
76
+ # Remove comments
77
+ html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
78
 
79
  # Remove HTML tags
80
+ text = re.sub(r'<[^>]+>', ' ', html_content)
81
 
82
  # Decode HTML entities
83
+ text = html.unescape(text)
 
84
 
85
  return text
86
 
87
+ def _extract_title(self, html_content: str) -> str:
88
+ """Extract page title"""
89
+ title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
90
+ if title_match:
91
+ title = title_match.group(1)
92
+ # Clean title
93
+ title = re.sub(r'\s+', ' ', title).strip()
94
+ return title[:200]
95
+ return "No title found"
96
+
97
  def _clean_text(self, text: str) -> str:
98
+ """Clean extracted text"""
99
  # Replace multiple whitespace
100
  text = re.sub(r'\s+', ' ', text)
101
 
102
  # Remove control characters
103
  text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
104
 
 
 
 
105
  return text.strip()
106
 
107
  # ==============================================
 
110
 
111
  scraper = WebScraper()
112
 
113
+ # ==============================================
114
+ # API FUNCTION FOR N8N
115
+ # ==============================================
116
+
117
+ def api_scrape_function(url: str) -> Dict[str, Any]:
118
+ """Function for API calls"""
119
+ return scraper.scrape(url)
120
+
121
  # ==============================================
122
  # GRADIO INTERFACE
123
  # ==============================================
124
 
125
+ def gradio_scrape(url: str):
126
  """Gradio interface function"""
127
  if not url:
128
  return "❌ Please enter a URL", {}
129
 
 
130
  result = scraper.scrape(url)
131
 
132
  if result["success"]:
 
142
  ## βœ… Success!
143
 
144
  **URL:** {result['url']}
145
+ **Title:** {result.get('title', 'N/A')}
146
  **Time:** {result['execution_time']}s
147
  **Characters:** {text_length:,}
148
 
 
156
  return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
157
 
158
  # ==============================================
159
+ # CREATE THE APP
160
  # ==============================================
161
 
162
+ # Create Gradio blocks
163
+ with gr.Blocks() as app:
164
  gr.Markdown("# 🌐 Web Scraper for n8n")
165
  gr.Markdown("Extract text content from webpages. Perfect for n8n workflows!")
166
 
 
174
  scrape_btn = gr.Button("Scrape", variant="primary")
175
 
176
  with gr.Column(scale=1):
177
+ gr.Markdown("""
178
  ### API Usage (for n8n)
179
 
180
+ **Method:** `POST` to `/api/scrape`
181
 
182
  **Body:**
183
  ```json
 
206
  )
207
 
208
  # Event handlers
209
+ def process_url(url):
210
+ return gradio_scrape(url)
211
+
212
  scrape_btn.click(
213
+ fn=process_url,
214
  inputs=[url_input],
215
  outputs=[output_md, output_json]
216
  )
217
 
 
218
  url_input.submit(
219
+ fn=process_url,
220
  inputs=[url_input],
221
  outputs=[output_md, output_json]
222
  )
223
+
224
+ # ==============================================
225
+ # ADD API ENDPOINT DIRECTLY IN GRADIO
226
+ # ==============================================
227
+
228
+ # Create a separate API endpoint
229
+ @app.app.post("/api/scrape")
230
+ async def api_scrape(request: dict):
231
+ """API endpoint for n8n"""
232
+ try:
233
+ url = request.get("url", "").strip()
234
+ if not url:
235
+ return {"success": False, "error": "URL is required"}
236
+
237
+ return api_scrape_function(url)
238
+ except Exception as e:
239
+ return {"success": False, "error": str(e)}
240
 
241
  # ==============================================
242
+ # LAUNCH CONFIGURATION
243
  # ==============================================
244
 
245
+ # For Hugging Face Spaces, just define the app
246
+ # The space will handle launching
 
 
 
 
 
 
 
 
247
 
248
+ # For local testing
249
  if __name__ == "__main__":
250
+ app.launch(
251
+ server_name="0.0.0.0",
252
+ server_port=7860,
253
+ share=False
254
+ )