yukee1992 commited on
Commit
fa1baec
·
verified ·
1 Parent(s): 5f8bca0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +330 -83
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==============================================
2
- # WEB SCRAPER FOR N8N - WORKING VERSION
3
  # ==============================================
4
 
5
  import gradio as gr
@@ -8,25 +8,25 @@ import json
8
  import time
9
  import re
10
  import html
11
- from typing import Dict, Any
12
  from fastapi import FastAPI, Request
13
  import uvicorn
14
 
15
  # ==============================================
16
- # SIMPLE WEB SCRAPER
17
  # ==============================================
18
 
19
- class WebScraper:
20
- """Lightweight web scraper"""
21
 
22
  def __init__(self):
23
  self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
24
 
25
- def scrape(self, url: str) -> Dict[str, Any]:
26
- """Main scraping function"""
27
  start_time = time.time()
28
 
29
- print(f"🌐 Scraping: {url}")
30
 
31
  # Ensure URL has protocol
32
  if not url.startswith(('http://', 'https://')):
@@ -36,30 +36,41 @@ class WebScraper:
36
  # Fetch the page
37
  headers = {
38
  'User-Agent': self.user_agent,
39
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 
40
  }
41
 
42
  response = requests.get(url, headers=headers, timeout=15)
43
  response.raise_for_status()
44
 
45
- # Extract text
46
- text = self._extract_text(response.text)
 
47
 
48
- # Clean text
49
- cleaned_text = self._clean_text(text)
50
 
51
- # Extract title
52
- title = self._extract_title(response.text)
 
 
 
 
 
 
 
 
 
53
 
54
  return {
55
  "success": True,
56
  "url": url,
57
  "title": title,
58
- "extracted_text": cleaned_text[:15000],
59
- "text_length": len(cleaned_text),
 
 
60
  "status_code": response.status_code,
61
- "execution_time": round(time.time() - start_time, 2),
62
- "method": "direct_html"
63
  }
64
 
65
  except Exception as e:
@@ -70,57 +81,299 @@ class WebScraper:
70
  "execution_time": round(time.time() - start_time, 2)
71
  }
72
 
73
- def _extract_text(self, html_content: str) -> str:
74
- """Extract text from HTML using regex"""
75
- # Remove scripts and styles
76
- html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
77
- html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- # Remove comments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
81
 
 
 
 
 
 
82
  # Remove HTML tags
83
  text = re.sub(r'<[^>]+>', ' ', html_content)
84
 
85
  # Decode HTML entities
86
  text = html.unescape(text)
87
 
88
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  def _extract_title(self, html_content: str) -> str:
91
  """Extract page title"""
92
  title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
93
  if title_match:
94
  title = title_match.group(1)
95
- # Clean title
96
  title = re.sub(r'\s+', ' ', title).strip()
97
  return title[:200]
98
  return "No title found"
99
 
100
- def _clean_text(self, text: str) -> str:
101
- """Clean extracted text"""
102
- # Replace multiple whitespace
103
- text = re.sub(r'\s+', ' ', text)
104
 
105
- # Remove control characters
106
- text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
 
 
 
107
 
108
- return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  # ==============================================
111
  # INITIALIZE
112
  # ==============================================
113
 
114
- scraper = WebScraper()
115
 
116
  # ==============================================
117
- # CREATE FASTAPI APP FIRST
118
  # ==============================================
119
 
120
  # Create FastAPI app
121
  fastapi_app = FastAPI(
122
- title="Web Scraper API",
123
- description="Extract text from webpages for n8n workflows",
124
  version="1.0"
125
  )
126
 
@@ -139,16 +392,18 @@ fastapi_app.add_middleware(
139
  @fastapi_app.get("/")
140
  async def root():
141
  return {
142
- "service": "Web Scraper API",
143
  "version": "1.0",
 
144
  "endpoints": {
145
  "GET /": "This info",
146
  "GET /health": "Health check",
147
- "POST /scrape": "Scrape a webpage (for n8n)"
148
  },
149
- "usage": {
150
- "curl": 'curl -X POST "https://your-space.hf.space/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\'',
151
- "n8n": "HTTP Request node: POST to /scrape with JSON body"
 
152
  }
153
  }
154
 
@@ -159,9 +414,9 @@ async def health():
159
  "timestamp": time.time()
160
  }
161
 
162
- @fastapi_app.post("/scrape")
163
- async def api_scrape(request: Request):
164
- """Main API endpoint for n8n"""
165
  try:
166
  # Parse JSON body
167
  body = await request.json()
@@ -173,8 +428,8 @@ async def api_scrape(request: Request):
173
  content={"success": False, "error": "URL parameter is required"}
174
  )
175
 
176
- print(f"📨 API Request received for: {url}")
177
- result = scraper.scrape(url)
178
 
179
  return result
180
 
@@ -193,34 +448,37 @@ async def api_scrape(request: Request):
193
  # GRADIO INTERFACE
194
  # ==============================================
195
 
196
- def gradio_scrape(url: str):
197
  """Gradio interface function"""
198
  if not url:
199
  return "❌ Please enter a URL", {}
200
 
201
- result = scraper.scrape(url)
202
 
203
  if result["success"]:
204
- text = result["extracted_text"]
205
- text_length = result["text_length"]
 
 
 
 
 
206
 
207
- # Create preview
208
- preview = text[:500]
209
- if len(text) > 500:
210
- preview += "..."
211
 
212
  output = f"""
213
- ## ✅ Success!
214
 
215
  **URL:** {result['url']}
216
  **Title:** {result.get('title', 'N/A')}
217
  **Time:** {result['execution_time']}s
218
- **Characters:** {text_length:,}
219
 
220
- ### Preview:
221
  {preview}
222
 
223
- *Check JSON tab for full API response*
224
  """
225
  return output, result
226
  else:
@@ -228,54 +486,43 @@ def gradio_scrape(url: str):
228
 
229
  # Create Gradio interface
230
  gradio_interface = gr.Interface(
231
- fn=gradio_scrape,
232
  inputs=gr.Textbox(
233
  label="Website URL",
234
  placeholder="https://example.com",
235
- value="https://example.com"
236
  ),
237
  outputs=[
238
  gr.Markdown(label="Result"),
239
  gr.JSON(label="API Response")
240
  ],
241
- title="🌐 Web Scraper for n8n",
242
- description="Extract text from webpages. Use POST /scrape for n8n integration.",
243
  examples=[
244
- ["https://example.com"],
245
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
246
- ["https://httpbin.org/html"]
247
- ]
 
 
 
248
  )
249
 
250
  # ==============================================
251
  # MOUNT GRADIO TO FASTAPI
252
  # ==============================================
253
 
254
- # Mount Gradio app to FastAPI at root path
255
  app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
256
 
257
- # ==============================================
258
- # ALTERNATIVE: If mounting doesn't work, try this:
259
- # ==============================================
260
-
261
- # Instead of mounting, you can also define routes manually
262
- # Uncomment below if mounting doesn't work:
263
-
264
- # @fastapi_app.get("/")
265
- # async def gradio_root():
266
- # # This will redirect to the Gradio interface
267
- # from fastapi.responses import RedirectResponse
268
- # return RedirectResponse(url="/")
269
-
270
  # ==============================================
271
  # LAUNCH THE APP
272
  # ==============================================
273
 
274
  if __name__ == "__main__":
275
  print("\n" + "="*60)
276
- print("🚀 Web Scraper API Starting")
277
  print("="*60)
278
- print("API Endpoint: POST /scrape")
279
  print("Web Interface: GET /")
280
  print("="*60 + "\n")
281
 
 
1
  # ==============================================
2
+ # SMART CONTENT EXTRACTOR FOR N8N
3
  # ==============================================
4
 
5
  import gradio as gr
 
8
  import time
9
  import re
10
  import html
11
+ from typing import Dict, Any, List, Optional
12
  from fastapi import FastAPI, Request
13
  import uvicorn
14
 
15
  # ==============================================
16
+ # SMART CONTENT EXTRACTOR
17
  # ==============================================
18
 
19
+ class SmartContentExtractor:
20
+ """Extracts only main content, removes navigation, ads, footers, etc."""
21
 
22
  def __init__(self):
23
  self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
24
 
25
+ def extract_content(self, url: str) -> Dict[str, Any]:
26
+ """Extract only main content from webpage"""
27
  start_time = time.time()
28
 
29
+ print(f"🌐 Extracting content from: {url}")
30
 
31
  # Ensure URL has protocol
32
  if not url.startswith(('http://', 'https://')):
 
36
  # Fetch the page
37
  headers = {
38
  'User-Agent': self.user_agent,
39
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
40
+ 'Accept-Language': 'en-US,en;q=0.9',
41
  }
42
 
43
  response = requests.get(url, headers=headers, timeout=15)
44
  response.raise_for_status()
45
 
46
+ # Get encoding
47
+ if response.encoding is None:
48
+ response.encoding = 'utf-8'
49
 
50
+ html_content = response.text
 
51
 
52
+ # Extract only main content
53
+ main_content = self._extract_main_content(html_content)
54
+
55
+ # Clean content
56
+ cleaned_content = self._clean_content(main_content)
57
+
58
+ # Extract title (separately)
59
+ title = self._extract_title(html_content)
60
+
61
+ # Extract metadata
62
+ metadata = self._extract_metadata(html_content)
63
 
64
  return {
65
  "success": True,
66
  "url": url,
67
  "title": title,
68
+ "main_content": cleaned_content[:20000], # Limit to 20k chars
69
+ "content_length": len(cleaned_content),
70
+ "content_preview": cleaned_content[:500] + ("..." if len(cleaned_content) > 500 else ""),
71
+ "metadata": metadata,
72
  "status_code": response.status_code,
73
+ "execution_time": round(time.time() - start_time, 2)
 
74
  }
75
 
76
  except Exception as e:
 
81
  "execution_time": round(time.time() - start_time, 2)
82
  }
83
 
84
+ def _extract_main_content(self, html_content: str) -> str:
85
+ """Extract only the main content, removing navigation, ads, footers, etc."""
86
+
87
+ # First, try to find main content using common selectors
88
+ # These are CSS selectors that typically contain main content
89
+ main_content_selectors = [
90
+ # Article/content focused
91
+ 'article', 'main', '.post-content', '.article-content',
92
+ '.entry-content', '.story-content', '.content-area',
93
+ '.main-content', '.post-body', '.article-body',
94
+ '.story-body', '.content-body', '.text-content',
95
+
96
+ # Blog/News specific
97
+ '.blog-content', '.news-content', '.post',
98
+ '.story', '.article', '.post-entry',
99
+
100
+ # Generic content containers
101
+ '.content', '#content', '.container .content',
102
+ '.page-content', '.single-content',
103
+
104
+ # Divs with content
105
+ 'div[class*="content"]', 'div[class*="article"]',
106
+ 'div[class*="post"]', 'div[class*="entry"]',
107
+ 'div[class*="story"]', 'div[class*="body"]',
108
+ ]
109
+
110
+ # Remove unwanted sections first (more aggressive)
111
+ html_content = self._remove_unwanted_sections(html_content)
112
+
113
+ # Try to extract using regex patterns for main content
114
+ content = self._extract_with_regex(html_content)
115
+
116
+ # If we got decent content, return it
117
+ if len(content.strip()) > 200:
118
+ return content
119
+
120
+ # Fallback: remove all HTML tags and get text
121
+ return self._extract_all_text(html_content)
122
+
123
+ def _remove_unwanted_sections(self, html_content: str) -> str:
124
+ """Remove navigation, ads, footers, sidebars, etc."""
125
+
126
+ # Patterns to remove (these are typically unwanted sections)
127
+ unwanted_patterns = [
128
+ # Navigation
129
+ r'<nav[^>]*>.*?</nav>',
130
+ r'<header[^>]*>.*?</header>',
131
+ r'<menu[^>]*>.*?</menu>',
132
+
133
+ # Footers
134
+ r'<footer[^>]*>.*?</footer>',
135
+
136
+ # Sidebars
137
+ r'<aside[^>]*>.*?</aside>',
138
+ r'<div[^>]*class="[^"]*sidebar[^"]*"[^>]*>.*?</div>',
139
+ r'<div[^>]*id="[^"]*sidebar[^"]*"[^>]*>.*?</div>',
140
+
141
+ # Ads and banners
142
+ r'<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>',
143
+ r'<div[^>]*class="[^"]*banner[^"]*"[^>]*>.*?</div>',
144
+ r'<div[^>]*class="[^"]*advertisement[^"]*"[^>]*>.*?</div>',
145
+ r'<ins[^>]*>.*?</ins>',
146
+
147
+ # Social media/widgets
148
+ r'<div[^>]*class="[^"]*social[^"]*"[^>]*>.*?</div>',
149
+ r'<div[^>]*class="[^"]*widget[^"]*"[^>]*>.*?</div>',
150
+ r'<div[^>]*class="[^"]*share[^"]*"[^>]*>.*?</div>',
151
+
152
+ # Comments
153
+ r'<div[^>]*class="[^"]*comment[^"]*"[^>]*>.*?</div>',
154
+ r'<section[^>]*class="[^"]*comment[^"]*"[^>]*>.*?</section>',
155
+
156
+ # Related content (often at bottom)
157
+ r'<div[^>]*class="[^"]*related[^"]*"[^>]*>.*?</div>',
158
+ r'<div[^>]*class="[^"]*popular[^"]*"[^>]*>.*?</div>',
159
+
160
+ # Menus and lists
161
+ r'<ul[^>]*class="[^"]*menu[^"]*"[^>]*>.*?</ul>',
162
+ r'<ul[^>]*class="[^"]*nav[^"]*"[^>]*>.*?</ul>',
163
+
164
+ # Scripts and styles (always remove)
165
+ r'<script[^>]*>.*?</script>',
166
+ r'<style[^>]*>.*?</style>',
167
+ r'<!--.*?-->', # Comments
168
+
169
+ # Metadata in body
170
+ r'<meta[^>]*>',
171
+ r'<link[^>]*>',
172
+ ]
173
+
174
+ cleaned_html = html_content
175
+ for pattern in unwanted_patterns:
176
+ cleaned_html = re.sub(pattern, ' ', cleaned_html, flags=re.DOTALL | re.IGNORECASE)
177
+
178
+ return cleaned_html
179
+
180
+ def _extract_with_regex(self, html_content: str) -> str:
181
+ """Extract content using regex patterns"""
182
 
183
+ # Try to find content between common content tags
184
+ content_patterns = [
185
+ # Look for article tags
186
+ r'<article[^>]*>(.*?)</article>',
187
+
188
+ # Look for main tags
189
+ r'<main[^>]*>(.*?)</main>',
190
+
191
+ # Look for divs with content classes
192
+ r'<div[^>]*class="[^"]*(post-content|article-content|entry-content|story-content)[^"]*"[^>]*>(.*?)</div>',
193
+ r'<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>',
194
+ r'<div[^>]*class="[^"]*article[^"]*"[^>]*>(.*?)</div>',
195
+ r'<div[^>]*class="[^"]*post[^"]*"[^>]*>(.*?)</div>',
196
+
197
+ # Look for section with content
198
+ r'<section[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</section>',
199
+ ]
200
+
201
+ all_content = []
202
+
203
+ for pattern in content_patterns:
204
+ matches = re.findall(pattern, html_content, re.DOTALL | re.IGNORECASE)
205
+ for match in matches:
206
+ # Handle groups in regex
207
+ if isinstance(match, tuple):
208
+ for group in match:
209
+ if group and len(group.strip()) > 50:
210
+ all_content.append(group)
211
+ elif match and len(match.strip()) > 50:
212
+ all_content.append(match)
213
+
214
+ if all_content:
215
+ # Combine all found content
216
+ combined = ' '.join(all_content)
217
+
218
+ # Remove any remaining HTML tags
219
+ combined = re.sub(r'<[^>]+>', ' ', combined)
220
+
221
+ # Decode HTML entities
222
+ combined = html.unescape(combined)
223
+
224
+ return combined
225
+
226
+ return ""
227
+
228
+ def _extract_all_text(self, html_content: str) -> str:
229
+ """Extract all text as fallback, but clean it well"""
230
+ # Remove scripts, styles, comments first
231
+ html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
232
+ html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
233
  html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
234
 
235
+ # Remove common unwanted tags
236
+ unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'ins', 'meta', 'link']
237
+ for tag in unwanted_tags:
238
+ html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
239
+
240
  # Remove HTML tags
241
  text = re.sub(r'<[^>]+>', ' ', html_content)
242
 
243
  # Decode HTML entities
244
  text = html.unescape(text)
245
 
246
+ # Remove very short lines (likely navigation items)
247
+ lines = text.split('\n')
248
+ filtered_lines = []
249
+ for line in lines:
250
+ line = line.strip()
251
+ if len(line) > 30: # Only keep lines longer than 30 chars
252
+ filtered_lines.append(line)
253
+ elif any(word in line.lower() for word in ['home', 'about', 'contact', 'login', 'sign up', 'search']):
254
+ # Skip navigation lines
255
+ continue
256
+
257
+ return '\n\n'.join(filtered_lines)
258
+
259
+ def _clean_content(self, content: str) -> str:
260
+ """Clean and normalize the extracted content"""
261
+ if not content:
262
+ return ""
263
+
264
+ # Replace multiple whitespace with single space
265
+ content = re.sub(r'\s+', ' ', content)
266
+
267
+ # Remove control characters
268
+ content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
269
+
270
+ # Remove common unwanted phrases (ads, prompts, etc.)
271
+ unwanted_phrases = [
272
+ r'sign up for our newsletter',
273
+ r'subscribe to our newsletter',
274
+ r'follow us on',
275
+ r'like us on facebook',
276
+ r'follow us on twitter',
277
+ r'share this article',
278
+ r'read more',
279
+ r'continue reading',
280
+ r'advertisement',
281
+ r'sponsored content',
282
+ r'related articles',
283
+ r'you may also like',
284
+ r'popular posts',
285
+ r'recommended for you',
286
+ r'click here',
287
+ r'learn more',
288
+ ]
289
+
290
+ for phrase in unwanted_phrases:
291
+ content = re.sub(phrase, '', content, flags=re.IGNORECASE)
292
+
293
+ # Remove email addresses
294
+ content = re.sub(r'\S+@\S+\.\S+', '', content)
295
+
296
+ # Remove URLs
297
+ content = re.sub(r'https?://\S+', '', content)
298
+
299
+ # Remove excessive punctuation
300
+ content = re.sub(r'[.!?]{3,}', '.', content)
301
+
302
+ # Normalize spaces around punctuation
303
+ content = re.sub(r'\s+([.,!?;:])', r'\1', content)
304
+ content = re.sub(r'([.,!?;:])\s+', r'\1 ', content)
305
+
306
+ # Split into paragraphs and filter
307
+ paragraphs = content.split('. ')
308
+ clean_paragraphs = []
309
+
310
+ for para in paragraphs:
311
+ para = para.strip()
312
+ if len(para) < 5:
313
+ continue
314
+
315
+ # Skip very short paragraphs (likely not content)
316
+ if len(para) > 30:
317
+ clean_paragraphs.append(para)
318
+
319
+ # Join back with proper spacing
320
+ content = '. '.join(clean_paragraphs)
321
+
322
+ return content.strip()
323
 
324
  def _extract_title(self, html_content: str) -> str:
325
  """Extract page title"""
326
  title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
327
  if title_match:
328
  title = title_match.group(1)
 
329
  title = re.sub(r'\s+', ' ', title).strip()
330
  return title[:200]
331
  return "No title found"
332
 
333
+ def _extract_metadata(self, html_content: str) -> Dict[str, str]:
334
+ """Extract basic metadata"""
335
+ metadata = {}
 
336
 
337
+ # Meta description
338
+ desc_match = re.search(r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
339
+ html_content, re.IGNORECASE)
340
+ if desc_match:
341
+ metadata['description'] = desc_match.group(1)[:300]
342
 
343
+ # Meta keywords
344
+ keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
345
+ html_content, re.IGNORECASE)
346
+ if keywords_match:
347
+ metadata['keywords'] = keywords_match.group(1)[:300]
348
+
349
+ # Author
350
+ author_match = re.search(r'<meta[^>]*name=["\']author["\'][^>]*content=["\'](.*?)["\']',
351
+ html_content, re.IGNORECASE)
352
+ if author_match:
353
+ metadata['author'] = author_match.group(1)[:200]
354
+
355
+ # OG title (social media title)
356
+ og_title_match = re.search(r'<meta[^>]*property=["\']og:title["\'][^>]*content=["\'](.*?)["\']',
357
+ html_content, re.IGNORECASE)
358
+ if og_title_match:
359
+ metadata['og_title'] = og_title_match.group(1)[:200]
360
+
361
+ return metadata
362
 
363
  # ==============================================
364
  # INITIALIZE
365
  # ==============================================
366
 
367
+ extractor = SmartContentExtractor()
368
 
369
  # ==============================================
370
+ # FASTAPI APP
371
  # ==============================================
372
 
373
  # Create FastAPI app
374
  fastapi_app = FastAPI(
375
+ title="Smart Content Extractor",
376
+ description="Extracts only main content from webpages, removes navigation, ads, footers",
377
  version="1.0"
378
  )
379
 
 
392
  @fastapi_app.get("/")
393
  async def root():
394
  return {
395
+ "service": "Smart Content Extractor",
396
  "version": "1.0",
397
+ "description": "Extracts only main content from webpages (no navigation, ads, footers)",
398
  "endpoints": {
399
  "GET /": "This info",
400
  "GET /health": "Health check",
401
+ "POST /extract": "Extract main content (for n8n)"
402
  },
403
+ "usage_n8n": {
404
+ "method": "POST",
405
+ "url": "https://your-space.hf.space/extract",
406
+ "body": {"url": "https://example.com"}
407
  }
408
  }
409
 
 
414
  "timestamp": time.time()
415
  }
416
 
417
+ @fastapi_app.post("/extract")
418
+ async def api_extract(request: Request):
419
+ """API endpoint for n8n - extracts only main content"""
420
  try:
421
  # Parse JSON body
422
  body = await request.json()
 
428
  content={"success": False, "error": "URL parameter is required"}
429
  )
430
 
431
+ print(f"📨 Content extraction request: {url}")
432
+ result = extractor.extract_content(url)
433
 
434
  return result
435
 
 
448
  # GRADIO INTERFACE
449
  # ==============================================
450
 
451
+ def gradio_extract(url: str):
452
  """Gradio interface function"""
453
  if not url:
454
  return "❌ Please enter a URL", {}
455
 
456
+ result = extractor.extract_content(url)
457
 
458
  if result["success"]:
459
+ content = result["main_content"]
460
+ content_length = result["content_length"]
461
+
462
+ # Create preview (first 3 paragraphs or 500 chars)
463
+ paragraphs = content.split('. ')
464
+ preview_paragraphs = paragraphs[:3]
465
+ preview = '. '.join(preview_paragraphs)
466
 
467
+ if len(preview) > 500:
468
+ preview = preview[:500] + "..."
 
 
469
 
470
  output = f"""
471
+ ## ✅ Content Extracted Successfully!
472
 
473
  **URL:** {result['url']}
474
  **Title:** {result.get('title', 'N/A')}
475
  **Time:** {result['execution_time']}s
476
+ **Content Length:** {content_length:,} characters
477
 
478
+ ### Content Preview:
479
  {preview}
480
 
481
+ *Check JSON tab for full content*
482
  """
483
  return output, result
484
  else:
 
486
 
487
  # Create Gradio interface
488
  gradio_interface = gr.Interface(
489
+ fn=gradio_extract,
490
  inputs=gr.Textbox(
491
  label="Website URL",
492
  placeholder="https://example.com",
493
+ value="https://en.wikipedia.org/wiki/Artificial_intelligence"
494
  ),
495
  outputs=[
496
  gr.Markdown(label="Result"),
497
  gr.JSON(label="API Response")
498
  ],
499
+ title="🧠 Smart Content Extractor for n8n",
500
+ description="Extracts ONLY main content - removes navigation, ads, footers, sidebars, etc.",
501
  examples=[
 
502
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
503
+ ["https://example.com"],
504
+ ["https://news.ycombinator.com"],
505
+ ["https://medium.com/topic/technology"]
506
+ ],
507
+ allow_flagging="never"
508
  )
509
 
510
  # ==============================================
511
  # MOUNT GRADIO TO FASTAPI
512
  # ==============================================
513
 
514
+ # Mount Gradio app to FastAPI
515
  app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
516
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  # ==============================================
518
  # LAUNCH THE APP
519
  # ==============================================
520
 
521
  if __name__ == "__main__":
522
  print("\n" + "="*60)
523
+ print("🧠 Smart Content Extractor Starting")
524
  print("="*60)
525
+ print("API Endpoint: POST /extract")
526
  print("Web Interface: GET /")
527
  print("="*60 + "\n")
528