Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on Jan 25

Commit

fa1baec

verified ·

1 Parent(s): 5f8bca0

Update app.py

Browse files

Files changed (1) hide show

app.py +330 -83

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==============================================
-# WEB SCRAPER FOR N8N - WORKING VERSION
 # ==============================================
 import gradio as gr
@@ -8,25 +8,25 @@ import json
 import time
 import re
 import html
-from typing import Dict, Any
 from fastapi import FastAPI, Request
 import uvicorn
 # ==============================================
-# SIMPLE WEB SCRAPER
 # ==============================================
-class WebScraper:
-    """Lightweight web scraper"""
     def __init__(self):
         self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-    def scrape(self, url: str) -> Dict[str, Any]:
-        """Main scraping function"""
         start_time = time.time()
-        print(f"🌐 Scraping: {url}")
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
@@ -36,30 +36,41 @@ class WebScraper:
             # Fetch the page
             headers = {
                 'User-Agent': self.user_agent,
-                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
             }
             response = requests.get(url, headers=headers, timeout=15)
             response.raise_for_status()
-            # Extract text
-            text = self._extract_text(response.text)
-            # Clean text
-            cleaned_text = self._clean_text(text)
-            # Extract title
-            title = self._extract_title(response.text)
             return {
                 "success": True,
                 "url": url,
                 "title": title,
-                "extracted_text": cleaned_text[:15000],
-                "text_length": len(cleaned_text),
                 "status_code": response.status_code,
-                "execution_time": round(time.time() - start_time, 2),
-                "method": "direct_html"
             }
         except Exception as e:
@@ -70,57 +81,299 @@ class WebScraper:
                 "execution_time": round(time.time() - start_time, 2)
             }
-    def _extract_text(self, html_content: str) -> str:
-        """Extract text from HTML using regex"""
-        # Remove scripts and styles
-        html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
-        html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
-        # Remove comments
         html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
         # Remove HTML tags
         text = re.sub(r'<[^>]+>', ' ', html_content)
         # Decode HTML entities
         text = html.unescape(text)
-        return text
     def _extract_title(self, html_content: str) -> str:
         """Extract page title"""
         title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
         if title_match:
             title = title_match.group(1)
-            # Clean title
             title = re.sub(r'\s+', ' ', title).strip()
             return title[:200]
         return "No title found"
-    def _clean_text(self, text: str) -> str:
-        """Clean extracted text"""
-        # Replace multiple whitespace
-        text = re.sub(r'\s+', ' ', text)
-        # Remove control characters
-        text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
-        return text.strip()
 # ==============================================
 # INITIALIZE
 # ==============================================
-scraper = WebScraper()
 # ==============================================
-# CREATE FASTAPI APP FIRST
 # ==============================================
 # Create FastAPI app
 fastapi_app = FastAPI(
-    title="Web Scraper API",
-    description="Extract text from webpages for n8n workflows",
     version="1.0"
 )
@@ -139,16 +392,18 @@ fastapi_app.add_middleware(
 @fastapi_app.get("/")
 async def root():
     return {
-        "service": "Web Scraper API",
         "version": "1.0",
         "endpoints": {
             "GET /": "This info",
             "GET /health": "Health check",
-            "POST /scrape": "Scrape a webpage (for n8n)"
         },
-        "usage": {
-            "curl": 'curl -X POST "https://your-space.hf.space/scrape" -H "Content-Type: application/json" -d \'{"url":"https://example.com"}\'',
-            "n8n": "HTTP Request node: POST to /scrape with JSON body"
         }
     }
@@ -159,9 +414,9 @@ async def health():
         "timestamp": time.time()
     }
-@fastapi_app.post("/scrape")
-async def api_scrape(request: Request):
-    """Main API endpoint for n8n"""
     try:
         # Parse JSON body
         body = await request.json()
@@ -173,8 +428,8 @@ async def api_scrape(request: Request):
                 content={"success": False, "error": "URL parameter is required"}
             )
-        print(f"📨 API Request received for: {url}")
-        result = scraper.scrape(url)
         return result
@@ -193,34 +448,37 @@ async def api_scrape(request: Request):
 # GRADIO INTERFACE
 # ==============================================
-def gradio_scrape(url: str):
     """Gradio interface function"""
     if not url:
         return "❌ Please enter a URL", {}
-    result = scraper.scrape(url)
     if result["success"]:
-        text = result["extracted_text"]
-        text_length = result["text_length"]
-        # Create preview
-        preview = text[:500]
-        if len(text) > 500:
-            preview += "..."
         output = f"""
-## ✅ Success!
 **URL:** {result['url']}
 **Title:** {result.get('title', 'N/A')}
 **Time:** {result['execution_time']}s
-**Characters:** {text_length:,}
-### Preview:
 {preview}
-*Check JSON tab for full API response*
 """
         return output, result
     else:
@@ -228,54 +486,43 @@ def gradio_scrape(url: str):
 # Create Gradio interface
 gradio_interface = gr.Interface(
-    fn=gradio_scrape,
     inputs=gr.Textbox(
         label="Website URL",
         placeholder="https://example.com",
-        value="https://example.com"
     ),
     outputs=[
         gr.Markdown(label="Result"),
         gr.JSON(label="API Response")
     ],
-    title="🌐 Web Scraper for n8n",
-    description="Extract text from webpages. Use POST /scrape for n8n integration.",
     examples=[
-        ["https://example.com"],
         ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
-        ["https://httpbin.org/html"]
-    ]
 )
 # ==============================================
 # MOUNT GRADIO TO FASTAPI
 # ==============================================
-# Mount Gradio app to FastAPI at root path
 app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
-# ==============================================
-# ALTERNATIVE: If mounting doesn't work, try this:
-# ==============================================
-# Instead of mounting, you can also define routes manually
-# Uncomment below if mounting doesn't work:
-# @fastapi_app.get("/")
-# async def gradio_root():
-#     # This will redirect to the Gradio interface
-#     from fastapi.responses import RedirectResponse
-#     return RedirectResponse(url="/")
 # ==============================================
 # LAUNCH THE APP
 # ==============================================
 if __name__ == "__main__":
     print("\n" + "="*60)
-    print("🚀 Web Scraper API Starting")
     print("="*60)
-    print("API Endpoint: POST /scrape")
     print("Web Interface: GET /")
     print("="*60 + "\n")

 # ==============================================
+# SMART CONTENT EXTRACTOR FOR N8N
 # ==============================================
 import gradio as gr
 import time
 import re
 import html
+from typing import Dict, Any, List, Optional
 from fastapi import FastAPI, Request
 import uvicorn
 # ==============================================
+# SMART CONTENT EXTRACTOR
 # ==============================================
+class SmartContentExtractor:
+    """Extracts only main content, removes navigation, ads, footers, etc."""
     def __init__(self):
         self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+    def extract_content(self, url: str) -> Dict[str, Any]:
+        """Extract only main content from webpage"""
         start_time = time.time()
+        print(f"🌐 Extracting content from: {url}")
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
             # Fetch the page
             headers = {
                 'User-Agent': self.user_agent,
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.9',
             }
             response = requests.get(url, headers=headers, timeout=15)
             response.raise_for_status()
+            # Get encoding
+            if response.encoding is None:
+                response.encoding = 'utf-8'
+            html_content = response.text
+            # Extract only main content
+            main_content = self._extract_main_content(html_content)
+            # Clean content
+            cleaned_content = self._clean_content(main_content)
+            # Extract title (separately)
+            title = self._extract_title(html_content)
+            # Extract metadata
+            metadata = self._extract_metadata(html_content)
             return {
                 "success": True,
                 "url": url,
                 "title": title,
+                "main_content": cleaned_content[:20000],  # Limit to 20k chars
+                "content_length": len(cleaned_content),
+                "content_preview": cleaned_content[:500] + ("..." if len(cleaned_content) > 500 else ""),
+                "metadata": metadata,
                 "status_code": response.status_code,
+                "execution_time": round(time.time() - start_time, 2)
             }
         except Exception as e:
                 "execution_time": round(time.time() - start_time, 2)
             }
+    def _extract_main_content(self, html_content: str) -> str:
+        """Extract only the main content, removing navigation, ads, footers, etc."""
+        # First, try to find main content using common selectors
+        # These are CSS selectors that typically contain main content
+        main_content_selectors = [
+            # Article/content focused
+            'article', 'main', '.post-content', '.article-content',
+            '.entry-content', '.story-content', '.content-area',
+            '.main-content', '.post-body', '.article-body',
+            '.story-body', '.content-body', '.text-content',
+            # Blog/News specific
+            '.blog-content', '.news-content', '.post',
+            '.story', '.article', '.post-entry',
+            # Generic content containers
+            '.content', '#content', '.container .content',
+            '.page-content', '.single-content',
+            # Divs with content
+            'div[class*="content"]', 'div[class*="article"]',
+            'div[class*="post"]', 'div[class*="entry"]',
+            'div[class*="story"]', 'div[class*="body"]',
+        ]
+        # Remove unwanted sections first (more aggressive)
+        html_content = self._remove_unwanted_sections(html_content)
+        # Try to extract using regex patterns for main content
+        content = self._extract_with_regex(html_content)
+        # If we got decent content, return it
+        if len(content.strip()) > 200:
+            return content
+        # Fallback: remove all HTML tags and get text
+        return self._extract_all_text(html_content)
+    def _remove_unwanted_sections(self, html_content: str) -> str:
+        """Remove navigation, ads, footers, sidebars, etc."""
+        # Patterns to remove (these are typically unwanted sections)
+        unwanted_patterns = [
+            # Navigation
+            r'<nav[^>]*>.*?</nav>',
+            r'<header[^>]*>.*?</header>',
+            r'<menu[^>]*>.*?</menu>',
+            # Footers
+            r'<footer[^>]*>.*?</footer>',
+            # Sidebars
+            r'<aside[^>]*>.*?</aside>',
+            r'<div[^>]*class="[^"]*sidebar[^"]*"[^>]*>.*?</div>',
+            r'<div[^>]*id="[^"]*sidebar[^"]*"[^>]*>.*?</div>',
+            # Ads and banners
+            r'<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>',
+            r'<div[^>]*class="[^"]*banner[^"]*"[^>]*>.*?</div>',
+            r'<div[^>]*class="[^"]*advertisement[^"]*"[^>]*>.*?</div>',
+            r'<ins[^>]*>.*?</ins>',
+            # Social media/widgets
+            r'<div[^>]*class="[^"]*social[^"]*"[^>]*>.*?</div>',
+            r'<div[^>]*class="[^"]*widget[^"]*"[^>]*>.*?</div>',
+            r'<div[^>]*class="[^"]*share[^"]*"[^>]*>.*?</div>',
+            # Comments
+            r'<div[^>]*class="[^"]*comment[^"]*"[^>]*>.*?</div>',
+            r'<section[^>]*class="[^"]*comment[^"]*"[^>]*>.*?</section>',
+            # Related content (often at bottom)
+            r'<div[^>]*class="[^"]*related[^"]*"[^>]*>.*?</div>',
+            r'<div[^>]*class="[^"]*popular[^"]*"[^>]*>.*?</div>',
+            # Menus and lists
+            r'<ul[^>]*class="[^"]*menu[^"]*"[^>]*>.*?</ul>',
+            r'<ul[^>]*class="[^"]*nav[^"]*"[^>]*>.*?</ul>',
+            # Scripts and styles (always remove)
+            r'<script[^>]*>.*?</script>',
+            r'<style[^>]*>.*?</style>',
+            r'<!--.*?-->',  # Comments
+            # Metadata in body
+            r'<meta[^>]*>',
+            r'<link[^>]*>',
+        ]
+        cleaned_html = html_content
+        for pattern in unwanted_patterns:
+            cleaned_html = re.sub(pattern, ' ', cleaned_html, flags=re.DOTALL | re.IGNORECASE)
+        return cleaned_html
+    def _extract_with_regex(self, html_content: str) -> str:
+        """Extract content using regex patterns"""
+        # Try to find content between common content tags
+        content_patterns = [
+            # Look for article tags
+            r'<article[^>]*>(.*?)</article>',
+            # Look for main tags
+            r'<main[^>]*>(.*?)</main>',
+            # Look for divs with content classes
+            r'<div[^>]*class="[^"]*(post-content|article-content|entry-content|story-content)[^"]*"[^>]*>(.*?)</div>',
+            r'<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>',
+            r'<div[^>]*class="[^"]*article[^"]*"[^>]*>(.*?)</div>',
+            r'<div[^>]*class="[^"]*post[^"]*"[^>]*>(.*?)</div>',
+            # Look for section with content
+            r'<section[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</section>',
+        ]
+        all_content = []
+        for pattern in content_patterns:
+            matches = re.findall(pattern, html_content, re.DOTALL | re.IGNORECASE)
+            for match in matches:
+                # Handle groups in regex
+                if isinstance(match, tuple):
+                    for group in match:
+                        if group and len(group.strip()) > 50:
+                            all_content.append(group)
+                elif match and len(match.strip()) > 50:
+                    all_content.append(match)
+        if all_content:
+            # Combine all found content
+            combined = ' '.join(all_content)
+            # Remove any remaining HTML tags
+            combined = re.sub(r'<[^>]+>', ' ', combined)
+            # Decode HTML entities
+            combined = html.unescape(combined)
+            return combined
+        return ""
+    def _extract_all_text(self, html_content: str) -> str:
+        """Extract all text as fallback, but clean it well"""
+        # Remove scripts, styles, comments first
+        html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
+        html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
         html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
+        # Remove common unwanted tags
+        unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'ins', 'meta', 'link']
+        for tag in unwanted_tags:
+            html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
         # Remove HTML tags
         text = re.sub(r'<[^>]+>', ' ', html_content)
         # Decode HTML entities
         text = html.unescape(text)
+        # Remove very short lines (likely navigation items)
+        lines = text.split('\n')
+        filtered_lines = []
+        for line in lines:
+            line = line.strip()
+            if len(line) > 30:  # Only keep lines longer than 30 chars
+                filtered_lines.append(line)
+            elif any(word in line.lower() for word in ['home', 'about', 'contact', 'login', 'sign up', 'search']):
+                # Skip navigation lines
+                continue
+        return '\n\n'.join(filtered_lines)
+    def _clean_content(self, content: str) -> str:
+        """Clean and normalize the extracted content"""
+        if not content:
+            return ""
+        # Replace multiple whitespace with single space
+        content = re.sub(r'\s+', ' ', content)
+        # Remove control characters
+        content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
+        # Remove common unwanted phrases (ads, prompts, etc.)
+        unwanted_phrases = [
+            r'sign up for our newsletter',
+            r'subscribe to our newsletter',
+            r'follow us on',
+            r'like us on facebook',
+            r'follow us on twitter',
+            r'share this article',
+            r'read more',
+            r'continue reading',
+            r'advertisement',
+            r'sponsored content',
+            r'related articles',
+            r'you may also like',
+            r'popular posts',
+            r'recommended for you',
+            r'click here',
+            r'learn more',
+        ]
+        for phrase in unwanted_phrases:
+            content = re.sub(phrase, '', content, flags=re.IGNORECASE)
+        # Remove email addresses
+        content = re.sub(r'\S+@\S+\.\S+', '', content)
+        # Remove URLs
+        content = re.sub(r'https?://\S+', '', content)
+        # Remove excessive punctuation
+        content = re.sub(r'[.!?]{3,}', '.', content)
+        # Normalize spaces around punctuation
+        content = re.sub(r'\s+([.,!?;:])', r'\1', content)
+        content = re.sub(r'([.,!?;:])\s+', r'\1 ', content)
+        # Split into paragraphs and filter
+        paragraphs = content.split('. ')
+        clean_paragraphs = []
+        for para in paragraphs:
+            para = para.strip()
+            if len(para) < 5:
+                continue
+            # Skip very short paragraphs (likely not content)
+            if len(para) > 30:
+                clean_paragraphs.append(para)
+        # Join back with proper spacing
+        content = '. '.join(clean_paragraphs)
+        return content.strip()
     def _extract_title(self, html_content: str) -> str:
         """Extract page title"""
         title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
         if title_match:
             title = title_match.group(1)
             title = re.sub(r'\s+', ' ', title).strip()
             return title[:200]
         return "No title found"
+    def _extract_metadata(self, html_content: str) -> Dict[str, str]:
+        """Extract basic metadata"""
+        metadata = {}
+        # Meta description
+        desc_match = re.search(r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
+                              html_content, re.IGNORECASE)
+        if desc_match:
+            metadata['description'] = desc_match.group(1)[:300]
+        # Meta keywords
+        keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
+                                  html_content, re.IGNORECASE)
+        if keywords_match:
+            metadata['keywords'] = keywords_match.group(1)[:300]
+        # Author
+        author_match = re.search(r'<meta[^>]*name=["\']author["\'][^>]*content=["\'](.*?)["\']',
+                                html_content, re.IGNORECASE)
+        if author_match:
+            metadata['author'] = author_match.group(1)[:200]
+        # OG title (social media title)
+        og_title_match = re.search(r'<meta[^>]*property=["\']og:title["\'][^>]*content=["\'](.*?)["\']',
+                                  html_content, re.IGNORECASE)
+        if og_title_match:
+            metadata['og_title'] = og_title_match.group(1)[:200]
+        return metadata
 # ==============================================
 # INITIALIZE
 # ==============================================
+extractor = SmartContentExtractor()
 # ==============================================
+# FASTAPI APP
 # ==============================================
 # Create FastAPI app
 fastapi_app = FastAPI(
+    title="Smart Content Extractor",
+    description="Extracts only main content from webpages, removes navigation, ads, footers",
     version="1.0"
 )
 @fastapi_app.get("/")
 async def root():
     return {
+        "service": "Smart Content Extractor",
         "version": "1.0",
+        "description": "Extracts only main content from webpages (no navigation, ads, footers)",
         "endpoints": {
             "GET /": "This info",
             "GET /health": "Health check",
+            "POST /extract": "Extract main content (for n8n)"
         },
+        "usage_n8n": {
+            "method": "POST",
+            "url": "https://your-space.hf.space/extract",
+            "body": {"url": "https://example.com"}
         }
     }
         "timestamp": time.time()
     }
+@fastapi_app.post("/extract")
+async def api_extract(request: Request):
+    """API endpoint for n8n - extracts only main content"""
     try:
         # Parse JSON body
         body = await request.json()
                 content={"success": False, "error": "URL parameter is required"}
             )
+        print(f"📨 Content extraction request: {url}")
+        result = extractor.extract_content(url)
         return result
 # GRADIO INTERFACE
 # ==============================================
+def gradio_extract(url: str):
     """Gradio interface function"""
     if not url:
         return "❌ Please enter a URL", {}
+    result = extractor.extract_content(url)
     if result["success"]:
+        content = result["main_content"]
+        content_length = result["content_length"]
+        # Create preview (first 3 paragraphs or 500 chars)
+        paragraphs = content.split('. ')
+        preview_paragraphs = paragraphs[:3]
+        preview = '. '.join(preview_paragraphs)
+        if len(preview) > 500:
+            preview = preview[:500] + "..."
         output = f"""
+## ✅ Content Extracted Successfully!
 **URL:** {result['url']}
 **Title:** {result.get('title', 'N/A')}
 **Time:** {result['execution_time']}s
+**Content Length:** {content_length:,} characters
+### Content Preview:
 {preview}
+*Check JSON tab for full content*
 """
         return output, result
     else:
 # Create Gradio interface
 gradio_interface = gr.Interface(
+    fn=gradio_extract,
     inputs=gr.Textbox(
         label="Website URL",
         placeholder="https://example.com",
+        value="https://en.wikipedia.org/wiki/Artificial_intelligence"
     ),
     outputs=[
         gr.Markdown(label="Result"),
         gr.JSON(label="API Response")
     ],
+    title="🧠 Smart Content Extractor for n8n",
+    description="Extracts ONLY main content - removes navigation, ads, footers, sidebars, etc.",
     examples=[
         ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
+        ["https://example.com"],
+        ["https://news.ycombinator.com"],
+        ["https://medium.com/topic/technology"]
+    ],
+    allow_flagging="never"
 )
 # ==============================================
 # MOUNT GRADIO TO FASTAPI
 # ==============================================
+# Mount Gradio app to FastAPI
 app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
 # ==============================================
 # LAUNCH THE APP
 # ==============================================
 if __name__ == "__main__":
     print("\n" + "="*60)
+    print("🧠 Smart Content Extractor Starting")
     print("="*60)
+    print("API Endpoint: POST /extract")
     print("Web Interface: GET /")
     print("="*60 + "\n")