Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on Jan 25

Commit

310b130

verified ·

1 Parent(s): ba2f5fc

Update app.py

Browse files

Files changed (1) hide show

app.py +236 -284

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==============================================
-# FREE SCREENSHOT SCRAPER FOR N8N
 # ==============================================
 import gradio as gr
@@ -8,50 +8,19 @@ import json
 import time
 import re
 import html
-import base64
-from io import BytesIO
-from PIL import Image
-import pytesseract  # Free OCR
 from typing import Dict, Any
 from fastapi import FastAPI, Request
 import uvicorn
 # ==============================================
-# FREE SCREENSHOT SCRAPER
 # ==============================================
-class FreeScreenshotScraper:
-    """Free scraper using screenshot APIs + fallback"""
     def __init__(self):
-        # Free screenshot APIs (no API key needed)
-        self.screenshot_apis = [
-            {
-                "url": lambda u: f"https://s0.wp.com/mshots/v1/{u}?w=1024",
-                "name": "wordpress_mshots"
-            },
-            {
-                "url": lambda u: f"https://render-tron.appspot.com/screenshot/{u}?width=1024&height=768",
-                "name": "render_tron"
-            },
-            {
-                "url": lambda u: f"https://image.thum.io/get/width/1024/crop/768/noanimate/{u}",
-                "name": "thumio"
-            },
-        ]
-        # Free HTML content APIs
-        self.html_apis = [
-            {
-                "url": lambda u: f"https://r.jina.ai/{u}",
-                "name": "jina_reader",
-                "headers": {"Accept": "application/json"}
-            },
-            {
-                "url": lambda u: f"https://extractorapi.com/api/v1/extractor?apikey=demo&url={u}",
-                "name": "extractor_api"
-            },
-        ]
     def extract_content(self, url: str) -> Dict[str, Any]:
         """Extract content using free APIs"""
@@ -63,267 +32,196 @@ class FreeScreenshotScraper:
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
-        # Strategy 1: Try Jina Reader API (best for content extraction)
-        print("  Trying Jina Reader API...")
-        jina_result = self._try_jina_reader(url)
-        if jina_result["success"]:
-            jina_result["execution_time"] = round(time.time() - start_time, 2)
-            jina_result["method"] = "jina_reader_api"
-            return jina_result
-        # Strategy 2: Try other HTML APIs
-        print("  Trying other HTML APIs...")
-        for api in self.html_apis[1:]:
-            result = self._try_api(api, url)
-            if result["success"]:
-                result["execution_time"] = round(time.time() - start_time, 2)
-                result["method"] = api["name"]
-                return result
-        # Strategy 3: Try direct request with smart headers
-        print("  Trying direct request...")
-        direct_result = self._try_direct_request(url)
-        if direct_result["success"]:
-            direct_result["execution_time"] = round(time.time() - start_time, 2)
-            direct_result["method"] = "direct_with_fallback"
-            return direct_result
-        # Strategy 4: Try screenshot APIs as last resort
-        print("  Trying screenshot APIs...")
-        for api in self.screenshot_apis:
-            result = self._try_screenshot_api(api, url)
-            if result["success"]:
-                result["execution_time"] = round(time.time() - start_time, 2)
-                result["method"] = f"screenshot_{api['name']}"
-                return result
         # All failed
         return {
             "success": False,
             "url": url,
-            "error": "All free methods failed",
             "execution_time": round(time.time() - start_time, 2),
-            "suggestions": [
-                "Try a different URL",
-                "Website may block automated access",
-                "Try using Jina Reader directly: https://r.jina.ai/your-url"
-            ]
         }
     def _try_jina_reader(self, url: str) -> Dict[str, Any]:
-        """Try Jina Reader API (free, no API key needed)"""
         try:
             api_url = f"https://r.jina.ai/{url}"
-            headers = {
-                "User-Agent": "Mozilla/5.0",
-                "Accept": "application/json",
-            }
-            response = requests.get(api_url, headers=headers, timeout=30)
-            if response.status_code == 200:
-                # Jina returns clean text directly
-                content = response.text
-                # Try to parse as JSON first
                 try:
-                    data = json.loads(content)
-                    if "data" in data:
-                        content = data["data"]["content"] if "content" in data["data"] else str(data["data"])
-                except:
-                    pass  # Keep as text
-                # Extract title if possible
-                title = ""
-                title_match = re.search(r'<title[^>]*>(.*?)</title>', content, re.IGNORECASE)
-                if title_match:
-                    title = title_match.group(1)
-                # Clean content
-                cleaned = self._clean_content(content)
-                return {
-                    "success": True,
-                    "url": url,
-                    "title": title[:200] if title else "Extracted via Jina Reader",
-                    "main_content": cleaned[:30000],
-                    "content_length": len(cleaned),
-                    "source": "jina_reader",
-                    "note": "Content extracted via free Jina Reader API"
-                }
-            return {"success": False, "error": f"Jina API status: {response.status_code}"}
         except Exception as e:
             return {"success": False, "error": f"Jina API error: {str(e)}"}
-    def _try_api(self, api: dict, url: str) -> Dict[str, Any]:
-        """Try other free APIs"""
-        try:
-            api_url = api["url"](url)
-            headers = api.get("headers", {"User-Agent": "Mozilla/5.0"})
-            response = requests.get(api_url, headers=headers, timeout=15)
-            if response.status_code == 200:
-                content = response.text
-                # Try to parse JSON
-                try:
-                    data = json.loads(content)
-                    # Extract content from common API formats
-                    if "text" in data:
-                        content = data["text"]
-                    elif "content" in data:
-                        content = data["content"]
-                    elif "article" in data:
-                        content = data["article"]
-                except:
-                    pass
-                cleaned = self._clean_content(content)
-                return {
-                    "success": True,
-                    "url": url,
-                    "main_content": cleaned[:20000],
-                    "content_length": len(cleaned)
-                }
-            return {"success": False}
-        except:
-            return {"success": False}
     def _try_direct_request(self, url: str) -> Dict[str, Any]:
-        """Try direct request with various strategies"""
-        strategies = [
-            self._direct_request_with_headers,
-            self._direct_request_as_googlebot,
-            self._direct_request_with_referer,
         ]
-        for strategy in strategies:
             try:
-                result = strategy(url)
-                if result["success"]:
-                    return result
-            except:
                 continue
         return {"success": False}
-    def _direct_request_with_headers(self, url: str) -> Dict[str, Any]:
-        """Direct request with browser-like headers"""
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-            "Accept-Language": "en-US,en;q=0.5",
-            "Accept-Encoding": "gzip, deflate",
-            "Connection": "keep-alive",
-            "Upgrade-Insecure-Requests": "1",
-            "Cache-Control": "max-age=0",
-        }
-        response = requests.get(url, headers=headers, timeout=10)
-        if response.status_code == 200:
-            content = self._extract_from_html(response.text)
-            cleaned = self._clean_content(content)
-            return {
-                "success": True,
-                "content": cleaned
-            }
-        return {"success": False}
-    def _direct_request_as_googlebot(self, url: str) -> Dict[str, Any]:
         """Pretend to be Googlebot"""
-        headers = {
-            "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-        }
-        response = requests.get(url, headers=headers, timeout=10)
-        if response.status_code == 200:
-            content = self._extract_from_html(response.text)
-            cleaned = self._clean_content(content)
-            return {
-                "success": True,
-                "content": cleaned
-            }
-        return {"success": False}
-    def _direct_request_with_referer(self, url: str) -> Dict[str, Any]:
-        """Request with referer"""
-        headers = {
-            "User-Agent": "Mozilla/5.0",
-            "Referer": "https://www.google.com/",
-            "Accept": "text/html",
-        }
-        response = requests.get(url, headers=headers, timeout=10)
-        if response.status_code == 200:
-            content = self._extract_from_html(response.text)
-            cleaned = self._clean_content(content)
-            return {
-                "success": True,
-                "content": cleaned
-            }
-        return {"success": False}
-    def _try_screenshot_api(self, api: dict, url: str) -> Dict[str, Any]:
-        """Try screenshot API"""
         try:
-            api_url = api["url"](url)
-            headers = {"User-Agent": "Mozilla/5.0"}
-            response = requests.get(api_url, headers=headers, timeout=15)
-            if response.status_code == 200 and len(response.content) > 1000:
-                # Check if it's actually an image
-                try:
-                    img = Image.open(BytesIO(response.content))
-                    img.verify()
-                    # Try OCR if available
-                    try:
-                        text = pytesseract.image_to_string(img)
-                        cleaned = self._clean_content(text)
-                        return {
-                            "success": True,
-                            "url": url,
-                            "main_content": cleaned[:15000],
-                            "content_length": len(cleaned),
-                            "note": "Content extracted from screenshot via OCR"
-                        }
-                    except:
-                        return {"success": False, "error": "OCR not available"}
-                except:
-                    return {"success": False}
             return {"success": False}
-        except:
-            return {"success": False}
     def _extract_from_html(self, html_content: str) -> str:
         """Extract text from HTML"""
         # Remove scripts and styles
-        html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
-        html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
-        # Remove unwanted tags
-        unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu']
         for tag in unwanted_tags:
             html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
@@ -331,10 +229,41 @@ class FreeScreenshotScraper:
         text = re.sub(r'<[^>]+>', ' ', html_content)
         text = html.unescape(text)
         return text
     def _clean_content(self, content: str) -> str:
-        """Clean content"""
         if not content:
             return ""
@@ -347,13 +276,36 @@ class FreeScreenshotScraper:
         # Remove excessive line breaks
         content = re.sub(r'\n{3,}', '\n\n', content)
         return content.strip()
 # ==============================================
 # INITIALIZE
 # ==============================================
-scraper = FreeScreenshotScraper()
 # ==============================================
 # FASTAPI APP
@@ -361,7 +313,7 @@ scraper = FreeScreenshotScraper()
 fastapi_app = FastAPI(
     title="Free Content Extractor",
-    description="Uses free APIs to extract content from websites",
     version="1.0"
 )
@@ -381,18 +333,18 @@ async def root():
     return {
         "service": "Free Content Extractor",
         "version": "1.0",
-        "description": "Uses free APIs (Jina Reader, etc.) to extract website content",
         "endpoints": {
             "GET /": "This info",
             "GET /health": "Health check",
-            "POST /extract": "Extract content (for n8n)"
         },
-        "free_apis_used": [
-            "Jina Reader (https://r.jina.ai/)",
-            "WordPress mShots",
-            "Render-Tron",
-            "ExtractorAPI (demo)"
-        ]
     }
 @fastapi_app.get("/health")
@@ -412,8 +364,8 @@ async def api_extract(request: Request):
                 content={"success": False, "error": "URL is required"}
             )
-        print(f"📨 Request: {url}")
-        result = scraper.extract_content(url)
         return result
@@ -433,16 +385,17 @@ async def api_extract(request: Request):
 # ==============================================
 def gradio_extract(url: str):
-    """Gradio interface"""
     if not url:
         return "❌ Please enter a URL", {}
-    result = scraper.extract_content(url)
     if result["success"]:
         content = result["main_content"]
         content_length = result["content_length"]
         preview = content[:800]
         if len(content) > 800:
             preview += "..."
@@ -451,23 +404,22 @@ def gradio_extract(url: str):
 ## ✅ Success!
 **URL:** {result['url']}
-**Method:** {result.get('method', 'free_api')}
 **Time:** {result['execution_time']}s
 **Content Length:** {content_length:,} characters
 ### Preview:
 {preview}
-*Using free APIs - may not work on all websites*
 """
         return output, result
     else:
         error = result.get("error", "Unknown error")
-        suggestions = result.get("suggestions", [])
-        suggestion_text = ""
-        if suggestions:
-            suggestion_text = "\n\n**Suggestions:**\n" + "\n".join([f"• {s}" for s in suggestions])
         return f"## ❌ Error\n\n{error}{suggestion_text}", result
@@ -484,12 +436,13 @@ gradio_interface = gr.Interface(
         gr.JSON(label="API Response")
     ],
     title="🌐 Free Content Extractor for n8n",
-    description="Uses free APIs to extract content. Works with most websites.",
     examples=[
         ["https://www.sinchew.com.my/"],
         ["https://example.com"],
         ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
-        ["https://news.ycombinator.com"]
     ]
 )
@@ -507,12 +460,11 @@ if __name__ == "__main__":
     print("\n" + "="*60)
     print("🌐 Free Content Extractor Starting")
     print("="*60)
-    print("Using free APIs:")
-    print("• Jina Reader (r.jina.ai)")
-    print("• WordPress mShots")
-    print("• Render-Tron")
     print("="*60)
     print("API Endpoint: POST /extract")
     print("="*60 + "\n")
     uvicorn.run(

 # ==============================================
+# SIMPLE FREE CONTENT EXTRACTOR FOR N8N
 # ==============================================
 import gradio as gr
 import time
 import re
 import html
 from typing import Dict, Any
 from fastapi import FastAPI, Request
 import uvicorn
 # ==============================================
+# SIMPLE CONTENT EXTRACTOR
 # ==============================================
+class SimpleContentExtractor:
+    """Simple extractor using Jina Reader API + fallbacks"""
     def __init__(self):
+        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
     def extract_content(self, url: str) -> Dict[str, Any]:
         """Extract content using free APIs"""
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
+        # Try multiple strategies
+        strategies = [
+            self._try_jina_reader,
+            self._try_direct_request,
+            self._try_googlebot,
+        ]
+        for i, strategy in enumerate(strategies):
+            try:
+                print(f"  Trying strategy {i+1}...")
+                result = strategy(url)
+                if result.get("success"):
+                    result["execution_time"] = round(time.time() - start_time, 2)
+                    result["method"] = f"strategy_{i+1}"
+                    return result
+            except Exception as e:
+                print(f"  Strategy {i+1} failed: {e}")
+                time.sleep(0.3)  # Small delay
         # All failed
         return {
             "success": False,
             "url": url,
+            "error": "Failed to extract content",
             "execution_time": round(time.time() - start_time, 2),
+            "suggestion": "Try using Jina Reader directly: https://r.jina.ai/your-url"
         }
     def _try_jina_reader(self, url: str) -> Dict[str, Any]:
+        """Try Jina Reader API (free, no API key, handles JavaScript)"""
         try:
+            # Jina Reader endpoint
             api_url = f"https://r.jina.ai/{url}"
+            # Try with different formats
+            formats = [
+                {"headers": {"Accept": "text/plain"}},
+                {"headers": {"Accept": "application/json"}},
+                {"url": f"https://r.jina.ai/{url}?format=json"},
+            ]
+            for fmt in formats:
                 try:
+                    headers = fmt.get("headers", {"Accept": "text/plain", "User-Agent": self.user_agent})
+                    api_url_to_use = fmt.get("url", api_url)
+                    response = requests.get(api_url_to_use, headers=headers, timeout=30)
+                    if response.status_code == 200:
+                        content = response.text
+                        # Try to parse as JSON
+                        try:
+                            data = json.loads(content)
+                            if isinstance(data, dict):
+                                if "content" in data:
+                                    content = data["content"]
+                                elif "data" in data:
+                                    content = str(data["data"])
+                                elif "text" in data:
+                                    content = data["text"]
+                        except:
+                            pass  # Keep as plain text
+                        # Extract title
+                        title = self._extract_title_from_text(content)
+                        # Clean content
+                        cleaned = self._clean_content(content)
+                        return {
+                            "success": True,
+                            "url": url,
+                            "title": title[:300] if title else "Extracted via Jina Reader",
+                            "main_content": cleaned[:35000],
+                            "content_length": len(cleaned),
+                            "content_preview": cleaned[:1000] + ("..." if len(cleaned) > 1000 else ""),
+                            "source": "jina_reader",
+                            "note": "Content extracted via free Jina Reader API (handles JavaScript)",
+                            "status": response.status_code
+                        }
+                except Exception as e:
+                    print(f"    Jina format failed: {e}")
+                    continue
+            return {"success": False, "error": f"Jina returned status: {response.status_code}"}
         except Exception as e:
             return {"success": False, "error": f"Jina API error: {str(e)}"}
     def _try_direct_request(self, url: str) -> Dict[str, Any]:
+        """Try direct HTTP request with smart headers"""
+        headers_list = [
+            # Normal browser
+            {
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.9",
+                "Accept-Encoding": "gzip, deflate",
+                "Connection": "keep-alive",
+            },
+            # Mobile browser
+            {
+                "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.9",
+            },
+            # Simple headers
+            {
+                "User-Agent": "Mozilla/5.0",
+                "Accept": "text/html",
+            },
         ]
+        for headers in headers_list:
             try:
+                response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
+                if response.status_code == 200:
+                    html_content = response.text
+                    # Extract content
+                    text_content = self._extract_from_html(html_content)
+                    cleaned = self._clean_content(text_content)
+                    # Extract title
+                    title = self._extract_title_from_html(html_content)
+                    if len(cleaned) > 100:  # If we got meaningful content
+                        return {
+                            "success": True,
+                            "url": url,
+                            "title": title[:300] if title else "Extracted via direct request",
+                            "main_content": cleaned[:30000],
+                            "content_length": len(cleaned),
+                            "source": "direct_request",
+                            "status": response.status_code
+                        }
+            except Exception as e:
+                print(f"    Direct request failed: {e}")
                 continue
         return {"success": False}
+    def _try_googlebot(self, url: str) -> Dict[str, Any]:
         """Pretend to be Googlebot"""
         try:
+            headers = {
+                "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "From": "googlebot(at)googlebot.com",
+            }
+            response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
+            if response.status_code == 200:
+                html_content = response.text
+                text_content = self._extract_from_html(html_content)
+                cleaned = self._clean_content(text_content)
+                title = self._extract_title_from_html(html_content)
+                if len(cleaned) > 100:
+                    return {
+                        "success": True,
+                        "url": url,
+                        "title": title[:300] if title else "Extracted as Googlebot",
+                        "main_content": cleaned[:30000],
+                        "content_length": len(cleaned),
+                        "source": "googlebot",
+                        "status": response.status_code
+                    }
             return {"success": False}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
     def _extract_from_html(self, html_content: str) -> str:
         """Extract text from HTML"""
         # Remove scripts and styles
+        html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
+        html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
+        html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
+        # Remove unwanted sections
+        unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe']
         for tag in unwanted_tags:
             html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
         text = re.sub(r'<[^>]+>', ' ', html_content)
         text = html.unescape(text)
+        # Remove excessive whitespace
+        text = re.sub(r'\s+', ' ', text)
         return text
+    def _extract_title_from_html(self, html_content: str) -> str:
+        """Extract title from HTML"""
+        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
+        if title_match:
+            title = title_match.group(1)
+            title = re.sub(r'\s+', ' ', title).strip()
+            title = html.unescape(title)
+            return title
+        return ""
+    def _extract_title_from_text(self, text: str) -> str:
+        """Try to extract title from plain text"""
+        # Look for title patterns
+        patterns = [
+            r'Title:\s*(.*?)(?:\n|$)',
+            r'#\s+(.*?)(?:\n|$)',
+            r'^(.*?)(?:\n|$)',
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, text[:500], re.IGNORECASE)
+            if match:
+                title = match.group(1).strip()
+                if len(title) > 10 and len(title) < 200:
+                    return title
+        return ""
     def _clean_content(self, content: str) -> str:
+        """Clean and normalize content"""
         if not content:
             return ""
         # Remove excessive line breaks
         content = re.sub(r'\n{3,}', '\n\n', content)
+        # Remove common unwanted phrases
+        unwanted = [
+            r'adsbygoogle',
+            r'advertisement',
+            r'sponsored content',
+            r'sign up for',
+            r'subscribe to',
+            r'follow us on',
+            r'like us on facebook',
+            r'share this article',
+            r'read more',
+            r'continue reading',
+            r'click here',
+            r'learn more',
+        ]
+        for phrase in unwanted:
+            content = re.sub(phrase, '', content, flags=re.IGNORECASE)
+        # Remove email addresses and URLs
+        content = re.sub(r'\S+@\S+\.\S+', '', content)
+        content = re.sub(r'https?://\S+', '', content)
         return content.strip()
 # ==============================================
 # INITIALIZE
 # ==============================================
+extractor = SimpleContentExtractor()
 # ==============================================
 # FASTAPI APP
 fastapi_app = FastAPI(
     title="Free Content Extractor",
+    description="Extracts content using free Jina Reader API and fallbacks",
     version="1.0"
 )
     return {
         "service": "Free Content Extractor",
         "version": "1.0",
+        "description": "Extracts website content using free Jina Reader API (handles JavaScript)",
         "endpoints": {
             "GET /": "This info",
             "GET /health": "Health check",
+            "POST /extract": "Extract content"
         },
+        "usage_n8n": {
+            "method": "POST",
+            "url": "https://your-space.hf.space/extract",
+            "body": {"url": "https://example.com"}
+        },
+        "alternative": "Use Jina Reader directly: GET https://r.jina.ai/your-url"
     }
 @fastapi_app.get("/health")
                 content={"success": False, "error": "URL is required"}
             )
+        print(f"📨 API Request: {url}")
+        result = extractor.extract_content(url)
         return result
 # ==============================================
 def gradio_extract(url: str):
+    """Gradio interface function"""
     if not url:
         return "❌ Please enter a URL", {}
+    result = extractor.extract_content(url)
     if result["success"]:
         content = result["main_content"]
         content_length = result["content_length"]
+        # Create preview
         preview = content[:800]
         if len(content) > 800:
             preview += "..."
 ## ✅ Success!
 **URL:** {result['url']}
+**Title:** {result.get('title', 'N/A')}
+**Method:** {result.get('method', 'jina_reader')}
 **Time:** {result['execution_time']}s
 **Content Length:** {content_length:,} characters
 ### Preview:
 {preview}
+*Powered by free Jina Reader API*
 """
         return output, result
     else:
         error = result.get("error", "Unknown error")
+        suggestion = result.get("suggestion", "")
+        suggestion_text = f"\n\n{suggestion}" if suggestion else ""
         return f"## ❌ Error\n\n{error}{suggestion_text}", result
         gr.JSON(label="API Response")
     ],
     title="🌐 Free Content Extractor for n8n",
+    description="Uses free Jina Reader API to extract content (handles JavaScript websites)",
     examples=[
         ["https://www.sinchew.com.my/"],
         ["https://example.com"],
         ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
+        ["https://news.ycombinator.com"],
+        ["https://zhihu.com"]
     ]
 )
     print("\n" + "="*60)
     print("🌐 Free Content Extractor Starting")
     print("="*60)
+    print("Primary method: Jina Reader API")
+    print("Secondary: Direct requests + Googlebot")
     print("="*60)
     print("API Endpoint: POST /extract")
+    print("Direct Jina: GET https://r.jina.ai/your-url")
     print("="*60 + "\n")
     uvicorn.run(