Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on about 1 month ago

Commit

2448858

verified ·

1 Parent(s): d982093

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -115

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==============================================
-# SIMPLE FREE CONTENT EXTRACTOR FOR N8N
 # ==============================================
 import gradio as gr
@@ -13,17 +13,17 @@ from fastapi import FastAPI, Request
 import uvicorn
 # ==============================================
-# SIMPLE CONTENT EXTRACTOR
 # ==============================================
-class SimpleContentExtractor:
-    """Simple extractor using Jina Reader API + fallbacks"""
     def __init__(self):
         self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
     def extract_content(self, url: str) -> Dict[str, Any]:
-        """Extract content using free APIs"""
         start_time = time.time()
         print(f"🌐 Extracting: {url}")
@@ -32,13 +32,15 @@ class SimpleContentExtractor:
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
-        # Try multiple strategies
         strategies = [
-            self._try_jina_reader,
-            self._try_direct_request,
-            self._try_googlebot,
         ]
         for i, strategy in enumerate(strategies):
             try:
                 print(f"  Trying strategy {i+1}...")
@@ -50,37 +52,39 @@ class SimpleContentExtractor:
                     return result
             except Exception as e:
                 print(f"  Strategy {i+1} failed: {e}")
-                time.sleep(0.3)  # Small delay
         # All failed
         return {
             "success": False,
             "url": url,
-            "error": "Failed to extract content",
             "execution_time": round(time.time() - start_time, 2),
-            "suggestion": "Try using Jina Reader directly: https://r.jina.ai/your-url"
         }
-    def _try_jina_reader(self, url: str) -> Dict[str, Any]:
-        """Try Jina Reader API (free, no API key, handles JavaScript)"""
         try:
-            # Jina Reader endpoint
-            api_url = f"https://r.jina.ai/{url}"
-            # Try with different formats
-            formats = [
-                {"headers": {"Accept": "text/plain"}},
-                {"headers": {"Accept": "application/json"}},
-                {"url": f"https://r.jina.ai/{url}?format=json"},
             ]
-            for fmt in formats:
                 try:
-                    headers = fmt.get("headers", {"Accept": "text/plain", "User-Agent": self.user_agent})
-                    api_url_to_use = fmt.get("url", api_url)
-                    response = requests.get(api_url_to_use, headers=headers, timeout=30)
                     if response.status_code == 200:
                         content = response.text
@@ -93,8 +97,6 @@ class SimpleContentExtractor:
                                     content = data["content"]
                                 elif "data" in data:
                                     content = str(data["data"])
-                                elif "text" in data:
-                                    content = data["text"]
                         except:
                             pass  # Keep as plain text
@@ -107,44 +109,40 @@ class SimpleContentExtractor:
                         return {
                             "success": True,
                             "url": url,
-                            "title": title[:300] if title else "Extracted via Jina Reader",
-                            "main_content": cleaned[:35000],
                             "content_length": len(cleaned),
-                            "content_preview": cleaned[:1000] + ("..." if len(cleaned) > 1000 else ""),
                             "source": "jina_reader",
-                            "note": "Content extracted via free Jina Reader API (handles JavaScript)",
                             "status": response.status_code
                         }
                 except Exception as e:
-                    print(f"    Jina format failed: {e}")
                     continue
-            return {"success": False, "error": f"Jina returned status: {response.status_code}"}
         except Exception as e:
-            return {"success": False, "error": f"Jina API error: {str(e)}"}
     def _try_direct_request(self, url: str) -> Dict[str, Any]:
-        """Try direct HTTP request with smart headers"""
         headers_list = [
-            # Normal browser
             {
                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
                 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                 "Accept-Language": "en-US,en;q=0.9",
-                "Accept-Encoding": "gzip, deflate",
-                "Connection": "keep-alive",
             },
-            # Mobile browser
             {
                 "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15",
-                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-                "Accept-Language": "en-US,en;q=0.9",
             },
-            # Simple headers
             {
-                "User-Agent": "Mozilla/5.0",
                 "Accept": "text/html",
             },
         ]
@@ -163,12 +161,12 @@ class SimpleContentExtractor:
                     # Extract title
                     title = self._extract_title_from_html(html_content)
-                    if len(cleaned) > 100:  # If we got meaningful content
                         return {
                             "success": True,
                             "url": url,
-                            "title": title[:300] if title else "Extracted via direct request",
-                            "main_content": cleaned[:30000],
                             "content_length": len(cleaned),
                             "source": "direct_request",
                             "status": response.status_code
@@ -180,16 +178,15 @@ class SimpleContentExtractor:
         return {"success": False}
-    def _try_googlebot(self, url: str) -> Dict[str, Any]:
-        """Pretend to be Googlebot"""
         try:
-            headers = {
-                "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
-                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-                "From": "googlebot(at)googlebot.com",
-            }
-            response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
             if response.status_code == 200:
                 html_content = response.text
@@ -197,15 +194,14 @@ class SimpleContentExtractor:
                 cleaned = self._clean_content(text_content)
                 title = self._extract_title_from_html(html_content)
-                if len(cleaned) > 100:
                     return {
                         "success": True,
                         "url": url,
-                        "title": title[:300] if title else "Extracted as Googlebot",
-                        "main_content": cleaned[:30000],
                         "content_length": len(cleaned),
-                        "source": "googlebot",
-                        "status": response.status_code
                     }
             return {"success": False}
@@ -216,11 +212,11 @@ class SimpleContentExtractor:
     def _extract_from_html(self, html_content: str) -> str:
         """Extract text from HTML"""
         # Remove scripts and styles
-        html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
-        html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
         html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
-        # Remove unwanted sections
         unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe']
         for tag in unwanted_tags:
             html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
@@ -229,7 +225,7 @@ class SimpleContentExtractor:
         text = re.sub(r'<[^>]+>', ' ', html_content)
         text = html.unescape(text)
-        # Remove excessive whitespace
         text = re.sub(r'\s+', ' ', text)
         return text
@@ -245,16 +241,16 @@ class SimpleContentExtractor:
         return ""
     def _extract_title_from_text(self, text: str) -> str:
-        """Try to extract title from plain text"""
         # Look for title patterns
         patterns = [
             r'Title:\s*(.*?)(?:\n|$)',
             r'#\s+(.*?)(?:\n|$)',
-            r'^(.*?)(?:\n|$)',
         ]
         for pattern in patterns:
-            match = re.search(pattern, text[:500], re.IGNORECASE)
             if match:
                 title = match.group(1).strip()
                 if len(title) > 10 and len(title) < 200:
@@ -284,37 +280,30 @@ class SimpleContentExtractor:
             r'sign up for',
             r'subscribe to',
             r'follow us on',
-            r'like us on facebook',
             r'share this article',
             r'read more',
             r'continue reading',
-            r'click here',
-            r'learn more',
         ]
         for phrase in unwanted:
             content = re.sub(phrase, '', content, flags=re.IGNORECASE)
-        # Remove email addresses and URLs
-        content = re.sub(r'\S+@\S+\.\S+', '', content)
-        content = re.sub(r'https?://\S+', '', content)
         return content.strip()
 # ==============================================
 # INITIALIZE
 # ==============================================
-extractor = SimpleContentExtractor()
 # ==============================================
 # FASTAPI APP
 # ==============================================
 fastapi_app = FastAPI(
-    title="Free Content Extractor",
-    description="Extracts content using free Jina Reader API and fallbacks",
-    version="1.0"
 )
 from fastapi.middleware.cors import CORSMiddleware
@@ -331,25 +320,25 @@ fastapi_app.add_middleware(
 @fastapi_app.get("/")
 async def root():
     return {
-        "service": "Free Content Extractor",
-        "version": "1.0",
-        "description": "Extracts website content using free Jina Reader API (handles JavaScript)",
         "endpoints": {
             "GET /": "This info",
-            "GET /health": "Health check",
             "POST /extract": "Extract content"
         },
-        "usage_n8n": {
-            "method": "POST",
-            "url": "https://your-space.hf.space/extract",
-            "body": {"url": "https://example.com"}
-        },
-        "alternative": "Use Jina Reader directly: GET https://r.jina.ai/your-url"
     }
 @fastapi_app.get("/health")
 async def health():
-    return {"status": "healthy", "timestamp": time.time()}
 @fastapi_app.post("/extract")
 async def api_extract(request: Request):
@@ -385,7 +374,7 @@ async def api_extract(request: Request):
 # ==============================================
 def gradio_extract(url: str):
-    """Gradio interface function"""
     if not url:
         return "❌ Please enter a URL", {}
@@ -395,9 +384,8 @@ def gradio_extract(url: str):
         content = result["main_content"]
         content_length = result["content_length"]
-        # Create preview
-        preview = content[:800]
-        if len(content) > 800:
             preview += "..."
         output = f"""
@@ -405,44 +393,36 @@ def gradio_extract(url: str):
 **URL:** {result['url']}
 **Title:** {result.get('title', 'N/A')}
-**Method:** {result.get('method', 'jina_reader')}
 **Time:** {result['execution_time']}s
-**Content Length:** {content_length:,} characters
 ### Preview:
 {preview}
-*Powered by free Jina Reader API*
 """
         return output, result
     else:
         error = result.get("error", "Unknown error")
-        suggestion = result.get("suggestion", "")
-        suggestion_text = f"\n\n{suggestion}" if suggestion else ""
-        return f"## ❌ Error\n\n{error}{suggestion_text}", result
 # Create Gradio interface
 gradio_interface = gr.Interface(
     fn=gradio_extract,
     inputs=gr.Textbox(
         label="Website URL",
-        placeholder="https://www.sinchew.com.my/",
-        value="https://www.sinchew.com.my/"
     ),
     outputs=[
         gr.Markdown(label="Result"),
         gr.JSON(label="API Response")
     ],
-    title="🌐 Free Content Extractor for n8n",
-    description="Uses free Jina Reader API to extract content (handles JavaScript websites)",
     examples=[
-        ["https://www.sinchew.com.my/"],
         ["https://example.com"],
         ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
-        ["https://news.ycombinator.com"],
-        ["https://zhihu.com"]
     ]
 )
@@ -458,13 +438,16 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
 if __name__ == "__main__":
     print("\n" + "="*60)
-    print("🌐 Free Content Extractor Starting")
     print("="*60)
-    print("Primary method: Jina Reader API")
-    print("Secondary: Direct requests + Googlebot")
     print("="*60)
-    print("API Endpoint: POST /extract")
-    print("Direct Jina: GET https://r.jina.ai/your-url")
     print("="*60 + "\n")
     uvicorn.run(

 # ==============================================
+# ROBUST CONTENT EXTRACTOR WITH BETTER ERROR HANDLING
 # ==============================================
 import gradio as gr
 import uvicorn
 # ==============================================
+# IMPROVED CONTENT EXTRACTOR
 # ==============================================
+class RobustContentExtractor:
+    """Content extractor with better timeout handling"""
     def __init__(self):
         self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
     def extract_content(self, url: str) -> Dict[str, Any]:
+        """Extract content with better error handling"""
         start_time = time.time()
         print(f"🌐 Extracting: {url}")
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
+        # Try multiple strategies with shorter timeouts
         strategies = [
+            self._try_jina_reader_fast,  # Faster timeout
+            self._try_direct_request,    # Direct attempt
+            self._try_simple_request,    # Simple headers
         ]
+        last_error = None
         for i, strategy in enumerate(strategies):
             try:
                 print(f"  Trying strategy {i+1}...")
                     return result
             except Exception as e:
+                last_error = str(e)
                 print(f"  Strategy {i+1} failed: {e}")
+                time.sleep(0.5)
         # All failed
         return {
             "success": False,
             "url": url,
+            "error": f"All extraction methods failed. Last error: {last_error}",
             "execution_time": round(time.time() - start_time, 2),
+            "suggestion": "Website may block automated access. Try a different URL."
         }
+    def _try_jina_reader_fast(self, url: str) -> Dict[str, Any]:
+        """Try Jina Reader with shorter timeout"""
         try:
+            # Try with shorter timeout first
+            jina_url = f"https://r.jina.ai/{url}"
+            # Try multiple approaches
+            attempts = [
+                {"timeout": 15, "headers": {"Accept": "text/plain"}},
+                {"timeout": 20, "headers": {"Accept": "application/json"}},
+                {"timeout": 25, "headers": {"User-Agent": self.user_agent}},
             ]
+            for attempt in attempts:
                 try:
+                    response = requests.get(
+                        jina_url,
+                        headers=attempt["headers"],
+                        timeout=attempt["timeout"]
+                    )
                     if response.status_code == 200:
                         content = response.text
                                     content = data["content"]
                                 elif "data" in data:
                                     content = str(data["data"])
                         except:
                             pass  # Keep as plain text
                         return {
                             "success": True,
                             "url": url,
+                            "title": title[:300] if title else "Extracted via Jina",
+                            "main_content": cleaned[:25000],
                             "content_length": len(cleaned),
+                            "content_preview": cleaned[:800] + ("..." if len(cleaned) > 800 else ""),
                             "source": "jina_reader",
                             "status": response.status_code
                         }
+                except requests.exceptions.Timeout:
+                    print(f"    Jina timeout after {attempt['timeout']}s, trying next...")
+                    continue
                 except Exception as e:
+                    print(f"    Jina attempt failed: {e}")
                     continue
+            return {"success": False, "error": "Jina Reader timed out"}
         except Exception as e:
+            return {"success": False, "error": f"Jina error: {str(e)}"}
     def _try_direct_request(self, url: str) -> Dict[str, Any]:
+        """Try direct request with various headers"""
         headers_list = [
             {
                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
                 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                 "Accept-Language": "en-US,en;q=0.9",
             },
             {
                 "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15",
+                "Accept": "text/html",
             },
             {
+                "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
                 "Accept": "text/html",
             },
         ]
                     # Extract title
                     title = self._extract_title_from_html(html_content)
+                    if len(cleaned) > 100:
                         return {
                             "success": True,
                             "url": url,
+                            "title": title[:300] if title else "Direct extraction",
+                            "main_content": cleaned[:20000],
                             "content_length": len(cleaned),
                             "source": "direct_request",
                             "status": response.status_code
         return {"success": False}
+    def _try_simple_request(self, url: str) -> Dict[str, Any]:
+        """Simple request with minimal headers"""
         try:
+            response = requests.get(
+                url,
+                headers={"User-Agent": "Mozilla/5.0"},
+                timeout=8,
+                allow_redirects=True
+            )
             if response.status_code == 200:
                 html_content = response.text
                 cleaned = self._clean_content(text_content)
                 title = self._extract_title_from_html(html_content)
+                if len(cleaned) > 50:
                     return {
                         "success": True,
                         "url": url,
+                        "title": title[:200] if title else "Simple extraction",
+                        "main_content": cleaned[:15000],
                         "content_length": len(cleaned),
+                        "source": "simple_request"
                     }
             return {"success": False}
     def _extract_from_html(self, html_content: str) -> str:
         """Extract text from HTML"""
         # Remove scripts and styles
+        html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
+        html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
         html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
+        # Remove unwanted tags
         unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe']
         for tag in unwanted_tags:
             html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
         text = re.sub(r'<[^>]+>', ' ', html_content)
         text = html.unescape(text)
+        # Clean up
         text = re.sub(r'\s+', ' ', text)
         return text
         return ""
     def _extract_title_from_text(self, text: str) -> str:
+        """Try to extract title from text"""
         # Look for title patterns
         patterns = [
             r'Title:\s*(.*?)(?:\n|$)',
             r'#\s+(.*?)(?:\n|$)',
+            r'<title[^>]*>(.*?)</title>',
         ]
         for pattern in patterns:
+            match = re.search(pattern, text[:1000], re.IGNORECASE)
             if match:
                 title = match.group(1).strip()
                 if len(title) > 10 and len(title) < 200:
             r'sign up for',
             r'subscribe to',
             r'follow us on',
             r'share this article',
             r'read more',
             r'continue reading',
         ]
         for phrase in unwanted:
             content = re.sub(phrase, '', content, flags=re.IGNORECASE)
         return content.strip()
 # ==============================================
 # INITIALIZE
 # ==============================================
+extractor = RobustContentExtractor()
 # ==============================================
 # FASTAPI APP
 # ==============================================
 fastapi_app = FastAPI(
+    title="Robust Content Extractor",
+    description="Extracts content with better timeout handling",
+    version="2.0"
 )
 from fastapi.middleware.cors import CORSMiddleware
 @fastapi_app.get("/")
 async def root():
     return {
+        "service": "Robust Content Extractor",
+        "version": "2.0",
+        "description": "Extracts website content with better error handling",
         "endpoints": {
             "GET /": "This info",
+            "GET /health": "Health check (fast)",
             "POST /extract": "Extract content"
         },
+        "timeout_notes": "Jina Reader timeout reduced to 15-25 seconds for faster response"
     }
 @fastapi_app.get("/health")
 async def health():
+    """Fast health check endpoint for wake-up calls"""
+    return {
+        "status": "healthy",
+        "timestamp": time.time(),
+        "service": "content_extractor"
+    }
 @fastapi_app.post("/extract")
 async def api_extract(request: Request):
 # ==============================================
 def gradio_extract(url: str):
+    """Gradio interface"""
     if not url:
         return "❌ Please enter a URL", {}
         content = result["main_content"]
         content_length = result["content_length"]
+        preview = content[:500]
+        if len(content) > 500:
             preview += "..."
         output = f"""
 **URL:** {result['url']}
 **Title:** {result.get('title', 'N/A')}
+**Method:** {result.get('method', 'extracted')}
 **Time:** {result['execution_time']}s
+**Characters:** {content_length:,}
 ### Preview:
 {preview}
 """
         return output, result
     else:
         error = result.get("error", "Unknown error")
+        return f"## ❌ Error\n\n{error}", result
 # Create Gradio interface
 gradio_interface = gr.Interface(
     fn=gradio_extract,
     inputs=gr.Textbox(
         label="Website URL",
+        placeholder="https://example.com",
+        value="https://example.com"
     ),
     outputs=[
         gr.Markdown(label="Result"),
         gr.JSON(label="API Response")
     ],
+    title="🌐 Robust Content Extractor",
+    description="Extracts content with better error handling and timeouts",
     examples=[
         ["https://example.com"],
         ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
+        ["https://news.ycombinator.com"]
     ]
 )
 if __name__ == "__main__":
     print("\n" + "="*60)
+    print("🌐 Robust Content Extractor Starting")
     print("="*60)
+    print("Features:")
+    print("• Faster timeouts (15-25s for Jina)")
+    print("• Multiple fallback strategies")
+    print("• Fast health endpoint for wake-up")
     print("="*60)
+    print("API Endpoints:")
+    print("• GET  /health  - Fast health check (for wake-up)")
+    print("• POST /extract - Extract content")
     print("="*60 + "\n")
     uvicorn.run(