Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on 28 days ago

Commit

5d4e21f

verified ·

1 Parent(s): 2448858

Update app.py

Browse files

Files changed (1) hide show

app.py +199 -97

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import html
 from typing import Dict, Any
 from fastapi import FastAPI, Request
 import uvicorn
 # ==============================================
 # IMPROVED CONTENT EXTRACTOR
@@ -32,11 +33,30 @@ class RobustContentExtractor:
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
-        # Try multiple strategies with shorter timeouts
         strategies = [
             self._try_jina_reader_fast,  # Faster timeout
             self._try_direct_request,    # Direct attempt
             self._try_simple_request,    # Simple headers
         ]
         last_error = None
@@ -49,12 +69,13 @@ class RobustContentExtractor:
                 if result.get("success"):
                     result["execution_time"] = round(time.time() - start_time, 2)
                     result["method"] = f"strategy_{i+1}"
                     return result
             except Exception as e:
                 last_error = str(e)
                 print(f"  Strategy {i+1} failed: {e}")
-                time.sleep(0.5)
         # All failed
         return {
@@ -68,88 +89,98 @@ class RobustContentExtractor:
     def _try_jina_reader_fast(self, url: str) -> Dict[str, Any]:
         """Try Jina Reader with shorter timeout"""
         try:
-            # Try with shorter timeout first
             jina_url = f"https://r.jina.ai/{url}"
-            # Try multiple approaches
-            attempts = [
-                {"timeout": 15, "headers": {"Accept": "text/plain"}},
-                {"timeout": 20, "headers": {"Accept": "application/json"}},
-                {"timeout": 25, "headers": {"User-Agent": self.user_agent}},
-            ]
-            for attempt in attempts:
                 try:
-                    response = requests.get(
-                        jina_url,
-                        headers=attempt["headers"],
-                        timeout=attempt["timeout"]
-                    )
-                    if response.status_code == 200:
-                        content = response.text
-                        # Try to parse as JSON
-                        try:
-                            data = json.loads(content)
-                            if isinstance(data, dict):
-                                if "content" in data:
-                                    content = data["content"]
-                                elif "data" in data:
-                                    content = str(data["data"])
-                        except:
-                            pass  # Keep as plain text
-                        # Extract title
-                        title = self._extract_title_from_text(content)
-                        # Clean content
-                        cleaned = self._clean_content(content)
-                        return {
-                            "success": True,
-                            "url": url,
-                            "title": title[:300] if title else "Extracted via Jina",
-                            "main_content": cleaned[:25000],
-                            "content_length": len(cleaned),
-                            "content_preview": cleaned[:800] + ("..." if len(cleaned) > 800 else ""),
-                            "source": "jina_reader",
-                            "status": response.status_code
-                        }
-                except requests.exceptions.Timeout:
-                    print(f"    Jina timeout after {attempt['timeout']}s, trying next...")
-                    continue
-                except Exception as e:
-                    print(f"    Jina attempt failed: {e}")
-                    continue
-            return {"success": False, "error": "Jina Reader timed out"}
         except Exception as e:
             return {"success": False, "error": f"Jina error: {str(e)}"}
     def _try_direct_request(self, url: str) -> Dict[str, Any]:
         """Try direct request with various headers"""
         headers_list = [
             {
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
-                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                 "Accept-Language": "en-US,en;q=0.9",
             },
             {
-                "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15",
-                "Accept": "text/html",
             },
             {
                 "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
-                "Accept": "text/html",
             },
         ]
-        for headers in headers_list:
             try:
-                response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
                 if response.status_code == 200:
                     html_content = response.text
@@ -168,26 +199,33 @@ class RobustContentExtractor:
                             "title": title[:300] if title else "Direct extraction",
                             "main_content": cleaned[:20000],
                             "content_length": len(cleaned),
-                            "source": "direct_request",
                             "status": response.status_code
                         }
             except Exception as e:
-                print(f"    Direct request failed: {e}")
                 continue
-        return {"success": False}
     def _try_simple_request(self, url: str) -> Dict[str, Any]:
         """Simple request with minimal headers"""
         try:
             response = requests.get(
                 url,
                 headers={"User-Agent": "Mozilla/5.0"},
                 timeout=8,
-                allow_redirects=True
             )
             if response.status_code == 200:
                 html_content = response.text
                 text_content = self._extract_from_html(html_content)
@@ -204,11 +242,60 @@ class RobustContentExtractor:
                         "source": "simple_request"
                     }
-            return {"success": False}
         except Exception as e:
             return {"success": False, "error": str(e)}
     def _extract_from_html(self, html_content: str) -> str:
         """Extract text from HTML"""
         # Remove scripts and styles
@@ -217,7 +304,7 @@ class RobustContentExtractor:
         html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
         # Remove unwanted tags
-        unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe']
         for tag in unwanted_tags:
             html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
@@ -232,12 +319,32 @@ class RobustContentExtractor:
     def _extract_title_from_html(self, html_content: str) -> str:
         """Extract title from HTML"""
-        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
         if title_match:
             title = title_match.group(1)
             title = re.sub(r'\s+', ' ', title).strip()
             title = html.unescape(title)
-            return title
         return ""
     def _extract_title_from_text(self, text: str) -> str:
@@ -272,22 +379,6 @@ class RobustContentExtractor:
         # Remove excessive line breaks
         content = re.sub(r'\n{3,}', '\n\n', content)
-        # Remove common unwanted phrases
-        unwanted = [
-            r'adsbygoogle',
-            r'advertisement',
-            r'sponsored content',
-            r'sign up for',
-            r'subscribe to',
-            r'follow us on',
-            r'share this article',
-            r'read more',
-            r'continue reading',
-        ]
-        for phrase in unwanted:
-            content = re.sub(phrase, '', content, flags=re.IGNORECASE)
         return content.strip()
 # ==============================================
@@ -303,7 +394,7 @@ extractor = RobustContentExtractor()
 fastapi_app = FastAPI(
     title="Robust Content Extractor",
     description="Extracts content with better timeout handling",
-    version="2.0"
 )
 from fastapi.middleware.cors import CORSMiddleware
@@ -321,14 +412,13 @@ fastapi_app.add_middleware(
 async def root():
     return {
         "service": "Robust Content Extractor",
-        "version": "2.0",
-        "description": "Extracts website content with better error handling",
         "endpoints": {
             "GET /": "This info",
             "GET /health": "Health check (fast)",
             "POST /extract": "Extract content"
-        },
-        "timeout_notes": "Jina Reader timeout reduced to 15-25 seconds for faster response"
     }
 @fastapi_app.get("/health")
@@ -354,7 +444,14 @@ async def api_extract(request: Request):
             )
         print(f"📨 API Request: {url}")
         result = extractor.extract_content(url)
         return result
@@ -364,9 +461,14 @@ async def api_extract(request: Request):
             content={"success": False, "error": "Invalid JSON"}
         )
     except Exception as e:
         return JSONResponse(
             status_code=500,
-            content={"success": False, "error": str(e)}
         )
 # ==============================================
@@ -417,8 +519,8 @@ gradio_interface = gr.Interface(
         gr.Markdown(label="Result"),
         gr.JSON(label="API Response")
     ],
-    title="🌐 Robust Content Extractor",
-    description="Extracts content with better error handling and timeouts",
     examples=[
         ["https://example.com"],
         ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
@@ -438,15 +540,15 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
 if __name__ == "__main__":
     print("\n" + "="*60)
-    print("🌐 Robust Content Extractor Starting")
     print("="*60)
     print("Features:")
-    print("• Faster timeouts (15-25s for Jina)")
     print("• Multiple fallback strategies")
-    print("• Fast health endpoint for wake-up")
     print("="*60)
     print("API Endpoints:")
-    print("• GET  /health  - Fast health check (for wake-up)")
     print("• POST /extract - Extract content")
     print("="*60 + "\n")

 from typing import Dict, Any
 from fastapi import FastAPI, Request
 import uvicorn
+import traceback
 # ==============================================
 # IMPROVED CONTENT EXTRACTOR
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
+        # Clean URL - remove any problematic characters
+        try:
+            from urllib.parse import quote, urlparse, urlunparse
+            parsed = urlparse(url)
+            # Only encode the path and query
+            encoded_path = quote(parsed.path, safe='/')
+            encoded_query = quote(parsed.query, safe='=&')
+            url = urlunparse((
+                parsed.scheme,
+                parsed.netloc,
+                encoded_path,
+                parsed.params,
+                encoded_query,
+                parsed.fragment
+            ))
+        except:
+            pass  # Keep original if encoding fails
+        # Try multiple strategies
         strategies = [
             self._try_jina_reader_fast,  # Faster timeout
             self._try_direct_request,    # Direct attempt
             self._try_simple_request,    # Simple headers
+            self._try_fallback_request,  # Fallback with different settings
         ]
         last_error = None
                 if result.get("success"):
                     result["execution_time"] = round(time.time() - start_time, 2)
                     result["method"] = f"strategy_{i+1}"
+                    print(f"  ✓ Strategy {i+1} succeeded")
                     return result
             except Exception as e:
                 last_error = str(e)
                 print(f"  Strategy {i+1} failed: {e}")
+                time.sleep(1)  # Short pause between strategies
         # All failed
         return {
     def _try_jina_reader_fast(self, url: str) -> Dict[str, Any]:
         """Try Jina Reader with shorter timeout"""
         try:
+            # Use encoded URL for Jina
             jina_url = f"https://r.jina.ai/{url}"
+            # Try with very short timeout first
+            response = requests.get(
+                jina_url,
+                headers={
+                    "Accept": "text/plain",
+                    "User-Agent": self.user_agent
+                },
+                timeout=12  # Reduced from 15s
+            )
+            if response.status_code == 200:
+                content = response.text
+                # Try to parse as JSON
                 try:
+                    data = json.loads(content)
+                    if isinstance(data, dict):
+                        if "content" in data:
+                            content = data["content"]
+                        elif "data" in data:
+                            content = str(data["data"])
+                except:
+                    pass  # Keep as plain text
+                # Extract title
+                title = self._extract_title_from_text(content)
+                # Clean content
+                cleaned = self._clean_content(content)
+                return {
+                    "success": True,
+                    "url": url,
+                    "title": title[:300] if title else "Extracted via Jina",
+                    "main_content": cleaned[:25000],
+                    "content_length": len(cleaned),
+                    "content_preview": cleaned[:800] + ("..." if len(cleaned) > 800 else ""),
+                    "source": "jina_reader",
+                    "status": response.status_code
+                }
+            return {"success": False, "error": f"Jina status: {response.status_code}"}
+        except requests.exceptions.Timeout:
+            print(f"    Jina timeout after 12s, trying next strategy...")
+            return {"success": False, "error": "Jina Reader timed out"}
         except Exception as e:
+            print(f"    Jina error: {e}")
             return {"success": False, "error": f"Jina error: {str(e)}"}
     def _try_direct_request(self, url: str) -> Dict[str, Any]:
         """Try direct request with various headers"""
         headers_list = [
             {
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                 "Accept-Language": "en-US,en;q=0.9",
+                "Accept-Encoding": "gzip, deflate, br",
+                "DNT": "1",
+                "Connection": "keep-alive",
+                "Upgrade-Insecure-Requests": "1",
+                "Sec-Fetch-Dest": "document",
+                "Sec-Fetch-Mode": "navigate",
+                "Sec-Fetch-Site": "none",
+                "Sec-Fetch-User": "?1",
+                "Cache-Control": "max-age=0",
             },
             {
+                "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
             },
             {
                 "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
             },
         ]
+        for i, headers in enumerate(headers_list):
             try:
+                print(f"    Direct attempt {i+1}...")
+                response = requests.get(
+                    url,
+                    headers=headers,
+                    timeout=10,
+                    allow_redirects=True,
+                    verify=False  # Try without SSL verification
+                )
+                print(f"    Status: {response.status_code}")
                 if response.status_code == 200:
                     html_content = response.text
                             "title": title[:300] if title else "Direct extraction",
                             "main_content": cleaned[:20000],
                             "content_length": len(cleaned),
+                            "source": f"direct_request_{i+1}",
                             "status": response.status_code
                         }
+            except requests.exceptions.Timeout:
+                print(f"    Direct request {i+1} timed out")
+                continue
             except Exception as e:
+                print(f"    Direct request {i+1} error: {e}")
                 continue
+        return {"success": False, "error": "All direct attempts failed"}
     def _try_simple_request(self, url: str) -> Dict[str, Any]:
         """Simple request with minimal headers"""
         try:
+            print("    Simple request attempt...")
             response = requests.get(
                 url,
                 headers={"User-Agent": "Mozilla/5.0"},
                 timeout=8,
+                allow_redirects=True,
+                verify=False
             )
+            print(f"    Simple status: {response.status_code}")
             if response.status_code == 200:
                 html_content = response.text
                 text_content = self._extract_from_html(html_content)
                         "source": "simple_request"
                     }
+            return {"success": False, "error": f"Status: {response.status_code}"}
         except Exception as e:
             return {"success": False, "error": str(e)}
+    def _try_fallback_request(self, url: str) -> Dict[str, Any]:
+        """Fallback using alternative methods"""
+        try:
+            print("    Fallback attempt...")
+            # Try with requests session
+            session = requests.Session()
+            session.headers.update({
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+                "Accept": "text/html",
+            })
+            response = session.get(url, timeout=15, allow_redirects=True, verify=False)
+            if response.status_code == 200:
+                html_content = response.text
+                # Very simple text extraction
+                text = self._simple_text_extraction(html_content)
+                if len(text) > 50:
+                    return {
+                        "success": True,
+                        "url": url,
+                        "title": "Fallback extraction",
+                        "main_content": text[:10000],
+                        "content_length": len(text),
+                        "source": "fallback",
+                        "status": response.status_code
+                    }
+            return {"success": False, "error": f"Fallback status: {response.status_code}"}
+        except Exception as e:
+            return {"success": False, "error": f"Fallback error: {str(e)}"}
+    def _simple_text_extraction(self, html_content: str) -> str:
+        """Very simple text extraction"""
+        # Remove scripts and styles
+        html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
+        html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
+        # Extract text between tags
+        text = re.sub(r'<[^>]+>', ' ', html_content)
+        text = html.unescape(text)
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
     def _extract_from_html(self, html_content: str) -> str:
         """Extract text from HTML"""
         # Remove scripts and styles
         html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
         # Remove unwanted tags
+        unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe', 'svg', 'button']
         for tag in unwanted_tags:
             html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
     def _extract_title_from_html(self, html_content: str) -> str:
         """Extract title from HTML"""
+        # Try <title> tag
+        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
         if title_match:
             title = title_match.group(1)
             title = re.sub(r'\s+', ' ', title).strip()
             title = html.unescape(title)
+            if title:
+                return title[:200]
+        # Try meta title
+        meta_match = re.search(r'<meta[^>]*property=["\']og:title["\'][^>]*content=["\'](.*?)["\']', html_content, re.IGNORECASE)
+        if meta_match:
+            title = meta_match.group(1)
+            title = html.unescape(title).strip()
+            if title:
+                return title[:200]
+        # Try h1
+        h1_match = re.search(r'<h1[^>]*>(.*?)</h1>', html_content, re.IGNORECASE | re.DOTALL)
+        if h1_match:
+            title = h1_match.group(1)
+            title = re.sub(r'<[^>]+>', '', title)
+            title = html.unescape(title).strip()
+            if title:
+                return title[:200]
         return ""
     def _extract_title_from_text(self, text: str) -> str:
         # Remove excessive line breaks
         content = re.sub(r'\n{3,}', '\n\n', content)
         return content.strip()
 # ==============================================
 fastapi_app = FastAPI(
     title="Robust Content Extractor",
     description="Extracts content with better timeout handling",
+    version="2.1"
 )
 from fastapi.middleware.cors import CORSMiddleware
 async def root():
     return {
         "service": "Robust Content Extractor",
+        "version": "2.1",
+        "description": "Extracts website content with multiple fallback strategies",
         "endpoints": {
             "GET /": "This info",
             "GET /health": "Health check (fast)",
             "POST /extract": "Extract content"
+        }
     }
 @fastapi_app.get("/health")
             )
         print(f"📨 API Request: {url}")
+        print(f"   Starting extraction at {time.strftime('%Y-%m-%d %H:%M:%S')}")
+        start_time = time.time()
         result = extractor.extract_content(url)
+        elapsed = time.time() - start_time
+        print(f"   Extraction completed in {elapsed:.2f}s")
+        print(f"   Success: {result.get('success')}")
         return result
             content={"success": False, "error": "Invalid JSON"}
         )
     except Exception as e:
+        print(f"   API Error: {traceback.format_exc()}")
         return JSONResponse(
             status_code=500,
+            content={
+                "success": False,
+                "error": str(e),
+                "traceback": traceback.format_exc()[:500]
+            }
         )
 # ==============================================
         gr.Markdown(label="Result"),
         gr.JSON(label="API Response")
     ],
+    title="🌐 Robust Content Extractor v2.1",
+    description="Extracts content with better error handling and multiple fallbacks",
     examples=[
         ["https://example.com"],
         ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
 if __name__ == "__main__":
     print("\n" + "="*60)
+    print("🌐 Robust Content Extractor v2.1 Starting")
     print("="*60)
     print("Features:")
     print("• Multiple fallback strategies")
+    print("• Better error handling")
+    print("• URL encoding support")
     print("="*60)
     print("API Endpoints:")
+    print("• GET  /health  - Fast health check")
     print("• POST /extract - Extract content")
     print("="*60 + "\n")