Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on 27 days ago

Commit

15aced8

verified ·

1 Parent(s): 5d4e21f

Update app.py

Browse files

Files changed (1) hide show

app.py +357 -329

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==============================================
-# ROBUST CONTENT EXTRACTOR WITH BETTER ERROR HANDLING
 # ==============================================
 import gradio as gr
@@ -12,54 +12,37 @@ from typing import Dict, Any
 from fastapi import FastAPI, Request
 import uvicorn
 import traceback
 # ==============================================
-# IMPROVED CONTENT EXTRACTOR
 # ==============================================
-class RobustContentExtractor:
-    """Content extractor with better timeout handling"""
     def __init__(self):
         self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
     def extract_content(self, url: str) -> Dict[str, Any]:
-        """Extract content with better error handling"""
         start_time = time.time()
-        print(f"🌐 Extracting: {url}")
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
-        # Clean URL - remove any problematic characters
-        try:
-            from urllib.parse import quote, urlparse, urlunparse
-            parsed = urlparse(url)
-            # Only encode the path and query
-            encoded_path = quote(parsed.path, safe='/')
-            encoded_query = quote(parsed.query, safe='=&')
-            url = urlunparse((
-                parsed.scheme,
-                parsed.netloc,
-                encoded_path,
-                parsed.params,
-                encoded_query,
-                parsed.fragment
-            ))
-        except:
-            pass  # Keep original if encoding fails
         # Try multiple strategies
         strategies = [
-            self._try_jina_reader_fast,  # Faster timeout
-            self._try_direct_request,    # Direct attempt
-            self._try_simple_request,    # Simple headers
-            self._try_fallback_request,  # Fallback with different settings
         ]
-        last_error = None
         for i, strategy in enumerate(strategies):
             try:
@@ -67,334 +50,383 @@ class RobustContentExtractor:
                 result = strategy(url)
                 if result.get("success"):
-                    result["execution_time"] = round(time.time() - start_time, 2)
-                    result["method"] = f"strategy_{i+1}"
-                    print(f"  ✓ Strategy {i+1} succeeded")
-                    return result
             except Exception as e:
-                last_error = str(e)
                 print(f"  Strategy {i+1} failed: {e}")
-                time.sleep(1)  # Short pause between strategies
-        # All failed
         return {
             "success": False,
             "url": url,
-            "error": f"All extraction methods failed. Last error: {last_error}",
             "execution_time": round(time.time() - start_time, 2),
-            "suggestion": "Website may block automated access. Try a different URL."
         }
-    def _try_jina_reader_fast(self, url: str) -> Dict[str, Any]:
-        """Try Jina Reader with shorter timeout"""
         try:
-            # Use encoded URL for Jina
             jina_url = f"https://r.jina.ai/{url}"
-            # Try with very short timeout first
             response = requests.get(
                 jina_url,
-                headers={
-                    "Accept": "text/plain",
-                    "User-Agent": self.user_agent
-                },
-                timeout=12  # Reduced from 15s
             )
             if response.status_code == 200:
                 content = response.text
-                # Try to parse as JSON
-                try:
-                    data = json.loads(content)
-                    if isinstance(data, dict):
-                        if "content" in data:
-                            content = data["content"]
-                        elif "data" in data:
-                            content = str(data["data"])
-                except:
-                    pass  # Keep as plain text
-                # Extract title
-                title = self._extract_title_from_text(content)
-                # Clean content
-                cleaned = self._clean_content(content)
                 return {
                     "success": True,
                     "url": url,
-                    "title": title[:300] if title else "Extracted via Jina",
-                    "main_content": cleaned[:25000],
                     "content_length": len(cleaned),
-                    "content_preview": cleaned[:800] + ("..." if len(cleaned) > 800 else ""),
                     "source": "jina_reader",
                     "status": response.status_code
                 }
             return {"success": False, "error": f"Jina status: {response.status_code}"}
-        except requests.exceptions.Timeout:
-            print(f"    Jina timeout after 12s, trying next strategy...")
-            return {"success": False, "error": "Jina Reader timed out"}
         except Exception as e:
-            print(f"    Jina error: {e}")
             return {"success": False, "error": f"Jina error: {str(e)}"}
-    def _try_direct_request(self, url: str) -> Dict[str, Any]:
-        """Try direct request with various headers"""
-        headers_list = [
-            {
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
-                "Accept-Language": "en-US,en;q=0.9",
-                "Accept-Encoding": "gzip, deflate, br",
-                "DNT": "1",
-                "Connection": "keep-alive",
-                "Upgrade-Insecure-Requests": "1",
-                "Sec-Fetch-Dest": "document",
-                "Sec-Fetch-Mode": "navigate",
-                "Sec-Fetch-Site": "none",
-                "Sec-Fetch-User": "?1",
-                "Cache-Control": "max-age=0",
-            },
-            {
-                "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
-                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-            },
-            {
-                "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
-                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-            },
-        ]
-        for i, headers in enumerate(headers_list):
-            try:
-                print(f"    Direct attempt {i+1}...")
-                response = requests.get(
-                    url,
-                    headers=headers,
-                    timeout=10,
-                    allow_redirects=True,
-                    verify=False  # Try without SSL verification
-                )
-                print(f"    Status: {response.status_code}")
-                if response.status_code == 200:
-                    html_content = response.text
-                    # Extract content
-                    text_content = self._extract_from_html(html_content)
-                    cleaned = self._clean_content(text_content)
-                    # Extract title
-                    title = self._extract_title_from_html(html_content)
-                    if len(cleaned) > 100:
-                        return {
-                            "success": True,
-                            "url": url,
-                            "title": title[:300] if title else "Direct extraction",
-                            "main_content": cleaned[:20000],
-                            "content_length": len(cleaned),
-                            "source": f"direct_request_{i+1}",
-                            "status": response.status_code
-                        }
-            except requests.exceptions.Timeout:
-                print(f"    Direct request {i+1} timed out")
-                continue
-            except Exception as e:
-                print(f"    Direct request {i+1} error: {e}")
-                continue
-        return {"success": False, "error": "All direct attempts failed"}
-    def _try_simple_request(self, url: str) -> Dict[str, Any]:
-        """Simple request with minimal headers"""
         try:
-            print("    Simple request attempt...")
-            response = requests.get(
-                url,
-                headers={"User-Agent": "Mozilla/5.0"},
-                timeout=8,
-                allow_redirects=True,
-                verify=False
-            )
-            print(f"    Simple status: {response.status_code}")
             if response.status_code == 200:
-                html_content = response.text
-                text_content = self._extract_from_html(html_content)
-                cleaned = self._clean_content(text_content)
-                title = self._extract_title_from_html(html_content)
-                if len(cleaned) > 50:
-                    return {
-                        "success": True,
-                        "url": url,
-                        "title": title[:200] if title else "Simple extraction",
-                        "main_content": cleaned[:15000],
-                        "content_length": len(cleaned),
-                        "source": "simple_request"
-                    }
-            return {"success": False, "error": f"Status: {response.status_code}"}
-        except Exception as e:
-            return {"success": False, "error": str(e)}
-    def _try_fallback_request(self, url: str) -> Dict[str, Any]:
-        """Fallback using alternative methods"""
-        try:
-            print("    Fallback attempt...")
-            # Try with requests session
-            session = requests.Session()
-            session.headers.update({
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
-                "Accept": "text/html",
-            })
-            response = session.get(url, timeout=15, allow_redirects=True, verify=False)
-            if response.status_code == 200:
-                html_content = response.text
-                # Very simple text extraction
-                text = self._simple_text_extraction(html_content)
-                if len(text) > 50:
                     return {
                         "success": True,
                         "url": url,
-                        "title": "Fallback extraction",
-                        "main_content": text[:10000],
-                        "content_length": len(text),
-                        "source": "fallback",
-                        "status": response.status_code
                     }
-            return {"success": False, "error": f"Fallback status: {response.status_code}"}
         except Exception as e:
-            return {"success": False, "error": f"Fallback error: {str(e)}"}
-    def _simple_text_extraction(self, html_content: str) -> str:
-        """Very simple text extraction"""
-        # Remove scripts and styles
-        html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
-        html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
-        # Extract text between tags
-        text = re.sub(r'<[^>]+>', ' ', html_content)
-        text = html.unescape(text)
-        text = re.sub(r'\s+', ' ', text)
-        return text.strip()
-    def _extract_from_html(self, html_content: str) -> str:
-        """Extract text from HTML"""
-        # Remove scripts and styles
-        html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
-        html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
-        html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
-        # Remove unwanted tags
-        unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe', 'svg', 'button']
-        for tag in unwanted_tags:
-            html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
-        # Extract text
-        text = re.sub(r'<[^>]+>', ' ', html_content)
-        text = html.unescape(text)
-        # Clean up
-        text = re.sub(r'\s+', ' ', text)
-        return text
-    def _extract_title_from_html(self, html_content: str) -> str:
-        """Extract title from HTML"""
-        # Try <title> tag
-        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
-        if title_match:
-            title = title_match.group(1)
-            title = re.sub(r'\s+', ' ', title).strip()
-            title = html.unescape(title)
-            if title:
-                return title[:200]
-        # Try meta title
-        meta_match = re.search(r'<meta[^>]*property=["\']og:title["\'][^>]*content=["\'](.*?)["\']', html_content, re.IGNORECASE)
-        if meta_match:
-            title = meta_match.group(1)
-            title = html.unescape(title).strip()
-            if title:
-                return title[:200]
-        # Try h1
-        h1_match = re.search(r'<h1[^>]*>(.*?)</h1>', html_content, re.IGNORECASE | re.DOTALL)
-        if h1_match:
-            title = h1_match.group(1)
-            title = re.sub(r'<[^>]+>', '', title)
-            title = html.unescape(title).strip()
-            if title:
-                return title[:200]
         return ""
-    def _extract_title_from_text(self, text: str) -> str:
-        """Try to extract title from text"""
-        # Look for title patterns
-        patterns = [
-            r'Title:\s*(.*?)(?:\n|$)',
-            r'#\s+(.*?)(?:\n|$)',
-            r'<title[^>]*>(.*?)</title>',
         ]
-        for pattern in patterns:
-            match = re.search(pattern, text[:1000], re.IGNORECASE)
-            if match:
-                title = match.group(1).strip()
-                if len(title) > 10 and len(title) < 200:
-                    return title
         return ""
-    def _clean_content(self, content: str) -> str:
-        """Clean and normalize content"""
-        if not content:
             return ""
-        # Replace multiple whitespace
-        content = re.sub(r'\s+', ' ', content)
-        # Remove control characters
-        content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
-        # Remove excessive line breaks
-        content = re.sub(r'\n{3,}', '\n\n', content)
-        return content.strip()
 # ==============================================
 # INITIALIZE
 # ==============================================
-extractor = RobustContentExtractor()
 # ==============================================
 # FASTAPI APP
 # ==============================================
 fastapi_app = FastAPI(
-    title="Robust Content Extractor",
-    description="Extracts content with better timeout handling",
-    version="2.1"
 )
 from fastapi.middleware.cors import CORSMiddleware
@@ -411,23 +443,22 @@ fastapi_app.add_middleware(
 @fastapi_app.get("/")
 async def root():
     return {
-        "service": "Robust Content Extractor",
-        "version": "2.1",
-        "description": "Extracts website content with multiple fallback strategies",
         "endpoints": {
             "GET /": "This info",
-            "GET /health": "Health check (fast)",
-            "POST /extract": "Extract content"
         }
     }
 @fastapi_app.get("/health")
 async def health():
-    """Fast health check endpoint for wake-up calls"""
     return {
         "status": "healthy",
         "timestamp": time.time(),
-        "service": "content_extractor"
     }
 @fastapi_app.post("/extract")
@@ -443,8 +474,8 @@ async def api_extract(request: Request):
                 content={"success": False, "error": "URL is required"}
             )
-        print(f"📨 API Request: {url}")
-        print(f"   Starting extraction at {time.strftime('%Y-%m-%d %H:%M:%S')}")
         start_time = time.time()
         result = extractor.extract_content(url)
@@ -452,6 +483,7 @@ async def api_extract(request: Request):
         print(f"   Extraction completed in {elapsed:.2f}s")
         print(f"   Success: {result.get('success')}")
         return result
@@ -466,8 +498,7 @@ async def api_extract(request: Request):
             status_code=500,
             content={
                 "success": False,
-                "error": str(e),
-                "traceback": traceback.format_exc()[:500]
             }
         )
@@ -478,53 +509,50 @@ async def api_extract(request: Request):
 def gradio_extract(url: str):
     """Gradio interface"""
     if not url:
-        return "❌ Please enter a URL", {}
     result = extractor.extract_content(url)
     if result["success"]:
         content = result["main_content"]
-        content_length = result["content_length"]
-        preview = content[:500]
-        if len(content) > 500:
-            preview += "..."
         output = f"""
-## ✅ Success!
-**URL:** {result['url']}
-**Title:** {result.get('title', 'N/A')}
-**Method:** {result.get('method', 'extracted')}
-**Time:** {result['execution_time']}s
-**Characters:** {content_length:,}
-### Preview:
-{preview}
 """
         return output, result
     else:
-        error = result.get("error", "Unknown error")
-        return f"## ❌ Error\n\n{error}", result
 # Create Gradio interface
 gradio_interface = gr.Interface(
     fn=gradio_extract,
     inputs=gr.Textbox(
-        label="Website URL",
-        placeholder="https://example.com",
-        value="https://example.com"
     ),
     outputs=[
-        gr.Markdown(label="Result"),
-        gr.JSON(label="API Response")
     ],
-    title="🌐 Robust Content Extractor v2.1",
-    description="Extracts content with better error handling and multiple fallbacks",
     examples=[
-        ["https://example.com"],
-        ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
-        ["https://news.ycombinator.com"]
     ]
 )
@@ -540,16 +568,16 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
 if __name__ == "__main__":
     print("\n" + "="*60)
-    print("🌐 Robust Content Extractor v2.1 Starting")
     print("="*60)
-    print("Features:")
-    print("• Multiple fallback strategies")
-    print("• Better error handling")
-    print("• URL encoding support")
     print("="*60)
-    print("API Endpoints:")
-    print("• GET  /health  - Fast health check")
-    print("• POST /extract - Extract content")
     print("="*60 + "\n")
     uvicorn.run(

 # ==============================================
+# IMPROVED CONTENT EXTRACTOR FOR NEWS SITES
 # ==============================================
 import gradio as gr
 from fastapi import FastAPI, Request
 import uvicorn
 import traceback
+from bs4 import BeautifulSoup
 # ==============================================
+# NEWS-SPECIFIC CONTENT EXTRACTOR
 # ==============================================
+class NewsContentExtractor:
+    """Content extractor specifically optimized for news websites"""
     def __init__(self):
         self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
     def extract_content(self, url: str) -> Dict[str, Any]:
+        """Extract news content with article-focused extraction"""
         start_time = time.time()
+        print(f"📰 Extracting news from: {url}")
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
         # Try multiple strategies
         strategies = [
+            self._try_direct_extract,  # Direct extraction with BeautifulSoup
+            self._try_jina_reader,     # Jina Reader
+            self._try_simple_extract,  # Simple fallback
         ]
+        best_result = None
+        best_score = 0
         for i, strategy in enumerate(strategies):
             try:
                 result = strategy(url)
                 if result.get("success"):
+                    # Score the result based on content quality
+                    score = self._score_content(result.get("main_content", ""))
+                    result["score"] = score
+                    if score > best_score:
+                        best_score = score
+                        best_result = result
+                        print(f"  ✓ Strategy {i+1} score: {score}")
             except Exception as e:
                 print(f"  Strategy {i+1} failed: {e}")
+                time.sleep(0.5)
+        if best_result and best_score > 10:  # Minimum score threshold
+            best_result["execution_time"] = round(time.time() - start_time, 2)
+            best_result["method"] = "best_extraction"
+            return best_result
+        # All failed or low quality
         return {
             "success": False,
             "url": url,
+            "error": "Could not extract quality news content",
             "execution_time": round(time.time() - start_time, 2),
+            "suggestion": "Website might have anti-scraping protection"
         }
+    def _try_direct_extract(self, url: str) -> Dict[str, Any]:
+        """Direct extraction with BeautifulSoup for better HTML parsing"""
+        try:
+            headers = {
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
+                "Accept-Encoding": "gzip, deflate",
+                "DNT": "1",
+                "Connection": "keep-alive",
+                "Upgrade-Insecure-Requests": "1",
+                "Sec-Fetch-Dest": "document",
+                "Sec-Fetch-Mode": "navigate",
+                "Sec-Fetch-Site": "none",
+                "Sec-Fetch-User": "?1",
+                "Cache-Control": "max-age=0",
+            }
+            response = requests.get(url, headers=headers, timeout=15, verify=False)
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Remove unwanted elements
+                for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer',
+                                              'aside', 'form', 'iframe', 'button', 'svg',
+                                              'link', 'meta', 'noscript']):
+                    unwanted.decompose()
+                # Try to find article content using multiple strategies
+                article_text = ""
+                # Strategy 1: Look for article-specific containers
+                article_selectors = [
+                    'article', '.article-content', '.post-content', '.entry-content',
+                    '.news-content', '.content-area', '.main-content',
+                    'div[class*="article"]', 'div[class*="content"]',
+                    'div[class*="post"]', 'div[class*="entry"]',
+                    'div[itemprop="articleBody"]', 'div[class*="story"]'
+                ]
+                for selector in article_selectors:
+                    article = soup.select_one(selector)
+                    if article:
+                        article_text = article.get_text(separator='\n', strip=True)
+                        if len(article_text) > 300:  # Minimum content length
+                            print(f"    Found content with selector: {selector}")
+                            break
+                # Strategy 2: Look for main content by paragraph density
+                if len(article_text) < 300:
+                    all_paragraphs = soup.find_all('p')
+                    if len(all_paragraphs) > 3:
+                        article_text = '\n'.join([p.get_text(strip=True) for p in all_paragraphs])
+                # Strategy 3: Extract text from main divs
+                if len(article_text) < 300:
+                    main_divs = soup.find_all(['div', 'section'])
+                    for div in main_divs:
+                        text = div.get_text(separator='\n', strip=True)
+                        # Check if this looks like article content
+                        if (len(text) > 500 and
+                            text.count('\n') > 5 and
+                            not any(word in text.lower() for word in ['cookie', 'privacy', 'copyright', 'advertisement'])):
+                            article_text = text
+                            break
+                # Clean and format the text
+                if article_text:
+                    cleaned_text = self._clean_news_content(article_text)
+                    # Extract title
+                    title = self._extract_title(soup)
+                    if not title:
+                        title_match = soup.find('title')
+                        title = title_match.get_text(strip=True) if title_match else "新闻标题"
+                    # Extract date if available
+                    date = self._extract_date(soup)
+                    return {
+                        "success": True,
+                        "url": url,
+                        "title": title[:200],
+                        "date": date,
+                        "main_content": cleaned_text,
+                        "content_length": len(cleaned_text),
+                        "content_preview": cleaned_text[:500] + ("..." if len(cleaned_text) > 500 else ""),
+                        "source": "direct_extraction",
+                        "status": response.status_code
+                    }
+            return {"success": False, "error": f"Status: {response.status_code}"}
+        except Exception as e:
+            return {"success": False, "error": f"Direct extract error: {str(e)}"}
+    def _try_jina_reader(self, url: str) -> Dict[str, Any]:
+        """Try Jina Reader"""
         try:
             jina_url = f"https://r.jina.ai/{url}"
             response = requests.get(
                 jina_url,
+                headers={"Accept": "text/plain"},
+                timeout=20
             )
             if response.status_code == 200:
                 content = response.text
+                # Clean the content
+                cleaned = self._clean_news_content(content)
+                # Extract title from Jina response
+                title = "Jina提取内容"
+                lines = content.split('\n')
+                for line in lines[:10]:
+                    if line.startswith('Title:') or line.startswith('# '):
+                        title = line.replace('Title:', '').replace('# ', '').strip()
+                        break
                 return {
                     "success": True,
                     "url": url,
+                    "title": title[:200],
+                    "main_content": cleaned,
                     "content_length": len(cleaned),
+                    "content_preview": cleaned[:500] + ("..." if len(cleaned) > 500 else ""),
                     "source": "jina_reader",
                     "status": response.status_code
                 }
             return {"success": False, "error": f"Jina status: {response.status_code}"}
         except Exception as e:
             return {"success": False, "error": f"Jina error: {str(e)}"}
+    def _try_simple_extract(self, url: str) -> Dict[str, Any]:
+        """Simple fallback extraction"""
         try:
+            response = requests.get(url, timeout=10, verify=False)
             if response.status_code == 200:
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Get all text
+                all_text = soup.get_text(separator='\n', strip=True)
+                # Clean and extract meaningful parts
+                lines = all_text.split('\n')
+                meaningful_lines = []
+                for line in lines:
+                    line = line.strip()
+                    if (len(line) > 20 and
+                        not any(word in line.lower() for word in ['cookie', 'privacy', 'copyright',
+                                                                  'advertisement', 'newsletter', 'subscribe',
+                                                                  'follow us', 'share this']) and
+                        not re.match(r'^[0-9\.\-\s]+$', line)):  # Skip number-only lines
+                        meaningful_lines.append(line)
+                cleaned_text = '\n'.join(meaningful_lines[:100])  # Take top 100 lines
+                if len(cleaned_text) > 200:
+                    title = soup.find('title')
+                    title_text = title.get_text(strip=True) if title else "新闻内容"
                     return {
                         "success": True,
                         "url": url,
+                        "title": title_text[:150],
+                        "main_content": cleaned_text,
+                        "content_length": len(cleaned_text),
+                        "source": "simple_extract"
                     }
+            return {"success": False, "error": "Simple extraction failed"}
         except Exception as e:
+            return {"success": False, "error": str(e)}
+    def _extract_title(self, soup) -> str:
+        """Extract title from BeautifulSoup object"""
+        # Try multiple title sources
+        title_sources = [
+            soup.find('title'),
+            soup.find('h1'),
+            soup.find('meta', property='og:title'),
+            soup.find('meta', attrs={'name': 'title'}),
+            soup.find('h2', class_=re.compile(r'title|heading')),
+        ]
+        for source in title_sources:
+            if source:
+                if hasattr(source, 'get'):
+                    content = source.get('content', '') if source.name == 'meta' else source.get_text(strip=True)
+                    if content and len(content) > 5 and len(content) < 200:
+                        return content
         return ""
+    def _extract_date(self, soup) -> str:
+        """Extract date from BeautifulSoup object"""
+        date_patterns = [
+            r'\d{4}[-/]\d{2}[-/]\d{2}',
+            r'\d{2}[-/]\d{2}[-/]\d{4}',
+            r'\d{1,2}\s+\w+\s+\d{4}',
+        ]
+        # Look in common date locations
+        date_selectors = [
+            'time',
+            '.date',
+            '.published',
+            '.post-date',
+            '.article-date',
+            'meta[property="article:published_time"]',
+            'meta[name="pubdate"]',
+            'meta[name="date"]',
         ]
+        for selector in date_selectors:
+            elements = soup.select(selector)
+            for element in elements:
+                if element.name == 'meta':
+                    date_str = element.get('content', '')
+                else:
+                    date_str = element.get_text(strip=True) or element.get('datetime', '')
+                for pattern in date_patterns:
+                    match = re.search(pattern, date_str)
+                    if match:
+                        return match.group()
         return ""
+    def _clean_news_content(self, text: str) -> str:
+        """Clean and format news content"""
+        if not text:
             return ""
+        # Remove excessive whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove common unwanted patterns
+        unwanted_patterns = [
+            r'adsbygoogle.*?\[\]\]',
+            r'ADVERTISEMENT',
+            r'Sponsored Content',
+            r'Sign up for.*?newsletter',
+            r'Subscribe to.*?channel',
+            r'Follow us on.*',
+            r'Share this.*',
+            r'Like us on.*',
+            r'Read more.*',
+            r'Continue reading.*',
+            r'点击这里.*',
+            r'更多新闻.*',
+            r'相关新闻.*',
+            r'热门搜索.*',
+            r'大事件.*',
+            r'Copyright.*All rights reserved',
+            r'本网站.*Cookies',
+            r'了解更多.*',
+            r'接受.*',
+            r'简\s*繁',
+            r'登入.*',
+            r'下载APP.*',
+            r'首页.*最新.*头条.*',
+            r'[\*\-\=]{5,}',  # Multiple special characters
+        ]
+        for pattern in unwanted_patterns:
+            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
+        # Remove very short lines (likely navigation)
+        lines = text.split('\n')
+        cleaned_lines = []
+        for line in lines:
+            line = line.strip()
+            if (len(line) > 15 and
+                not line.startswith(('http://', 'https://', 'www.')) and
+                not re.match(r'^[\d\s\.\-]+$', line)):
+                cleaned_lines.append(line)
+        text = '\n'.join(cleaned_lines)
+        # Remove duplicate consecutive lines
+        lines = text.split('\n')
+        unique_lines = []
+        for i, line in enumerate(lines):
+            if i == 0 or line != lines[i-1]:
+                unique_lines.append(line)
+        return '\n'.join(unique_lines).strip()
+    def _score_content(self, text: str) -> int:
+        """Score content quality based on various factors"""
+        if not text:
+            return 0
+        score = 0
+        # Length-based scoring
+        length = len(text)
+        if length > 1000:
+            score += 30
+        elif length > 500:
+            score += 20
+        elif length > 200:
+            score += 10
+        # Paragraph count (rough estimate)
+        paragraphs = text.count('\n\n') + 1
+        if paragraphs > 5:
+            score += 20
+        elif paragraphs > 3:
+            score += 10
+        # News indicators
+        news_keywords = ['报道', '新闻', '记者', '警方', '调查', '发生', '表示', '指出',
+                        '据知', '据了解', '据悉', '事件', '事故', '案件']
+        for keyword in news_keywords:
+            if keyword in text:
+                score += 2
+        # Penalize for unwanted content
+        unwanted_terms = ['cookie', 'privacy', 'copyright', 'advertisement', 'newsletter']
+        for term in unwanted_terms:
+            if term.lower() in text.lower():
+                score -= 5
+        return max(0, score)
 # ==============================================
 # INITIALIZE
 # ==============================================
+extractor = NewsContentExtractor()
 # ==============================================
 # FASTAPI APP
 # ==============================================
 fastapi_app = FastAPI(
+    title="News Content Extractor",
+    description="Extracts news article content with BeautifulSoup",
+    version="3.0"
 )
 from fastapi.middleware.cors import CORSMiddleware
 @fastapi_app.get("/")
 async def root():
     return {
+        "service": "News Content Extractor",
+        "version": "3.0",
+        "description": "Extracts news article content using BeautifulSoup",
         "endpoints": {
             "GET /": "This info",
+            "GET /health": "Health check",
+            "POST /extract": "Extract news content"
         }
     }
 @fastapi_app.get("/health")
 async def health():
     return {
         "status": "healthy",
         "timestamp": time.time(),
+        "service": "news_extractor"
     }
 @fastapi_app.post("/extract")
                 content={"success": False, "error": "URL is required"}
             )
+        print(f"📰 API Request for news: {url}")
+        print(f"   Starting at {time.strftime('%Y-%m-%d %H:%M:%S')}")
         start_time = time.time()
         result = extractor.extract_content(url)
         print(f"   Extraction completed in {elapsed:.2f}s")
         print(f"   Success: {result.get('success')}")
+        print(f"   Content length: {result.get('content_length', 0)}")
         return result
             status_code=500,
             content={
                 "success": False,
+                "error": str(e)
             }
         )
 def gradio_extract(url: str):
     """Gradio interface"""
     if not url:
+        return "❌ 请输入URL", {}
     result = extractor.extract_content(url)
     if result["success"]:
         content = result["main_content"]
+        title = result.get("title", "无标题")
+        date = result.get("date", "")
         output = f"""
+## ✅ 提取成功！
+**标题:** {title}
+**日期:** {date if date else "未提取到日期"}
+**方法:** {result.get('method', '提取')}
+**时间:** {result['execution_time']}s
+**字符数:** {result['content_length']:,}
+### 内容预览:
+{content[:800]}{"..." if len(content) > 800 else ""}
 """
         return output, result
     else:
+        error = result.get("error", "未知错误")
+        return f"## ❌ 错误\n\n{error}", result
 # Create Gradio interface
 gradio_interface = gr.Interface(
     fn=gradio_extract,
     inputs=gr.Textbox(
+        label="新闻URL",
+        placeholder="https://example.com/news",
+        value="https://northern.sinchew.com.my/?p=7217886"
     ),
     outputs=[
+        gr.Markdown(label="结果"),
+        gr.JSON(label="API响应")
     ],
+    title="📰 新闻内容提取器",
+    description="使用BeautifulSoup提取新闻文章内容",
     examples=[
+        ["https://northern.sinchew.com.my/?p=7217886"],
+        ["https://www.sinchew.com.my/?p=7234965"],
+        ["https://example.com"]
     ]
 )
 if __name__ == "__main__":
     print("\n" + "="*60)
+    print("📰 新闻内容提取器 v3.0 启动")
     print("="*60)
+    print("特性:")
+    print("• 使用BeautifulSoup进行HTML解析")
+    print("• 专门针对新闻网站优化")
+    print("• 智能内容评分系统")
     print("="*60)
+    print("API端点:")
+    print("• GET  /health  - 健康检查")
+    print("• POST /extract - 提取新闻内容")
     print("="*60 + "\n")
     uvicorn.run(