Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on 28 days ago

Commit

823e327

verified ·

1 Parent(s): fa45285

Update app.py

Browse files

Files changed (1) hide show

app.py +374 -264

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==============================================
-# IMPROVED CONTENT EXTRACTOR FOR NEWS SITES
 # ==============================================
 import gradio as gr
@@ -13,77 +13,93 @@ from fastapi import FastAPI, Request
 import uvicorn
 import traceback
 from bs4 import BeautifulSoup
 # ==============================================
-# NEWS-SPECIFIC CONTENT EXTRACTOR
 # ==============================================
-class NewsContentExtractor:
-    """Content extractor specifically optimized for news websites"""
     def __init__(self):
-        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-    def extract_content(self, url: str) -> Dict[str, Any]:
-        """Extract news content with article-focused extraction"""
         start_time = time.time()
-        print(f"📰 Extracting news from: {url}")
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
-        # Try multiple strategies
-        strategies = [
-            self._try_direct_extract,  # Direct extraction with BeautifulSoup
-            self._try_jina_reader,     # Jina Reader
-            self._try_simple_extract,  # Simple fallback
         ]
         best_result = None
         best_score = 0
-        for i, strategy in enumerate(strategies):
             try:
-                print(f"  Trying strategy {i+1}...")
-                result = strategy(url)
                 if result.get("success"):
-                    # Score the result based on content quality
-                    score = self._score_content(result.get("main_content", ""))
                     result["score"] = score
                     if score > best_score:
                         best_score = score
                         best_result = result
-                        print(f"  ✓ Strategy {i+1} score: {score}")
             except Exception as e:
-                print(f"  Strategy {i+1} failed: {e}")
-                time.sleep(0.5)
-        if best_result and best_score > 10:  # Minimum score threshold
             best_result["execution_time"] = round(time.time() - start_time, 2)
-            best_result["method"] = "best_extraction"
             return best_result
-        # All failed or low quality
         return {
             "success": False,
             "url": url,
-            "error": "Could not extract quality news content",
-            "execution_time": round(time.time() - start_time, 2),
-            "suggestion": "Website might have anti-scraping protection"
         }
-    def _try_direct_extract(self, url: str) -> Dict[str, Any]:
-        """Direct extraction with BeautifulSoup for better HTML parsing"""
         try:
             headers = {
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
-                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-                "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
                 "Accept-Encoding": "gzip, deflate",
                 "DNT": "1",
                 "Connection": "keep-alive",
@@ -92,158 +108,230 @@ class NewsContentExtractor:
                 "Sec-Fetch-Mode": "navigate",
                 "Sec-Fetch-Site": "none",
                 "Sec-Fetch-User": "?1",
-                "Cache-Control": "max-age=0",
             }
-            response = requests.get(url, headers=headers, timeout=15, verify=False)
             if response.status_code == 200:
-                soup = BeautifulSoup(response.content, 'html.parser')
-                # Remove unwanted elements
-                for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer',
-                                              'aside', 'form', 'iframe', 'button', 'svg',
-                                              'link', 'meta', 'noscript']):
-                    unwanted.decompose()
-                # Try to find article content using multiple strategies
-                article_text = ""
-                # Strategy 1: Look for article-specific containers
-                article_selectors = [
-                    'article', '.article-content', '.post-content', '.entry-content',
-                    '.news-content', '.content-area', '.main-content',
-                    'div[class*="article"]', 'div[class*="content"]',
-                    'div[class*="post"]', 'div[class*="entry"]',
-                    'div[itemprop="articleBody"]', 'div[class*="story"]'
-                ]
-                for selector in article_selectors:
-                    article = soup.select_one(selector)
-                    if article:
-                        article_text = article.get_text(separator='\n', strip=True)
-                        if len(article_text) > 300:  # Minimum content length
-                            print(f"    Found content with selector: {selector}")
-                            break
-                # Strategy 2: Look for main content by paragraph density
-                if len(article_text) < 300:
-                    all_paragraphs = soup.find_all('p')
-                    if len(all_paragraphs) > 3:
-                        article_text = '\n'.join([p.get_text(strip=True) for p in all_paragraphs])
-                # Strategy 3: Extract text from main divs
-                if len(article_text) < 300:
-                    main_divs = soup.find_all(['div', 'section'])
-                    for div in main_divs:
-                        text = div.get_text(separator='\n', strip=True)
-                        # Check if this looks like article content
-                        if (len(text) > 500 and
-                            text.count('\n') > 5 and
-                            not any(word in text.lower() for word in ['cookie', 'privacy', 'copyright', 'advertisement'])):
-                            article_text = text
-                            break
-                # Clean and format the text
-                if article_text:
-                    cleaned_text = self._clean_news_content(article_text)
-                    # Extract title
-                    title = self._extract_title(soup)
-                    if not title:
-                        title_match = soup.find('title')
-                        title = title_match.get_text(strip=True) if title_match else "新闻标题"
-                    # Extract date if available
-                    date = self._extract_date(soup)
                     return {
                         "success": True,
                         "url": url,
                         "title": title[:200],
-                        "date": date,
                         "main_content": cleaned_text,
                         "content_length": len(cleaned_text),
                         "content_preview": cleaned_text[:500] + ("..." if len(cleaned_text) > 500 else ""),
-                        "source": "direct_extraction",
-                        "status": response.status_code
                     }
             return {"success": False, "error": f"Status: {response.status_code}"}
         except Exception as e:
-            return {"success": False, "error": f"Direct extract error: {str(e)}"}
-    def _try_jina_reader(self, url: str) -> Dict[str, Any]:
-        """Try Jina Reader"""
         try:
             jina_url = f"https://r.jina.ai/{url}"
-            response = requests.get(
-                jina_url,
-                headers={"Accept": "text/plain"},
-                timeout=20
-            )
             if response.status_code == 200:
-                content = response.text
-                # Clean the content
-                cleaned = self._clean_news_content(content)
-                # Extract title from Jina response
-                title = "Jina提取内容"
-                lines = content.split('\n')
-                for line in lines[:10]:
-                    if line.startswith('Title:') or line.startswith('# '):
-                        title = line.replace('Title:', '').replace('# ', '').strip()
-                        break
-                return {
-                    "success": True,
-                    "url": url,
-                    "title": title[:200],
-                    "main_content": cleaned,
-                    "content_length": len(cleaned),
-                    "content_preview": cleaned[:500] + ("..." if len(cleaned) > 500 else ""),
-                    "source": "jina_reader",
-                    "status": response.status_code
-                }
-            return {"success": False, "error": f"Jina status: {response.status_code}"}
         except Exception as e:
-            return {"success": False, "error": f"Jina error: {str(e)}"}
-    def _try_simple_extract(self, url: str) -> Dict[str, Any]:
-        """Simple fallback extraction"""
         try:
             response = requests.get(url, timeout=10, verify=False)
             if response.status_code == 200:
                 soup = BeautifulSoup(response.content, 'html.parser')
-                # Get all text
-                all_text = soup.get_text(separator='\n', strip=True)
-                # Clean and extract meaningful parts
                 lines = all_text.split('\n')
-                meaningful_lines = []
                 for line in lines:
                     line = line.strip()
-                    if (len(line) > 20 and
-                        not any(word in line.lower() for word in ['cookie', 'privacy', 'copyright',
-                                                                  'advertisement', 'newsletter', 'subscribe',
-                                                                  'follow us', 'share this']) and
-                        not re.match(r'^[0-9\.\-\s]+$', line)):  # Skip number-only lines
-                        meaningful_lines.append(line)
-                cleaned_text = '\n'.join(meaningful_lines[:100])  # Take top 100 lines
                 if len(cleaned_text) > 200:
                     title = soup.find('title')
-                    title_text = title.get_text(strip=True) if title else "新闻内容"
                     return {
                         "success": True,
@@ -251,89 +339,96 @@ class NewsContentExtractor:
                         "title": title_text[:150],
                         "main_content": cleaned_text,
                         "content_length": len(cleaned_text),
-                        "source": "simple_extract"
                     }
-            return {"success": False, "error": "Simple extraction failed"}
         except Exception as e:
             return {"success": False, "error": str(e)}
-    def _extract_title(self, soup) -> str:
-        """Extract title from BeautifulSoup object"""
-        # Try multiple title sources
-        title_sources = [
-            soup.find('title'),
-            soup.find('h1'),
-            soup.find('meta', property='og:title'),
-            soup.find('meta', attrs={'name': 'title'}),
-            soup.find('h2', class_=re.compile(r'title|heading')),
         ]
-        for source in title_sources:
-            if source:
-                if hasattr(source, 'get'):
-                    content = source.get('content', '') if source.name == 'meta' else source.get_text(strip=True)
-                    if content and len(content) > 5 and len(content) < 200:
-                        return content
-        return ""
-    def _extract_date(self, soup) -> str:
         """Extract date from BeautifulSoup object"""
-        date_patterns = [
-            r'\d{4}[-/]\d{2}[-/]\d{2}',
-            r'\d{2}[-/]\d{2}[-/]\d{4}',
-            r'\d{1,2}\s+\w+\s+\d{4}',
-        ]
-        # Look in common date locations
         date_selectors = [
             'time',
             '.date',
             '.published',
             '.post-date',
             '.article-date',
-            'meta[property="article:published_time"]',
-            'meta[name="pubdate"]',
-            'meta[name="date"]',
         ]
         for selector in date_selectors:
-            elements = soup.select(selector)
-            for element in elements:
                 if element.name == 'meta':
                     date_str = element.get('content', '')
                 else:
-                    date_str = element.get_text(strip=True) or element.get('datetime', '')
-                for pattern in date_patterns:
-                    match = re.search(pattern, date_str)
-                    if match:
-                        return match.group()
         return ""
-    def _clean_news_content(self, text: str) -> str:
-        """Clean and format news content"""
         if not text:
             return ""
-        # Remove excessive whitespace
-        text = re.sub(r'\s+', ' ', text)
-        # Remove common unwanted patterns
-        unwanted_patterns = [
-            r'adsbygoogle.*?\[\]\]',
             r'ADVERTISEMENT',
             r'Sponsored Content',
-            r'Sign up for.*?newsletter',
-            r'Subscribe to.*?channel',
-            r'Follow us on.*',
-            r'Share this.*',
-            r'Like us on.*',
-            r'Read more.*',
-            r'Continue reading.*',
             r'点击这里.*',
             r'更多新闻.*',
             r'相关新闻.*',
@@ -346,87 +441,98 @@ class NewsContentExtractor:
             r'简\s*繁',
             r'登入.*',
             r'下载APP.*',
-            r'首页.*最新.*头条.*',
-            r'[\*\-\=]{5,}',  # Multiple special characters
         ]
-        for pattern in unwanted_patterns:
-            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
-        # Remove very short lines (likely navigation)
         lines = text.split('\n')
         cleaned_lines = []
         for line in lines:
             line = line.strip()
-            if (len(line) > 15 and
                 not line.startswith(('http://', 'https://', 'www.')) and
-                not re.match(r'^[\d\s\.\-]+$', line)):
                 cleaned_lines.append(line)
-        text = '\n'.join(cleaned_lines)
         # Remove duplicate consecutive lines
-        lines = text.split('\n')
         unique_lines = []
-        for i, line in enumerate(lines):
-            if i == 0 or line != lines[i-1]:
                 unique_lines.append(line)
-        return '\n'.join(unique_lines).strip()
-    def _score_content(self, text: str) -> int:
-        """Score content quality based on various factors"""
-        if not text:
             return 0
         score = 0
-        # Length-based scoring
-        length = len(text)
-        if length > 1000:
             score += 30
         elif length > 500:
             score += 20
-        elif length > 200:
             score += 10
-        # Paragraph count (rough estimate)
-        paragraphs = text.count('\n\n') + 1
-        if paragraphs > 5:
-            score += 20
-        elif paragraphs > 3:
-            score += 10
-        # News indicators
-        news_keywords = ['报道', '新闻', '记者', '警方', '调查', '发生', '表示', '指出',
-                        '据知', '据了解', '据悉', '事件', '事故', '案件']
-        for keyword in news_keywords:
-            if keyword in text:
                 score += 2
-        # Penalize for unwanted content
-        unwanted_terms = ['cookie', 'privacy', 'copyright', 'advertisement', 'newsletter']
-        for term in unwanted_terms:
-            if term.lower() in text.lower():
-                score -= 5
-        return max(0, score)
 # ==============================================
 # INITIALIZE
 # ==============================================
-extractor = NewsContentExtractor()
 # ==============================================
 # FASTAPI APP
 # ==============================================
 fastapi_app = FastAPI(
-    title="News Content Extractor",
-    description="Extracts news article content with BeautifulSoup",
-    version="3.0"
 )
 from fastapi.middleware.cors import CORSMiddleware
@@ -443,13 +549,13 @@ fastapi_app.add_middleware(
 @fastapi_app.get("/")
 async def root():
     return {
-        "service": "News Content Extractor",
-        "version": "3.0",
-        "description": "Extracts news article content using BeautifulSoup",
         "endpoints": {
             "GET /": "This info",
             "GET /health": "Health check",
-            "POST /extract": "Extract news content"
         }
     }
@@ -458,7 +564,7 @@ async def health():
     return {
         "status": "healthy",
         "timestamp": time.time(),
-        "service": "news_extractor"
     }
 @fastapi_app.post("/extract")
@@ -474,16 +580,16 @@ async def api_extract(request: Request):
                 content={"success": False, "error": "URL is required"}
             )
-        print(f"📰 API Request for news: {url}")
-        print(f"   Starting at {time.strftime('%Y-%m-%d %H:%M:%S')}")
         start_time = time.time()
-        result = extractor.extract_content(url)
         elapsed = time.time() - start_time
-        print(f"   Extraction completed in {elapsed:.2f}s")
-        print(f"   Success: {result.get('success')}")
-        print(f"   Content length: {result.get('content_length', 0)}")
         return result
@@ -493,7 +599,7 @@ async def api_extract(request: Request):
             content={"success": False, "error": "Invalid JSON"}
         )
     except Exception as e:
-        print(f"   API Error: {traceback.format_exc()}")
         return JSONResponse(
             status_code=500,
             content={
@@ -511,48 +617,51 @@ def gradio_extract(url: str):
     if not url:
         return "❌ 请输入URL", {}
-    result = extractor.extract_content(url)
     if result["success"]:
         content = result["main_content"]
         title = result.get("title", "无标题")
-        date = result.get("date", "")
-        output = f"""
-## ✅ 提取成功！
-**标题:** {title}
-**日期:** {date if date else "未提取到日期"}
-**方法:** {result.get('method', '提取')}
-**时间:** {result['execution_time']}s
-**字符数:** {result['content_length']:,}
-### 内容预览:
-{content[:800]}{"..." if len(content) > 800 else ""}
 """
         return output, result
     else:
         error = result.get("error", "未知错误")
-        return f"## ❌ 错误\n\n{error}", result
 # Create Gradio interface
 gradio_interface = gr.Interface(
     fn=gradio_extract,
     inputs=gr.Textbox(
-        label="新闻URL",
-        placeholder="https://example.com/news",
         value="https://northern.sinchew.com.my/?p=7217886"
     ),
     outputs=[
-        gr.Markdown(label="结果"),
-        gr.JSON(label="API响应")
     ],
-    title="📰 新闻内容提取器",
-    description="使用BeautifulSoup提取新闻文章内容",
     examples=[
         ["https://northern.sinchew.com.my/?p=7217886"],
         ["https://www.sinchew.com.my/?p=7234965"],
-        ["https://example.com"]
     ]
 )
@@ -568,16 +677,17 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
 if __name__ == "__main__":
     print("\n" + "="*60)
-    print("📰 新闻内容提取器 v3.0 启动")
     print("="*60)
     print("特性:")
-    print("• 使用BeautifulSoup进行HTML解析")
-    print("• 专门针对新闻网站优化")
-    print("• 智能内容评分系统")
     print("="*60)
     print("API端点:")
     print("• GET  /health  - 健康检查")
-    print("• POST /extract - 提取新闻内容")
     print("="*60 + "\n")
     uvicorn.run(

 # ==============================================
+# NEWS CONTENT EXTRACTOR WITH READABILITY
 # ==============================================
 import gradio as gr
 import uvicorn
 import traceback
 from bs4 import BeautifulSoup
+from readability import Document
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # ==============================================
+# NEWS CONTENT EXTRACTOR WITH READABILITY
 # ==============================================
+class NewsArticleExtractor:
+    """Extract news articles using readability-lxml"""
     def __init__(self):
+        self.user_agents = [
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
+            "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
+        ]
+    def extract_article(self, url: str) -> Dict[str, Any]:
+        """Extract article content using multiple methods"""
         start_time = time.time()
+        logger.info(f"📰 Extracting article from: {url}")
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
+        # Try multiple extraction methods
+        methods = [
+            self._extract_with_readability,
+            self._extract_with_jina,
+            self._extract_with_selectors,
+            self._extract_fallback,
         ]
         best_result = None
         best_score = 0
+        for i, method in enumerate(methods):
             try:
+                logger.info(f"  Trying method {i+1}: {method.__name__}")
+                result = method(url)
                 if result.get("success"):
+                    # Score the article
+                    score = self._score_article(result)
                     result["score"] = score
+                    logger.info(f"  ✓ Method {i+1} score: {score}")
                     if score > best_score:
                         best_score = score
                         best_result = result
+                        # If we have a good score, return early
+                        if score > 50:
+                            break
             except Exception as e:
+                logger.error(f"  Method {i+1} failed: {e}")
+                time.sleep(1)
+        if best_result and best_score > 20:
             best_result["execution_time"] = round(time.time() - start_time, 2)
+            best_result["method"] = "article_extraction"
             return best_result
         return {
             "success": False,
             "url": url,
+            "error": "Could not extract article content",
+            "execution_time": round(time.time() - start_time, 2)
         }
+    def _extract_with_readability(self, url: str) -> Dict[str, Any]:
+        """Use readability-lxml to extract article content"""
         try:
             headers = {
+                "User-Agent": self.user_agents[0],
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,ms;q=0.6",
                 "Accept-Encoding": "gzip, deflate",
                 "DNT": "1",
                 "Connection": "keep-alive",
                 "Sec-Fetch-Mode": "navigate",
                 "Sec-Fetch-Site": "none",
                 "Sec-Fetch-User": "?1",
             }
+            response = requests.get(url, headers=headers, timeout=20, verify=False)
             if response.status_code == 200:
+                # Parse with readability
+                doc = Document(response.text)
+                # Extract content
+                article_html = doc.summary()
+                title = doc.title()
+                # Convert HTML to clean text
+                soup = BeautifulSoup(article_html, 'html.parser')
+                article_text = soup.get_text(separator='\n', strip=True)
+                # Clean the text
+                cleaned_text = self._clean_article_text(article_text)
+                if len(cleaned_text) > 200:
+                    # Extract metadata
+                    metadata = self._extract_metadata(response.text)
                     return {
                         "success": True,
                         "url": url,
                         "title": title[:200],
                         "main_content": cleaned_text,
                         "content_length": len(cleaned_text),
                         "content_preview": cleaned_text[:500] + ("..." if len(cleaned_text) > 500 else ""),
+                        "source": "readability",
+                        "status": response.status_code,
+                        "metadata": metadata
                     }
             return {"success": False, "error": f"Status: {response.status_code}"}
         except Exception as e:
+            return {"success": False, "error": f"Readability error: {str(e)}"}
+    def _extract_with_jina(self, url: str) -> Dict[str, Any]:
+        """Try Jina Reader with different parameters"""
         try:
             jina_url = f"https://r.jina.ai/{url}"
+            # Try with different accept headers
+            accept_headers = [
+                "text/plain",
+                "application/json",
+                "text/markdown"
+            ]
+            for accept in accept_headers:
+                try:
+                    response = requests.get(
+                        jina_url,
+                        headers={
+                            "Accept": accept,
+                            "User-Agent": self.user_agents[0]
+                        },
+                        timeout=25
+                    )
+                    if response.status_code == 200:
+                        content = response.text
+                        # Parse based on content type
+                        if accept == "application/json":
+                            try:
+                                data = json.loads(content)
+                                content = data.get("content", content)
+                            except:
+                                pass
+                        # Clean content
+                        cleaned = self._clean_article_text(content)
+                        # Extract title
+                        title = "Jina提取"
+                        lines = content.split('\n')
+                        for line in lines[:5]:
+                            if line.startswith('Title:') or line.startswith('# '):
+                                title = line.replace('Title:', '').replace('# ', '').strip()
+                                break
+                        if len(cleaned) > 200:
+                            return {
+                                "success": True,
+                                "url": url,
+                                "title": title[:200],
+                                "main_content": cleaned,
+                                "content_length": len(cleaned),
+                                "source": f"jina_{accept}",
+                                "status": response.status_code
+                            }
+                except Exception as e:
+                    logger.warning(f"Jina attempt with {accept} failed: {e}")
+                    continue
+            return {"success": False, "error": "All Jina attempts failed"}
+        except Exception as e:
+            return {"success": False, "error": f"Jina error: {str(e)}"}
+    def _extract_with_selectors(self, url: str) -> Dict[str, Any]:
+        """Extract using specific selectors for sinchew.com.my"""
+        try:
+            headers = {
+                "User-Agent": self.user_agents[1],
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            }
+            response = requests.get(url, headers=headers, timeout=15, verify=False)
             if response.status_code == 200:
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Remove unwanted elements
+                for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer',
+                                              'aside', 'form', 'iframe', 'button', 'svg']):
+                    unwanted.decompose()
+                # Try specific selectors for sinchew.com.my
+                selectors_to_try = [
+                    'div.entry-content',
+                    'article',
+                    'div.post-content',
+                    'div.content-area',
+                    'div.article-content',
+                    'div.story-content',
+                    'div[itemprop="articleBody"]',
+                    'div.article-body',
+                    'div.main-content',
+                    'div.news-content',
+                ]
+                article_text = ""
+                for selector in selectors_to_try:
+                    element = soup.select_one(selector)
+                    if element:
+                        text = element.get_text(separator='\n', strip=True)
+                        if len(text) > len(article_text):
+                            article_text = text
+                # If specific selectors didn't work, try finding the main content
+                if len(article_text) < 300:
+                    # Look for paragraphs with Chinese text
+                    all_p = soup.find_all('p')
+                    chinese_paragraphs = []
+                    for p in all_p:
+                        text = p.get_text(strip=True)
+                        if text and len(text) > 50:
+                            # Check if it contains Chinese characters
+                            if re.search(r'[\u4e00-\u9fff]', text):
+                                chinese_paragraphs.append(text)
+                    if chinese_paragraphs:
+                        article_text = '\n\n'.join(chinese_paragraphs[:20])  # Limit to 20 paragraphs
+                # Clean the text
+                cleaned_text = self._clean_article_text(article_text)
+                if len(cleaned_text) > 200:
+                    # Extract title
+                    title = soup.find('title')
+                    title_text = title.get_text(strip=True) if title else "新闻标题"
+                    # Extract date
+                    date = self._extract_date_from_soup(soup)
+                    return {
+                        "success": True,
+                        "url": url,
+                        "title": title_text[:200],
+                        "date": date,
+                        "main_content": cleaned_text,
+                        "content_length": len(cleaned_text),
+                        "source": "selectors",
+                        "status": response.status_code
+                    }
+            return {"success": False, "error": f"Status: {response.status_code}"}
         except Exception as e:
+            return {"success": False, "error": f"Selector error: {str(e)}"}
+    def _extract_fallback(self, url: str) -> Dict[str, Any]:
+        """Fallback extraction method"""
         try:
             response = requests.get(url, timeout=10, verify=False)
             if response.status_code == 200:
+                # Use BeautifulSoup to get clean text
                 soup = BeautifulSoup(response.content, 'html.parser')
+                # Remove all tags except p, div, span
+                for tag in soup.find_all(['script', 'style', 'nav', 'header', 'footer',
+                                         'aside', 'form', 'iframe', 'button']):
+                    tag.decompose()
+                # Get text and filter
+                all_text = soup.get_text(separator='\n', strip=True)
                 lines = all_text.split('\n')
+                # Filter lines
+                filtered_lines = []
                 for line in lines:
                     line = line.strip()
+                    if (len(line) > 30 and  # Minimum length
+                        re.search(r'[\u4e00-\u9fff]', line) and  # Contains Chinese
+                        not re.search(r'cookie|privacy|copyright|advertisement|newsletter|subscribe',
+                                     line.lower()) and
+                        not line.startswith('http')):
+                        filtered_lines.append(line)
+                cleaned_text = '\n\n'.join(filtered_lines[:50])
                 if len(cleaned_text) > 200:
                     title = soup.find('title')
+                    title_text = title.get_text(strip=True) if title else "内容提取"
                     return {
                         "success": True,
                         "title": title_text[:150],
                         "main_content": cleaned_text,
                         "content_length": len(cleaned_text),
+                        "source": "fallback"
                     }
+            return {"success": False, "error": "Fallback extraction failed"}
         except Exception as e:
             return {"success": False, "error": str(e)}
+    def _extract_metadata(self, html_content: str) -> Dict[str, str]:
+        """Extract metadata from HTML"""
+        metadata = {}
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Extract date
+        date = self._extract_date_from_soup(soup)
+        if date:
+            metadata["date"] = date
+        # Extract author
+        author_selectors = [
+            'meta[name="author"]',
+            'meta[property="article:author"]',
+            '.author',
+            '.byline',
+            'span[itemprop="author"]',
         ]
+        for selector in author_selectors:
+            element = soup.select_one(selector)
+            if element:
+                if element.name == 'meta':
+                    author = element.get('content', '')
+                else:
+                    author = element.get_text(strip=True)
+                if author:
+                    metadata["author"] = author
+                    break
+        return metadata
+    def _extract_date_from_soup(self, soup) -> str:
         """Extract date from BeautifulSoup object"""
         date_selectors = [
+            'meta[property="article:published_time"]',
+            'meta[name="pubdate"]',
+            'meta[name="date"]',
             'time',
             '.date',
             '.published',
             '.post-date',
             '.article-date',
         ]
         for selector in date_selectors:
+            element = soup.select_one(selector)
+            if element:
                 if element.name == 'meta':
                     date_str = element.get('content', '')
+                elif element.name == 'time':
+                    date_str = element.get('datetime', '') or element.get_text(strip=True)
                 else:
+                    date_str = element.get_text(strip=True)
+                if date_str:
+                    # Try to parse date
+                    date_patterns = [
+                        r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}',
+                        r'\d{4}/\d{2}/\d{2}',
+                        r'\d{4}-\d{2}-\d{2}',
+                        r'\d{2}/\d{2}/\d{4}',
+                    ]
+                    for pattern in date_patterns:
+                        match = re.search(pattern, date_str)
+                        if match:
+                            return match.group()
         return ""
+    def _clean_article_text(self, text: str) -> str:
+        """Clean article text"""
         if not text:
             return ""
+        # Remove image markers and other noise
+        patterns_to_remove = [
+            r'!\[Image \d+: .*?\]',
+            r'Image \d+:',
             r'ADVERTISEMENT',
             r'Sponsored Content',
             r'点击这里.*',
             r'更多新闻.*',
             r'相关新闻.*',
             r'简\s*繁',
             r'登入.*',
             r'下载APP.*',
+            r'[\*\-\=]{5,}',
+            r'^\s*\d+\s*$',  # Line with only numbers
         ]
+        for pattern in patterns_to_remove:
+            text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.MULTILINE)
+        # Split into lines and clean
         lines = text.split('\n')
         cleaned_lines = []
         for line in lines:
             line = line.strip()
+            if (len(line) > 20 and  # Minimum length
                 not line.startswith(('http://', 'https://', 'www.')) and
+                not re.search(r'^[\d\s\.\-]+$', line) and  # Not just numbers/dashes
+                not re.search(r'cookie|隐私|版权|广告', line.lower())):
                 cleaned_lines.append(line)
         # Remove duplicate consecutive lines
         unique_lines = []
+        for i, line in enumerate(cleaned_lines):
+            if i == 0 or line != cleaned_lines[i-1]:
                 unique_lines.append(line)
+        # Join with paragraph breaks
+        text = '\n\n'.join(unique_lines)
+        # Final cleanup
+        text = re.sub(r'\n{3,}', '\n\n', text)
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+    def _score_article(self, result: Dict[str, Any]) -> int:
+        """Score article quality"""
+        if not result.get("success"):
             return 0
         score = 0
+        content = result.get("main_content", "")
+        # Length score
+        length = len(content)
+        if length > 800:
             score += 30
         elif length > 500:
             score += 20
+        elif length > 300:
             score += 10
+        # Paragraph count
+        paragraphs = content.count('\n\n') + 1
+        if paragraphs > 3:
+            score += 15
+        elif paragraphs > 1:
+            score += 5
+        # News keywords in Chinese
+        news_keywords_chinese = ['报道', '新闻', '记者', '警方', '调查', '发生', '表示',
+                                '指出', '据知', '据了解', '据悉', '事件', '事故', '案件',
+                                '透露', '说明', '强调', '要求', '建议', '认为']
+        for keyword in news_keywords_chinese:
+            if keyword in content:
                 score += 2
+        # Check for Chinese text
+        if re.search(r'[\u4e00-\u9fff]', content):
+            score += 20
+        # Source bonus
+        source = result.get("source", "")
+        if "readability" in source:
+            score += 10
+        return score
 # ==============================================
 # INITIALIZE
 # ==============================================
+extractor = NewsArticleExtractor()
 # ==============================================
 # FASTAPI APP
 # ==============================================
 fastapi_app = FastAPI(
+    title="News Article Extractor",
+    description="Extracts news articles using readability-lxml",
+    version="4.0"
 )
 from fastapi.middleware.cors import CORSMiddleware
 @fastapi_app.get("/")
 async def root():
     return {
+        "service": "News Article Extractor",
+        "version": "4.0",
+        "description": "Extracts news articles using multiple methods including readability-lxml",
         "endpoints": {
             "GET /": "This info",
             "GET /health": "Health check",
+            "POST /extract": "Extract article content"
         }
     }
     return {
         "status": "healthy",
         "timestamp": time.time(),
+        "service": "article_extractor"
     }
 @fastapi_app.post("/extract")
                 content={"success": False, "error": "URL is required"}
             )
+        logger.info(f"📰 API Request: {url}")
         start_time = time.time()
+        result = extractor.extract_article(url)
         elapsed = time.time() - start_time
+        logger.info(f"   Extraction completed in {elapsed:.2f}s")
+        logger.info(f"   Success: {result.get('success')}")
+        logger.info(f"   Content length: {result.get('content_length', 0)}")
+        logger.info(f"   Method used: {result.get('method', 'unknown')}")
         return result
             content={"success": False, "error": "Invalid JSON"}
         )
     except Exception as e:
+        logger.error(f"API Error: {traceback.format_exc()}")
         return JSONResponse(
             status_code=500,
             content={
     if not url:
         return "❌ 请输入URL", {}
+    result = extractor.extract_article(url)
     if result["success"]:
         content = result["main_content"]
         title = result.get("title", "无标题")
+        # Format output nicely
+        output = f"""## 📰 {title}
+**URL:** {result['url']}
+**提取方法:** {result.get('method', '未知')}
+**提取时间:** {result['execution_time']}秒
+**内容长度:** {result['content_length']}字符
+---
+{content}
+---
+*提取完成于 {time.strftime('%Y-%m-%d %H:%M:%S')}*
 """
         return output, result
     else:
         error = result.get("error", "未知错误")
+        return f"## ❌ 提取失败\n\n**错误:** {error}\n\n**URL:** {result.get('url', '未知')}", result
 # Create Gradio interface
 gradio_interface = gr.Interface(
     fn=gradio_extract,
     inputs=gr.Textbox(
+        label="新闻文章URL",
+        placeholder="https://example.com/news/article",
         value="https://northern.sinchew.com.my/?p=7217886"
     ),
     outputs=[
+        gr.Markdown(label="文章内容"),
+        gr.JSON(label="原始数据")
     ],
+    title="📰 新闻文章提取器 v4.0",
+    description="使用readability-lxml提取新闻文章主要内容",
     examples=[
         ["https://northern.sinchew.com.my/?p=7217886"],
         ["https://www.sinchew.com.my/?p=7234965"],
+        ["https://www.zaobao.com.sg/realtime/china/story20250127-1525893"]
     ]
 )
 if __name__ == "__main__":
     print("\n" + "="*60)
+    print("📰 新闻文章提取器 v4.0 启动")
     print("="*60)
     print("特性:")
+    print("• 使用readability-lxml进行智能文章提取")
+    print("• 多种提取方法备用")
+    print("• 专门优化中文新闻网站")
+    print("• 自动内容评分系统")
     print("="*60)
     print("API端点:")
     print("• GET  /health  - 健康检查")
+    print("• POST /extract - 提取文章内容")
     print("="*60 + "\n")
     uvicorn.run(