Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on Jan 25

Commit

40f056b

verified ·

1 Parent(s): f9380bf

Update app.py

Browse files

Files changed (1) hide show

app.py +372 -186

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==============================================
-# SMART CONTENT EXTRACTOR FOR N8N
 # ==============================================
 import gradio as gr
@@ -8,22 +8,56 @@ import json
 import time
 import re
 import html
-from typing import Dict, Any
 from fastapi import FastAPI, Request
 import uvicorn
 # ==============================================
-# SMART CONTENT EXTRACTOR
 # ==============================================
-class SmartContentExtractor:
-    """Extracts only main content, removes navigation, ads, footers, etc."""
     def __init__(self):
-        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
     def extract_content(self, url: str) -> Dict[str, Any]:
-        """Extract only main content from webpage"""
         start_time = time.time()
         print(f"🌐 Extracting content from: {url}")
@@ -33,41 +67,39 @@ class SmartContentExtractor:
             url = 'https://' + url
         try:
-            # Fetch the page
-            headers = {
-                'User-Agent': self.user_agent,
-                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-                'Accept-Language': 'en-US,en;q=0.9',
-            }
-            response = requests.get(url, headers=headers, timeout=15)
             response.raise_for_status()
-            # Get encoding
-            if response.encoding is None:
-                response.encoding = 'utf-8'
-            html_content = response.text
-            # Extract only main content
-            main_content = self._extract_main_content(html_content)
-            # Clean content
-            cleaned_content = self._clean_content(main_content)
-            # Extract title (separately)
-            title = self._extract_title(html_content)
             # Extract metadata
-            metadata = self._extract_metadata(html_content)
             return {
                 "success": True,
                 "url": url,
                 "title": title,
-                "main_content": cleaned_content[:20000],  # Limit to 20k chars
                 "content_length": len(cleaned_content),
-                "content_preview": cleaned_content[:500] + ("..." if len(cleaned_content) > 500 else ""),
                 "metadata": metadata,
                 "status_code": response.status_code,
                 "execution_time": round(time.time() - start_time, 2)
@@ -81,81 +113,75 @@ class SmartContentExtractor:
                 "execution_time": round(time.time() - start_time, 2)
             }
-    def _extract_main_content(self, html_content: str) -> str:
-        """Extract only the main content, removing navigation, ads, footers, etc."""
-        # Remove unwanted sections first
-        html_content = self._remove_unwanted_sections(html_content)
-        # Try to extract using regex patterns for main content
-        content = self._extract_with_regex(html_content)
-        # If we got decent content, return it
-        if len(content.strip()) > 200:
-            return content
-        # Fallback: remove all HTML tags and get text
-        return self._extract_all_text(html_content)
-    def _remove_unwanted_sections(self, html_content: str) -> str:
-        """Remove navigation, ads, footers, sidebars, etc."""
-        # Patterns to remove
-        unwanted_patterns = [
-            # Navigation
-            r'<nav[^>]*>.*?</nav>',
-            r'<header[^>]*>.*?</header>',
-            # Footers
-            r'<footer[^>]*>.*?</footer>',
-            # Sidebars
-            r'<aside[^>]*>.*?</aside>',
-            r'<div[^>]*class="[^"]*sidebar[^"]*"[^>]*>.*?</div>',
-            # Ads and banners
-            r'<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>',
-            r'<div[^>]*class="[^"]*banner[^"]*"[^>]*>.*?</div>',
-            r'<ins[^>]*>.*?</ins>',
-            # Social media/widgets
-            r'<div[^>]*class="[^"]*social[^"]*"[^>]*>.*?</div>',
-            r'<div[^>]*class="[^"]*widget[^"]*"[^>]*>.*?</div>',
-            # Comments
-            r'<div[^>]*class="[^"]*comment[^"]*"[^>]*>.*?</div>',
-            # Related content
-            r'<div[^>]*class="[^"]*related[^"]*"[^>]*>.*?</div>',
-            # Scripts and styles
-            r'<script[^>]*>.*?</script>',
-            r'<style[^>]*>.*?</style>',
-            r'<!--.*?-->',
-        ]
-        cleaned_html = html_content
-        for pattern in unwanted_patterns:
-            cleaned_html = re.sub(pattern, ' ', cleaned_html, flags=re.DOTALL | re.IGNORECASE)
-        return cleaned_html
-    def _extract_with_regex(self, html_content: str) -> str:
-        """Extract content using regex patterns"""
         content_patterns = [
-            # Look for article tags
             r'<article[^>]*>(.*?)</article>',
-            # Look for main tags
             r'<main[^>]*>(.*?)</main>',
-            # Look for divs with content classes
-            r'<div[^>]*class="[^"]*(post-content|article-content|entry-content|story-content)[^"]*"[^>]*>(.*?)</div>',
-            r'<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>',
-            # Look for section with content
-            r'<section[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</section>',
         ]
         all_content = []
@@ -182,138 +208,268 @@ class SmartContentExtractor:
             return combined
-        return ""
-    def _extract_all_text(self, html_content: str) -> str:
-        """Extract all text as fallback, but clean it well"""
-        # Remove scripts, styles, comments first
         html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
         html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
         html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
-        # Remove common unwanted tags
-        unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'ins']
         for tag in unwanted_tags:
             html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
-        # Remove HTML tags
         text = re.sub(r'<[^>]+>', ' ', html_content)
         # Decode HTML entities
         text = html.unescape(text)
-        # Remove very short lines
-        lines = text.split('\n')
-        filtered_lines = []
-        for line in lines:
-            line = line.strip()
-            if len(line) > 30:  # Only keep lines longer than 30 chars
-                filtered_lines.append(line)
-            elif any(word in line.lower() for word in ['home', 'about', 'contact', 'login', 'sign up', 'search']):
-                continue
-        return '\n\n'.join(filtered_lines)
-    def _clean_content(self, content: str) -> str:
-        """Clean and normalize the extracted content"""
         if not content:
             return ""
         # Replace multiple whitespace with single space
         content = re.sub(r'\s+', ' ', content)
-        # Remove control characters
         content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
-        # Remove common unwanted phrases
         unwanted_phrases = [
             r'sign up for our newsletter',
             r'subscribe to our newsletter',
             r'follow us on',
-            r'like us on facebook',
-            r'follow us on twitter',
             r'share this article',
             r'read more',
             r'continue reading',
             r'advertisement',
-            r'sponsored content',
-            r'related articles',
-            r'you may also like',
             r'click here',
             r'learn more',
         ]
         for phrase in unwanted_phrases:
             content = re.sub(phrase, '', content, flags=re.IGNORECASE)
-        # Remove email addresses
         content = re.sub(r'\S+@\S+\.\S+', '', content)
-        # Remove URLs
         content = re.sub(r'https?://\S+', '', content)
-        # Remove excessive punctuation
-        content = re.sub(r'[.!?]{3,}', '.', content)
-        # Normalize spaces around punctuation
-        content = re.sub(r'\s+([.,!?;:])', r'\1', content)
-        content = re.sub(r'([.,!?;:])\s+', r'\1 ', content)
-        # Split into paragraphs and filter
-        sentences = content.split('. ')
         clean_sentences = []
         for sentence in sentences:
             sentence = sentence.strip()
-            if len(sentence) < 5:
                 continue
-            if len(sentence) > 30:
-                clean_sentences.append(sentence)
-        # Join back with proper spacing
-        content = '. '.join(clean_sentences)
         return content.strip()
-    def _extract_title(self, html_content: str) -> str:
-        """Extract page title"""
-        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
         if title_match:
             title = title_match.group(1)
             title = re.sub(r'\s+', ' ', title).strip()
-            return title[:200]
-        return "No title found"
     def _extract_metadata(self, html_content: str) -> Dict[str, str]:
-        """Extract basic metadata"""
         metadata = {}
-        # Meta description
-        desc_match = re.search(r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
-                              html_content, re.IGNORECASE)
-        if desc_match:
-            metadata['description'] = desc_match.group(1)[:300]
-        # Meta keywords
         keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
                                   html_content, re.IGNORECASE)
         if keywords_match:
-            metadata['keywords'] = keywords_match.group(1)[:300]
         # Author
         author_match = re.search(r'<meta[^>]*name=["\']author["\'][^>]*content=["\'](.*?)["\']',
                                 html_content, re.IGNORECASE)
         if author_match:
-            metadata['author'] = author_match.group(1)[:200]
         return metadata
 # ==============================================
 # INITIALIZE
 # ==============================================
-extractor = SmartContentExtractor()
 # ==============================================
 # FASTAPI APP
@@ -321,9 +477,9 @@ extractor = SmartContentExtractor()
 # Create FastAPI app
 fastapi_app = FastAPI(
-    title="Smart Content Extractor",
-    description="Extracts only main content from webpages",
-    version="1.0"
 )
 # Add CORS middleware
@@ -341,13 +497,17 @@ fastapi_app.add_middleware(
 @fastapi_app.get("/")
 async def root():
     return {
-        "service": "Smart Content Extractor",
-        "version": "1.0",
-        "description": "Extracts only main content from webpages",
         "endpoints": {
-            "GET /": "This info",
-            "GET /health": "Health check",
-            "POST /extract": "Extract main content (for n8n)"
         }
     }
@@ -360,7 +520,7 @@ async def health():
 @fastapi_app.post("/extract")
 async def api_extract(request: Request):
-    """API endpoint for n8n - extracts only main content"""
     try:
         body = await request.json()
         url = body.get("url", "").strip()
@@ -368,10 +528,10 @@ async def api_extract(request: Request):
         if not url:
             return JSONResponse(
                 status_code=400,
-                content={"success": False, "error": "URL parameter is required"}
             )
-        print(f"📨 Content extraction request: {url}")
         result = extractor.extract_content(url)
         return result
@@ -379,12 +539,12 @@ async def api_extract(request: Request):
     except json.JSONDecodeError:
         return JSONResponse(
             status_code=400,
-            content={"success": False, "error": "Invalid JSON payload"}
         )
     except Exception as e:
         return JSONResponse(
             status_code=500,
-            content={"success": False, "error": f"Internal error: {str(e)}"}
         )
 # ==============================================
@@ -394,24 +554,46 @@ async def api_extract(request: Request):
 def gradio_extract(url: str):
     """Gradio interface function"""
     if not url:
-        return "❌ Please enter a URL", {}
     result = extractor.extract_content(url)
     if result["success"]:
         content = result["main_content"]
         content_length = result["content_length"]
         # Create preview
-        preview = content[:500]
-        if len(content) > 500:
-            preview += "..."
-        output = f"""
 ## ✅ Content Extracted Successfully!
 **URL:** {result['url']}
-**Title:** {result.get('title', 'N/A')}
 **Time:** {result['execution_time']}s
 **Content Length:** {content_length:,} characters
@@ -420,26 +602,29 @@ def gradio_extract(url: str):
 """
         return output, result
     else:
-        return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
-# Create Gradio interface (removed allow_flagging parameter)
 gradio_interface = gr.Interface(
     fn=gradio_extract,
     inputs=gr.Textbox(
-        label="Website URL",
-        placeholder="https://example.com",
-        value="https://en.wikipedia.org/wiki/Artificial_intelligence"
     ),
     outputs=[
-        gr.Markdown(label="Result"),
-        gr.JSON(label="API Response")
     ],
-    title="🧠 Smart Content Extractor for n8n",
-    description="Extracts ONLY main content - removes navigation, ads, footers, sidebars, etc.",
     examples=[
-        ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
         ["https://example.com"],
-        ["https://news.ycombinator.com"]
     ]
 )
@@ -456,10 +641,11 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
 if __name__ == "__main__":
     print("\n" + "="*60)
-    print("🧠 Smart Content Extractor Starting")
     print("="*60)
-    print("API Endpoint: POST /extract")
-    print("Web Interface: GET /")
     print("="*60 + "\n")
     uvicorn.run(

 # ==============================================
+# SMART CONTENT EXTRACTOR FOR CHINESE WEBSITES
 # ==============================================
 import gradio as gr
 import time
 import re
 import html
+import chardet
+from typing import Dict, Any, Optional
 from fastapi import FastAPI, Request
 import uvicorn
 # ==============================================
+# ENHANCED CONTENT EXTRACTOR FOR CHINESE
 # ==============================================
+class ChineseContentExtractor:
+    """Enhanced content extractor optimized for Chinese websites"""
     def __init__(self):
+        # Chinese browser user agents
+        self.user_agents = [
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/120.0",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+        ]
+        # Common Chinese website patterns
+        self.chinese_site_patterns = [
+            r'\.cn$',
+            r'\.com\.cn$',
+            r'baidu\.com',
+            r'qq\.com',
+            r'sina\.com\.cn',
+            r'sohu\.com',
+            r'163\.com',
+            r'jd\.com',
+            r'taobao\.com',
+            r'alibaba\.com',
+            r'zhihu\.com',
+            r'bilibili\.com',
+            r'weibo\.com',
+            r'douyin\.com',
+            r'douban\.com',
+            r'ximalaya\.com',
+        ]
+    def is_chinese_website(self, url: str) -> bool:
+        """Check if URL is a Chinese website"""
+        for pattern in self.chinese_site_patterns:
+            if re.search(pattern, url, re.IGNORECASE):
+                return True
+        return False
     def extract_content(self, url: str) -> Dict[str, Any]:
+        """Extract content with Chinese website support"""
         start_time = time.time()
         print(f"🌐 Extracting content from: {url}")
             url = 'https://' + url
         try:
+            # Determine if Chinese website
+            is_chinese = self.is_chinese_website(url)
+            # Fetch the page with appropriate settings
+            response = self._fetch_with_encoding(url, is_chinese)
             response.raise_for_status()
+            # Get correct encoding
+            content, encoding = self._decode_content(response)
+            # Extract main content
+            main_content = self._extract_main_content(content, is_chinese)
+            # Clean content (preserve Chinese characters)
+            cleaned_content = self._clean_content(main_content, is_chinese)
             # Extract metadata
+            title = self._extract_title(content, encoding)
+            metadata = self._extract_metadata(content)
+            # Calculate content stats
+            chinese_char_count = self._count_chinese_characters(cleaned_content)
             return {
                 "success": True,
                 "url": url,
+                "is_chinese_website": is_chinese,
                 "title": title,
+                "main_content": cleaned_content[:25000],  # Increased limit for Chinese
                 "content_length": len(cleaned_content),
+                "chinese_char_count": chinese_char_count,
+                "encoding_used": encoding,
+                "content_preview": cleaned_content[:800] + ("..." if len(cleaned_content) > 800 else ""),
                 "metadata": metadata,
                 "status_code": response.status_code,
                 "execution_time": round(time.time() - start_time, 2)
                 "execution_time": round(time.time() - start_time, 2)
             }
+    def _fetch_with_encoding(self, url: str, is_chinese: bool) -> requests.Response:
+        """Fetch webpage with proper encoding handling"""
+        headers = {
+            'User-Agent': self.user_agents[0],
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8' if is_chinese else 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate',
+        }
+        # Add Chinese-specific headers
+        if is_chinese:
+            headers.update({
+                'Accept-Language': 'zh-CN,zh;q=0.9',
+                'Cache-Control': 'no-cache',
+            })
+        response = requests.get(url, headers=headers, timeout=15)
+        return response
+    def _decode_content(self, response: requests.Response) -> tuple[str, str]:
+        """Decode content with proper encoding detection"""
+        # Try to detect encoding
+        if response.encoding:
+            encoding = response.encoding.lower()
+        else:
+            # Use chardet to detect encoding
+            detected = chardet.detect(response.content)
+            encoding = detected.get('encoding', 'utf-8').lower()
+        # Handle common Chinese encodings
+        if encoding in ['gb2312', 'gbk', 'gb18030']:
+            encoding = 'gb18030'  # Most comprehensive Chinese encoding
+        elif encoding == 'big5':
+            encoding = 'big5'  # Traditional Chinese
+        else:
+            encoding = 'utf-8'  # Default to UTF-8
+        try:
+            content = response.content.decode(encoding, errors='replace')
+        except:
+            # Fallback to UTF-8 with error replacement
+            content = response.content.decode('utf-8', errors='replace')
+            encoding = 'utf-8'
+        return content, encoding
+    def _extract_main_content(self, html_content: str, is_chinese: bool) -> str:
+        """Extract main content with Chinese website optimizations"""
+        # Remove unwanted sections
+        html_content = self._remove_unwanted_sections(html_content, is_chinese)
+        # Chinese websites often have specific content patterns
         content_patterns = [
+            # Common Chinese content containers
+            r'<div[^>]*class="[^"]*(content|article|post|正文|内容)[^"]*"[^>]*>(.*?)</div>',
+            r'<div[^>]*id="[^"]*(content|article|post|正文|内容)[^"]*"[^>]*>(.*?)</div>',
+            # Article tags
             r'<article[^>]*>(.*?)</article>',
+            # Main content area
             r'<main[^>]*>(.*?)</main>',
+            # Chinese specific patterns
+            r'<div[^>]*class="[^"]*(detail|content-main|article-content)[^"]*"[^>]*>(.*?)</div>',
+            # For news websites
+            r'<div[^>]*class="[^"]*(news-content|news-body|news-article)[^"]*"[^>]*>(.*?)</div>',
         ]
         all_content = []
             return combined
+        # Fallback: extract all text and clean
+        return self._extract_all_text(html_content, is_chinese)
+    def _remove_unwanted_sections(self, html_content: str, is_chinese: bool) -> str:
+        """Remove unwanted sections with Chinese-specific patterns"""
+        # Base patterns
+        unwanted_patterns = [
+            # Navigation
+            r'<nav[^>]*>.*?</nav>',
+            r'<header[^>]*>.*?</header>',
+            # Footers
+            r'<footer[^>]*>.*?</footer>',
+            # Sidebars
+            r'<aside[^>]*>.*?</aside>',
+            # Ads
+            r'<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>',
+            r'<ins[^>]*>.*?</ins>',
+            # Scripts and styles
+            r'<script[^>]*>.*?</script>',
+            r'<style[^>]*>.*?</style>',
+            r'<!--.*?-->',
+        ]
+        # Chinese-specific unwanted patterns
+        if is_chinese:
+            chinese_patterns = [
+                # Chinese navigation/menus (导航, 菜单)
+                r'<div[^>]*class="[^"]*(导航|菜单|nav)[^"]*"[^>]*>.*?</div>',
+                r'<ul[^>]*class="[^"]*(导航|菜单)[^"]*"[^>]*>.*?</ul>',
+                # Sidebars (侧边栏)
+                r'<div[^>]*class="[^"]*(侧边栏|sidebar)[^"]*"[^>]*>.*?</div>',
+                # Comments (评论)
+                r'<div[^>]*class="[^"]*(评论|comment)[^"]*"[^>]*>.*?</div>',
+                # Related articles (相关文章)
+                r'<div[^>]*class="[^"]*(相关|related)[^"]*"[^>]*>.*?</div>',
+                # Hot posts (热门)
+                r'<div[^>]*class="[^"]*(热门|hot)[^"]*"[^>]*>.*?</div>',
+                # Recommendations (推荐)
+                r'<div[^>]*class="[^"]*(推荐|recommend)[^"]*"[^>]*>.*?</div>',
+                # Share buttons (分享)
+                r'<div[^>]*class="[^"]*(分享|share)[^"]*"[^>]*>.*?</div>',
+            ]
+            unwanted_patterns.extend(chinese_patterns)
+        cleaned_html = html_content
+        for pattern in unwanted_patterns:
+            cleaned_html = re.sub(pattern, ' ', cleaned_html, flags=re.DOTALL | re.IGNORECASE)
+        return cleaned_html
+    def _extract_all_text(self, html_content: str, is_chinese: bool) -> str:
+        """Extract all text with Chinese character preservation"""
+        # Remove scripts, styles, comments
         html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
         html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
         html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
+        # Remove unwanted tags
+        unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'ins', 'meta', 'link']
         for tag in unwanted_tags:
             html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
+        # Remove HTML tags but preserve text
         text = re.sub(r'<[^>]+>', ' ', html_content)
         # Decode HTML entities
         text = html.unescape(text)
+        # Chinese-specific cleaning
+        if is_chinese:
+            # Keep Chinese text blocks
+            lines = text.split('\n')
+            filtered_lines = []
+            for line in lines:
+                line = line.strip()
+                # Keep lines with significant Chinese content
+                chinese_chars = self._count_chinese_characters(line)
+                if chinese_chars > 5 or len(line) > 50:
+                    filtered_lines.append(line)
+            text = '\n\n'.join(filtered_lines)
+        return text
+    def _clean_content(self, content: str, is_chinese: bool) -> str:
+        """Clean content while preserving Chinese characters"""
         if not content:
             return ""
         # Replace multiple whitespace with single space
         content = re.sub(r'\s+', ' ', content)
+        # Remove control characters but preserve Chinese/Unicode
         content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
+        # Remove unwanted phrases (both English and Chinese)
         unwanted_phrases = [
+            # English
             r'sign up for our newsletter',
             r'subscribe to our newsletter',
             r'follow us on',
             r'share this article',
             r'read more',
             r'continue reading',
             r'advertisement',
             r'click here',
             r'learn more',
+            # Chinese
+            r'订阅我们的新闻',
+            r'关注我们',
+            r'分享这篇文章',
+            r'阅读更多',
+            r'继续阅读',
+            r'广告',
+            r'点击这里',
+            r'了解更多',
+            r'相关文章',
+            r'热门推荐',
+            r'猜你喜欢',
         ]
         for phrase in unwanted_phrases:
             content = re.sub(phrase, '', content, flags=re.IGNORECASE)
+        # Remove email addresses and URLs
         content = re.sub(r'\S+@\S+\.\S+', '', content)
         content = re.sub(r'https?://\S+', '', content)
+        # For Chinese content, clean differently
+        if is_chinese:
+            # Remove excessive punctuation but preserve Chinese punctuation
+            content = re.sub(r'[。！？]{3,}', '。', content)
+            content = re.sub(r'[\.,!?]{3,}', '.', content)
+            # Normalize Chinese punctuation spacing
+            content = re.sub(r'\s+([。，！？；：])', r'\1', content)
+            content = re.sub(r'([。，！？；：])\s+', r'\1', content)
+        else:
+            # Normalize English punctuation spacing
+            content = re.sub(r'\s+([.,!?;:])', r'\1', content)
+            content = re.sub(r'([.,!?;:])\s+', r'\1 ', content)
+        # Split and filter paragraphs
+        if is_chinese:
+            # Split by Chinese sentence endings
+            sentences = re.split(r'[。！？]', content)
+        else:
+            # Split by English sentence endings
+            sentences = re.split(r'[.!?]', content)
         clean_sentences = []
         for sentence in sentences:
             sentence = sentence.strip()
+            if not sentence:
                 continue
+            # Keep sentences with meaningful content
+            if is_chinese:
+                chinese_chars = self._count_chinese_characters(sentence)
+                if chinese_chars > 3 or len(sentence) > 20:
+                    clean_sentences.append(sentence)
+            else:
+                if len(sentence) > 20:
+                    clean_sentences.append(sentence)
+        # Join back with appropriate punctuation
+        if is_chinese:
+            content = '。'.join(clean_sentences) + ('。' if clean_sentences else '')
+        else:
+            content = '. '.join(clean_sentences) + ('.' if clean_sentences else '')
         return content.strip()
+    def _extract_title(self, html_content: str, encoding: str) -> str:
+        """Extract page title with encoding support"""
+        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
         if title_match:
             title = title_match.group(1)
             title = re.sub(r'\s+', ' ', title).strip()
+            # Ensure title is properly decoded
+            try:
+                title = html.unescape(title)
+            except:
+                pass
+            return title[:300]
+        return "未找到标题" if 'gb' in encoding or 'big5' in encoding else "No title found"
     def _extract_metadata(self, html_content: str) -> Dict[str, str]:
+        """Extract metadata including Chinese meta tags"""
         metadata = {}
+        # Meta description (supports both English and Chinese)
+        desc_patterns = [
+            r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
+            r'<meta[^>]*property=["\']og:description["\'][^>]*content=["\'](.*?)["\']',
+        ]
+        for pattern in desc_patterns:
+            match = re.search(pattern, html_content, re.IGNORECASE)
+            if match:
+                metadata['description'] = html.unescape(match.group(1))[:500]
+                break
+        # Keywords
         keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
                                   html_content, re.IGNORECASE)
         if keywords_match:
+            metadata['keywords'] = html.unescape(keywords_match.group(1))[:500]
         # Author
         author_match = re.search(r'<meta[^>]*name=["\']author["\'][^>]*content=["\'](.*?)["\']',
                                 html_content, re.IGNORECASE)
         if author_match:
+            metadata['author'] = html.unescape(author_match.group(1))[:200]
+        # Charset
+        charset_match = re.search(r'<meta[^>]*charset=["\']([^"\']+)["\']', html_content, re.IGNORECASE)
+        if charset_match:
+            metadata['charset'] = charset_match.group(1)
         return metadata
+    def _count_chinese_characters(self, text: str) -> int:
+        """Count Chinese characters in text"""
+        # Chinese character ranges in Unicode
+        chinese_ranges = [
+            (0x4E00, 0x9FFF),    # CJK Unified Ideographs
+            (0x3400, 0x4DBF),    # CJK Unified Ideographs Extension A
+            (0x20000, 0x2A6DF),  # CJK Unified Ideographs Extension B
+            (0x2A700, 0x2B73F),  # CJK Unified Ideographs Extension C
+            (0x2B740, 0x2B81F),  # CJK Unified Ideographs Extension D
+            (0x2B820, 0x2CEAF),  # CJK Unified Ideographs Extension E
+            (0xF900, 0xFAFF),    # CJK Compatibility Ideographs
+            (0x2F800, 0x2FA1F),  # CJK Compatibility Ideographs Supplement
+        ]
+        count = 0
+        for char in text:
+            char_code = ord(char)
+            for start, end in chinese_ranges:
+                if start <= char_code <= end:
+                    count += 1
+                    break
+        return count
 # ==============================================
 # INITIALIZE
 # ==============================================
+extractor = ChineseContentExtractor()
 # ==============================================
 # FASTAPI APP
 # Create FastAPI app
 fastapi_app = FastAPI(
+    title="智能内容提取器 (中文网站优化)",
+    description="专门优化中文网站的内容提取器，去除导航、广告、页脚等无关内容",
+    version="2.0"
 )
 # Add CORS middleware
 @fastapi_app.get("/")
 async def root():
     return {
+        "service": "智能内容提取器",
+        "version": "2.0",
+        "description": "专门优化中文网站的内容提取器",
         "endpoints": {
+            "GET /": "API信息",
+            "GET /health": "健康检查",
+            "POST /extract": "提取主要内容 (n8n专用)"
+        },
+        "examples": {
+            "中文网站": "https://zhihu.com",
+            "英文网站": "https://example.com"
         }
     }
 @fastapi_app.post("/extract")
 async def api_extract(request: Request):
+    """API endpoint for n8n - optimized for Chinese websites"""
     try:
         body = await request.json()
         url = body.get("url", "").strip()
         if not url:
             return JSONResponse(
                 status_code=400,
+                content={"success": False, "error": "URL参数是必需的"}
             )
+        print(f"📨 内容提取请求: {url}")
         result = extractor.extract_content(url)
         return result
     except json.JSONDecodeError:
         return JSONResponse(
             status_code=400,
+            content={"success": False, "error": "无效的JSON数据"}
         )
     except Exception as e:
         return JSONResponse(
             status_code=500,
+            content={"success": False, "error": f"内部错误: {str(e)}"}
         )
 # ==============================================
 def gradio_extract(url: str):
     """Gradio interface function"""
     if not url:
+        return "❌ 请输入URL", {}
     result = extractor.extract_content(url)
     if result["success"]:
         content = result["main_content"]
         content_length = result["content_length"]
+        is_chinese = result.get("is_chinese_website", False)
         # Create preview
+        if is_chinese:
+            # For Chinese, show first 600 characters
+            preview = content[:600]
+            if len(content) > 600:
+                preview += "..."
+        else:
+            # For English, show first 500 characters
+            preview = content[:500]
+            if len(content) > 500:
+                preview += "..."
+        if is_chinese:
+            output = f"""
+## ✅ 内容提取成功！
+**网址:** {result['url']}
+**标题:** {result.get('title', '无标题')}
+**时间:** {result['execution_time']}秒
+**内容长度:** {content_length:,} 字符
+**中文字符数:** {result.get('chinese_char_count', 0):,}
+### 内容预览:
+{preview}
+"""
+        else:
+            output = f"""
 ## ✅ Content Extracted Successfully!
 **URL:** {result['url']}
+**Title:** {result.get('title', 'No title')}
 **Time:** {result['execution_time']}s
 **Content Length:** {content_length:,} characters
 """
         return output, result
     else:
+        error_msg = result.get("error", "未知错误")
+        return f"## ❌ 错误\n\n{error_msg}", result
+# Create Gradio interface
 gradio_interface = gr.Interface(
     fn=gradio_extract,
     inputs=gr.Textbox(
+        label="网站网址 / Website URL",
+        placeholder="请输入网址 (如: https://zhihu.com)",
+        value="https://zhihu.com"
     ),
     outputs=[
+        gr.Markdown(label="结果 / Result"),
+        gr.JSON(label="API响应 / API Response")
     ],
+    title="🧠 智能内容提取器 (中文优化) / Smart Content Extractor (Chinese Optimized)",
+    description="专门优化中文网站的内容提取器，去除导航、广告、页脚等无关内容 / Optimized for Chinese websites, removes navigation, ads, footers, etc.",
     examples=[
+        ["https://zhihu.com"],
+        ["https://baidu.com"],
+        ["https://news.sina.com.cn"],
         ["https://example.com"],
+        ["https://en.wikipedia.org/wiki/Artificial_intelligence"]
     ]
 )
 if __name__ == "__main__":
     print("\n" + "="*60)
+    print("🧠 智能内容提取器启动中...")
+    print("Smart Content Extractor Starting...")
     print("="*60)
+    print("API端点 / API Endpoint: POST /extract")
+    print("网页界面 / Web Interface: GET /")
     print("="*60 + "\n")
     uvicorn.run(