Spaces:

yukee1992
/

Screenshot-scraper

Sleeping

App Files Files Community

yukee1992 commited on Jan 25

Commit

ba2f5fc

verified ·

1 Parent(s): 92bcfa2

Update app.py

Browse files

Files changed (1) hide show

app.py +334 -467

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==============================================
-# SMART CONTENT EXTRACTOR FOR CHINESE WEBSITES
 # ==============================================
 import gradio as gr
@@ -8,481 +8,363 @@ import json
 import time
 import re
 import html
-import chardet
-from typing import Dict, Any, Optional
 from fastapi import FastAPI, Request
 import uvicorn
 # ==============================================
-# ENHANCED CONTENT EXTRACTOR FOR CHINESE
 # ==============================================
-class ChineseContentExtractor:
-    """Enhanced content extractor optimized for Chinese websites"""
     def __init__(self):
-        # Chinese browser user agents
-        self.user_agents = [
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/120.0",
-            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
         ]
-        # Common Chinese website patterns
-        self.chinese_site_patterns = [
-            r'\.cn$',
-            r'\.com\.cn$',
-            r'baidu\.com',
-            r'qq\.com',
-            r'sina\.com\.cn',
-            r'sohu\.com',
-            r'163\.com',
-            r'jd\.com',
-            r'taobao\.com',
-            r'alibaba\.com',
-            r'zhihu\.com',
-            r'bilibili\.com',
-            r'weibo\.com',
-            r'douyin\.com',
-            r'douban\.com',
-            r'ximalaya\.com',
         ]
-    def is_chinese_website(self, url: str) -> bool:
-        """Check if URL is a Chinese website"""
-        for pattern in self.chinese_site_patterns:
-            if re.search(pattern, url, re.IGNORECASE):
-                return True
-        return False
     def extract_content(self, url: str) -> Dict[str, Any]:
-        """Extract content with Chinese website support"""
         start_time = time.time()
-        print(f"🌐 Extracting content from: {url}")
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
         try:
-            # Determine if Chinese website
-            is_chinese = self.is_chinese_website(url)
-            # Fetch the page with appropriate settings
-            response = self._fetch_with_encoding(url, is_chinese)
-            response.raise_for_status()
-            # Get correct encoding
-            content, encoding = self._decode_content(response)
-            # Extract main content
-            main_content = self._extract_main_content(content, is_chinese)
-            # Clean content (preserve Chinese characters)
-            cleaned_content = self._clean_content(main_content, is_chinese)
-            # Extract metadata
-            title = self._extract_title(content, encoding)
-            metadata = self._extract_metadata(content)
-            # Calculate content stats
-            chinese_char_count = self._count_chinese_characters(cleaned_content)
-            return {
-                "success": True,
-                "url": url,
-                "is_chinese_website": is_chinese,
-                "title": title,
-                "main_content": cleaned_content[:25000],  # Increased limit for Chinese
-                "content_length": len(cleaned_content),
-                "chinese_char_count": chinese_char_count,
-                "encoding_used": encoding,
-                "content_preview": cleaned_content[:800] + ("..." if len(cleaned_content) > 800 else ""),
-                "metadata": metadata,
-                "status_code": response.status_code,
-                "execution_time": round(time.time() - start_time, 2)
-            }
-        except Exception as e:
-            return {
-                "success": False,
-                "url": url,
-                "error": str(e),
-                "execution_time": round(time.time() - start_time, 2)
-            }
-    def _fetch_with_encoding(self, url: str, is_chinese: bool) -> requests.Response:
-        """Fetch webpage with proper encoding handling"""
-        headers = {
-            'User-Agent': self.user_agents[0],
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8' if is_chinese else 'en-US,en;q=0.9',
-            'Accept-Encoding': 'gzip, deflate',
-        }
-        # Add Chinese-specific headers
-        if is_chinese:
-            headers.update({
-                'Accept-Language': 'zh-CN,zh;q=0.9',
-                'Cache-Control': 'no-cache',
-            })
-        response = requests.get(url, headers=headers, timeout=15)
-        return response
-    def _decode_content(self, response: requests.Response) -> tuple[str, str]:
-        """Decode content with proper encoding detection"""
-        # Try to detect encoding
-        if response.encoding:
-            encoding = response.encoding.lower()
-        else:
-            # Use chardet to detect encoding
-            detected = chardet.detect(response.content)
-            encoding = detected.get('encoding', 'utf-8').lower()
-        # Handle common Chinese encodings
-        if encoding in ['gb2312', 'gbk', 'gb18030']:
-            encoding = 'gb18030'  # Most comprehensive Chinese encoding
-        elif encoding == 'big5':
-            encoding = 'big5'  # Traditional Chinese
-        else:
-            encoding = 'utf-8'  # Default to UTF-8
-        try:
-            content = response.content.decode(encoding, errors='replace')
-        except:
-            # Fallback to UTF-8 with error replacement
-            content = response.content.decode('utf-8', errors='replace')
-            encoding = 'utf-8'
-        return content, encoding
-    def _extract_main_content(self, html_content: str, is_chinese: bool) -> str:
-        """Extract main content with Chinese website optimizations"""
-        # Remove unwanted sections
-        html_content = self._remove_unwanted_sections(html_content, is_chinese)
-        # Chinese websites often have specific content patterns
-        content_patterns = [
-            # Common Chinese content containers
-            r'<div[^>]*class="[^"]*(content|article|post|正文|内容)[^"]*"[^>]*>(.*?)</div>',
-            r'<div[^>]*id="[^"]*(content|article|post|正文|内容)[^"]*"[^>]*>(.*?)</div>',
-            # Article tags
-            r'<article[^>]*>(.*?)</article>',
-            # Main content area
-            r'<main[^>]*>(.*?)</main>',
-            # Chinese specific patterns
-            r'<div[^>]*class="[^"]*(detail|content-main|article-content)[^"]*"[^>]*>(.*?)</div>',
-            # For news websites
-            r'<div[^>]*class="[^"]*(news-content|news-body|news-article)[^"]*"[^>]*>(.*?)</div>',
-        ]
-        all_content = []
-        for pattern in content_patterns:
-            matches = re.findall(pattern, html_content, re.DOTALL | re.IGNORECASE)
-            for match in matches:
-                if isinstance(match, tuple):
-                    for group in match:
-                        if group and len(group.strip()) > 50:
-                            all_content.append(group)
-                elif match and len(match.strip()) > 50:
-                    all_content.append(match)
-        if all_content:
-            # Combine all found content
-            combined = ' '.join(all_content)
-            # Remove any remaining HTML tags
-            combined = re.sub(r'<[^>]+>', ' ', combined)
-            # Decode HTML entities
-            combined = html.unescape(combined)
-            return combined
-        # Fallback: extract all text and clean
-        return self._extract_all_text(html_content, is_chinese)
-    def _remove_unwanted_sections(self, html_content: str, is_chinese: bool) -> str:
-        """Remove unwanted sections with Chinese-specific patterns"""
-        # Base patterns
-        unwanted_patterns = [
-            # Navigation
-            r'<nav[^>]*>.*?</nav>',
-            r'<header[^>]*>.*?</header>',
-            # Footers
-            r'<footer[^>]*>.*?</footer>',
-            # Sidebars
-            r'<aside[^>]*>.*?</aside>',
-            # Ads
-            r'<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>',
-            r'<ins[^>]*>.*?</ins>',
-            # Scripts and styles
-            r'<script[^>]*>.*?</script>',
-            r'<style[^>]*>.*?</style>',
-            r'<!--.*?-->',
-        ]
-        # Chinese-specific unwanted patterns
-        if is_chinese:
-            chinese_patterns = [
-                # Chinese navigation/menus (导航, 菜单)
-                r'<div[^>]*class="[^"]*(导航|菜单|nav)[^"]*"[^>]*>.*?</div>',
-                r'<ul[^>]*class="[^"]*(导航|菜单)[^"]*"[^>]*>.*?</ul>',
-                # Sidebars (侧边栏)
-                r'<div[^>]*class="[^"]*(侧边栏|sidebar)[^"]*"[^>]*>.*?</div>',
-                # Comments (评论)
-                r'<div[^>]*class="[^"]*(评论|comment)[^"]*"[^>]*>.*?</div>',
-                # Related articles (相关文章)
-                r'<div[^>]*class="[^"]*(相关|related)[^"]*"[^>]*>.*?</div>',
-                # Hot posts (热门)
-                r'<div[^>]*class="[^"]*(热门|hot)[^"]*"[^>]*>.*?</div>',
-                # Recommendations (推荐)
-                r'<div[^>]*class="[^"]*(推荐|recommend)[^"]*"[^>]*>.*?</div>',
-                # Share buttons (分享)
-                r'<div[^>]*class="[^"]*(分享|share)[^"]*"[^>]*>.*?</div>',
-            ]
-            unwanted_patterns.extend(chinese_patterns)
-        cleaned_html = html_content
-        for pattern in unwanted_patterns:
-            cleaned_html = re.sub(pattern, ' ', cleaned_html, flags=re.DOTALL | re.IGNORECASE)
-        return cleaned_html
-    def _extract_all_text(self, html_content: str, is_chinese: bool) -> str:
-        """Extract all text with Chinese character preservation"""
-        # Remove scripts, styles, comments
         html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
         html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
-        html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
         # Remove unwanted tags
-        unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'ins', 'meta', 'link']
         for tag in unwanted_tags:
             html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
-        # Remove HTML tags but preserve text
         text = re.sub(r'<[^>]+>', ' ', html_content)
-        # Decode HTML entities
         text = html.unescape(text)
-        # Chinese-specific cleaning
-        if is_chinese:
-            # Keep Chinese text blocks
-            lines = text.split('\n')
-            filtered_lines = []
-            for line in lines:
-                line = line.strip()
-                # Keep lines with significant Chinese content
-                chinese_chars = self._count_chinese_characters(line)
-                if chinese_chars > 5 or len(line) > 50:
-                    filtered_lines.append(line)
-            text = '\n\n'.join(filtered_lines)
         return text
-    def _clean_content(self, content: str, is_chinese: bool) -> str:
-        """Clean content while preserving Chinese characters"""
         if not content:
             return ""
-        # Replace multiple whitespace with single space
         content = re.sub(r'\s+', ' ', content)
-        # Remove control characters but preserve Chinese/Unicode
         content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
-        # Remove unwanted phrases (both English and Chinese)
-        unwanted_phrases = [
-            # English
-            r'sign up for our newsletter',
-            r'subscribe to our newsletter',
-            r'follow us on',
-            r'share this article',
-            r'read more',
-            r'continue reading',
-            r'advertisement',
-            r'click here',
-            r'learn more',
-            # Chinese
-            r'订阅我们的新闻',
-            r'关注我们',
-            r'分享这篇文章',
-            r'阅读更多',
-            r'继续阅读',
-            r'广告',
-            r'点击这里',
-            r'了解更多',
-            r'相关文章',
-            r'热门推荐',
-            r'猜你喜欢',
-        ]
-        for phrase in unwanted_phrases:
-            content = re.sub(phrase, '', content, flags=re.IGNORECASE)
-        # Remove email addresses and URLs
-        content = re.sub(r'\S+@\S+\.\S+', '', content)
-        content = re.sub(r'https?://\S+', '', content)
-        # For Chinese content, clean differently
-        if is_chinese:
-            # Remove excessive punctuation but preserve Chinese punctuation
-            content = re.sub(r'[。！？]{3,}', '。', content)
-            content = re.sub(r'[\.,!?]{3,}', '.', content)
-            # Normalize Chinese punctuation spacing
-            content = re.sub(r'\s+([。，！？；：])', r'\1', content)
-            content = re.sub(r'([。，！？；：])\s+', r'\1', content)
-        else:
-            # Normalize English punctuation spacing
-            content = re.sub(r'\s+([.,!?;:])', r'\1', content)
-            content = re.sub(r'([.,!?;:])\s+', r'\1 ', content)
-        # Split and filter paragraphs
-        if is_chinese:
-            # Split by Chinese sentence endings
-            sentences = re.split(r'[。！？]', content)
-        else:
-            # Split by English sentence endings
-            sentences = re.split(r'[.!?]', content)
-        clean_sentences = []
-        for sentence in sentences:
-            sentence = sentence.strip()
-            if not sentence:
-                continue
-            # Keep sentences with meaningful content
-            if is_chinese:
-                chinese_chars = self._count_chinese_characters(sentence)
-                if chinese_chars > 3 or len(sentence) > 20:
-                    clean_sentences.append(sentence)
-            else:
-                if len(sentence) > 20:
-                    clean_sentences.append(sentence)
-        # Join back with appropriate punctuation
-        if is_chinese:
-            content = '。'.join(clean_sentences) + ('。' if clean_sentences else '')
-        else:
-            content = '. '.join(clean_sentences) + ('.' if clean_sentences else '')
         return content.strip()
-    def _extract_title(self, html_content: str, encoding: str) -> str:
-        """Extract page title with encoding support"""
-        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
-        if title_match:
-            title = title_match.group(1)
-            title = re.sub(r'\s+', ' ', title).strip()
-            # Ensure title is properly decoded
-            try:
-                title = html.unescape(title)
-            except:
-                pass
-            return title[:300]
-        return "未找到标题" if 'gb' in encoding or 'big5' in encoding else "No title found"
-    def _extract_metadata(self, html_content: str) -> Dict[str, str]:
-        """Extract metadata including Chinese meta tags"""
-        metadata = {}
-        # Meta description (supports both English and Chinese)
-        desc_patterns = [
-            r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
-            r'<meta[^>]*property=["\']og:description["\'][^>]*content=["\'](.*?)["\']',
-        ]
-        for pattern in desc_patterns:
-            match = re.search(pattern, html_content, re.IGNORECASE)
-            if match:
-                metadata['description'] = html.unescape(match.group(1))[:500]
-                break
-        # Keywords
-        keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
-                                  html_content, re.IGNORECASE)
-        if keywords_match:
-            metadata['keywords'] = html.unescape(keywords_match.group(1))[:500]
-        # Author
-        author_match = re.search(r'<meta[^>]*name=["\']author["\'][^>]*content=["\'](.*?)["\']',
-                                html_content, re.IGNORECASE)
-        if author_match:
-            metadata['author'] = html.unescape(author_match.group(1))[:200]
-        # Charset
-        charset_match = re.search(r'<meta[^>]*charset=["\']([^"\']+)["\']', html_content, re.IGNORECASE)
-        if charset_match:
-            metadata['charset'] = charset_match.group(1)
-        return metadata
-    def _count_chinese_characters(self, text: str) -> int:
-        """Count Chinese characters in text"""
-        # Chinese character ranges in Unicode
-        chinese_ranges = [
-            (0x4E00, 0x9FFF),    # CJK Unified Ideographs
-            (0x3400, 0x4DBF),    # CJK Unified Ideographs Extension A
-            (0x20000, 0x2A6DF),  # CJK Unified Ideographs Extension B
-            (0x2A700, 0x2B73F),  # CJK Unified Ideographs Extension C
-            (0x2B740, 0x2B81F),  # CJK Unified Ideographs Extension D
-            (0x2B820, 0x2CEAF),  # CJK Unified Ideographs Extension E
-            (0xF900, 0xFAFF),    # CJK Compatibility Ideographs
-            (0x2F800, 0x2FA1F),  # CJK Compatibility Ideographs Supplement
-        ]
-        count = 0
-        for char in text:
-            char_code = ord(char)
-            for start, end in chinese_ranges:
-                if start <= char_code <= end:
-                    count += 1
-                    break
-        return count
 # ==============================================
 # INITIALIZE
 # ==============================================
-extractor = ChineseContentExtractor()
 # ==============================================
 # FASTAPI APP
 # ==============================================
-# Create FastAPI app
 fastapi_app = FastAPI(
-    title="智能内容提取器 (中文网站优化)",
-    description="专门优化中文网站的内容提取器，去除导航、广告、页脚等无关内容",
-    version="2.0"
 )
-# Add CORS middleware
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
@@ -497,30 +379,29 @@ fastapi_app.add_middleware(
 @fastapi_app.get("/")
 async def root():
     return {
-        "service": "智能内容提取器",
-        "version": "2.0",
-        "description": "专门优化中文网站的内容提取器",
         "endpoints": {
-            "GET /": "API信息",
-            "GET /health": "健康检查",
-            "POST /extract": "提取主要内容 (n8n专用)"
         },
-        "examples": {
-            "中文网站": "https://zhihu.com",
-            "英文网站": "https://example.com"
-        }
     }
 @fastapi_app.get("/health")
 async def health():
-    return {
-        "status": "healthy",
-        "timestamp": time.time()
-    }
 @fastapi_app.post("/extract")
 async def api_extract(request: Request):
-    """API endpoint for n8n - optimized for Chinese websites"""
     try:
         body = await request.json()
         url = body.get("url", "").strip()
@@ -528,23 +409,23 @@ async def api_extract(request: Request):
         if not url:
             return JSONResponse(
                 status_code=400,
-                content={"success": False, "error": "URL参数是必需的"}
             )
-        print(f"📨 内容提取请求: {url}")
-        result = extractor.extract_content(url)
         return result
     except json.JSONDecodeError:
         return JSONResponse(
             status_code=400,
-            content={"success": False, "error": "无效的JSON数据"}
         )
     except Exception as e:
         return JSONResponse(
             status_code=500,
-            content={"success": False, "error": f"内部错误: {str(e)}"}
         )
 # ==============================================
@@ -552,79 +433,63 @@ async def api_extract(request: Request):
 # ==============================================
 def gradio_extract(url: str):
-    """Gradio interface function"""
     if not url:
-        return "❌ 请输入URL", {}
-    result = extractor.extract_content(url)
     if result["success"]:
         content = result["main_content"]
         content_length = result["content_length"]
-        is_chinese = result.get("is_chinese_website", False)
-        # Create preview
-        if is_chinese:
-            # For Chinese, show first 600 characters
-            preview = content[:600]
-            if len(content) > 600:
-                preview += "..."
-        else:
-            # For English, show first 500 characters
-            preview = content[:500]
-            if len(content) > 500:
-                preview += "..."
-        if is_chinese:
-            output = f"""
-## ✅ 内容提取成功！
-**网址:** {result['url']}
-**标题:** {result.get('title', '无标题')}
-**时间:** {result['execution_time']}秒
-**内容长度:** {content_length:,} 字符
-**中文字符数:** {result.get('chinese_char_count', 0):,}
-### 内容预览:
-{preview}
-"""
-        else:
-            output = f"""
-## ✅ Content Extracted Successfully!
 **URL:** {result['url']}
-**Title:** {result.get('title', 'No title')}
 **Time:** {result['execution_time']}s
 **Content Length:** {content_length:,} characters
-### Content Preview:
 {preview}
 """
         return output, result
     else:
-        error_msg = result.get("error", "未知错误")
-        return f"## ❌ 错误\n\n{error_msg}", result
 # Create Gradio interface
 gradio_interface = gr.Interface(
     fn=gradio_extract,
     inputs=gr.Textbox(
-        label="网站网址 / Website URL",
-        placeholder="请输入网址 (如: https://zhihu.com)",
-        value="https://zhihu.com"
     ),
     outputs=[
-        gr.Markdown(label="结果 / Result"),
-        gr.JSON(label="API响应 / API Response")
     ],
-    title="🧠 智能内容提取器 (中文优化) / Smart Content Extractor (Chinese Optimized)",
-    description="专门优化中文网站的内容提取器，去除导航、广告、页脚等无关内容 / Optimized for Chinese websites, removes navigation, ads, footers, etc.",
     examples=[
-        ["https://zhihu.com"],
-        ["https://baidu.com"],
-        ["https://news.sina.com.cn"],
         ["https://example.com"],
-        ["https://en.wikipedia.org/wiki/Artificial_intelligence"]
     ]
 )
@@ -632,7 +497,6 @@ gradio_interface = gr.Interface(
 # MOUNT GRADIO TO FASTAPI
 # ==============================================
-# Mount Gradio app to FastAPI
 app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
 # ==============================================
@@ -641,11 +505,14 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
 if __name__ == "__main__":
     print("\n" + "="*60)
-    print("🧠 智能内容提取器启动中...")
-    print("Smart Content Extractor Starting...")
     print("="*60)
-    print("API端点 / API Endpoint: POST /extract")
-    print("网页界面 / Web Interface: GET /")
     print("="*60 + "\n")
     uvicorn.run(

 # ==============================================
+# FREE SCREENSHOT SCRAPER FOR N8N
 # ==============================================
 import gradio as gr
 import time
 import re
 import html
+import base64
+from io import BytesIO
+from PIL import Image
+import pytesseract  # Free OCR
+from typing import Dict, Any
 from fastapi import FastAPI, Request
 import uvicorn
 # ==============================================
+# FREE SCREENSHOT SCRAPER
 # ==============================================
+class FreeScreenshotScraper:
+    """Free scraper using screenshot APIs + fallback"""
     def __init__(self):
+        # Free screenshot APIs (no API key needed)
+        self.screenshot_apis = [
+            {
+                "url": lambda u: f"https://s0.wp.com/mshots/v1/{u}?w=1024",
+                "name": "wordpress_mshots"
+            },
+            {
+                "url": lambda u: f"https://render-tron.appspot.com/screenshot/{u}?width=1024&height=768",
+                "name": "render_tron"
+            },
+            {
+                "url": lambda u: f"https://image.thum.io/get/width/1024/crop/768/noanimate/{u}",
+                "name": "thumio"
+            },
         ]
+        # Free HTML content APIs
+        self.html_apis = [
+            {
+                "url": lambda u: f"https://r.jina.ai/{u}",
+                "name": "jina_reader",
+                "headers": {"Accept": "application/json"}
+            },
+            {
+                "url": lambda u: f"https://extractorapi.com/api/v1/extractor?apikey=demo&url={u}",
+                "name": "extractor_api"
+            },
         ]
     def extract_content(self, url: str) -> Dict[str, Any]:
+        """Extract content using free APIs"""
         start_time = time.time()
+        print(f"🌐 Extracting: {url}")
         # Ensure URL has protocol
         if not url.startswith(('http://', 'https://')):
             url = 'https://' + url
+        # Strategy 1: Try Jina Reader API (best for content extraction)
+        print("  Trying Jina Reader API...")
+        jina_result = self._try_jina_reader(url)
+        if jina_result["success"]:
+            jina_result["execution_time"] = round(time.time() - start_time, 2)
+            jina_result["method"] = "jina_reader_api"
+            return jina_result
+        # Strategy 2: Try other HTML APIs
+        print("  Trying other HTML APIs...")
+        for api in self.html_apis[1:]:
+            result = self._try_api(api, url)
+            if result["success"]:
+                result["execution_time"] = round(time.time() - start_time, 2)
+                result["method"] = api["name"]
+                return result
+        # Strategy 3: Try direct request with smart headers
+        print("  Trying direct request...")
+        direct_result = self._try_direct_request(url)
+        if direct_result["success"]:
+            direct_result["execution_time"] = round(time.time() - start_time, 2)
+            direct_result["method"] = "direct_with_fallback"
+            return direct_result
+        # Strategy 4: Try screenshot APIs as last resort
+        print("  Trying screenshot APIs...")
+        for api in self.screenshot_apis:
+            result = self._try_screenshot_api(api, url)
+            if result["success"]:
+                result["execution_time"] = round(time.time() - start_time, 2)
+                result["method"] = f"screenshot_{api['name']}"
+                return result
+        # All failed
+        return {
+            "success": False,
+            "url": url,
+            "error": "All free methods failed",
+            "execution_time": round(time.time() - start_time, 2),
+            "suggestions": [
+                "Try a different URL",
+                "Website may block automated access",
+                "Try using Jina Reader directly: https://r.jina.ai/your-url"
+            ]
+        }
+    def _try_jina_reader(self, url: str) -> Dict[str, Any]:
+        """Try Jina Reader API (free, no API key needed)"""
         try:
+            api_url = f"https://r.jina.ai/{url}"
+            headers = {
+                "User-Agent": "Mozilla/5.0",
+                "Accept": "application/json",
+            }
+            response = requests.get(api_url, headers=headers, timeout=30)
+            if response.status_code == 200:
+                # Jina returns clean text directly
+                content = response.text
+                # Try to parse as JSON first
+                try:
+                    data = json.loads(content)
+                    if "data" in data:
+                        content = data["data"]["content"] if "content" in data["data"] else str(data["data"])
+                except:
+                    pass  # Keep as text
+                # Extract title if possible
+                title = ""
+                title_match = re.search(r'<title[^>]*>(.*?)</title>', content, re.IGNORECASE)
+                if title_match:
+                    title = title_match.group(1)
+                # Clean content
+                cleaned = self._clean_content(content)
+                return {
+                    "success": True,
+                    "url": url,
+                    "title": title[:200] if title else "Extracted via Jina Reader",
+                    "main_content": cleaned[:30000],
+                    "content_length": len(cleaned),
+                    "source": "jina_reader",
+                    "note": "Content extracted via free Jina Reader API"
+                }
+            return {"success": False, "error": f"Jina API status: {response.status_code}"}
+        except Exception as e:
+            return {"success": False, "error": f"Jina API error: {str(e)}"}
+    def _try_api(self, api: dict, url: str) -> Dict[str, Any]:
+        """Try other free APIs"""
+        try:
+            api_url = api["url"](url)
+            headers = api.get("headers", {"User-Agent": "Mozilla/5.0"})
+            response = requests.get(api_url, headers=headers, timeout=15)
+            if response.status_code == 200:
+                content = response.text
+                # Try to parse JSON
+                try:
+                    data = json.loads(content)
+                    # Extract content from common API formats
+                    if "text" in data:
+                        content = data["text"]
+                    elif "content" in data:
+                        content = data["content"]
+                    elif "article" in data:
+                        content = data["article"]
+                except:
+                    pass
+                cleaned = self._clean_content(content)
+                return {
+                    "success": True,
+                    "url": url,
+                    "main_content": cleaned[:20000],
+                    "content_length": len(cleaned)
+                }
+            return {"success": False}
+        except:
+            return {"success": False}
+    def _try_direct_request(self, url: str) -> Dict[str, Any]:
+        """Try direct request with various strategies"""
+        strategies = [
+            self._direct_request_with_headers,
+            self._direct_request_as_googlebot,
+            self._direct_request_with_referer,
+        ]
+        for strategy in strategies:
+            try:
+                result = strategy(url)
+                if result["success"]:
+                    return result
+            except:
+                continue
+        return {"success": False}
+    def _direct_request_with_headers(self, url: str) -> Dict[str, Any]:
+        """Direct request with browser-like headers"""
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.5",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Cache-Control": "max-age=0",
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        if response.status_code == 200:
+            content = self._extract_from_html(response.text)
+            cleaned = self._clean_content(content)
+            return {
+                "success": True,
+                "content": cleaned
+            }
+        return {"success": False}
+    def _direct_request_as_googlebot(self, url: str) -> Dict[str, Any]:
+        """Pretend to be Googlebot"""
+        headers = {
+            "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        if response.status_code == 200:
+            content = self._extract_from_html(response.text)
+            cleaned = self._clean_content(content)
+            return {
+                "success": True,
+                "content": cleaned
+            }
+        return {"success": False}
+    def _direct_request_with_referer(self, url: str) -> Dict[str, Any]:
+        """Request with referer"""
+        headers = {
+            "User-Agent": "Mozilla/5.0",
+            "Referer": "https://www.google.com/",
+            "Accept": "text/html",
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        if response.status_code == 200:
+            content = self._extract_from_html(response.text)
+            cleaned = self._clean_content(content)
+            return {
+                "success": True,
+                "content": cleaned
+            }
+        return {"success": False}
+    def _try_screenshot_api(self, api: dict, url: str) -> Dict[str, Any]:
+        """Try screenshot API"""
+        try:
+            api_url = api["url"](url)
+            headers = {"User-Agent": "Mozilla/5.0"}
+            response = requests.get(api_url, headers=headers, timeout=15)
+            if response.status_code == 200 and len(response.content) > 1000:
+                # Check if it's actually an image
+                try:
+                    img = Image.open(BytesIO(response.content))
+                    img.verify()
+                    # Try OCR if available
+                    try:
+                        text = pytesseract.image_to_string(img)
+                        cleaned = self._clean_content(text)
+                        return {
+                            "success": True,
+                            "url": url,
+                            "main_content": cleaned[:15000],
+                            "content_length": len(cleaned),
+                            "note": "Content extracted from screenshot via OCR"
+                        }
+                    except:
+                        return {"success": False, "error": "OCR not available"}
+                except:
+                    return {"success": False}
+            return {"success": False}
+        except:
+            return {"success": False}
+    def _extract_from_html(self, html_content: str) -> str:
+        """Extract text from HTML"""
+        # Remove scripts and styles
         html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
         html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
         # Remove unwanted tags
+        unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu']
         for tag in unwanted_tags:
             html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
+        # Extract text
         text = re.sub(r'<[^>]+>', ' ', html_content)
         text = html.unescape(text)
         return text
+    def _clean_content(self, content: str) -> str:
+        """Clean content"""
         if not content:
             return ""
+        # Replace multiple whitespace
         content = re.sub(r'\s+', ' ', content)
+        # Remove control characters
         content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
+        # Remove excessive line breaks
+        content = re.sub(r'\n{3,}', '\n\n', content)
         return content.strip()
 # ==============================================
 # INITIALIZE
 # ==============================================
+scraper = FreeScreenshotScraper()
 # ==============================================
 # FASTAPI APP
 # ==============================================
 fastapi_app = FastAPI(
+    title="Free Content Extractor",
+    description="Uses free APIs to extract content from websites",
+    version="1.0"
 )
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 @fastapi_app.get("/")
 async def root():
     return {
+        "service": "Free Content Extractor",
+        "version": "1.0",
+        "description": "Uses free APIs (Jina Reader, etc.) to extract website content",
         "endpoints": {
+            "GET /": "This info",
+            "GET /health": "Health check",
+            "POST /extract": "Extract content (for n8n)"
         },
+        "free_apis_used": [
+            "Jina Reader (https://r.jina.ai/)",
+            "WordPress mShots",
+            "Render-Tron",
+            "ExtractorAPI (demo)"
+        ]
     }
 @fastapi_app.get("/health")
 async def health():
+    return {"status": "healthy", "timestamp": time.time()}
 @fastapi_app.post("/extract")
 async def api_extract(request: Request):
+    """API endpoint for n8n"""
     try:
         body = await request.json()
         url = body.get("url", "").strip()
         if not url:
             return JSONResponse(
                 status_code=400,
+                content={"success": False, "error": "URL is required"}
             )
+        print(f"📨 Request: {url}")
+        result = scraper.extract_content(url)
         return result
     except json.JSONDecodeError:
         return JSONResponse(
             status_code=400,
+            content={"success": False, "error": "Invalid JSON"}
         )
     except Exception as e:
         return JSONResponse(
             status_code=500,
+            content={"success": False, "error": str(e)}
         )
 # ==============================================
 # ==============================================
 def gradio_extract(url: str):
+    """Gradio interface"""
     if not url:
+        return "❌ Please enter a URL", {}
+    result = scraper.extract_content(url)
     if result["success"]:
         content = result["main_content"]
         content_length = result["content_length"]
+        preview = content[:800]
+        if len(content) > 800:
+            preview += "..."
+        output = f"""
+## ✅ Success!
 **URL:** {result['url']}
+**Method:** {result.get('method', 'free_api')}
 **Time:** {result['execution_time']}s
 **Content Length:** {content_length:,} characters
+### Preview:
 {preview}
+*Using free APIs - may not work on all websites*
 """
         return output, result
     else:
+        error = result.get("error", "Unknown error")
+        suggestions = result.get("suggestions", [])
+        suggestion_text = ""
+        if suggestions:
+            suggestion_text = "\n\n**Suggestions:**\n" + "\n".join([f"• {s}" for s in suggestions])
+        return f"## ❌ Error\n\n{error}{suggestion_text}", result
 # Create Gradio interface
 gradio_interface = gr.Interface(
     fn=gradio_extract,
     inputs=gr.Textbox(
+        label="Website URL",
+        placeholder="https://www.sinchew.com.my/",
+        value="https://www.sinchew.com.my/"
     ),
     outputs=[
+        gr.Markdown(label="Result"),
+        gr.JSON(label="API Response")
     ],
+    title="🌐 Free Content Extractor for n8n",
+    description="Uses free APIs to extract content. Works with most websites.",
     examples=[
+        ["https://www.sinchew.com.my/"],
         ["https://example.com"],
+        ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
+        ["https://news.ycombinator.com"]
     ]
 )
 # MOUNT GRADIO TO FASTAPI
 # ==============================================
 app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
 # ==============================================
 if __name__ == "__main__":
     print("\n" + "="*60)
+    print("🌐 Free Content Extractor Starting")
+    print("="*60)
+    print("Using free APIs:")
+    print("• Jina Reader (r.jina.ai)")
+    print("• WordPress mShots")
+    print("• Render-Tron")
     print("="*60)
+    print("API Endpoint: POST /extract")
     print("="*60 + "\n")
     uvicorn.run(