Spaces:

maylinejix
/

Scrape

Sleeping

App Files Files Community

maylinejix commited on Dec 19, 2025

Commit

ceb6c8b

verified ·

1 Parent(s): a01c075

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -82

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import time
 from datetime import datetime
 from werkzeug.utils import secure_filename
-import cloudscraper
 app = Flask(__name__)
 PUBLIC_DIR = 'public'
@@ -30,7 +30,6 @@ def get_driver(headless=True):
     options.add_argument('--disable-gpu')
     options.add_argument('--window-size=1920,1080')
     options.add_argument('--disable-blink-features=AutomationControlled')
-    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
     driver = uc.Chrome(options=options, use_subprocess=False)
@@ -39,10 +38,6 @@ def get_driver(headless=True):
             Object.defineProperty(navigator, 'webdriver', {
                 get: () => undefined
             });
-            Object.defineProperty(navigator, 'plugins', {
-                get: () => [1, 2, 3, 4, 5]
-            });
-            window.chrome = { runtime: {} };
         '''
     })
@@ -53,57 +48,70 @@ def index():
     return jsonify({
         'message': 'Undetected Chrome Scraper API is running',
         'endpoints': {
-            'POST /api/scrape': 'Get HTML with Selenium (use for complex sites)',
-            'POST /api/scrape-fast': 'Get HTML with Cloudscraper (bypass Cloudflare)',
             'POST /api/execute': 'Execute Python code with Selenium',
             'POST /api/upload': 'Upload file to server',
-            'GET /api/files': 'List uploaded files',
-            'GET /uploads/<filename>': 'Download uploaded file'
         }
     })
-@app.route('/api/scrape-fast', methods=['POST'])
-def scrape_fast():
     data = request.get_json()
     url = data.get('url')
     html_only = data.get('html_only', False)
     if not url:
         return jsonify({'success': False, 'error': 'URL is required'}), 400
     try:
-        scraper = cloudscraper.create_scraper(
-            browser={
-                'browser': 'chrome',
-                'platform': 'windows',
-                'desktop': True
-            }
-        )
-        response = scraper.get(url, timeout=30)
-        if response.status_code == 200:
-            html = response.text
-            if html_only:
-                return html, 200, {'Content-Type': 'text/html; charset=utf-8'}
-            return jsonify({
-                'success': True,
-                'data': {
-                    'html': html,
-                    'status_code': response.status_code,
-                    'url': response.url,
-                    'timestamp': datetime.now().isoformat()
-                }
-            })
         else:
-            return jsonify({
-                'success': False,
-                'error': f'HTTP {response.status_code}',
-                'status_code': response.status_code
-            }), response.status_code
     except Exception as e:
         return jsonify({'success': False, 'error': str(e)}), 500
@@ -198,7 +206,6 @@ def scrape():
     wait_time = data.get('wait', 5)
     screenshot = data.get('screenshot', False)
     html_only = data.get('html_only', False)
-    max_wait = data.get('max_wait', 60)
     if not url:
         return jsonify({'success': False, 'error': 'URL is required'}), 400
@@ -207,51 +214,12 @@ def scrape():
     try:
         driver = get_driver(headless=True)
         driver.get(url)
-        time.sleep(10)
-        start_time = time.time()
-        challenge_detected = False
-        while time.time() - start_time < max_wait:
-            html = driver.page_source
-            if "Just a moment" in html or "Verifying you are human" in html or "cf-chl-widget" in html:
-                challenge_detected = True
-                time.sleep(3)
-                try:
-                    driver.execute_script("window.scrollTo(0, 100);")
-                    time.sleep(1)
-                except:
-                    pass
-                continue
-            if challenge_detected:
-                time.sleep(5)
-            break
         time.sleep(wait_time)
         html = driver.page_source
         title = driver.title
         current_url = driver.current_url
-        if "Just a moment" in html or "Verifying you are human" in html:
-            if screenshot:
-                filename = f'challenge-{int(time.time())}.png'
-                filepath = os.path.join(PUBLIC_DIR, filename)
-                driver.save_screenshot(filepath)
-            return jsonify({
-                'success': False,
-                'error': 'Cloudflare challenge not solved. Try using /api/scrape-fast instead',
-                'html_preview': html[:500],
-                'screenshot': f"{request.host_url}files/{filename}" if screenshot else None
-            }), 403
         if html_only:
             return html, 200, {'Content-Type': 'text/html; charset=utf-8'}

 import time
 from datetime import datetime
 from werkzeug.utils import secure_filename
+from curl_cffi import requests as curl_requests
 app = Flask(__name__)
 PUBLIC_DIR = 'public'
     options.add_argument('--disable-gpu')
     options.add_argument('--window-size=1920,1080')
     options.add_argument('--disable-blink-features=AutomationControlled')
     driver = uc.Chrome(options=options, use_subprocess=False)
             Object.defineProperty(navigator, 'webdriver', {
                 get: () => undefined
             });
         '''
     })
     return jsonify({
         'message': 'Undetected Chrome Scraper API is running',
         'endpoints': {
+            'POST /api/scrape': 'Get HTML with Selenium',
+            'POST /api/scrape-cf': 'Bypass Cloudflare with curl_cffi',
             'POST /api/execute': 'Execute Python code with Selenium',
             'POST /api/upload': 'Upload file to server',
+            'GET /api/files': 'List uploaded files'
         }
     })
+@app.route('/api/scrape-cf', methods=['POST'])
+def scrape_cloudflare():
     data = request.get_json()
     url = data.get('url')
     html_only = data.get('html_only', False)
+    method = data.get('method', 'GET').upper()
+    headers = data.get('headers', {})
+    body_data = data.get('data')
     if not url:
         return jsonify({'success': False, 'error': 'URL is required'}), 400
     try:
+        default_headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
+        }
+        default_headers.update(headers)
+        if method == 'POST':
+            response = curl_requests.post(
+                url,
+                headers=default_headers,
+                data=body_data,
+                timeout=30,
+                impersonate="chrome120"
+            )
         else:
+            response = curl_requests.get(
+                url,
+                headers=default_headers,
+                timeout=30,
+                impersonate="chrome120"
+            )
+        html = response.text
+        if html_only:
+            return html, 200, {'Content-Type': 'text/html; charset=utf-8'}
+        return jsonify({
+            'success': True,
+            'data': {
+                'html': html,
+                'status_code': response.status_code,
+                'url': str(response.url),
+                'headers': dict(response.headers),
+                'timestamp': datetime.now().isoformat()
+            }
+        })
     except Exception as e:
         return jsonify({'success': False, 'error': str(e)}), 500
     wait_time = data.get('wait', 5)
     screenshot = data.get('screenshot', False)
     html_only = data.get('html_only', False)
     if not url:
         return jsonify({'success': False, 'error': 'URL is required'}), 400
     try:
         driver = get_driver(headless=True)
         driver.get(url)
         time.sleep(wait_time)
         html = driver.page_source
         title = driver.title
         current_url = driver.current_url
         if html_only:
             return html, 200, {'Content-Type': 'text/html; charset=utf-8'}