Spaces:

maylinejix
/

Scrape

Sleeping

App Files Files Community

maylinejix commited on Dec 19, 2025

Commit

e89dbd6

verified ·

1 Parent(s): 52200fd

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -12

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import os
 import time
 from datetime import datetime
 from werkzeug.utils import secure_filename
 app = Flask(__name__)
 PUBLIC_DIR = 'public'
@@ -20,12 +21,14 @@ MAX_FILE_SIZE = 16 * 1024 * 1024
 def allowed_file(filename):
     return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
-def get_driver():
     options = uc.ChromeOptions()
-    options.add_argument('--headless=new')
     options.add_argument('--no-sandbox')
     options.add_argument('--disable-dev-shm-usage')
     options.add_argument('--disable-gpu')
     options.add_argument('--disable-blink-features=AutomationControlled')
     options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
@@ -36,6 +39,10 @@ def get_driver():
             Object.defineProperty(navigator, 'webdriver', {
                 get: () => undefined
             });
         '''
     })
@@ -46,7 +53,8 @@ def index():
     return jsonify({
         'message': 'Undetected Chrome Scraper API is running',
         'endpoints': {
-            'POST /api/scrape': 'Get HTML content from URL',
             'POST /api/execute': 'Execute Python code with Selenium',
             'POST /api/upload': 'Upload file to server',
             'GET /api/files': 'List uploaded files',
@@ -54,6 +62,51 @@ def index():
         }
     })
 @app.route('/api/upload', methods=['POST'])
 def upload_file():
     try:
@@ -142,29 +195,43 @@ def serve_upload(filename):
 def scrape():
     data = request.get_json()
     url = data.get('url')
-    wait_time = data.get('wait', 3)
     screenshot = data.get('screenshot', False)
     html_only = data.get('html_only', False)
-    max_wait = data.get('max_wait', 30)
     if not url:
         return jsonify({'success': False, 'error': 'URL is required'}), 400
     driver = None
     try:
-        driver = get_driver()
         driver.get(url)
         start_time = time.time()
         while time.time() - start_time < max_wait:
             html = driver.page_source
-            if ("Just a moment" not in html and
-                "Verifying you are human" not in html and
-                "cf-chl-widget" not in html):
-                break
-            time.sleep(2)
         time.sleep(wait_time)
@@ -173,9 +240,16 @@ def scrape():
         current_url = driver.current_url
         if "Just a moment" in html or "Verifying you are human" in html:
             return jsonify({
                 'success': False,
-                'error': 'Cloudflare challenge not solved. Try increasing max_wait.'
             }), 403
         if html_only:

 import time
 from datetime import datetime
 from werkzeug.utils import secure_filename
+import cloudscraper
 app = Flask(__name__)
 PUBLIC_DIR = 'public'
 def allowed_file(filename):
     return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+def get_driver(headless=True):
     options = uc.ChromeOptions()
+    if headless:
+        options.add_argument('--headless=new')
     options.add_argument('--no-sandbox')
     options.add_argument('--disable-dev-shm-usage')
     options.add_argument('--disable-gpu')
+    options.add_argument('--window-size=1920,1080')
     options.add_argument('--disable-blink-features=AutomationControlled')
     options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
             Object.defineProperty(navigator, 'webdriver', {
                 get: () => undefined
             });
+            Object.defineProperty(navigator, 'plugins', {
+                get: () => [1, 2, 3, 4, 5]
+            });
+            window.chrome = { runtime: {} };
         '''
     })
     return jsonify({
         'message': 'Undetected Chrome Scraper API is running',
         'endpoints': {
+            'POST /api/scrape': 'Get HTML with Selenium (use for complex sites)',
+            'POST /api/scrape-fast': 'Get HTML with Cloudscraper (bypass Cloudflare)',
             'POST /api/execute': 'Execute Python code with Selenium',
             'POST /api/upload': 'Upload file to server',
             'GET /api/files': 'List uploaded files',
         }
     })
+@app.route('/api/scrape-fast', methods=['POST'])
+def scrape_fast():
+    data = request.get_json()
+    url = data.get('url')
+    html_only = data.get('html_only', False)
+    if not url:
+        return jsonify({'success': False, 'error': 'URL is required'}), 400
+    try:
+        scraper = cloudscraper.create_scraper(
+            browser={
+                'browser': 'chrome',
+                'platform': 'windows',
+                'desktop': True
+            }
+        )
+        response = scraper.get(url, timeout=30)
+        if response.status_code == 200:
+            html = response.text
+            if html_only:
+                return html, 200, {'Content-Type': 'text/html; charset=utf-8'}
+            return jsonify({
+                'success': True,
+                'data': {
+                    'html': html,
+                    'status_code': response.status_code,
+                    'url': response.url,
+                    'timestamp': datetime.now().isoformat()
+                }
+            })
+        else:
+            return jsonify({
+                'success': False,
+                'error': f'HTTP {response.status_code}',
+                'status_code': response.status_code
+            }), response.status_code
+    except Exception as e:
+        return jsonify({'success': False, 'error': str(e)}), 500
 @app.route('/api/upload', methods=['POST'])
 def upload_file():
     try:
 def scrape():
     data = request.get_json()
     url = data.get('url')
+    wait_time = data.get('wait', 5)
     screenshot = data.get('screenshot', False)
     html_only = data.get('html_only', False)
+    max_wait = data.get('max_wait', 60)
     if not url:
         return jsonify({'success': False, 'error': 'URL is required'}), 400
     driver = None
     try:
+        driver = get_driver(headless=True)
         driver.get(url)
+        time.sleep(10)
         start_time = time.time()
+        challenge_detected = False
         while time.time() - start_time < max_wait:
             html = driver.page_source
+            if "Just a moment" in html or "Verifying you are human" in html or "cf-chl-widget" in html:
+                challenge_detected = True
+                time.sleep(3)
+                try:
+                    driver.execute_script("window.scrollTo(0, 100);")
+                    time.sleep(1)
+                except:
+                    pass
+                continue
+            if challenge_detected:
+                time.sleep(5)
+            break
         time.sleep(wait_time)
         current_url = driver.current_url
         if "Just a moment" in html or "Verifying you are human" in html:
+            if screenshot:
+                filename = f'challenge-{int(time.time())}.png'
+                filepath = os.path.join(PUBLIC_DIR, filename)
+                driver.save_screenshot(filepath)
             return jsonify({
                 'success': False,
+                'error': 'Cloudflare challenge not solved. Try using /api/scrape-fast instead',
+                'html_preview': html[:500],
+                'screenshot': f"{request.host_url}files/{filename}" if screenshot else None
             }), 403
         if html_only: