maylinejix commited on
Commit
ceb6c8b
·
verified ·
1 Parent(s): a01c075

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -82
app.py CHANGED
@@ -7,7 +7,7 @@ import os
7
  import time
8
  from datetime import datetime
9
  from werkzeug.utils import secure_filename
10
- import cloudscraper
11
 
12
  app = Flask(__name__)
13
  PUBLIC_DIR = 'public'
@@ -30,7 +30,6 @@ def get_driver(headless=True):
30
  options.add_argument('--disable-gpu')
31
  options.add_argument('--window-size=1920,1080')
32
  options.add_argument('--disable-blink-features=AutomationControlled')
33
- options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
34
 
35
  driver = uc.Chrome(options=options, use_subprocess=False)
36
 
@@ -39,10 +38,6 @@ def get_driver(headless=True):
39
  Object.defineProperty(navigator, 'webdriver', {
40
  get: () => undefined
41
  });
42
- Object.defineProperty(navigator, 'plugins', {
43
- get: () => [1, 2, 3, 4, 5]
44
- });
45
- window.chrome = { runtime: {} };
46
  '''
47
  })
48
 
@@ -53,57 +48,70 @@ def index():
53
  return jsonify({
54
  'message': 'Undetected Chrome Scraper API is running',
55
  'endpoints': {
56
- 'POST /api/scrape': 'Get HTML with Selenium (use for complex sites)',
57
- 'POST /api/scrape-fast': 'Get HTML with Cloudscraper (bypass Cloudflare)',
58
  'POST /api/execute': 'Execute Python code with Selenium',
59
  'POST /api/upload': 'Upload file to server',
60
- 'GET /api/files': 'List uploaded files',
61
- 'GET /uploads/<filename>': 'Download uploaded file'
62
  }
63
  })
64
 
65
- @app.route('/api/scrape-fast', methods=['POST'])
66
- def scrape_fast():
67
  data = request.get_json()
68
  url = data.get('url')
69
  html_only = data.get('html_only', False)
 
 
 
70
 
71
  if not url:
72
  return jsonify({'success': False, 'error': 'URL is required'}), 400
73
 
74
  try:
75
- scraper = cloudscraper.create_scraper(
76
- browser={
77
- 'browser': 'chrome',
78
- 'platform': 'windows',
79
- 'desktop': True
80
- }
81
- )
82
-
83
- response = scraper.get(url, timeout=30)
 
84
 
85
- if response.status_code == 200:
86
- html = response.text
87
-
88
- if html_only:
89
- return html, 200, {'Content-Type': 'text/html; charset=utf-8'}
90
-
91
- return jsonify({
92
- 'success': True,
93
- 'data': {
94
- 'html': html,
95
- 'status_code': response.status_code,
96
- 'url': response.url,
97
- 'timestamp': datetime.now().isoformat()
98
- }
99
- })
100
  else:
101
- return jsonify({
102
- 'success': False,
103
- 'error': f'HTTP {response.status_code}',
104
- 'status_code': response.status_code
105
- }), response.status_code
106
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  except Exception as e:
108
  return jsonify({'success': False, 'error': str(e)}), 500
109
 
@@ -198,7 +206,6 @@ def scrape():
198
  wait_time = data.get('wait', 5)
199
  screenshot = data.get('screenshot', False)
200
  html_only = data.get('html_only', False)
201
- max_wait = data.get('max_wait', 60)
202
 
203
  if not url:
204
  return jsonify({'success': False, 'error': 'URL is required'}), 400
@@ -207,51 +214,12 @@ def scrape():
207
  try:
208
  driver = get_driver(headless=True)
209
  driver.get(url)
210
-
211
- time.sleep(10)
212
-
213
- start_time = time.time()
214
- challenge_detected = False
215
-
216
- while time.time() - start_time < max_wait:
217
- html = driver.page_source
218
-
219
- if "Just a moment" in html or "Verifying you are human" in html or "cf-chl-widget" in html:
220
- challenge_detected = True
221
- time.sleep(3)
222
-
223
- try:
224
- driver.execute_script("window.scrollTo(0, 100);")
225
- time.sleep(1)
226
- except:
227
- pass
228
-
229
- continue
230
-
231
- if challenge_detected:
232
- time.sleep(5)
233
-
234
- break
235
-
236
  time.sleep(wait_time)
237
 
238
  html = driver.page_source
239
  title = driver.title
240
  current_url = driver.current_url
241
 
242
- if "Just a moment" in html or "Verifying you are human" in html:
243
- if screenshot:
244
- filename = f'challenge-{int(time.time())}.png'
245
- filepath = os.path.join(PUBLIC_DIR, filename)
246
- driver.save_screenshot(filepath)
247
-
248
- return jsonify({
249
- 'success': False,
250
- 'error': 'Cloudflare challenge not solved. Try using /api/scrape-fast instead',
251
- 'html_preview': html[:500],
252
- 'screenshot': f"{request.host_url}files/{filename}" if screenshot else None
253
- }), 403
254
-
255
  if html_only:
256
  return html, 200, {'Content-Type': 'text/html; charset=utf-8'}
257
 
 
7
  import time
8
  from datetime import datetime
9
  from werkzeug.utils import secure_filename
10
+ from curl_cffi import requests as curl_requests
11
 
12
  app = Flask(__name__)
13
  PUBLIC_DIR = 'public'
 
30
  options.add_argument('--disable-gpu')
31
  options.add_argument('--window-size=1920,1080')
32
  options.add_argument('--disable-blink-features=AutomationControlled')
 
33
 
34
  driver = uc.Chrome(options=options, use_subprocess=False)
35
 
 
38
  Object.defineProperty(navigator, 'webdriver', {
39
  get: () => undefined
40
  });
 
 
 
 
41
  '''
42
  })
43
 
 
48
  return jsonify({
49
  'message': 'Undetected Chrome Scraper API is running',
50
  'endpoints': {
51
+ 'POST /api/scrape': 'Get HTML with Selenium',
52
+ 'POST /api/scrape-cf': 'Bypass Cloudflare with curl_cffi',
53
  'POST /api/execute': 'Execute Python code with Selenium',
54
  'POST /api/upload': 'Upload file to server',
55
+ 'GET /api/files': 'List uploaded files'
 
56
  }
57
  })
58
 
59
+ @app.route('/api/scrape-cf', methods=['POST'])
60
+ def scrape_cloudflare():
61
  data = request.get_json()
62
  url = data.get('url')
63
  html_only = data.get('html_only', False)
64
+ method = data.get('method', 'GET').upper()
65
+ headers = data.get('headers', {})
66
+ body_data = data.get('data')
67
 
68
  if not url:
69
  return jsonify({'success': False, 'error': 'URL is required'}), 400
70
 
71
  try:
72
+ default_headers = {
73
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
74
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
75
+ 'Accept-Language': 'en-US,en;q=0.9',
76
+ 'Accept-Encoding': 'gzip, deflate, br',
77
+ 'DNT': '1',
78
+ 'Connection': 'keep-alive',
79
+ 'Upgrade-Insecure-Requests': '1'
80
+ }
81
+ default_headers.update(headers)
82
 
83
+ if method == 'POST':
84
+ response = curl_requests.post(
85
+ url,
86
+ headers=default_headers,
87
+ data=body_data,
88
+ timeout=30,
89
+ impersonate="chrome120"
90
+ )
 
 
 
 
 
 
 
91
  else:
92
+ response = curl_requests.get(
93
+ url,
94
+ headers=default_headers,
95
+ timeout=30,
96
+ impersonate="chrome120"
97
+ )
98
+
99
+ html = response.text
100
+
101
+ if html_only:
102
+ return html, 200, {'Content-Type': 'text/html; charset=utf-8'}
103
+
104
+ return jsonify({
105
+ 'success': True,
106
+ 'data': {
107
+ 'html': html,
108
+ 'status_code': response.status_code,
109
+ 'url': str(response.url),
110
+ 'headers': dict(response.headers),
111
+ 'timestamp': datetime.now().isoformat()
112
+ }
113
+ })
114
+
115
  except Exception as e:
116
  return jsonify({'success': False, 'error': str(e)}), 500
117
 
 
206
  wait_time = data.get('wait', 5)
207
  screenshot = data.get('screenshot', False)
208
  html_only = data.get('html_only', False)
 
209
 
210
  if not url:
211
  return jsonify({'success': False, 'error': 'URL is required'}), 400
 
214
  try:
215
  driver = get_driver(headless=True)
216
  driver.get(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  time.sleep(wait_time)
218
 
219
  html = driver.page_source
220
  title = driver.title
221
  current_url = driver.current_url
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  if html_only:
224
  return html, 200, {'Content-Type': 'text/html; charset=utf-8'}
225