maylinejix commited on
Commit
e89dbd6
·
verified ·
1 Parent(s): 52200fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -12
app.py CHANGED
@@ -7,6 +7,7 @@ import os
7
  import time
8
  from datetime import datetime
9
  from werkzeug.utils import secure_filename
 
10
 
11
  app = Flask(__name__)
12
  PUBLIC_DIR = 'public'
@@ -20,12 +21,14 @@ MAX_FILE_SIZE = 16 * 1024 * 1024
20
  def allowed_file(filename):
21
  return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
22
 
23
- def get_driver():
24
  options = uc.ChromeOptions()
25
- options.add_argument('--headless=new')
 
26
  options.add_argument('--no-sandbox')
27
  options.add_argument('--disable-dev-shm-usage')
28
  options.add_argument('--disable-gpu')
 
29
  options.add_argument('--disable-blink-features=AutomationControlled')
30
  options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
31
 
@@ -36,6 +39,10 @@ def get_driver():
36
  Object.defineProperty(navigator, 'webdriver', {
37
  get: () => undefined
38
  });
 
 
 
 
39
  '''
40
  })
41
 
@@ -46,7 +53,8 @@ def index():
46
  return jsonify({
47
  'message': 'Undetected Chrome Scraper API is running',
48
  'endpoints': {
49
- 'POST /api/scrape': 'Get HTML content from URL',
 
50
  'POST /api/execute': 'Execute Python code with Selenium',
51
  'POST /api/upload': 'Upload file to server',
52
  'GET /api/files': 'List uploaded files',
@@ -54,6 +62,51 @@ def index():
54
  }
55
  })
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  @app.route('/api/upload', methods=['POST'])
58
  def upload_file():
59
  try:
@@ -142,29 +195,43 @@ def serve_upload(filename):
142
  def scrape():
143
  data = request.get_json()
144
  url = data.get('url')
145
- wait_time = data.get('wait', 3)
146
  screenshot = data.get('screenshot', False)
147
  html_only = data.get('html_only', False)
148
- max_wait = data.get('max_wait', 30)
149
 
150
  if not url:
151
  return jsonify({'success': False, 'error': 'URL is required'}), 400
152
 
153
  driver = None
154
  try:
155
- driver = get_driver()
156
  driver.get(url)
157
 
 
 
158
  start_time = time.time()
 
 
159
  while time.time() - start_time < max_wait:
160
  html = driver.page_source
161
 
162
- if ("Just a moment" not in html and
163
- "Verifying you are human" not in html and
164
- "cf-chl-widget" not in html):
165
- break
 
 
 
 
 
 
 
 
 
 
166
 
167
- time.sleep(2)
168
 
169
  time.sleep(wait_time)
170
 
@@ -173,9 +240,16 @@ def scrape():
173
  current_url = driver.current_url
174
 
175
  if "Just a moment" in html or "Verifying you are human" in html:
 
 
 
 
 
176
  return jsonify({
177
  'success': False,
178
- 'error': 'Cloudflare challenge not solved. Try increasing max_wait.'
 
 
179
  }), 403
180
 
181
  if html_only:
 
7
  import time
8
  from datetime import datetime
9
  from werkzeug.utils import secure_filename
10
+ import cloudscraper
11
 
12
  app = Flask(__name__)
13
  PUBLIC_DIR = 'public'
 
21
  def allowed_file(filename):
22
  return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
23
 
24
+ def get_driver(headless=True):
25
  options = uc.ChromeOptions()
26
+ if headless:
27
+ options.add_argument('--headless=new')
28
  options.add_argument('--no-sandbox')
29
  options.add_argument('--disable-dev-shm-usage')
30
  options.add_argument('--disable-gpu')
31
+ options.add_argument('--window-size=1920,1080')
32
  options.add_argument('--disable-blink-features=AutomationControlled')
33
  options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
34
 
 
39
  Object.defineProperty(navigator, 'webdriver', {
40
  get: () => undefined
41
  });
42
+ Object.defineProperty(navigator, 'plugins', {
43
+ get: () => [1, 2, 3, 4, 5]
44
+ });
45
+ window.chrome = { runtime: {} };
46
  '''
47
  })
48
 
 
53
  return jsonify({
54
  'message': 'Undetected Chrome Scraper API is running',
55
  'endpoints': {
56
+ 'POST /api/scrape': 'Get HTML with Selenium (use for complex sites)',
57
+ 'POST /api/scrape-fast': 'Get HTML with Cloudscraper (bypass Cloudflare)',
58
  'POST /api/execute': 'Execute Python code with Selenium',
59
  'POST /api/upload': 'Upload file to server',
60
  'GET /api/files': 'List uploaded files',
 
62
  }
63
  })
64
 
65
+ @app.route('/api/scrape-fast', methods=['POST'])
66
+ def scrape_fast():
67
+ data = request.get_json()
68
+ url = data.get('url')
69
+ html_only = data.get('html_only', False)
70
+
71
+ if not url:
72
+ return jsonify({'success': False, 'error': 'URL is required'}), 400
73
+
74
+ try:
75
+ scraper = cloudscraper.create_scraper(
76
+ browser={
77
+ 'browser': 'chrome',
78
+ 'platform': 'windows',
79
+ 'desktop': True
80
+ }
81
+ )
82
+
83
+ response = scraper.get(url, timeout=30)
84
+
85
+ if response.status_code == 200:
86
+ html = response.text
87
+
88
+ if html_only:
89
+ return html, 200, {'Content-Type': 'text/html; charset=utf-8'}
90
+
91
+ return jsonify({
92
+ 'success': True,
93
+ 'data': {
94
+ 'html': html,
95
+ 'status_code': response.status_code,
96
+ 'url': response.url,
97
+ 'timestamp': datetime.now().isoformat()
98
+ }
99
+ })
100
+ else:
101
+ return jsonify({
102
+ 'success': False,
103
+ 'error': f'HTTP {response.status_code}',
104
+ 'status_code': response.status_code
105
+ }), response.status_code
106
+
107
+ except Exception as e:
108
+ return jsonify({'success': False, 'error': str(e)}), 500
109
+
110
  @app.route('/api/upload', methods=['POST'])
111
  def upload_file():
112
  try:
 
195
  def scrape():
196
  data = request.get_json()
197
  url = data.get('url')
198
+ wait_time = data.get('wait', 5)
199
  screenshot = data.get('screenshot', False)
200
  html_only = data.get('html_only', False)
201
+ max_wait = data.get('max_wait', 60)
202
 
203
  if not url:
204
  return jsonify({'success': False, 'error': 'URL is required'}), 400
205
 
206
  driver = None
207
  try:
208
+ driver = get_driver(headless=True)
209
  driver.get(url)
210
 
211
+ time.sleep(10)
212
+
213
  start_time = time.time()
214
+ challenge_detected = False
215
+
216
  while time.time() - start_time < max_wait:
217
  html = driver.page_source
218
 
219
+ if "Just a moment" in html or "Verifying you are human" in html or "cf-chl-widget" in html:
220
+ challenge_detected = True
221
+ time.sleep(3)
222
+
223
+ try:
224
+ driver.execute_script("window.scrollTo(0, 100);")
225
+ time.sleep(1)
226
+ except:
227
+ pass
228
+
229
+ continue
230
+
231
+ if challenge_detected:
232
+ time.sleep(5)
233
 
234
+ break
235
 
236
  time.sleep(wait_time)
237
 
 
240
  current_url = driver.current_url
241
 
242
  if "Just a moment" in html or "Verifying you are human" in html:
243
+ if screenshot:
244
+ filename = f'challenge-{int(time.time())}.png'
245
+ filepath = os.path.join(PUBLIC_DIR, filename)
246
+ driver.save_screenshot(filepath)
247
+
248
  return jsonify({
249
  'success': False,
250
+ 'error': 'Cloudflare challenge not solved. Try using /api/scrape-fast instead',
251
+ 'html_preview': html[:500],
252
+ 'screenshot': f"{request.host_url}files/{filename}" if screenshot else None
253
  }), 403
254
 
255
  if html_only: