maylinejix commited on
Commit
075e397
·
verified ·
1 Parent(s): ed74c7f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -4
app.py CHANGED
@@ -5,20 +5,63 @@ from selenium.webdriver.support.ui import WebDriverWait
5
  from selenium.webdriver.support import expected_conditions as EC
6
  import os
7
  import time
8
- import base64
9
  from datetime import datetime
 
10
 
11
  app = Flask(__name__)
12
  PUBLIC_DIR = 'public'
 
13
  os.makedirs(PUBLIC_DIR, exist_ok=True)
 
 
 
 
 
 
 
14
 
15
  def get_driver():
16
  options = uc.ChromeOptions()
17
- options.add_argument('--headless')
18
  options.add_argument('--no-sandbox')
19
  options.add_argument('--disable-dev-shm-usage')
20
  options.add_argument('--disable-gpu')
21
- return uc.Chrome(options=options, use_subprocess=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  @app.route('/')
24
  def index():
@@ -26,16 +69,105 @@ def index():
26
  'message': 'Undetected Chrome Scraper API is running',
27
  'endpoints': {
28
  'POST /api/scrape': 'Get HTML content from URL',
29
- 'POST /api/execute': 'Execute Python code with Selenium'
 
 
 
30
  }
31
  })
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  @app.route('/api/scrape', methods=['POST'])
34
  def scrape():
35
  data = request.get_json()
36
  url = data.get('url')
37
  wait_time = data.get('wait', 3)
38
  screenshot = data.get('screenshot', False)
 
 
39
 
40
  if not url:
41
  return jsonify({'success': False, 'error': 'URL is required'}), 400
@@ -44,12 +176,33 @@ def scrape():
44
  try:
45
  driver = get_driver()
46
  driver.get(url)
 
 
 
 
 
 
 
 
 
 
 
 
47
  time.sleep(wait_time)
48
 
49
  html = driver.page_source
50
  title = driver.title
51
  current_url = driver.current_url
52
 
 
 
 
 
 
 
 
 
 
53
  result = {
54
  'success': True,
55
  'data': {
 
5
  from selenium.webdriver.support import expected_conditions as EC
6
  import os
7
  import time
 
8
  from datetime import datetime
9
+ from werkzeug.utils import secure_filename
10
 
11
  app = Flask(__name__)
12
  PUBLIC_DIR = 'public'
13
+ UPLOAD_DIR = 'uploads'
14
  os.makedirs(PUBLIC_DIR, exist_ok=True)
15
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
16
+
17
+ ALLOWED_EXTENSIONS = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif', 'html', 'json', 'xml', 'csv'}
18
+ MAX_FILE_SIZE = 16 * 1024 * 1024
19
+
20
+ def allowed_file(filename):
21
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
22
 
23
  def get_driver():
24
  options = uc.ChromeOptions()
25
+ options.add_argument('--headless=new')
26
  options.add_argument('--no-sandbox')
27
  options.add_argument('--disable-dev-shm-usage')
28
  options.add_argument('--disable-gpu')
29
+ options.add_argument('--disable-blink-features=AutomationControlled')
30
+ options.add_argument('--disable-features=IsolateOrigins,site-per-process')
31
+ options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
32
+ options.add_argument('--disable-web-security')
33
+ options.add_argument('--allow-running-insecure-content')
34
+ options.add_argument('--ignore-certificate-errors')
35
+
36
+ prefs = {
37
+ 'profile.default_content_setting_values.notifications': 2,
38
+ 'credentials_enable_service': False,
39
+ 'profile.password_manager_enabled': False
40
+ }
41
+ options.add_experimental_option('prefs', prefs)
42
+ options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
43
+ options.add_experimental_option('useAutomationExtension', False)
44
+
45
+ driver = uc.Chrome(options=options, use_subprocess=False)
46
+
47
+ driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
48
+ 'source': '''
49
+ Object.defineProperty(navigator, 'webdriver', {
50
+ get: () => undefined
51
+ });
52
+ Object.defineProperty(navigator, 'plugins', {
53
+ get: () => [1, 2, 3, 4, 5]
54
+ });
55
+ Object.defineProperty(navigator, 'languages', {
56
+ get: () => ['en-US', 'en']
57
+ });
58
+ window.chrome = {
59
+ runtime: {}
60
+ };
61
+ '''
62
+ })
63
+
64
+ return driver
65
 
66
  @app.route('/')
67
  def index():
 
69
  'message': 'Undetected Chrome Scraper API is running',
70
  'endpoints': {
71
  'POST /api/scrape': 'Get HTML content from URL',
72
+ 'POST /api/execute': 'Execute Python code with Selenium',
73
+ 'POST /api/upload': 'Upload file to server',
74
+ 'GET /api/files': 'List uploaded files',
75
+ 'GET /uploads/<filename>': 'Download uploaded file'
76
  }
77
  })
78
 
79
+ @app.route('/api/upload', methods=['POST'])
80
+ def upload_file():
81
+ try:
82
+ if 'file' not in request.files:
83
+ return jsonify({
84
+ 'success': False,
85
+ 'error': 'No file part in request'
86
+ }), 400
87
+
88
+ file = request.files['file']
89
+
90
+ if file.filename == '':
91
+ return jsonify({
92
+ 'success': False,
93
+ 'error': 'No file selected'
94
+ }), 400
95
+
96
+ if not allowed_file(file.filename):
97
+ return jsonify({
98
+ 'success': False,
99
+ 'error': f'File type not allowed. Allowed types: {", ".join(ALLOWED_EXTENSIONS)}'
100
+ }), 400
101
+
102
+ filename = secure_filename(file.filename)
103
+ timestamp = int(time.time())
104
+ unique_filename = f"{timestamp}_{filename}"
105
+ filepath = os.path.join(UPLOAD_DIR, unique_filename)
106
+
107
+ file.save(filepath)
108
+
109
+ file_size = os.path.getsize(filepath)
110
+
111
+ return jsonify({
112
+ 'success': True,
113
+ 'data': {
114
+ 'filename': unique_filename,
115
+ 'original_filename': filename,
116
+ 'size': file_size,
117
+ 'size_mb': round(file_size / (1024 * 1024), 2),
118
+ 'url': f"{request.host_url}uploads/{unique_filename}",
119
+ 'timestamp': datetime.now().isoformat()
120
+ }
121
+ })
122
+
123
+ except Exception as e:
124
+ return jsonify({
125
+ 'success': False,
126
+ 'error': str(e)
127
+ }), 500
128
+
129
+ @app.route('/api/files', methods=['GET'])
130
+ def list_files():
131
+ try:
132
+ files = []
133
+ for filename in os.listdir(UPLOAD_DIR):
134
+ filepath = os.path.join(UPLOAD_DIR, filename)
135
+ if os.path.isfile(filepath):
136
+ file_stat = os.stat(filepath)
137
+ files.append({
138
+ 'filename': filename,
139
+ 'size': file_stat.st_size,
140
+ 'size_mb': round(file_stat.st_size / (1024 * 1024), 2),
141
+ 'url': f"{request.host_url}uploads/{filename}",
142
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat()
143
+ })
144
+
145
+ return jsonify({
146
+ 'success': True,
147
+ 'data': {
148
+ 'total': len(files),
149
+ 'files': files
150
+ }
151
+ })
152
+
153
+ except Exception as e:
154
+ return jsonify({
155
+ 'success': False,
156
+ 'error': str(e)
157
+ }), 500
158
+
159
+ @app.route('/uploads/<path:filename>')
160
+ def serve_upload(filename):
161
+ return send_from_directory(UPLOAD_DIR, filename)
162
+
163
  @app.route('/api/scrape', methods=['POST'])
164
  def scrape():
165
  data = request.get_json()
166
  url = data.get('url')
167
  wait_time = data.get('wait', 3)
168
  screenshot = data.get('screenshot', False)
169
+ html_only = data.get('html_only', False)
170
+ max_wait = data.get('max_wait', 30)
171
 
172
  if not url:
173
  return jsonify({'success': False, 'error': 'URL is required'}), 400
 
176
  try:
177
  driver = get_driver()
178
  driver.get(url)
179
+
180
+ start_time = time.time()
181
+ while time.time() - start_time < max_wait:
182
+ html = driver.page_source
183
+
184
+ if ("Just a moment" not in html and
185
+ "Verifying you are human" not in html and
186
+ "cf-chl-widget" not in html):
187
+ break
188
+
189
+ time.sleep(2)
190
+
191
  time.sleep(wait_time)
192
 
193
  html = driver.page_source
194
  title = driver.title
195
  current_url = driver.current_url
196
 
197
+ if "Just a moment" in html or "Verifying you are human" in html:
198
+ return jsonify({
199
+ 'success': False,
200
+ 'error': 'Cloudflare challenge not solved. Try increasing max_wait.'
201
+ }), 403
202
+
203
+ if html_only:
204
+ return html, 200, {'Content-Type': 'text/html; charset=utf-8'}
205
+
206
  result = {
207
  'success': True,
208
  'data': {