Scrape / app.py
maylinejix's picture
Update app.py
e0b0ec0 verified
from flask import Flask, request, jsonify, send_from_directory, make_response
import os
import time
import random
import base64
from datetime import datetime
from werkzeug.utils import secure_filename
import requests
app = Flask(__name__)
PUBLIC_DIR = 'public'
UPLOAD_DIR = 'uploads'
os.makedirs(PUBLIC_DIR, exist_ok=True)
os.makedirs(UPLOAD_DIR, exist_ok=True)
ALLOWED_EXTENSIONS = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif', 'html', 'json', 'xml', 'csv'}
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (iPad; CPU OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Linux; Android 13; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36'
]
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/')
def index():
return jsonify({
'message': 'Scraper API with Smart Cloudflare Bypass',
'endpoints': {
'POST /api/get': 'Fetch URL (auto-try all UAs until success)',
'POST /api/upload': 'Upload file',
'GET /api/files': 'List files'
}
})
@app.route('/api/get', methods=['POST'])
def get_url():
data = request.get_json()
url = data.get('url')
html_only = data.get('html_only', False)
try_all = data.get('try_all', True)
if not url:
return jsonify({'success': False, 'error': 'URL required'}), 400
if not try_all:
user_agent = random.choice(USER_AGENTS)
return fetch_with_ua(url, user_agent, html_only)
failed_attempts = []
for user_agent in USER_AGENTS:
try:
headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
if response.status_code == 403:
failed_attempts.append({
'user_agent': user_agent,
'status': 403,
'reason': 'Forbidden'
})
continue
html = response.text
if 'Just a moment' in html or 'Verifying you are human' in html or 'cf-chl-widget' in html:
failed_attempts.append({
'user_agent': user_agent,
'status': response.status_code,
'reason': 'Cloudflare challenge detected'
})
continue
if html_only:
html_b64 = base64.b64encode(html.encode('utf-8')).decode('utf-8')
return jsonify({
'success': True,
'html_base64': html_b64,
'user_agent': user_agent,
'attempts': len(failed_attempts) + 1
})
return jsonify({
'success': True,
'data': {
'html': html,
'status_code': response.status_code,
'url': str(response.url),
'user_agent': user_agent,
'attempts': len(failed_attempts) + 1,
'failed_attempts': failed_attempts,
'timestamp': datetime.now().isoformat()
}
})
except requests.exceptions.Timeout:
failed_attempts.append({
'user_agent': user_agent,
'status': 'timeout',
'reason': 'Request timeout'
})
continue
except Exception as e:
failed_attempts.append({
'user_agent': user_agent,
'status': 'error',
'reason': str(e)
})
continue
return jsonify({
'success': False,
'error': 'All user agents failed',
'total_attempts': len(USER_AGENTS),
'failed_attempts': failed_attempts
}), 403
def fetch_with_ua(url, user_agent, html_only):
try:
headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
response = requests.get(url, headers=headers, timeout=30, allow_redirects=True)
html = response.text
if html_only:
html_b64 = base64.b64encode(html.encode('utf-8')).decode('utf-8')
return jsonify({
'success': True,
'html_base64': html_b64,
'user_agent': user_agent
})
return jsonify({
'success': True,
'data': {
'html': html,
'status_code': response.status_code,
'url': str(response.url),
'user_agent': user_agent,
'timestamp': datetime.now().isoformat()
}
})
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/api/upload', methods=['POST'])
def upload_file():
try:
if 'file' not in request.files:
return jsonify({'success': False, 'error': 'No file'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'success': False, 'error': 'No file selected'}), 400
if not allowed_file(file.filename):
return jsonify({'success': False, 'error': 'File type not allowed'}), 400
filename = secure_filename(file.filename)
timestamp = int(time.time())
unique_filename = f"{timestamp}_{filename}"
filepath = os.path.join(UPLOAD_DIR, unique_filename)
file.save(filepath)
file_size = os.path.getsize(filepath)
return jsonify({
'success': True,
'data': {
'filename': unique_filename,
'original_filename': filename,
'size': file_size,
'size_mb': round(file_size / (1024 * 1024), 2),
'url': f"{request.host_url}uploads/{unique_filename}",
'timestamp': datetime.now().isoformat()
}
})
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/api/files', methods=['GET'])
def list_files():
try:
files = []
for filename in os.listdir(UPLOAD_DIR):
filepath = os.path.join(UPLOAD_DIR, filename)
if os.path.isfile(filepath):
file_stat = os.stat(filepath)
files.append({
'filename': filename,
'size': file_stat.st_size,
'size_mb': round(file_stat.st_size / (1024 * 1024), 2),
'url': f"{request.host_url}uploads/{filename}",
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat()
})
return jsonify({
'success': True,
'data': {
'total': len(files),
'files': files
}
})
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/uploads/<path:filename>')
def serve_upload(filename):
return send_from_directory(UPLOAD_DIR, filename)
@app.route('/files/<path:filename>')
def serve_file(filename):
return send_from_directory(PUBLIC_DIR, filename)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)))