| import os |
| import requests |
| from bs4 import BeautifulSoup |
| from urllib.parse import urljoin, urlparse |
| from zipfile import ZipFile |
| from io import BytesIO |
| import gradio as gr |
|
|
| def download_file(url, session): |
| """Download a file and return its content.""" |
| try: |
| response = session.get(url) |
| response.raise_for_status() |
| return response.content |
| except requests.exceptions.RequestException as e: |
| print(f"Error downloading {url}: {e}") |
| return None |
|
|
| def save_webpage_as_zip(url): |
| """Save a webpage and its assets as a ZIP file.""" |
| session = requests.Session() |
| response = session.get(url) |
| response.raise_for_status() |
|
|
| soup = BeautifulSoup(response.content, 'html.parser') |
| temp_dir = 'temp_webpage' |
| if not os.path.exists(temp_dir): |
| os.makedirs(temp_dir) |
|
|
| main_html_path = os.path.join(temp_dir, 'index.html') |
| with open(main_html_path, 'wb') as f: |
| f.write(response.content) |
| |
| assets = [] |
| for tag in soup.find_all(['img', 'link', 'script']): |
| if tag.name == 'img' and tag.get('src'): |
| assets.append(tag['src']) |
| elif tag.name == 'link' and tag.get('href'): |
| assets.append(tag['href']) |
| elif tag.name == 'script' and tag.get('src'): |
| assets.append(tag['src']) |
| |
| for asset in assets: |
| asset_url = urljoin(url, asset) |
| asset_path = urlparse(asset_url).path.lstrip('/') |
| asset_full_path = os.path.join(temp_dir, asset_path) |
|
|
| if asset_path.endswith('/'): |
| print(f"Skipping directory {asset_full_path}") |
| continue |
|
|
| os.makedirs(os.path.dirname(asset_full_path), exist_ok=True) |
|
|
| content = download_file(asset_url, session) |
| if content: |
| if os.path.isdir(asset_full_path): |
| print(f"Skipping directory {asset_full_path}") |
| continue |
| with open(asset_full_path, 'wb') as f: |
| f.write(content) |
| |
| zip_buffer = BytesIO() |
| with ZipFile(zip_buffer, 'w') as zipf: |
| for root, _, files in os.walk(temp_dir): |
| for file in files: |
| file_path = os.path.join(root, file) |
| zipf.write(file_path, os.path.relpath(file_path, temp_dir)) |
|
|
| for root, _, files in os.walk(temp_dir, topdown=False): |
| for file in files: |
| os.remove(os.path.join(root, file)) |
| os.rmdir(root) |
| zip_buffer.seek(0) |
| return zip_buffer |
|
|
| def generate_zip_file(url): |
| """Generate ZIP file from a webpage URL.""" |
| zip_buffer = save_webpage_as_zip(url) |
| temp_zip_path = "webpage.zip" |
| with open(temp_zip_path, 'wb') as f: |
| f.write(zip_buffer.read()) |
| return temp_zip_path |
|
|
| examples = [ |
| "https://www.bmw.com/en/index.html", |
| "https://www.ferrari.com/en-EN", |
| "https://streamlit.io/" |
| ] |
|
|
| DESCRIPTION = """ |
| |
| ## Webpage to ZIP Downloader 🔗 |
| """ |
|
|
| with gr.Blocks(theme="gstaff/whiteboard") as demo: |
| gr.Markdown(DESCRIPTION) |
| gr.Markdown("Enter a URL to download the webpage and its assets as a ZIP file.") |
|
|
| url_input = gr.Textbox(label="Website URL", placeholder="Enter a URL (e.g., https://www.example.com)") |
|
|
| download_button = gr.Button("Download as ZIP") |
| output_file = gr.File(label="Download") |
|
|
| def set_example_url(url): |
| url_input.value = url |
|
|
| download_button.click(fn=generate_zip_file, inputs=url_input, outputs=output_file) |
|
|
| gr.Examples( |
| examples=examples, |
| inputs=url_input, |
| outputs=output_file, |
| fn=generate_zip_file |
| ) |
| demo.launch() |
|
|