Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import asyncio | |
| from playwright.async_api import async_playwright | |
| import os | |
| import tempfile | |
| from datetime import datetime | |
| import re | |
| async def convert_website_to_pdf(url, wait_time, progress=gr.Progress()): | |
| """Convert a website to PDF and extract its content""" | |
| if not url: | |
| return None, "β Please enter a URL", "", "", "" | |
| # Validate URL | |
| if not url.startswith(('http://', 'https://')): | |
| url = 'https://' + url | |
| progress(0, desc="Initializing browser...") | |
| try: | |
| async with async_playwright() as p: | |
| # Launch browser | |
| progress(0.1, desc="Launching browser...") | |
| browser = await p.chromium.launch(headless=True) | |
| context = await browser.new_context( | |
| viewport={'width': 1920, 'height': 1080}, | |
| user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| ) | |
| page = await context.new_page() | |
| # Navigate to URL | |
| progress(0.2, desc=f"Loading {url}...") | |
| try: | |
| await page.goto(url, wait_until='networkidle', timeout=30000) | |
| except Exception as e: | |
| await browser.close() | |
| return None, f"β Failed to load URL: {str(e)}", "", "", "" | |
| # Wait for specified time | |
| if wait_time > 0: | |
| progress(0.3, desc=f"Waiting {wait_time} seconds for page to fully load...") | |
| await asyncio.sleep(wait_time) | |
| # Extract HTML | |
| progress(0.5, desc="Extracting HTML content...") | |
| html_content = await page.content() | |
| # Extract CSS | |
| progress(0.6, desc="Extracting CSS styles...") | |
| css_content = await page.evaluate('''() => { | |
| let css = ''; | |
| for (let sheet of document.styleSheets) { | |
| try { | |
| for (let rule of sheet.cssRules) { | |
| css += rule.cssText + '\\n'; | |
| } | |
| } catch (e) { | |
| // Cross-origin stylesheets may not be accessible | |
| css += `/* Could not access stylesheet: ${sheet.href} */\\n`; | |
| } | |
| } | |
| return css; | |
| }''') | |
| # Extract JavaScript | |
| progress(0.7, desc="Extracting JavaScript...") | |
| js_content = await page.evaluate('''() => { | |
| let scripts = []; | |
| document.querySelectorAll('script').forEach(script => { | |
| if (script.src) { | |
| scripts.push(`// External script: ${script.src}`); | |
| } else if (script.textContent) { | |
| scripts.push(script.textContent); | |
| } | |
| }); | |
| return scripts.join('\\n\\n' + '='.repeat(50) + '\\n\\n'); | |
| }''') | |
| # Generate PDF | |
| progress(0.8, desc="Converting to PDF...") | |
| # Create temporary file for PDF | |
| temp_dir = tempfile.gettempdir() | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| safe_url = re.sub(r'[^\w\-_]', '_', url[:50]) | |
| pdf_filename = f"website_{safe_url}_{timestamp}.pdf" | |
| pdf_path = os.path.join(temp_dir, pdf_filename) | |
| await page.pdf( | |
| path=pdf_path, | |
| format='A4', | |
| print_background=True, | |
| margin={'top': '20px', 'right': '20px', 'bottom': '20px', 'left': '20px'} | |
| ) | |
| progress(0.9, desc="Finalizing...") | |
| await browser.close() | |
| # Prepare status message | |
| status = f"""β **Conversion Successful!** | |
| π **URL:** {url} | |
| β±οΈ **Wait Time:** {wait_time} seconds | |
| π **HTML Size:** {len(html_content):,} characters | |
| π¨ **CSS Size:** {len(css_content):,} characters | |
| β‘ **JavaScript Size:** {len(js_content):,} characters | |
| π₯ **PDF Generated:** {pdf_filename} | |
| """ | |
| progress(1.0, desc="Complete!") | |
| return pdf_path, status, html_content, css_content, js_content | |
| except Exception as e: | |
| return None, f"β Error: {str(e)}", "", "", "" | |
| def sync_convert(url, wait_time, progress=gr.Progress()): | |
| """Synchronous wrapper for the async function""" | |
| return asyncio.run(convert_website_to_pdf(url, wait_time, progress)) | |
| # Custom CSS for better styling | |
| custom_css = """ | |
| #status_box { | |
| border-left: 4px solid #10b981; | |
| background-color: #1f2937; | |
| padding: 15px; | |
| border-radius: 8px; | |
| color: #f9fafb; | |
| } | |
| #status_box * { | |
| color: #f9fafb !important; | |
| } | |
| .progress-container { | |
| margin: 20px 0; | |
| } | |
| footer { | |
| display: none !important; | |
| } | |
| """ | |
| # Create Gradio interface | |
| with gr.Blocks(css=custom_css, title="Website to PDF Converter") as demo: | |
| gr.Markdown(""" | |
| # π Website to PDF Converter | |
| Convert any website to PDF while extracting HTML, CSS, and JavaScript content. | |
| Perfect for archiving, analysis, or offline viewing. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| url_input = gr.Textbox( | |
| label="Website URL", | |
| placeholder="Enter URL (e.g., https://example.com or example.com)", | |
| lines=1 | |
| ) | |
| with gr.Column(scale=1): | |
| wait_time = gr.Slider( | |
| minimum=0, | |
| maximum=10, | |
| value=2, | |
| step=0.5, | |
| label="Wait Time (seconds)", | |
| info="Time to wait for dynamic content to load" | |
| ) | |
| convert_btn = gr.Button("π Convert to PDF", variant="primary", size="lg") | |
| status_output = gr.Markdown(label="Status", elem_id="status_box") | |
| with gr.Row(): | |
| pdf_output = gr.File(label="π₯ Download PDF", file_types=[".pdf"]) | |
| gr.Markdown("### π Extracted Content") | |
| with gr.Tabs(): | |
| with gr.Tab("HTML"): | |
| html_output = gr.Code( | |
| label="HTML Content", | |
| language="html", | |
| lines=15, | |
| show_label=False | |
| ) | |
| with gr.Tab("CSS"): | |
| css_output = gr.Code( | |
| label="CSS Styles", | |
| language="css", | |
| lines=15, | |
| show_label=False | |
| ) | |
| with gr.Tab("JavaScript"): | |
| js_output = gr.Code( | |
| label="JavaScript Code", | |
| language="javascript", | |
| lines=15, | |
| show_label=False | |
| ) | |
| # Examples | |
| gr.Markdown("### π‘ Try These Examples") | |
| gr.Examples( | |
| examples=[ | |
| ["https://example.com", 1], | |
| ["https://wikipedia.org", 2], | |
| ["https://github.com", 3], | |
| ], | |
| inputs=[url_input, wait_time], | |
| label="Example Websites" | |
| ) | |
| # Event handler | |
| convert_btn.click( | |
| fn=sync_convert, | |
| inputs=[url_input, wait_time], | |
| outputs=[pdf_output, status_output, html_output, css_output, js_output] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### βΉοΈ How It Works | |
| 1. **Enter URL**: Provide the website address you want to convert | |
| 2. **Set Wait Time**: Configure how long to wait for dynamic content (AJAX, JavaScript) | |
| 3. **Click Convert**: The app will load the page, extract content, and generate a PDF | |
| 4. **Download**: Get your PDF and view the extracted HTML, CSS, and JavaScript | |
| **Note:** Some websites may block automated access or have anti-bot measures. | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) |