import gradio as gr import asyncio from playwright.async_api import async_playwright import os import tempfile from datetime import datetime import re async def convert_website_to_pdf(url, wait_time, progress=gr.Progress()): """Convert a website to PDF and extract its content""" if not url: return None, "❌ Please enter a URL", "", "", "" # Validate URL if not url.startswith(('http://', 'https://')): url = 'https://' + url progress(0, desc="Initializing browser...") try: async with async_playwright() as p: # Launch browser progress(0.1, desc="Launching browser...") browser = await p.chromium.launch(headless=True) context = await browser.new_context( viewport={'width': 1920, 'height': 1080}, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' ) page = await context.new_page() # Navigate to URL progress(0.2, desc=f"Loading {url}...") try: await page.goto(url, wait_until='networkidle', timeout=30000) except Exception as e: await browser.close() return None, f"❌ Failed to load URL: {str(e)}", "", "", "" # Wait for specified time if wait_time > 0: progress(0.3, desc=f"Waiting {wait_time} seconds for page to fully load...") await asyncio.sleep(wait_time) # Extract HTML progress(0.5, desc="Extracting HTML content...") html_content = await page.content() # Extract CSS progress(0.6, desc="Extracting CSS styles...") css_content = await page.evaluate('''() => { let css = ''; for (let sheet of document.styleSheets) { try { for (let rule of sheet.cssRules) { css += rule.cssText + '\\n'; } } catch (e) { // Cross-origin stylesheets may not be accessible css += `/* Could not access stylesheet: ${sheet.href} */\\n`; } } return css; }''') # Extract JavaScript progress(0.7, desc="Extracting JavaScript...") js_content = await page.evaluate('''() => { let scripts = []; document.querySelectorAll('script').forEach(script => { if (script.src) { scripts.push(`// External script: ${script.src}`); } else if (script.textContent) { scripts.push(script.textContent); } }); return scripts.join('\\n\\n' + '='.repeat(50) + '\\n\\n'); }''') # Generate PDF progress(0.8, desc="Converting to PDF...") # Create temporary file for PDF temp_dir = tempfile.gettempdir() timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") safe_url = re.sub(r'[^\w\-_]', '_', url[:50]) pdf_filename = f"website_{safe_url}_{timestamp}.pdf" pdf_path = os.path.join(temp_dir, pdf_filename) await page.pdf( path=pdf_path, format='A4', print_background=True, margin={'top': '20px', 'right': '20px', 'bottom': '20px', 'left': '20px'} ) progress(0.9, desc="Finalizing...") await browser.close() # Prepare status message status = f"""✅ **Conversion Successful!** 📄 **URL:** {url} âąī¸ **Wait Time:** {wait_time} seconds 📊 **HTML Size:** {len(html_content):,} characters 🎨 **CSS Size:** {len(css_content):,} characters ⚡ **JavaScript Size:** {len(js_content):,} characters đŸ“Ĩ **PDF Generated:** {pdf_filename} """ progress(1.0, desc="Complete!") return pdf_path, status, html_content, css_content, js_content except Exception as e: return None, f"❌ Error: {str(e)}", "", "", "" def sync_convert(url, wait_time, progress=gr.Progress()): """Synchronous wrapper for the async function""" return asyncio.run(convert_website_to_pdf(url, wait_time, progress)) # Custom CSS for better styling custom_css = """ #status_box { border-left: 4px solid #10b981; background-color: #1f2937; padding: 15px; border-radius: 8px; color: #f9fafb; } #status_box * { color: #f9fafb !important; } .progress-container { margin: 20px 0; } footer { display: none !important; } """ # Create Gradio interface with gr.Blocks(css=custom_css, title="Website to PDF Converter") as demo: gr.Markdown(""" # 🌐 Website to PDF Converter Convert any website to PDF while extracting HTML, CSS, and JavaScript content. Perfect for archiving, analysis, or offline viewing. """) with gr.Row(): with gr.Column(scale=2): url_input = gr.Textbox( label="Website URL", placeholder="Enter URL (e.g., https://example.com or example.com)", lines=1 ) with gr.Column(scale=1): wait_time = gr.Slider( minimum=0, maximum=10, value=2, step=0.5, label="Wait Time (seconds)", info="Time to wait for dynamic content to load" ) convert_btn = gr.Button("🚀 Convert to PDF", variant="primary", size="lg") status_output = gr.Markdown(label="Status", elem_id="status_box") with gr.Row(): pdf_output = gr.File(label="đŸ“Ĩ Download PDF", file_types=[".pdf"]) gr.Markdown("### 📋 Extracted Content") with gr.Tabs(): with gr.Tab("HTML"): html_output = gr.Code( label="HTML Content", language="html", lines=15, show_label=False ) with gr.Tab("CSS"): css_output = gr.Code( label="CSS Styles", language="css", lines=15, show_label=False ) with gr.Tab("JavaScript"): js_output = gr.Code( label="JavaScript Code", language="javascript", lines=15, show_label=False ) # Examples gr.Markdown("### 💡 Try These Examples") gr.Examples( examples=[ ["https://example.com", 1], ["https://wikipedia.org", 2], ["https://github.com", 3], ], inputs=[url_input, wait_time], label="Example Websites" ) # Event handler convert_btn.click( fn=sync_convert, inputs=[url_input, wait_time], outputs=[pdf_output, status_output, html_output, css_output, js_output] ) gr.Markdown(""" --- ### â„šī¸ How It Works 1. **Enter URL**: Provide the website address you want to convert 2. **Set Wait Time**: Configure how long to wait for dynamic content (AJAX, JavaScript) 3. **Click Convert**: The app will load the page, extract content, and generate a PDF 4. **Download**: Get your PDF and view the extracted HTML, CSS, and JavaScript **Note:** Some websites may block automated access or have anti-bot measures. """) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False )