Spaces:

namelessai
/

web2pdf

Sleeping

File size: 7,915 Bytes

import gradio as gr
import asyncio
from playwright.async_api import async_playwright
import os
import tempfile
from datetime import datetime
import re

async def convert_website_to_pdf(url, wait_time, progress=gr.Progress()):
    """Convert a website to PDF and extract its content"""
    
    if not url:
        return None, "❌ Please enter a URL", "", "", ""
    
    # Validate URL
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url
    
    progress(0, desc="Initializing browser...")
    
    try:
        async with async_playwright() as p:
            # Launch browser
            progress(0.1, desc="Launching browser...")
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context(
                viewport={'width': 1920, 'height': 1080},
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            )
            page = await context.new_page()
            
            # Navigate to URL
            progress(0.2, desc=f"Loading {url}...")
            try:
                await page.goto(url, wait_until='networkidle', timeout=30000)
            except Exception as e:
                await browser.close()
                return None, f"❌ Failed to load URL: {str(e)}", "", "", ""
            
            # Wait for specified time
            if wait_time > 0:
                progress(0.3, desc=f"Waiting {wait_time} seconds for page to fully load...")
                await asyncio.sleep(wait_time)
            
            # Extract HTML
            progress(0.5, desc="Extracting HTML content...")
            html_content = await page.content()
            
            # Extract CSS
            progress(0.6, desc="Extracting CSS styles...")
            css_content = await page.evaluate('''() => {
                let css = '';
                for (let sheet of document.styleSheets) {
                    try {
                        for (let rule of sheet.cssRules) {
                            css += rule.cssText + '\\n';
                        }
                    } catch (e) {
                        // Cross-origin stylesheets may not be accessible
                        css += `/* Could not access stylesheet: ${sheet.href} */\\n`;
                    }
                }
                return css;
            }''')
            
            # Extract JavaScript
            progress(0.7, desc="Extracting JavaScript...")
            js_content = await page.evaluate('''() => {
                let scripts = [];
                document.querySelectorAll('script').forEach(script => {
                    if (script.src) {
                        scripts.push(`// External script: ${script.src}`);
                    } else if (script.textContent) {
                        scripts.push(script.textContent);
                    }
                });
                return scripts.join('\\n\\n' + '='.repeat(50) + '\\n\\n');
            }''')
            
            # Generate PDF
            progress(0.8, desc="Converting to PDF...")
            
            # Create temporary file for PDF
            temp_dir = tempfile.gettempdir()
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            safe_url = re.sub(r'[^\w\-_]', '_', url[:50])
            pdf_filename = f"website_{safe_url}_{timestamp}.pdf"
            pdf_path = os.path.join(temp_dir, pdf_filename)
            
            await page.pdf(
                path=pdf_path,
                format='A4',
                print_background=True,
                margin={'top': '20px', 'right': '20px', 'bottom': '20px', 'left': '20px'}
            )
            
            progress(0.9, desc="Finalizing...")
            
            await browser.close()
            
            # Prepare status message
            status = f"""✅ **Conversion Successful!**
            
📄 **URL:** {url}
⏱️ **Wait Time:** {wait_time} seconds
📊 **HTML Size:** {len(html_content):,} characters
🎨 **CSS Size:** {len(css_content):,} characters
⚡ **JavaScript Size:** {len(js_content):,} characters
📥 **PDF Generated:** {pdf_filename}
"""
            
            progress(1.0, desc="Complete!")
            
            return pdf_path, status, html_content, css_content, js_content
            
    except Exception as e:
        return None, f"❌ Error: {str(e)}", "", "", ""

def sync_convert(url, wait_time, progress=gr.Progress()):
    """Synchronous wrapper for the async function"""
    return asyncio.run(convert_website_to_pdf(url, wait_time, progress))

# Custom CSS for better styling
custom_css = """
#status_box {
    border-left: 4px solid #10b981;
    background-color: #1f2937;
    padding: 15px;
    border-radius: 8px;
    color: #f9fafb;
}
#status_box * {
    color: #f9fafb !important;
}
.progress-container {
    margin: 20px 0;
}
footer {
    display: none !important;
}
"""

# Create Gradio interface
with gr.Blocks(css=custom_css, title="Website to PDF Converter") as demo:
    gr.Markdown("""
    # 🌐 Website to PDF Converter
    
    Convert any website to PDF while extracting HTML, CSS, and JavaScript content.
    Perfect for archiving, analysis, or offline viewing.
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            url_input = gr.Textbox(
                label="Website URL",
                placeholder="Enter URL (e.g., https://example.com or example.com)",
                lines=1
            )
        with gr.Column(scale=1):
            wait_time = gr.Slider(
                minimum=0,
                maximum=10,
                value=2,
                step=0.5,
                label="Wait Time (seconds)",
                info="Time to wait for dynamic content to load"
            )
    
    convert_btn = gr.Button("🚀 Convert to PDF", variant="primary", size="lg")
    
    status_output = gr.Markdown(label="Status", elem_id="status_box")
    
    with gr.Row():
        pdf_output = gr.File(label="📥 Download PDF", file_types=[".pdf"])
    
    gr.Markdown("### 📋 Extracted Content")
    
    with gr.Tabs():
        with gr.Tab("HTML"):
            html_output = gr.Code(
                label="HTML Content",
                language="html",
                lines=15,
                show_label=False
            )
        with gr.Tab("CSS"):
            css_output = gr.Code(
                label="CSS Styles",
                language="css",
                lines=15,
                show_label=False
            )
        with gr.Tab("JavaScript"):
            js_output = gr.Code(
                label="JavaScript Code",
                language="javascript",
                lines=15,
                show_label=False
            )
    
    # Examples
    gr.Markdown("### 💡 Try These Examples")
    gr.Examples(
        examples=[
            ["https://example.com", 1],
            ["https://wikipedia.org", 2],
            ["https://github.com", 3],
        ],
        inputs=[url_input, wait_time],
        label="Example Websites"
    )
    
    # Event handler
    convert_btn.click(
        fn=sync_convert,
        inputs=[url_input, wait_time],
        outputs=[pdf_output, status_output, html_output, css_output, js_output]
    )
    
    gr.Markdown("""
    ---
    ### ℹ️ How It Works
    
    1. **Enter URL**: Provide the website address you want to convert
    2. **Set Wait Time**: Configure how long to wait for dynamic content (AJAX, JavaScript)
    3. **Click Convert**: The app will load the page, extract content, and generate a PDF
    4. **Download**: Get your PDF and view the extracted HTML, CSS, and JavaScript
    
    **Note:** Some websites may block automated access or have anti-bot measures.
    """)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )