web2pdf / app.py
namelessai's picture
Fix some bad contrast
2f3316c verified
import gradio as gr
import asyncio
from playwright.async_api import async_playwright
import os
import tempfile
from datetime import datetime
import re
async def convert_website_to_pdf(url, wait_time, progress=gr.Progress()):
"""Convert a website to PDF and extract its content"""
if not url:
return None, "❌ Please enter a URL", "", "", ""
# Validate URL
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
progress(0, desc="Initializing browser...")
try:
async with async_playwright() as p:
# Launch browser
progress(0.1, desc="Launching browser...")
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
)
page = await context.new_page()
# Navigate to URL
progress(0.2, desc=f"Loading {url}...")
try:
await page.goto(url, wait_until='networkidle', timeout=30000)
except Exception as e:
await browser.close()
return None, f"❌ Failed to load URL: {str(e)}", "", "", ""
# Wait for specified time
if wait_time > 0:
progress(0.3, desc=f"Waiting {wait_time} seconds for page to fully load...")
await asyncio.sleep(wait_time)
# Extract HTML
progress(0.5, desc="Extracting HTML content...")
html_content = await page.content()
# Extract CSS
progress(0.6, desc="Extracting CSS styles...")
css_content = await page.evaluate('''() => {
let css = '';
for (let sheet of document.styleSheets) {
try {
for (let rule of sheet.cssRules) {
css += rule.cssText + '\\n';
}
} catch (e) {
// Cross-origin stylesheets may not be accessible
css += `/* Could not access stylesheet: ${sheet.href} */\\n`;
}
}
return css;
}''')
# Extract JavaScript
progress(0.7, desc="Extracting JavaScript...")
js_content = await page.evaluate('''() => {
let scripts = [];
document.querySelectorAll('script').forEach(script => {
if (script.src) {
scripts.push(`// External script: ${script.src}`);
} else if (script.textContent) {
scripts.push(script.textContent);
}
});
return scripts.join('\\n\\n' + '='.repeat(50) + '\\n\\n');
}''')
# Generate PDF
progress(0.8, desc="Converting to PDF...")
# Create temporary file for PDF
temp_dir = tempfile.gettempdir()
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_url = re.sub(r'[^\w\-_]', '_', url[:50])
pdf_filename = f"website_{safe_url}_{timestamp}.pdf"
pdf_path = os.path.join(temp_dir, pdf_filename)
await page.pdf(
path=pdf_path,
format='A4',
print_background=True,
margin={'top': '20px', 'right': '20px', 'bottom': '20px', 'left': '20px'}
)
progress(0.9, desc="Finalizing...")
await browser.close()
# Prepare status message
status = f"""βœ… **Conversion Successful!**
πŸ“„ **URL:** {url}
⏱️ **Wait Time:** {wait_time} seconds
πŸ“Š **HTML Size:** {len(html_content):,} characters
🎨 **CSS Size:** {len(css_content):,} characters
⚑ **JavaScript Size:** {len(js_content):,} characters
πŸ“₯ **PDF Generated:** {pdf_filename}
"""
progress(1.0, desc="Complete!")
return pdf_path, status, html_content, css_content, js_content
except Exception as e:
return None, f"❌ Error: {str(e)}", "", "", ""
def sync_convert(url, wait_time, progress=gr.Progress()):
"""Synchronous wrapper for the async function"""
return asyncio.run(convert_website_to_pdf(url, wait_time, progress))
# Custom CSS for better styling
custom_css = """
#status_box {
border-left: 4px solid #10b981;
background-color: #1f2937;
padding: 15px;
border-radius: 8px;
color: #f9fafb;
}
#status_box * {
color: #f9fafb !important;
}
.progress-container {
margin: 20px 0;
}
footer {
display: none !important;
}
"""
# Create Gradio interface
with gr.Blocks(css=custom_css, title="Website to PDF Converter") as demo:
gr.Markdown("""
# 🌐 Website to PDF Converter
Convert any website to PDF while extracting HTML, CSS, and JavaScript content.
Perfect for archiving, analysis, or offline viewing.
""")
with gr.Row():
with gr.Column(scale=2):
url_input = gr.Textbox(
label="Website URL",
placeholder="Enter URL (e.g., https://example.com or example.com)",
lines=1
)
with gr.Column(scale=1):
wait_time = gr.Slider(
minimum=0,
maximum=10,
value=2,
step=0.5,
label="Wait Time (seconds)",
info="Time to wait for dynamic content to load"
)
convert_btn = gr.Button("πŸš€ Convert to PDF", variant="primary", size="lg")
status_output = gr.Markdown(label="Status", elem_id="status_box")
with gr.Row():
pdf_output = gr.File(label="πŸ“₯ Download PDF", file_types=[".pdf"])
gr.Markdown("### πŸ“‹ Extracted Content")
with gr.Tabs():
with gr.Tab("HTML"):
html_output = gr.Code(
label="HTML Content",
language="html",
lines=15,
show_label=False
)
with gr.Tab("CSS"):
css_output = gr.Code(
label="CSS Styles",
language="css",
lines=15,
show_label=False
)
with gr.Tab("JavaScript"):
js_output = gr.Code(
label="JavaScript Code",
language="javascript",
lines=15,
show_label=False
)
# Examples
gr.Markdown("### πŸ’‘ Try These Examples")
gr.Examples(
examples=[
["https://example.com", 1],
["https://wikipedia.org", 2],
["https://github.com", 3],
],
inputs=[url_input, wait_time],
label="Example Websites"
)
# Event handler
convert_btn.click(
fn=sync_convert,
inputs=[url_input, wait_time],
outputs=[pdf_output, status_output, html_output, css_output, js_output]
)
gr.Markdown("""
---
### ℹ️ How It Works
1. **Enter URL**: Provide the website address you want to convert
2. **Set Wait Time**: Configure how long to wait for dynamic content (AJAX, JavaScript)
3. **Click Convert**: The app will load the page, extract content, and generate a PDF
4. **Download**: Get your PDF and view the extracted HTML, CSS, and JavaScript
**Note:** Some websites may block automated access or have anti-bot measures.
""")
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)