Spaces:

namelessai
/

web2pdf

Sleeping

App Files Files Community

web2pdf / app.py

namelessai

Fix some bad contrast

2f3316c verified about 2 months ago

raw

history blame contribute delete

7.92 kB

	import gradio as gr
	import asyncio
	from playwright.async_api import async_playwright
	import os
	import tempfile
	from datetime import datetime
	import re

	async def convert_website_to_pdf(url, wait_time, progress=gr.Progress()):
	"""Convert a website to PDF and extract its content"""

	if not url:
	return None, "❌ Please enter a URL", "", "", ""

	# Validate URL
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url

	progress(0, desc="Initializing browser...")

	try:
	async with async_playwright() as p:
	# Launch browser
	progress(0.1, desc="Launching browser...")
	browser = await p.chromium.launch(headless=True)
	context = await browser.new_context(
	viewport={'width': 1920, 'height': 1080},
	user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	)
	page = await context.new_page()

	# Navigate to URL
	progress(0.2, desc=f"Loading {url}...")
	try:
	await page.goto(url, wait_until='networkidle', timeout=30000)
	except Exception as e:
	await browser.close()
	return None, f"❌ Failed to load URL: {str(e)}", "", "", ""

	# Wait for specified time
	if wait_time > 0:
	progress(0.3, desc=f"Waiting {wait_time} seconds for page to fully load...")
	await asyncio.sleep(wait_time)

	# Extract HTML
	progress(0.5, desc="Extracting HTML content...")
	html_content = await page.content()

	# Extract CSS
	progress(0.6, desc="Extracting CSS styles...")
	css_content = await page.evaluate('''() => {
	let css = '';
	for (let sheet of document.styleSheets) {
	try {
	for (let rule of sheet.cssRules) {
	css += rule.cssText + '\\n';
	}
	} catch (e) {
	// Cross-origin stylesheets may not be accessible
	css += `/* Could not access stylesheet: ${sheet.href} */\\n`;
	}
	}
	return css;
	}''')

	# Extract JavaScript
	progress(0.7, desc="Extracting JavaScript...")
	js_content = await page.evaluate('''() => {
	let scripts = [];
	document.querySelectorAll('script').forEach(script => {
	if (script.src) {
	scripts.push(`// External script: ${script.src}`);
	} else if (script.textContent) {
	scripts.push(script.textContent);
	}
	});
	return scripts.join('\\n\\n' + '='.repeat(50) + '\\n\\n');
	}''')

	# Generate PDF
	progress(0.8, desc="Converting to PDF...")

	# Create temporary file for PDF
	temp_dir = tempfile.gettempdir()
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	safe_url = re.sub(r'[^\w\-_]', '_', url[:50])
	pdf_filename = f"website_{safe_url}_{timestamp}.pdf"
	pdf_path = os.path.join(temp_dir, pdf_filename)

	await page.pdf(
	path=pdf_path,
	format='A4',
	print_background=True,
	margin={'top': '20px', 'right': '20px', 'bottom': '20px', 'left': '20px'}
	)

	progress(0.9, desc="Finalizing...")

	await browser.close()

	# Prepare status message
	status = f"""✅ Conversion Successful!

	📄 URL: {url}
	⏱️ Wait Time: {wait_time} seconds
	📊 HTML Size: {len(html_content):,} characters
	🎨 CSS Size: {len(css_content):,} characters
	⚡ JavaScript Size: {len(js_content):,} characters
	📥 PDF Generated: {pdf_filename}
	"""

	progress(1.0, desc="Complete!")

	return pdf_path, status, html_content, css_content, js_content

	except Exception as e:
	return None, f"❌ Error: {str(e)}", "", "", ""

	def sync_convert(url, wait_time, progress=gr.Progress()):
	"""Synchronous wrapper for the async function"""
	return asyncio.run(convert_website_to_pdf(url, wait_time, progress))

	# Custom CSS for better styling
	custom_css = """
	#status_box {
	border-left: 4px solid #10b981;
	background-color: #1f2937;
	padding: 15px;
	border-radius: 8px;
	color: #f9fafb;
	}
	#status_box * {
	color: #f9fafb !important;
	}
	.progress-container {
	margin: 20px 0;
	}
	footer {
	display: none !important;
	}
	"""

	# Create Gradio interface
	with gr.Blocks(css=custom_css, title="Website to PDF Converter") as demo:
	gr.Markdown("""
	# 🌐 Website to PDF Converter

	Convert any website to PDF while extracting HTML, CSS, and JavaScript content.
	Perfect for archiving, analysis, or offline viewing.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	url_input = gr.Textbox(
	label="Website URL",
	placeholder="Enter URL (e.g., https://example.com or example.com)",
	lines=1
	)
	with gr.Column(scale=1):
	wait_time = gr.Slider(
	minimum=0,
	maximum=10,
	value=2,
	step=0.5,
	label="Wait Time (seconds)",
	info="Time to wait for dynamic content to load"
	)

	convert_btn = gr.Button("🚀 Convert to PDF", variant="primary", size="lg")

	status_output = gr.Markdown(label="Status", elem_id="status_box")

	with gr.Row():
	pdf_output = gr.File(label="📥 Download PDF", file_types=[".pdf"])

	gr.Markdown("### 📋 Extracted Content")

	with gr.Tabs():
	with gr.Tab("HTML"):
	html_output = gr.Code(
	label="HTML Content",
	language="html",
	lines=15,
	show_label=False
	)
	with gr.Tab("CSS"):
	css_output = gr.Code(
	label="CSS Styles",
	language="css",
	lines=15,
	show_label=False
	)
	with gr.Tab("JavaScript"):
	js_output = gr.Code(
	label="JavaScript Code",
	language="javascript",
	lines=15,
	show_label=False
	)

	# Examples
	gr.Markdown("### 💡 Try These Examples")
	gr.Examples(
	examples=[
	["https://example.com", 1],
	["https://wikipedia.org", 2],
	["https://github.com", 3],
	],
	inputs=[url_input, wait_time],
	label="Example Websites"
	)

	# Event handler
	convert_btn.click(
	fn=sync_convert,
	inputs=[url_input, wait_time],
	outputs=[pdf_output, status_output, html_output, css_output, js_output]
	)

	gr.Markdown("""
	---
	### ℹ️ How It Works

	1. Enter URL: Provide the website address you want to convert
	2. Set Wait Time: Configure how long to wait for dynamic content (AJAX, JavaScript)
	3. Click Convert: The app will load the page, extract content, and generate a PDF
	4. Download: Get your PDF and view the extracted HTML, CSS, and JavaScript

	Note: Some websites may block automated access or have anti-bot measures.
	""")

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)