Spaces:

PaddlePaddle
/

doc2page

Running

App Files Files Community

doc2page / app.py

jzhang533

working demo

1f4004d 2 months ago

raw

history blame

28.1 kB

	import gradio as gr
	import os
	import tempfile
	from pathlib import Path
	import requests
	import base64
	import re
	from typing import Tuple
	import markdown
	from dotenv import load_dotenv
	from openai import OpenAI

	# Load environment variables from .env file
	load_dotenv()

	# API Configuration
	API_URL = os.getenv("API_URL", "")
	API_TOKEN = os.getenv("API_TOKEN", "")


	class Doc2PageConverter:
	def __init__(self):
	self.qianfan_token = os.getenv('QIANFAN_TOKEN')
	self.qianfan_model = "ernie-x1.1-preview"
	self.client = None

	if self.qianfan_token:
	self.client = OpenAI(
	base_url="https://qianfan.baidubce.com/v2",
	api_key=self.qianfan_token
	)



	def extract_text_with_api(self, file_path: str) -> str:
	"""Extract text and structure using PP-StructureV3 API"""
	try:
	if not API_URL or not API_TOKEN:
	raise ValueError(
	"API_URL and API_TOKEN must be configured in .env file")

	# Determine file type
	file_extension = Path(file_path).suffix.lower()
	if file_extension == ".pdf":
	file_type = 0 # PDF
	else:
	file_type = 1 # Image

	# Read file content
	with open(file_path, "rb") as f:
	file_bytes = f.read()

	# Encode file to base64
	file_data = base64.b64encode(file_bytes).decode("ascii")

	# Prepare API request
	headers = {
	"Authorization": f"token {API_TOKEN}",
	"Content-Type": "application/json",
	}

	# Use default settings for simplicity
	payload = {
	"file": file_data,
	"fileType": file_type,
	"useFormulaRecognition": True,
	"useChartRecognition": False,
	"useDocOrientationClassify": False,
	"useDocUnwarping": False,
	"useTextlineOrientation": False,
	"useSealRecognition": True,
	"useRegionDetection": True,
	"useTableRecognition": True,
	"layoutThreshold": 0.5,
	"layoutNms": True,
	"layoutUnclipRatio": 1.0,
	"textDetLimitType": "min",
	"textTetLimitSideLen": 736,
	"textDetThresh": 0.30,
	"textDetBoxThresh": 0.60,
	"textDetUnclipRatio": 1.5,
	"textRecScoreThresh": 0.00,
	"sealDetLimitType": "min",
	"sealDetLimitSideLen": 736,
	"sealDetThresh": 0.20,
	"sealDetBoxThresh": 0.60,
	"sealDetUnclipRatio": 0.5,
	"sealRecScoreThresh": 0.00,
	"useOcrResultsWithTableCells": True,
	"useE2eWiredTableRecModel": False,
	"useE2eWirelessTableRecModel": False,
	"useWiredTableCellsTransToHtml": False,
	"useWirelessWableCellsTransToHtml": False,
	"useTableOrientationClassify": True,
	}

	# Call API
	response = requests.post(
	API_URL,
	json=payload,
	headers=headers,
	timeout=300, # 5 minutes timeout
	)

	response.raise_for_status()
	result = response.json()

	# Process API response
	layout_results = result.get("result", {}).get(
	"layoutParsingResults", [])

	markdown_content_list = []
	markdown_list = []

	for res in layout_results:
	markdown_data = res["markdown"]
	markdown_text = markdown_data["text"]
	img_path_to_url = markdown_data["images"]

	# Embed images into markdown
	markdown_content = self.embed_images_into_markdown_text(
	markdown_text, img_path_to_url
	)
	markdown_content_list.append(markdown_content)

	# Prepare for concatenation
	markdown_with_content = markdown_data.copy()
	markdown_with_content["text"] = markdown_content
	markdown_list.append(markdown_with_content)

	# Concatenate all pages
	concatenated_markdown = self.concatenate_markdown_pages(markdown_list)

	return concatenated_markdown

	except requests.exceptions.RequestException as e:
	raise RuntimeError(f"API request failed: {str(e)}")
	except Exception as e:
	print(f"Error in API extraction: {e}")
	return ""

	def embed_images_into_markdown_text(self, markdown_text, markdown_images):
	"""Embed images into markdown text"""
	for img_path, img_url in markdown_images.items():
	markdown_text = markdown_text.replace(
	f'<img src="{img_path}"', f'<img src="{img_url}"'
	)
	return markdown_text

	def concatenate_markdown_pages(self, markdown_list):
	"""Concatenate markdown pages into single document"""
	markdown_texts = ""
	previous_page_last_element_paragraph_end_flag = True

	for res in markdown_list:
	page_first_element_paragraph_start_flag: bool = res["isStart"]
	page_last_element_paragraph_end_flag: bool = res["isEnd"]

	if (
	not page_first_element_paragraph_start_flag
	and not previous_page_last_element_paragraph_end_flag
	):
	last_char_of_markdown = (markdown_texts[-1]
	if markdown_texts else "")
	first_char_of_handler = res["text"]

	last_is_chinese_char = (
	re.match(r"[\u4e00-\u9fff]", last_char_of_markdown)
	if last_char_of_markdown
	else False
	)
	first_is_chinese_char = (
	re.match(r"[\u4e00-\u9fff]", first_char_of_handler)
	if first_char_of_handler
	else False
	)
	if not (last_is_chinese_char or first_is_chinese_char):
	markdown_texts += " " + res["text"]
	else:
	markdown_texts += res["text"]
	else:
	markdown_texts += "\n\n" + res["text"]
	previous_page_last_element_paragraph_end_flag = (
	page_last_element_paragraph_end_flag
	)

	return markdown_texts

	def markdown_to_html_with_ernie(self, markdown_text: str) -> str:
	"""Convert markdown to HTML using ERNIE API"""
	if not self.client:
	# Fallback to basic markdown conversion if no API client
	return self.basic_markdown_to_html(markdown_text)

	try:
	prompt = f"""Please convert the following markdown text into a modern, clean HTML page. Use contemporary typography with the Inter font family and clean design principles. Make it visually appealing with proper CSS styling, responsive design, and excellent readability.

	Design requirements:
	- Use Inter font from Google Fonts
	- Clean, modern spacing and typography
	- Subtle shadows and rounded corners
	- Good color contrast and hierarchy
	- Responsive design that works on all devices
	- Include proper HTML structure with head, body, and semantic elements

	Important: Add a footer at the bottom with "Powered by PaddleOCR and ERNIE" where PaddleOCR links to https://github.com/PaddlePaddle/PaddleOCR and ERNIE links to https://huggingface.co/BAIDU. Style it with modern, subtle styling.

	Markdown content:
	{markdown_text}

	IMPORTANT: Return ONLY the raw HTML code starting with <!DOCTYPE html> and ending with </html>. Do NOT wrap it in markdown code blocks or add any explanations. I need the pure HTML content that can be directly saved as an .html file."""

	messages = [{"role": "user", "content": prompt}]

	response = self.client.chat.completions.create(
	model=self.qianfan_model,
	messages=messages,
	max_tokens=64000,
	)

	html_content = response.choices[0].message.content

	# Clean up markdown code block markers if present
	if html_content.startswith('```html'):
	html_content = html_content[7:] # Remove ```html
	elif html_content.startswith('```'):
	html_content = html_content[3:] # Remove ```

	if html_content.endswith('```'):
	html_content = html_content[:-3] # Remove ending ```

	# Strip any extra whitespace
	html_content = html_content.strip()

	return html_content

	except Exception as e:
	print(f"Error calling ERNIE API: {e}")
	return self.basic_markdown_to_html(markdown_text)

	def basic_markdown_to_html(self, markdown_text: str) -> str:
	"""Fallback markdown to HTML conversion"""
	html = markdown.markdown(markdown_text)

	# Wrap in a complete HTML document with styling
	complete_html = f"""
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Converted Document</title>
	<style>
	/* Modern, clean typography */
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');

	* {{
	margin: 0;
	padding: 0;
	box-sizing: border-box;
	}}

	body {{
	font-family: 'Inter', system-ui, -apple-system, sans-serif;
	font-weight: 400;
	line-height: 1.7;
	color: #1a1a1a;
	max-width: 850px;
	margin: 0 auto;
	padding: 32px 24px;
	background: #fafafa;
	font-size: 16px;
	}}

	.container {{
	background: #ffffff;
	padding: 48px;
	border-radius: 12px;
	box-shadow: 0 1px 3px rgba(0,0,0,0.08), 0 4px 24px rgba(0,0,0,0.04);
	border: 1px solid rgba(0,0,0,0.06);
	}}

	/* Typography hierarchy */
	h1, h2, h3, h4, h5, h6 {{
	font-weight: 600;
	color: #0f0f0f;
	margin: 32px 0 16px 0;
	letter-spacing: -0.02em;
	}}

	h1 {{
	font-size: 2.25rem;
	font-weight: 700;
	margin-top: 0;
	margin-bottom: 24px;
	border-bottom: 2px solid #e5e7eb;
	padding-bottom: 16px;
	}}

	h2 {{
	font-size: 1.75rem;
	margin-top: 48px;
	}}

	h3 {{
	font-size: 1.375rem;
	margin-top: 40px;
	}}

	h4 {{
	font-size: 1.125rem;
	}}

	p {{
	margin-bottom: 20px;
	color: #374151;
	line-height: 1.75;
	}}

	/* Code styling */
	code {{
	font-family: 'SF Mono', Consolas, 'Liberation Mono', monospace;
	background-color: #f3f4f6;
	color: #1f2937;
	padding: 3px 6px;
	border-radius: 4px;
	font-size: 0.875rem;
	font-weight: 500;
	}}

	pre {{
	background-color: #f8fafc;
	border: 1px solid #e5e7eb;
	padding: 20px;
	border-radius: 8px;
	overflow-x: auto;
	margin: 24px 0;
	font-size: 0.875rem;
	line-height: 1.6;
	}}

	pre code {{
	background: none;
	padding: 0;
	border-radius: 0;
	}}

	/* Blockquotes */
	blockquote {{
	border-left: 4px solid #6366f1;
	padding-left: 20px;
	margin: 24px 0;
	font-style: normal;
	color: #4b5563;
	background-color: #f8fafc;
	padding: 16px 20px;
	border-radius: 0 8px 8px 0;
	}}

	/* Images */
	img {{
	max-width: 100%;
	height: auto;
	border-radius: 8px;
	margin: 20px 0;
	box-shadow: 0 4px 12px rgba(0,0,0,0.1);
	}}

	/* Tables */
	table {{
	border-collapse: collapse;
	width: 100%;
	margin: 24px 0;
	background: #ffffff;
	border-radius: 8px;
	overflow: hidden;
	box-shadow: 0 1px 3px rgba(0,0,0,0.1);
	}}

	th, td {{
	padding: 16px;
	text-align: left;
	border-bottom: 1px solid #e5e7eb;
	}}

	th {{
	background-color: #f9fafb;
	font-weight: 600;
	color: #374151;
	font-size: 0.875rem;
	text-transform: uppercase;
	letter-spacing: 0.05em;
	}}

	tr:last-child td {{
	border-bottom: none;
	}}

	/* Lists */
	ul, ol {{
	margin: 16px 0 20px 24px;
	color: #374151;
	}}

	li {{
	margin-bottom: 8px;
	line-height: 1.6;
	}}

	/* Links */
	a {{
	color: #6366f1;
	text-decoration: none;
	font-weight: 500;
	}}

	a:hover {{
	color: #4f46e5;
	text-decoration: underline;
	}}
	/* Footer */
	.footer {{
	margin-top: 64px;
	padding-top: 24px;
	border-top: 1px solid #e5e7eb;
	text-align: center;
	font-size: 14px;
	color: #6b7280;
	font-weight: 400;
	}}

	.footer a {{
	color: #6366f1;
	font-weight: 500;
	text-decoration: none;
	}}

	.footer a:hover {{
	color: #4f46e5;
	text-decoration: underline;
	}}
	</style>
	</head>
	<body>
	<div class="container">
	{html}
	<div class="footer">
	Powered by <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR</a> and
	<a href="https://huggingface.co/BAIDU" target="_blank">ERNIE</a>
	</div>
	</div>
	</body>
	</html>
	"""
	return complete_html

	def process_document(self, file_path: str) -> Tuple[str, str]:
	"""Process uploaded document and convert to HTML"""
	try:
	file_extension = Path(file_path).suffix.lower()

	# Check supported formats
	if file_extension == '.pdf' or file_extension in [
	'.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
	# Process with PP-StructureV3 API
	markdown_content = self.extract_text_with_api(file_path)
	else:
	return ("Error: Unsupported file format. "
	"Please upload PDF or image files."), ""

	if not markdown_content.strip():
	return ("Warning: No text content extracted "
	"from the document."), ""

	# Convert markdown to HTML using ERNIE or fallback
	html_content = self.markdown_to_html_with_ernie(markdown_content)

	return markdown_content, html_content

	except Exception as e:
	return f"Error processing document: {str(e)}", ""

	# Initialize converter
	converter = Doc2PageConverter()

	def process_upload(file):
	"""Process uploaded file and return markdown and HTML"""
	if file is None:
	return "Please upload a file.", "", ""

	try:
	# Process the document
	markdown_result, html_result = converter.process_document(file.name)

	if html_result:
	return "Document processed successfully!", markdown_result, html_result
	else:
	return markdown_result, "", "" # Error message in markdown_result

	except Exception as e:
	return f"Error: {str(e)}", "", ""

	def save_html_file(html_content, filename="converted_page"):
	"""Save HTML content to file for download"""
	if not html_content:
	return None

	# Create temporary file
	temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False,
	prefix=f"{filename}_")
	temp_file.write(html_content)
	temp_file.close()

	return temp_file.name

	# Create custom theme for a clean, modern look
	custom_theme = gr.themes.Default(
	primary_hue="blue",
	secondary_hue="gray",
	neutral_hue="gray",
	font=("Inter", "system-ui", "sans-serif"),
	font_mono=("SF Mono", "Consolas", "monospace")
	).set(
	body_background_fill="#fafafa",
	background_fill_primary="#ffffff",
	background_fill_secondary="#f8f9fa",
	border_color_primary="#e5e7eb",
	button_primary_background_fill="#6366f1",
	button_primary_background_fill_hover="#4f46e5",
	button_primary_text_color="#ffffff",
	)

	# Create Gradio interface
	with gr.Blocks(
	title="Doc2Page - Simple Document Converter",
	theme=custom_theme,
	css="""
	.gradio-container {
	max-width: 1200px !important;
	margin: auto;
	padding: 32px 16px;
	}

	/* Enhanced button styling */
	.gr-button {
	font-weight: 500;
	border-radius: 10px;
	font-size: 14px;
	transition: all 0.2s ease;
	box-shadow: 0 2px 4px rgba(99, 102, 241, 0.1);
	}

	.gr-button:hover {
	transform: translateY(-1px);
	box-shadow: 0 4px 8px rgba(99, 102, 241, 0.2);
	}

	/* Input styling */
	.gr-textbox, .gr-file {
	border-radius: 10px;
	font-family: 'Inter', system-ui, sans-serif;
	border: 1px solid #e5e7eb;
	transition: border-color 0.2s ease;
	}

	.gr-textbox:focus, .gr-file:focus {
	border-color: #6366f1;
	box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.1);
	}

	/* Typography */
	h1 {
	font-weight: 700;
	color: #1a1a1a;
	margin-bottom: 8px;
	font-size: 2.5rem;
	}

	.app-description {
	color: #6b7280;
	font-size: 18px;
	margin-bottom: 40px;
	font-weight: 400;
	}

	/* Tab styling */
	.gr-tab {
	border-radius: 8px 8px 0 0;
	font-weight: 500;
	}

	/* Card-like sections */
	.gr-column {
	background: rgba(255, 255, 255, 0.5);
	border-radius: 12px;
	padding: 16px;
	margin: 8px;
	}

	/* Status styling */
	.gr-textbox[data-testid*="status"] {
	background-color: #f8fafc;
	border: 1px solid #e2e8f0;
	}

	/* Download section styling */
	.download-section {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	border-radius: 12px;
	padding: 20px;
	color: white;
	margin-top: 20px;
	}
	"""
	) as app:

	# Header
	gr.Markdown(
	"# Doc2Page",
	elem_classes="main-title"
	)
	gr.Markdown(
	"🥃 Transform your documents into beautiful webpages!",
	elem_classes="app-description"
	)

	# Main interface
	with gr.Row():
	with gr.Column(scale=1, min_width=350):
	with gr.Group():
	gr.Markdown("### 📄 Upload Document")
	file_input = gr.File(
	label="Choose your file",
	file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"],
	file_count="single",
	height=140
	)

	process_btn = gr.Button(
	"✨ Convert to Webpage",
	variant="primary",
	size="lg",
	scale=1
	)

	status_output = gr.Textbox(
	label="Status",
	placeholder="Ready to convert your document...",
	interactive=False,
	lines=3,
	max_lines=3
	)

	with gr.Column(scale=2):
	gr.Markdown("### 📋 Results")
	with gr.Tabs():
	with gr.TabItem("❤️ Preview", id="preview"):
	html_preview = gr.HTML(
	label="",
	value="<div style='padding: 40px; text-align: center; color: #6b7280;'>Your converted webpage will appear here</div>",
	)

	with gr.TabItem("📝 Markdown Source", id="markdown"):
	markdown_output = gr.Textbox(
	label="",
	placeholder="Extracted markdown content will appear here...",
	lines=22,
	interactive=False,
	show_copy_button=True
	)

	with gr.TabItem("🌐 HTML Source", id="html"):
	html_output = gr.Code(
	label="",
	language="html",
	lines=22,
	interactive=False
	)

	# Success & Download section
	with gr.Row(visible=False) as download_section:
	with gr.Column():
	gr.Markdown("""
	<div style="background: linear-gradient(135deg, #10b981, #059669); border-radius: 12px; padding: 20px; color: white; text-align: center; margin: 20px 0;">
	<h3 style="margin: 0 0 8px 0; color: white;">✅ Conversion Successful!</h3>
	<p style="margin: 0; opacity: 0.9;">Your document has been converted to a beautiful webpage</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📥 Download Your Webpage")
	download_btn = gr.File(
	label="HTML File",
	visible=True
	)

	with gr.Column(scale=1):
	gr.Markdown("### 🚀 Quick Deploy Guide")
	gr.Markdown("""
	1. GitHub Pages: Upload as `index.html` to your repo
	2. Netlify: Drag & drop the file to netlify.app
	3. Vercel: Use their simple file deployment
	4. Local: Double-click to open in browser
	""", elem_classes="deploy-guide")

	# Event handlers
	def process_and_update(file):
	status, markdown_content, html_content = process_upload(file)

	# Create download file if HTML was generated
	download_file = None
	show_download = False

	if html_content:
	filename = Path(file.name).stem if file else "converted_page"
	download_file = save_html_file(html_content, filename)
	show_download = True

	# Preview content with better styling when no content
	preview_content = html_content if html_content else """
	<div style='padding: 60px 20px; text-align: center; color: #6b7280;
	background: #f9fafb; border-radius: 8px; border: 2px dashed #d1d5db;'>
	<h3 style='color: #9ca3af; margin: 0;'>No preview available</h3>
	<p style='margin: 8px 0 0 0;'>Convert a document to see the preview</p>
	</div>
	"""

	return (
	status, # status_output
	markdown_content, # markdown_output
	html_content, # html_output
	preview_content, # html_preview
	download_file, # download_btn
	gr.update(visible=show_download) # download_section
	)

	process_btn.click(
	fn=process_and_update,
	inputs=[file_input],
	outputs=[
	status_output,
	markdown_output,
	html_output,
	html_preview,
	download_btn,
	download_section
	]
	)

	# Footer
	gr.Markdown(
	"""
	<div style="text-align: center; padding: 20px 0; margin-top: 40px; border-top: 1px solid #e5e7eb; color: #6b7280; font-size: 14px;">
	Powered by <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank" style="color: #6366f1; text-decoration: none;">PaddleOCR</a>
	for text extraction and <a href="https://huggingface.co/BAIDU" target="_blank" style="color: #6366f1; text-decoration: none;">ERNIE</a>
	for HTML generation
	</div>
	""",
	elem_id="footer"
	)

	# Tips section
	with gr.Accordion("💡 Tips for Best Results", open=False):
	gr.Markdown("""
	File Types: PDF, PNG, JPG, JPEG, BMP, TIFF

	For Best OCR Results:
	- Use high-resolution, clear images
	- Ensure good contrast between text and background
	- Avoid skewed or rotated documents
	- PDFs generally produce the best results

	🚀 Deploy to GitHub Pages:
	1. Create a new GitHub repository or use an existing one
	2. Download the generated HTML file from above
	3. Upload it to your repository as `index.html`
	4. Go to repository Settings → Pages
	5. Select "Deploy from a branch" → Choose "main" branch
	6. Your page will be live at `https://yourusername.github.io/yourrepository`

	💡 Pro Tips:
	- Enable custom domains in GitHub Pages settings
	- Use GitHub Actions for automated deployments
	- Consider using Jekyll themes for enhanced styling
	""")


	if __name__ == "__main__":
	app.launch()