pdf2html / web_interface.py
jrpark's picture
Upload folder using huggingface_hub
d1aa69e verified
import os
import gradio as gr
from pathlib import Path
import base64
# PDF to HTML ๋ณ€ํ™˜๊ธฐ ํด๋ž˜์Šค ์ž„ํฌํŠธ - ์ˆ˜์ •๋œ ๋ฒ„์ „ ์‚ฌ์šฉ
from convert import PDFToHTMLConverter
def convert_pdf_to_html(pdf_file):
"""PDF ํŒŒ์ผ์„ HTML๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜"""
try:
# ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ ํ™•์ธ
current_dir = Path.cwd()
temp_dir = current_dir / ".temp"
# PDF ๋ฐ์ดํ„ฐ ์ค€๋น„
if hasattr(pdf_file, "name"): # Gradio ํŒŒ์ผ ๊ฐ์ฒด์ธ ๊ฒฝ์šฐ
with open(pdf_file.name, "rb") as f:
pdf_data = f.read()
else: # ์ด๋ฏธ ๋ฐ”์ด๋„ˆ๋ฆฌ ๋ฐ์ดํ„ฐ์ธ ๊ฒฝ์šฐ
pdf_data = pdf_file
# ๊ณ ์ • ๊ฒฝ๋กœ์— PDF ์ €์žฅ
pdf_input_dir = temp_dir / "temp_input_pdf"
pdf_input_dir.mkdir(exist_ok=True, parents=True)
pdf_path = pdf_input_dir / "current.pdf"
# PDF ์ €์žฅ
with open(pdf_path, "wb") as f:
f.write(pdf_data)
print(f"PDF ์ €์žฅ ์™„๋ฃŒ: {pdf_path}")
# PDF ๋ณ€ํ™˜ - ํ…์ŠคํŠธ HTML๊ณผ ๋ฏธ๋””์–ด HTML๋กœ ๋ถ„๋ฆฌ
converter = PDFToHTMLConverter(str(pdf_path))
text_html_path, media_html_path = converter.convert()
print(f"HTML ๋ณ€ํ™˜ ์™„๋ฃŒ: {text_html_path}, {media_html_path}")
# HTML ํŒŒ์ผ ์ฝ๊ธฐ
with open(text_html_path, "r", encoding="utf-8") as f:
text_html_content = f.read()
with open(media_html_path, "r", encoding="utf-8") as f:
media_html_content = f.read()
# ์ด๋ฏธ์ง€๋ฅผ Base64๋กœ ์ธ์ฝ”๋”ฉํ•˜์—ฌ HTML์— ์ง์ ‘ ํฌํ•จ
img_dir_path = temp_dir / "temp_output_html" / "images"
if img_dir_path.exists():
print(f"์ด๋ฏธ์ง€ ๋””๋ ‰ํ† ๋ฆฌ ํ™•์ธ: {img_dir_path}")
for img_file in img_dir_path.glob("*.*"):
try:
rel_path = f"images/{img_file.name}"
print(f"์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์ค‘: {img_file}")
# ์ด๋ฏธ์ง€ ํŒŒ์ผ ์ฝ๊ธฐ
with open(img_file, "rb") as f:
encoded_string = base64.b64encode(f.read()).decode("utf-8")
# ์ด๋ฏธ์ง€ ํƒ€์ž…์— ๋”ฐ๋ผ MIME ํƒ€์ž… ์„ค์ •
ext = img_file.suffix.lower()[1:] # .png -> png
mime_type = {
"png": "image/png",
"jpg": "image/jpeg",
"jpeg": "image/jpeg",
"gif": "image/gif",
"svg": "image/svg+xml",
}.get(ext, "image/png")
# Base64 ์ด๋ฏธ์ง€ URL ์ƒ์„ฑ
data_url = f"data:{mime_type};base64,{encoded_string}"
# ๋ฏธ๋””์–ด HTML ๋‚ด์šฉ์—์„œ ์ด๋ฏธ์ง€ ๊ฒฝ๋กœ ๊ต์ฒด
original_pattern = f'src="{rel_path}"'
replacement = f'src="{data_url}"'
if original_pattern in media_html_content:
media_html_content = media_html_content.replace(
original_pattern, replacement
)
print(f"์ด๋ฏธ์ง€ {img_file.name} Base64 ์ธ์ฝ”๋”ฉ ์™„๋ฃŒ")
else:
print(
f"๊ฒฝ๊ณ : ์ด๋ฏธ์ง€ ๊ฒฝ๋กœ '{rel_path}'๋ฅผ HTML์—์„œ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"
)
except Exception as e:
print(f"์ด๋ฏธ์ง€ {img_file.name} ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {str(e)}")
else:
print(f"์ด๋ฏธ์ง€ ๋””๋ ‰ํ† ๋ฆฌ๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Œ: {img_dir_path}")
# ์Šคํฌ๋กค ๊ฐ€๋Šฅํ•œ ์ปจํ…Œ์ด๋„ˆ๋กœ HTML ์ปจํ…์ธ  ๋ž˜ํ•‘
text_html_with_style = f"""
<div style="width: 100%; height: 800px; overflow-y: auto; border: 1px solid #444; background-color: #2a2a2a;">
{text_html_content}
</div>
"""
media_html_with_style = f"""
<div style="width: 100%; height: 800px; overflow-y: auto; border: 1px solid #444; background-color: #2a2a2a;">
{media_html_content}
</div>
"""
print("HTML ๋‚ด์šฉ ์ค€๋น„ ์™„๋ฃŒ")
# ํ…์ŠคํŠธ HTML๊ณผ ๋ฏธ๋””์–ด HTML ๋ฐ˜ํ™˜
return text_html_with_style, media_html_with_style
except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}\n{error_details}")
error_html = f"<h1>์˜ค๋ฅ˜ ๋ฐœ์ƒ</h1><p>{str(e)}</p><pre>{error_details}</pre>"
return error_html, error_html
def launch_web_interface():
"""Gradio ์›น ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰"""
# CSS ์Šคํƒ€์ผ
css = """
/* ์ „์ฒด ๋ ˆ์ด์•„์›ƒ */
body, .gradio-container {
margin: 0 !important;
padding: 0 !important;
width: 100% !important;
max-width: none !important;
background-color: #1f1f1f;
}
/* ํ—ค๋” ์˜์—ญ */
.header-area {
background-color: #2a2a2a;
padding: 1rem;
border-bottom: 1px solid #444;
margin-bottom: 1rem;
}
/* ์—…๋กœ๋“œ ์˜์—ญ */
.upload-area {
background-color: #2a2a2a;
padding: 1rem;
border-radius: 5px;
margin-bottom: 1rem;
}
/* HTML ๋ทฐ์–ด ์ปจํ…Œ์ด๋„ˆ */
.html-columns {
display: flex;
gap: 20px;
}
.html-column {
flex: 1;
min-width: 0;
}
/* HTML ๋ทฐ์–ด */
.html-display {
min-height: 800px !important;
width: 100% !important;
background-color: #2a2a2a !important;
}
/* HTML ๋‚ด์šฉ์˜ ํ…์ŠคํŠธ ์ƒ‰์ƒ */
.html-display * {
color: #ffffff !important;
}
/* HTML ๋‚ด์˜ ํ‘œ ์Šคํƒ€์ผ */
.html-display table {
background-color: #333 !important;
border: 1px solid #555 !important;
}
.html-display td,
.html-display th {
border: 1px solid #555 !important;
color: #fff !important;
}
/* ๋ฒ„ํŠผ ์Šคํƒ€์ผ */
.convert-button {
background-color: #E67E22 !important;
border: none !important;
}
/* ํƒ€์ดํ‹€ ํ…์ŠคํŠธ */
.title-text {
color: white !important;
margin: 0 !important;
padding: 0 !important;
}
/* ์„ค๋ช… ํ…์ŠคํŠธ */
.description-text {
color: #aaa !important;
margin-top: 0.5rem !important;
}
/* ์ปฌ๋Ÿผ ์ œ๋ชฉ */
.column-title {
color: white !important;
margin-bottom: 0.5rem !important;
}
/* ํ‘ธํ„ฐ */
.footer-area {
margin-top: 2rem;
text-align: center;
color: #888;
padding: 1rem;
}
"""
# Gradio ์ธํ„ฐํŽ˜์ด์Šค
with gr.Blocks(css=css, theme=gr.themes.Default()) as demo:
# ํ—ค๋” ์„น์…˜
with gr.Column(elem_classes="header-area"):
gr.Markdown("# PDF to HTML ๋ณ€ํ™˜๊ธฐ", elem_classes="title-text")
gr.Markdown(
"PDF ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜์—ฌ ํ…์ŠคํŠธ์™€ ๋ฏธ๋””์–ด๋กœ ๋ถ„๋ฆฌ๋œ HTML์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.",
elem_classes="description-text",
)
# ์—…๋กœ๋“œ ์„น์…˜
with gr.Column(elem_classes="upload-area"):
# ํŒŒ์ผ ์—…๋กœ๋“œ ๋ฐ ๋ณ€ํ™˜ ๋ฒ„ํŠผ
with gr.Row():
pdf_input = gr.File(
label="PDF ํŒŒ์ผ ์—…๋กœ๋“œ", type="binary", elem_id="pdf-upload"
)
convert_btn = gr.Button(
"๋ณ€ํ™˜ํ•˜๊ธฐ", variant="primary", elem_classes="convert-button"
)
# ์ƒํƒœ ํ‘œ์‹œ
status_output = gr.Textbox(
label="์ƒํƒœ", value="๋Œ€๊ธฐ ์ค‘...", interactive=False
)
# HTML ๋ทฐ์–ด ์˜์—ญ (๋‘ ์—ด๋กœ ๊ตฌ์„ฑ)
with gr.Column(visible=False) as html_output_area:
with gr.Row(elem_classes="html-columns"):
# ์™ผ์ชฝ ์—ด - ํ…์ŠคํŠธ HTML
with gr.Column(elem_classes="html-column"):
gr.Markdown("### ํ…์ŠคํŠธ ๋‚ด์šฉ", elem_classes="column-title")
text_html_viewer = gr.HTML(
label="ํ…์ŠคํŠธ HTML",
elem_id="text-html-viewer",
elem_classes="html-display",
)
# ์˜ค๋ฅธ์ชฝ ์—ด - ๋ฏธ๋””์–ด HTML
with gr.Column(elem_classes="html-column"):
gr.Markdown("### ํ‘œ ๋ฐ ์ด๋ฏธ์ง€", elem_classes="column-title")
media_html_viewer = gr.HTML(
label="๋ฏธ๋””์–ด HTML",
elem_id="media-html-viewer",
elem_classes="html-display",
)
# ํ‘ธํ„ฐ
with gr.Column(elem_classes="footer-area"):
gr.Markdown("ยฉ 2025 pdf2html")
# ๋ณ€ํ™˜ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
def process_conversion(pdf_file):
if pdf_file is None:
return (
gr.update(visible=False),
"<h1 style='color:white;'>PDF ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”</h1>",
"<h1 style='color:white;'>PDF ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”</h1>",
"PDF ํŒŒ์ผ์ด ์—…๋กœ๋“œ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.",
)
try:
# ๋ณ€ํ™˜ ํ•จ์ˆ˜ ํ˜ธ์ถœ
text_html, media_html = convert_pdf_to_html(pdf_file)
# HTML ๋‚ด์šฉ ๋””๋ฒ„๊น…
print(f"ํ…์ŠคํŠธ HTML ๊ธธ์ด: {len(text_html)} ๋ฐ”์ดํŠธ")
print(f"๋ฏธ๋””์–ด HTML ๊ธธ์ด: {len(media_html)} ๋ฐ”์ดํŠธ")
# HTML ํ‘œ์‹œ ์˜์—ญ ๋ณด์ด๊ธฐ ๋ฐ ๋‚ด์šฉ ์—…๋ฐ์ดํŠธ
return gr.update(visible=True), text_html, media_html, "๋ณ€ํ™˜ ์™„๋ฃŒ!"
except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {str(e)}\n{error_details}")
error_html = f"<h1 style='color:red;'>์˜ค๋ฅ˜ ๋ฐœ์ƒ</h1><p style='color:red;'>{str(e)}</p>"
return (
gr.update(visible=False),
error_html,
error_html,
f"์˜ค๋ฅ˜: {str(e)}",
)
# ๋ณ€ํ™˜ ๋ฒ„ํŠผ ํด๋ฆญ ์ด๋ฒคํŠธ
convert_btn.click(
fn=process_conversion,
inputs=pdf_input,
outputs=[
html_output_area,
text_html_viewer,
media_html_viewer,
status_output,
],
)
# ๋ ˆ์ด์•„์›ƒ ๋ฌธ์ œ ํ•ด๊ฒฐ์„ ์œ„ํ•œ JavaScript
demo.load(
js="""
function fixLayout() {
// HTML ๋ทฐ์–ด ์ปจํ…Œ์ด๋„ˆ ํ™•์ธ
const htmlColumns = document.querySelector('.html-columns');
if (htmlColumns) {
// ์ปจํ…Œ์ด๋„ˆ์˜ ๋„ˆ๋น„ ๊ท ๋“ฑํ•˜๊ฒŒ ๋งž์ถ”๊ธฐ
const columns = htmlColumns.querySelectorAll('.html-column');
columns.forEach(column => {
column.style.flex = '1';
column.style.minWidth = '0';
});
}
// ํ…์ŠคํŠธ ์ƒ‰์ƒ ๊ฐ•์ œ ์„ค์ •
const textViewer = document.getElementById('text-html-viewer');
const mediaViewer = document.getElementById('media-html-viewer');
function forceTextColor(element) {
if (!element) return;
// iframe ๋‚ด๋ถ€ ๋ฌธ์„œ์— ์ ‘๊ทผ
try {
const iframes = element.querySelectorAll('iframe');
iframes.forEach(iframe => {
if (iframe.contentDocument) {
const allElements = iframe.contentDocument.querySelectorAll('*');
allElements.forEach(el => {
if (el.tagName !== 'IMG') {
el.style.color = '#ffffff';
}
});
// ๋ฐฐ๊ฒฝ์ƒ‰ ์„ค์ •
const body = iframe.contentDocument.body;
if (body) {
body.style.backgroundColor = '#2a2a2a';
}
}
});
} catch (e) {
console.error('iframe ์ ‘๊ทผ ์ค‘ ์˜ค๋ฅ˜:', e);
}
// ์ง์ ‘ ๋ฌธ์„œ ๋‚ด ์š”์†Œ์— ์ƒ‰์ƒ ์„ค์ •
const allTextElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, span, div, a, li, td, th');
allTextElements.forEach(el => {
el.style.color = '#ffffff';
});
}
forceTextColor(textViewer);
forceTextColor(mediaViewer);
// ์ด๋ฏธ์ง€ ํ‘œ์‹œ ํ™•์ธ
if (mediaViewer) {
const images = mediaViewer.querySelectorAll('img');
console.log(`๋ฏธ๋””์–ด ๋ทฐ์–ด์—์„œ ์ด๋ฏธ์ง€ ${images.length}๊ฐœ ๋ฐœ๊ฒฌ`);
images.forEach((img, index) => {
// ์ด๋ฏธ์ง€ ๋กœ๋“œ ์ƒํƒœ ํ™•์ธ
console.log(`์ด๋ฏธ์ง€ ${index + 1} ๋กœ๋“œ ์ƒํƒœ: ${img.complete ? '์™„๋ฃŒ' : '๋กœ๋”ฉ ์ค‘'}`);
if (img.complete && img.naturalWidth === 0) {
console.log(`์ด๋ฏธ์ง€ ${index + 1} ๋กœ๋“œ ์‹คํŒจ`);
}
});
}
}
// ํŽ˜์ด์ง€ ๋กœ๋“œ ์‹œ ๋ ˆ์ด์•„์›ƒ ์กฐ์ •
window.addEventListener('load', function() {
setTimeout(fixLayout, 1000);
setTimeout(fixLayout, 3000);
setTimeout(fixLayout, 5000); // ๋” ๊ธด ์‹œ๊ฐ„ ํ›„์—๋„ ํ•œ ๋ฒˆ ๋” ์‹คํ–‰
});
// MutationObserver๋กœ DOM ๋ณ€๊ฒฝ ๊ฐ์ง€
const observer = new MutationObserver(mutations => {
setTimeout(fixLayout, 500);
});
// ํŽ˜์ด์ง€ ๋กœ๋“œ ํ›„ Observer ์‹œ์ž‘
window.addEventListener('load', () => {
observer.observe(document.body, {
childList: true,
subtree: true,
attributes: true
});
// ์Šคํƒ€์ผ ์š”์†Œ ์ง์ ‘ ์ถ”๊ฐ€
const style = document.createElement('style');
style.textContent = `
.html-display * {
color: #ffffff !important;
}
.html-display {
background-color: #2a2a2a !important;
}
`;
document.head.appendChild(style);
});
"""
)
# ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰
demo.launch(share=False, inbrowser=True, show_api=False)
if __name__ == "__main__":
launch_web_interface()