|
|
import os |
|
|
import gradio as gr |
|
|
from pathlib import Path |
|
|
import base64 |
|
|
|
|
|
|
|
|
from convert import PDFToHTMLConverter |
|
|
|
|
|
|
|
|
def convert_pdf_to_html(pdf_file): |
|
|
"""PDF ํ์ผ์ HTML๋ก ๋ณํํ๊ณ ๊ฒฐ๊ณผ ๋ฐํ""" |
|
|
try: |
|
|
|
|
|
current_dir = Path.cwd() |
|
|
temp_dir = current_dir / ".temp" |
|
|
|
|
|
|
|
|
if hasattr(pdf_file, "name"): |
|
|
with open(pdf_file.name, "rb") as f: |
|
|
pdf_data = f.read() |
|
|
else: |
|
|
pdf_data = pdf_file |
|
|
|
|
|
|
|
|
pdf_input_dir = temp_dir / "temp_input_pdf" |
|
|
pdf_input_dir.mkdir(exist_ok=True, parents=True) |
|
|
pdf_path = pdf_input_dir / "current.pdf" |
|
|
|
|
|
|
|
|
with open(pdf_path, "wb") as f: |
|
|
f.write(pdf_data) |
|
|
|
|
|
print(f"PDF ์ ์ฅ ์๋ฃ: {pdf_path}") |
|
|
|
|
|
|
|
|
converter = PDFToHTMLConverter(str(pdf_path)) |
|
|
text_html_path, media_html_path = converter.convert() |
|
|
|
|
|
print(f"HTML ๋ณํ ์๋ฃ: {text_html_path}, {media_html_path}") |
|
|
|
|
|
|
|
|
with open(text_html_path, "r", encoding="utf-8") as f: |
|
|
text_html_content = f.read() |
|
|
|
|
|
with open(media_html_path, "r", encoding="utf-8") as f: |
|
|
media_html_content = f.read() |
|
|
|
|
|
|
|
|
img_dir_path = temp_dir / "temp_output_html" / "images" |
|
|
if img_dir_path.exists(): |
|
|
print(f"์ด๋ฏธ์ง ๋๋ ํ ๋ฆฌ ํ์ธ: {img_dir_path}") |
|
|
for img_file in img_dir_path.glob("*.*"): |
|
|
try: |
|
|
rel_path = f"images/{img_file.name}" |
|
|
print(f"์ด๋ฏธ์ง ์ฒ๋ฆฌ ์ค: {img_file}") |
|
|
|
|
|
|
|
|
with open(img_file, "rb") as f: |
|
|
encoded_string = base64.b64encode(f.read()).decode("utf-8") |
|
|
|
|
|
|
|
|
ext = img_file.suffix.lower()[1:] |
|
|
mime_type = { |
|
|
"png": "image/png", |
|
|
"jpg": "image/jpeg", |
|
|
"jpeg": "image/jpeg", |
|
|
"gif": "image/gif", |
|
|
"svg": "image/svg+xml", |
|
|
}.get(ext, "image/png") |
|
|
|
|
|
|
|
|
data_url = f"data:{mime_type};base64,{encoded_string}" |
|
|
|
|
|
|
|
|
original_pattern = f'src="{rel_path}"' |
|
|
replacement = f'src="{data_url}"' |
|
|
|
|
|
if original_pattern in media_html_content: |
|
|
media_html_content = media_html_content.replace( |
|
|
original_pattern, replacement |
|
|
) |
|
|
print(f"์ด๋ฏธ์ง {img_file.name} Base64 ์ธ์ฝ๋ฉ ์๋ฃ") |
|
|
else: |
|
|
print( |
|
|
f"๊ฒฝ๊ณ : ์ด๋ฏธ์ง ๊ฒฝ๋ก '{rel_path}'๋ฅผ HTML์์ ์ฐพ์ ์ ์์ต๋๋ค" |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"์ด๋ฏธ์ง {img_file.name} ์ฒ๋ฆฌ ์ค ์ค๋ฅ: {str(e)}") |
|
|
else: |
|
|
print(f"์ด๋ฏธ์ง ๋๋ ํ ๋ฆฌ๊ฐ ์กด์ฌํ์ง ์์: {img_dir_path}") |
|
|
|
|
|
|
|
|
text_html_with_style = f""" |
|
|
<div style="width: 100%; height: 800px; overflow-y: auto; border: 1px solid #444; background-color: #2a2a2a;"> |
|
|
{text_html_content} |
|
|
</div> |
|
|
""" |
|
|
|
|
|
media_html_with_style = f""" |
|
|
<div style="width: 100%; height: 800px; overflow-y: auto; border: 1px solid #444; background-color: #2a2a2a;"> |
|
|
{media_html_content} |
|
|
</div> |
|
|
""" |
|
|
|
|
|
print("HTML ๋ด์ฉ ์ค๋น ์๋ฃ") |
|
|
|
|
|
|
|
|
return text_html_with_style, media_html_with_style |
|
|
|
|
|
except Exception as e: |
|
|
import traceback |
|
|
|
|
|
error_details = traceback.format_exc() |
|
|
print(f"์ค๋ฅ ๋ฐ์: {str(e)}\n{error_details}") |
|
|
error_html = f"<h1>์ค๋ฅ ๋ฐ์</h1><p>{str(e)}</p><pre>{error_details}</pre>" |
|
|
return error_html, error_html |
|
|
|
|
|
|
|
|
def launch_web_interface(): |
|
|
"""Gradio ์น ์ธํฐํ์ด์ค ์คํ""" |
|
|
|
|
|
css = """ |
|
|
/* ์ ์ฒด ๋ ์ด์์ */ |
|
|
body, .gradio-container { |
|
|
margin: 0 !important; |
|
|
padding: 0 !important; |
|
|
width: 100% !important; |
|
|
max-width: none !important; |
|
|
background-color: #1f1f1f; |
|
|
} |
|
|
|
|
|
/* ํค๋ ์์ญ */ |
|
|
.header-area { |
|
|
background-color: #2a2a2a; |
|
|
padding: 1rem; |
|
|
border-bottom: 1px solid #444; |
|
|
margin-bottom: 1rem; |
|
|
} |
|
|
|
|
|
/* ์
๋ก๋ ์์ญ */ |
|
|
.upload-area { |
|
|
background-color: #2a2a2a; |
|
|
padding: 1rem; |
|
|
border-radius: 5px; |
|
|
margin-bottom: 1rem; |
|
|
} |
|
|
|
|
|
/* HTML ๋ทฐ์ด ์ปจํ
์ด๋ */ |
|
|
.html-columns { |
|
|
display: flex; |
|
|
gap: 20px; |
|
|
} |
|
|
|
|
|
.html-column { |
|
|
flex: 1; |
|
|
min-width: 0; |
|
|
} |
|
|
|
|
|
/* HTML ๋ทฐ์ด */ |
|
|
.html-display { |
|
|
min-height: 800px !important; |
|
|
width: 100% !important; |
|
|
background-color: #2a2a2a !important; |
|
|
} |
|
|
|
|
|
/* HTML ๋ด์ฉ์ ํ
์คํธ ์์ */ |
|
|
.html-display * { |
|
|
color: #ffffff !important; |
|
|
} |
|
|
|
|
|
/* HTML ๋ด์ ํ ์คํ์ผ */ |
|
|
.html-display table { |
|
|
background-color: #333 !important; |
|
|
border: 1px solid #555 !important; |
|
|
} |
|
|
|
|
|
.html-display td, |
|
|
.html-display th { |
|
|
border: 1px solid #555 !important; |
|
|
color: #fff !important; |
|
|
} |
|
|
|
|
|
/* ๋ฒํผ ์คํ์ผ */ |
|
|
.convert-button { |
|
|
background-color: #E67E22 !important; |
|
|
border: none !important; |
|
|
} |
|
|
|
|
|
/* ํ์ดํ ํ
์คํธ */ |
|
|
.title-text { |
|
|
color: white !important; |
|
|
margin: 0 !important; |
|
|
padding: 0 !important; |
|
|
} |
|
|
|
|
|
/* ์ค๋ช
ํ
์คํธ */ |
|
|
.description-text { |
|
|
color: #aaa !important; |
|
|
margin-top: 0.5rem !important; |
|
|
} |
|
|
|
|
|
/* ์ปฌ๋ผ ์ ๋ชฉ */ |
|
|
.column-title { |
|
|
color: white !important; |
|
|
margin-bottom: 0.5rem !important; |
|
|
} |
|
|
|
|
|
/* ํธํฐ */ |
|
|
.footer-area { |
|
|
margin-top: 2rem; |
|
|
text-align: center; |
|
|
color: #888; |
|
|
padding: 1rem; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
with gr.Blocks(css=css, theme=gr.themes.Default()) as demo: |
|
|
|
|
|
with gr.Column(elem_classes="header-area"): |
|
|
gr.Markdown("# PDF to HTML ๋ณํ๊ธฐ", elem_classes="title-text") |
|
|
gr.Markdown( |
|
|
"PDF ํ์ผ์ ์
๋ก๋ํ์ฌ ํ
์คํธ์ ๋ฏธ๋์ด๋ก ๋ถ๋ฆฌ๋ HTML์ ์์ฑํฉ๋๋ค.", |
|
|
elem_classes="description-text", |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Column(elem_classes="upload-area"): |
|
|
|
|
|
with gr.Row(): |
|
|
pdf_input = gr.File( |
|
|
label="PDF ํ์ผ ์
๋ก๋", type="binary", elem_id="pdf-upload" |
|
|
) |
|
|
convert_btn = gr.Button( |
|
|
"๋ณํํ๊ธฐ", variant="primary", elem_classes="convert-button" |
|
|
) |
|
|
|
|
|
|
|
|
status_output = gr.Textbox( |
|
|
label="์ํ", value="๋๊ธฐ ์ค...", interactive=False |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Column(visible=False) as html_output_area: |
|
|
with gr.Row(elem_classes="html-columns"): |
|
|
|
|
|
with gr.Column(elem_classes="html-column"): |
|
|
gr.Markdown("### ํ
์คํธ ๋ด์ฉ", elem_classes="column-title") |
|
|
text_html_viewer = gr.HTML( |
|
|
label="ํ
์คํธ HTML", |
|
|
elem_id="text-html-viewer", |
|
|
elem_classes="html-display", |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Column(elem_classes="html-column"): |
|
|
gr.Markdown("### ํ ๋ฐ ์ด๋ฏธ์ง", elem_classes="column-title") |
|
|
media_html_viewer = gr.HTML( |
|
|
label="๋ฏธ๋์ด HTML", |
|
|
elem_id="media-html-viewer", |
|
|
elem_classes="html-display", |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Column(elem_classes="footer-area"): |
|
|
gr.Markdown("ยฉ 2025 pdf2html") |
|
|
|
|
|
|
|
|
def process_conversion(pdf_file): |
|
|
if pdf_file is None: |
|
|
return ( |
|
|
gr.update(visible=False), |
|
|
"<h1 style='color:white;'>PDF ํ์ผ์ ์
๋ก๋ํด์ฃผ์ธ์</h1>", |
|
|
"<h1 style='color:white;'>PDF ํ์ผ์ ์
๋ก๋ํด์ฃผ์ธ์</h1>", |
|
|
"PDF ํ์ผ์ด ์
๋ก๋๋์ง ์์์ต๋๋ค.", |
|
|
) |
|
|
|
|
|
try: |
|
|
|
|
|
text_html, media_html = convert_pdf_to_html(pdf_file) |
|
|
|
|
|
|
|
|
print(f"ํ
์คํธ HTML ๊ธธ์ด: {len(text_html)} ๋ฐ์ดํธ") |
|
|
print(f"๋ฏธ๋์ด HTML ๊ธธ์ด: {len(media_html)} ๋ฐ์ดํธ") |
|
|
|
|
|
|
|
|
return gr.update(visible=True), text_html, media_html, "๋ณํ ์๋ฃ!" |
|
|
except Exception as e: |
|
|
import traceback |
|
|
|
|
|
error_details = traceback.format_exc() |
|
|
print(f"์ฒ๋ฆฌ ์ค ์ค๋ฅ: {str(e)}\n{error_details}") |
|
|
error_html = f"<h1 style='color:red;'>์ค๋ฅ ๋ฐ์</h1><p style='color:red;'>{str(e)}</p>" |
|
|
return ( |
|
|
gr.update(visible=False), |
|
|
error_html, |
|
|
error_html, |
|
|
f"์ค๋ฅ: {str(e)}", |
|
|
) |
|
|
|
|
|
|
|
|
convert_btn.click( |
|
|
fn=process_conversion, |
|
|
inputs=pdf_input, |
|
|
outputs=[ |
|
|
html_output_area, |
|
|
text_html_viewer, |
|
|
media_html_viewer, |
|
|
status_output, |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
demo.load( |
|
|
js=""" |
|
|
function fixLayout() { |
|
|
// HTML ๋ทฐ์ด ์ปจํ
์ด๋ ํ์ธ |
|
|
const htmlColumns = document.querySelector('.html-columns'); |
|
|
if (htmlColumns) { |
|
|
// ์ปจํ
์ด๋์ ๋๋น ๊ท ๋ฑํ๊ฒ ๋ง์ถ๊ธฐ |
|
|
const columns = htmlColumns.querySelectorAll('.html-column'); |
|
|
columns.forEach(column => { |
|
|
column.style.flex = '1'; |
|
|
column.style.minWidth = '0'; |
|
|
}); |
|
|
} |
|
|
|
|
|
// ํ
์คํธ ์์ ๊ฐ์ ์ค์ |
|
|
const textViewer = document.getElementById('text-html-viewer'); |
|
|
const mediaViewer = document.getElementById('media-html-viewer'); |
|
|
|
|
|
function forceTextColor(element) { |
|
|
if (!element) return; |
|
|
|
|
|
// iframe ๋ด๋ถ ๋ฌธ์์ ์ ๊ทผ |
|
|
try { |
|
|
const iframes = element.querySelectorAll('iframe'); |
|
|
iframes.forEach(iframe => { |
|
|
if (iframe.contentDocument) { |
|
|
const allElements = iframe.contentDocument.querySelectorAll('*'); |
|
|
allElements.forEach(el => { |
|
|
if (el.tagName !== 'IMG') { |
|
|
el.style.color = '#ffffff'; |
|
|
} |
|
|
}); |
|
|
|
|
|
// ๋ฐฐ๊ฒฝ์ ์ค์ |
|
|
const body = iframe.contentDocument.body; |
|
|
if (body) { |
|
|
body.style.backgroundColor = '#2a2a2a'; |
|
|
} |
|
|
} |
|
|
}); |
|
|
} catch (e) { |
|
|
console.error('iframe ์ ๊ทผ ์ค ์ค๋ฅ:', e); |
|
|
} |
|
|
|
|
|
// ์ง์ ๋ฌธ์ ๋ด ์์์ ์์ ์ค์ |
|
|
const allTextElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, span, div, a, li, td, th'); |
|
|
allTextElements.forEach(el => { |
|
|
el.style.color = '#ffffff'; |
|
|
}); |
|
|
} |
|
|
|
|
|
forceTextColor(textViewer); |
|
|
forceTextColor(mediaViewer); |
|
|
|
|
|
// ์ด๋ฏธ์ง ํ์ ํ์ธ |
|
|
if (mediaViewer) { |
|
|
const images = mediaViewer.querySelectorAll('img'); |
|
|
console.log(`๋ฏธ๋์ด ๋ทฐ์ด์์ ์ด๋ฏธ์ง ${images.length}๊ฐ ๋ฐ๊ฒฌ`); |
|
|
|
|
|
images.forEach((img, index) => { |
|
|
// ์ด๋ฏธ์ง ๋ก๋ ์ํ ํ์ธ |
|
|
console.log(`์ด๋ฏธ์ง ${index + 1} ๋ก๋ ์ํ: ${img.complete ? '์๋ฃ' : '๋ก๋ฉ ์ค'}`); |
|
|
if (img.complete && img.naturalWidth === 0) { |
|
|
console.log(`์ด๋ฏธ์ง ${index + 1} ๋ก๋ ์คํจ`); |
|
|
} |
|
|
}); |
|
|
} |
|
|
} |
|
|
|
|
|
// ํ์ด์ง ๋ก๋ ์ ๋ ์ด์์ ์กฐ์ |
|
|
window.addEventListener('load', function() { |
|
|
setTimeout(fixLayout, 1000); |
|
|
setTimeout(fixLayout, 3000); |
|
|
setTimeout(fixLayout, 5000); // ๋ ๊ธด ์๊ฐ ํ์๋ ํ ๋ฒ ๋ ์คํ |
|
|
}); |
|
|
|
|
|
// MutationObserver๋ก DOM ๋ณ๊ฒฝ ๊ฐ์ง |
|
|
const observer = new MutationObserver(mutations => { |
|
|
setTimeout(fixLayout, 500); |
|
|
}); |
|
|
|
|
|
// ํ์ด์ง ๋ก๋ ํ Observer ์์ |
|
|
window.addEventListener('load', () => { |
|
|
observer.observe(document.body, { |
|
|
childList: true, |
|
|
subtree: true, |
|
|
attributes: true |
|
|
}); |
|
|
|
|
|
// ์คํ์ผ ์์ ์ง์ ์ถ๊ฐ |
|
|
const style = document.createElement('style'); |
|
|
style.textContent = ` |
|
|
.html-display * { |
|
|
color: #ffffff !important; |
|
|
} |
|
|
.html-display { |
|
|
background-color: #2a2a2a !important; |
|
|
} |
|
|
`; |
|
|
document.head.appendChild(style); |
|
|
}); |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
demo.launch(share=False, inbrowser=True, show_api=False) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
launch_web_interface() |
|
|
|