pdf2html / convert.py
jrpark's picture
Upload folder using huggingface_hub
d1aa69e verified
import os
from pathlib import Path
import fitz # PyMuPDF
import base64
import re
import shutil
class PDFToHTMLConverter:
def __init__(self, pdf_path, output_dir=None):
"""
PDF๋ฅผ HTML๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ํด๋ž˜์Šค ์ดˆ๊ธฐํ™”
Args:
pdf_path (str): PDF ํŒŒ์ผ ๊ฒฝ๋กœ
output_dir (str, optional): ์ถœ๋ ฅ ๋””๋ ‰ํ† ๋ฆฌ. ๊ธฐ๋ณธ๊ฐ’์€ PDF์™€ ๊ฐ™์€ ๋””๋ ‰ํ† ๋ฆฌ.
"""
self.pdf_path = pdf_path
self.pdf_name = Path(pdf_path).stem
# ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ ๋””๋ ‰ํ† ๋ฆฌ์— .temp ํด๋” ์ƒ์„ฑ
current_dir = Path.cwd()
self.temp_dir = current_dir / ".temp"
# ์ž…๋ ฅ PDF ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ
self.pdf_dir = self.temp_dir / "temp_input_pdf"
# ์ถœ๋ ฅ HTML ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ
self.output_dir = self.temp_dir / "temp_output_html"
# ์ด๋ฏธ์ง€ ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ
self.img_dir = self.output_dir / "images"
# ํ•„์š”ํ•œ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
self.temp_dir.mkdir(exist_ok=True)
self.pdf_dir.mkdir(exist_ok=True)
self.output_dir.mkdir(exist_ok=True)
self.img_dir.mkdir(exist_ok=True)
# ๊ณ ์ •๋œ ํŒŒ์ผ ์ด๋ฆ„ ์„ค์ •
self.fixed_pdf_path = self.pdf_dir / "current.pdf"
# ํŒŒ์ผ ๊ฒฝ๋กœ๊ฐ€ ๋ฌธ์ž์—ด์ธ ๊ฒฝ์šฐ Path ๊ฐ์ฒด๋กœ ๋ณ€ํ™˜
if isinstance(pdf_path, str):
pdf_path = Path(pdf_path)
# ์›๋ณธ PDF ํŒŒ์ผ์ด ๊ณ ์ • ๊ฒฝ๋กœ์™€ ๋‹ค๋ฅธ ๊ฒฝ์šฐ์—๋งŒ ๋ณต์‚ฌ
if pdf_path != self.fixed_pdf_path:
shutil.copy2(str(pdf_path), str(self.fixed_pdf_path))
print(f"PDF ํŒŒ์ผ ๋ณต์‚ฌ๋จ: {pdf_path} -> {self.fixed_pdf_path}")
else:
print(f"PDF ํŒŒ์ผ์ด ์ด๋ฏธ ์˜ฌ๋ฐ”๋ฅธ ์œ„์น˜์— ์žˆ์Šต๋‹ˆ๋‹ค: {self.fixed_pdf_path}")
# PyMuPDF ๋ฌธ์„œ ๊ฐ์ฒด ์—ด๊ธฐ
self.doc = fitz.open(self.fixed_pdf_path)
# ๊ฒฐ๊ณผ HTML
self.html_content = ""
self.text_html_content = ""
self.media_html_content = ""
def _extract_text_with_structure(self, page):
"""
ํŽ˜์ด์ง€์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๊ณ  ๊ธฐ๋ณธ ๊ตฌ์กฐ๋ฅผ ์œ ์ง€
Args:
page (fitz.Page): PDF ํŽ˜์ด์ง€ ๊ฐ์ฒด
Returns:
str: ๊ตฌ์กฐํ™”๋œ HTML ํ…์ŠคํŠธ
"""
blocks = page.get_text("dict")["blocks"]
html_text = []
for block in blocks:
if block["type"] == 0: # ํ…์ŠคํŠธ ๋ธ”๋ก
text_lines = []
for line in block["lines"]:
line_text = ""
for span in line["spans"]:
# ํฐํŠธ ํฌ๊ธฐ์™€ ์Šคํƒ€์ผ ๋ถ„์„
font_size = span["size"]
is_bold = "bold" in span["font"].lower() or span.get("flags", 0) & 16 != 0
is_italic = "italic" in span["font"].lower() or span.get("flags", 0) & 1 != 0
text = span["text"]
# ํฐํŠธ ํฌ๊ธฐ์— ๋”ฐ๋ผ ์ œ๋ชฉ ๋˜๋Š” ์ผ๋ฐ˜ ํ…์ŠคํŠธ๋กœ ๋ถ„๋ฅ˜
if font_size > 14: # ํฐ ํฐํŠธ๋Š” ์ œ๋ชฉ์ผ ๊ฐ€๋Šฅ์„ฑ์ด ๋†’์Œ
if is_bold:
text = f"<h1>{text}</h1>"
else:
text = f"<h2>{text}</h2>"
elif font_size > 12:
if is_bold:
text = f"<h3>{text}</h3>"
else:
text = f"<h4>{text}</h4>"
else:
if is_bold:
text = f"<strong>{text}</strong>"
if is_italic:
text = f"<em>{text}</em>"
line_text += text
text_lines.append(line_text)
# ํ…์ŠคํŠธ ๋ผ์ธ์„ ๋‹จ๋ฝ์œผ๋กœ ๊ฒฐํ•ฉ
if text_lines:
paragraph = " ".join(text_lines)
html_text.append(f"<p>{paragraph}</p>")
return "\n".join(html_text)
def _extract_images(self, page, page_num):
"""
ํŽ˜์ด์ง€์—์„œ ์ด๋ฏธ์ง€ ์ถ”์ถœ
Args:
page (fitz.Page): PDF ํŽ˜์ด์ง€ ๊ฐ์ฒด
page_num (int): ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
Returns:
list: ์ด๋ฏธ์ง€ HTML ํƒœ๊ทธ ๋ชฉ๋ก
"""
image_tags = []
image_list = page.get_images(full=True)
for img_idx, img_info in enumerate(image_list):
try:
xref = img_info[0]
base_img = self.doc.extract_image(xref)
image_bytes = base_img["image"]
# ์ด๋ฏธ์ง€ ํฌ๋งท ํ™•์ธ (๊ธฐ๋ณธ๊ฐ’์€ png)
image_ext = base_img["ext"]
if image_ext.lower() not in ["jpeg", "jpg", "png"]:
image_ext = "png"
# ๊ณ ์ •๋œ ๊ฒฝ๋กœ์— ์ด๋ฏธ์ง€ ์ €์žฅ
image_filename = f"page{page_num+1}_img{img_idx+1}.{image_ext}"
image_path = self.img_dir / image_filename
# ์ด๋ฏธ์ง€ ๋””๋ ‰ํ† ๋ฆฌ ํ™•์ธ
if not self.img_dir.exists():
self.img_dir.mkdir(parents=True, exist_ok=True)
with open(image_path, "wb") as img_file:
img_file.write(image_bytes)
# ๋””๋ฒ„๊น…์„ ์œ„ํ•œ ์ฝ”๋“œ
print(f"์ด๋ฏธ์ง€ ์ €์žฅ: {image_path} (ํฌ๊ธฐ: {len(image_bytes)} ๋ฐ”์ดํŠธ)")
# ์ด๋ฏธ์ง€ ํƒœ๊ทธ ์ƒ์„ฑ (์ƒ๋Œ€ ๊ฒฝ๋กœ ์‚ฌ์šฉ)
# ๊ฒฝ๋กœ๊ฐ€ HTML ํŒŒ์ผ์—์„œ ์˜ฌ๋ฐ”๋ฅด๊ฒŒ ์ฐธ์กฐ๋  ์ˆ˜ ์žˆ๋„๋ก ํ•ฉ๋‹ˆ๋‹ค
rel_img_path = f"images/{image_filename}"
img_tag = f'<div class="image-container"><img src="{rel_img_path}" alt="Page {page_num+1} Image {img_idx+1}" style="max-width:100%; height:auto;"/></div>'
image_tags.append(img_tag)
except Exception as e:
print(f"์ด๋ฏธ์ง€ ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜: {str(e)}")
return image_tags
def _extract_tables(self, page):
"""
ํŽ˜์ด์ง€์—์„œ ํ‘œ ์ถ”์ถœ ์‹œ๋„
Args:
page (fitz.Page): PDF ํŽ˜์ด์ง€ ๊ฐ์ฒด
Returns:
list: ํ‘œ HTML ํƒœ๊ทธ ๋ชฉ๋ก
"""
# ํ‘œ ๊ฐ์ง€ ๋ฐ ์ถ”์ถœ์€ ๋ณต์žกํ•œ ์ž‘์—…์ž…๋‹ˆ๋‹ค.
# ์ด ๊ฐ„๋‹จํ•œ ์˜ˆ์‹œ์—์„œ๋Š” ํ…Œ์ด๋ธ”๋กœ ๋ณด์ด๋Š” ๊ตฌ์กฐ๋ฅผ ๊ฐ์ง€ํ•˜๋Š” ๊ธฐ๋ณธ์ ์ธ ์ ‘๊ทผ ๋ฐฉ์‹์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
tables = []
# ํŽ˜์ด์ง€์˜ ํ…์ŠคํŠธ ๋ธ”๋ก์„ ๋ถ„์„
blocks = page.get_text("dict")["blocks"]
# ๋†’์ด๊ฐ€ ๋น„์Šทํ•œ ํ…์ŠคํŠธ ๋ธ”๋ก์ด ๊ฐ€๋กœ๋กœ ์ •๋ ฌ๋œ ๊ฒฝ์šฐ ํ…Œ์ด๋ธ” ํ–‰์ผ ๊ฐ€๋Šฅ์„ฑ์ด ์žˆ์Œ
table_candidates = []
for i, block in enumerate(blocks):
if block["type"] == 0: # ํ…์ŠคํŠธ ๋ธ”๋ก
# ํ…์ŠคํŠธ ๋ธ”๋ก์˜ ์œ„์น˜ ์ •๋ณด
x0, y0, x1, y1 = block["bbox"]
# ๊ฐ™์€ ํ–‰์— ์žˆ๋Š” ๋‹ค๋ฅธ ํ…์ŠคํŠธ ๋ธ”๋ก ์ฐพ๊ธฐ
same_row_blocks = []
for j, other_block in enumerate(blocks):
if i != j and other_block["type"] == 0:
ox0, oy0, ox1, oy1 = other_block["bbox"]
# y ์ขŒํ‘œ๊ฐ€ ๋น„์Šทํ•˜๋ฉด ๊ฐ™์€ ํ–‰์ผ ๊ฐ€๋Šฅ์„ฑ์ด ์žˆ์Œ
if abs(y0 - oy0) < 5 and abs(y1 - oy1) < 5:
same_row_blocks.append(j)
# ๊ฐ™์€ ํ–‰์— ์—ฌ๋Ÿฌ ํ…์ŠคํŠธ ๋ธ”๋ก์ด ์žˆ์œผ๋ฉด ํ…Œ์ด๋ธ” ํ–‰์ผ ๊ฐ€๋Šฅ์„ฑ์ด ๋†’์Œ
if len(same_row_blocks) >= 2:
table_candidates.append((i, same_row_blocks))
# ํ…Œ์ด๋ธ” ํ›„๋ณด๊ฐ€ ์žˆ์œผ๋ฉด HTML ํ…Œ์ด๋ธ”๋กœ ๋ณ€ํ™˜
if table_candidates:
table_html = "<table border='1'>\n"
for row_idx, row_blocks in table_candidates:
table_html += "<tr>\n"
# ํ˜„์žฌ ๋ธ”๋ก ์ถ”๊ฐ€
block_text = ""
for line in blocks[row_idx]["lines"]:
for span in line["spans"]:
block_text += span["text"] + " "
table_html += f"<td>{block_text.strip()}</td>\n"
# ๊ฐ™์€ ํ–‰์˜ ๋‹ค๋ฅธ ๋ธ”๋ก ์ถ”๊ฐ€
for block_idx in row_blocks:
block_text = ""
for line in blocks[block_idx]["lines"]:
for span in line["spans"]:
block_text += span["text"] + " "
table_html += f"<td>{block_text.strip()}</td>\n"
table_html += "</tr>\n"
table_html += "</table>"
tables.append(table_html)
return tables
def _create_html_template(self, title, content, css_additional=""):
"""
HTML ํ…œํ”Œ๋ฆฟ ์ƒ์„ฑ - ๋‹คํฌ ํ…Œ๋งˆ ์ ์šฉ
Args:
title (str): HTML ์ œ๋ชฉ
content (str): HTML ๋ณธ๋ฌธ ๋‚ด์šฉ
css_additional (str): ์ถ”๊ฐ€ CSS ์Šคํƒ€์ผ
Returns:
str: ์™„์„ฑ๋œ HTML ๋ฌธ์ž์—ด
"""
return f"""<!DOCTYPE html>
<html lang="ko">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{title}</title>
<style>
body {{
font-family: Arial, sans-serif;
line-height: 1.6;
margin: 0;
padding: 0;
height: 100vh;
overflow-y: auto;
background-color: #2a2a2a;
color: #ffffff;
}}
.page-title {{
padding: 10px 20px;
margin: 0;
background-color: #333;
color: white;
position: sticky;
top: 0;
z-index: 10;
}}
.content-container {{
padding: 20px;
}}
.image-container {{
text-align: center;
margin: 20px 0;
}}
img {{
max-width: 100%;
height: auto;
border: 1px solid #444;
}}
table {{
border-collapse: collapse;
width: 100%;
margin: 20px 0;
background-color: #333;
}}
td, th {{
border: 1px solid #555;
padding: 8px;
color: #fff;
}}
h1, h2, h3, h4, p, span, div {{
color: #fff;
}}
.media-item {{
margin-bottom: 30px;
padding-bottom: 20px;
border-bottom: 1px solid #444;
}}
.media-item-heading {{
background-color: #444;
padding: 5px 10px;
margin-bottom: 10px;
font-weight: bold;
border-left: 3px solid #E67E22;
color: #fff;
}}
.page-text {{
margin-bottom: 30px;
border-bottom: 1px solid #444;
padding-bottom: 20px;
}}
/* ์Šคํฌ๋กค๋ฐ” ์Šคํƒ€์ผ */
::-webkit-scrollbar {{
width: 8px;
}}
::-webkit-scrollbar-track {{
background: #333;
}}
::-webkit-scrollbar-thumb {{
background: #666;
border-radius: 4px;
}}
::-webkit-scrollbar-thumb:hover {{
background: #777;
}}
/* ๋งํฌ ์Šคํƒ€์ผ */
a {{
color: #3498db;
text-decoration: none;
}}
a:hover {{
text-decoration: underline;
}}
{css_additional}
</style>
</head>
<body>
<h1 class="page-title">{title}</h1>
<div class="content-container">
{content}
</div>
</body>
</html>"""
def convert(self):
"""
PDF๋ฅผ ํ…์ŠคํŠธ HTML๊ณผ ๋ฏธ๋””์–ด HTML๋กœ ๋ถ„๋ฆฌํ•˜์—ฌ ๋ณ€ํ™˜
Returns:
tuple: ํ…์ŠคํŠธ HTML ๊ฒฝ๋กœ, ๋ฏธ๋””์–ด HTML ๊ฒฝ๋กœ
"""
# ํ…์ŠคํŠธ ์ปฌ๋Ÿผ๊ณผ ๋ฏธ๋””์–ด ์ปฌ๋Ÿผ์„ ์œ„ํ•œ ์ปจํ…์ธ  ์ค€๋น„
text_content = []
media_content = []
media_order = 0 # ๋ฏธ๋””์–ด ์•„์ดํ…œ ์ˆœ์„œ
# ๊ฐ ํŽ˜์ด์ง€ ์ฒ˜๋ฆฌ
for page_num, page in enumerate(self.doc):
# ํ…์ŠคํŠธ ์ถ”์ถœ
text_html = self._extract_text_with_structure(page)
text_content.append(f"\n<div class='page-text' id='page-text-{page_num+1}'>\n")
text_content.append(f"<h3>ํŽ˜์ด์ง€ {page_num+1}</h3>")
text_content.append(text_html)
text_content.append("\n</div>\n")
# ํ‘œ ์ถ”์ถœ
tables = self._extract_tables(page)
for table_idx, table in enumerate(tables):
media_order += 1
media_content.append(f"""
<div class="media-item" id="table-{page_num+1}-{table_idx+1}" data-page="{page_num+1}">
<div class="media-item-heading">ํ‘œ {media_order} - ํŽ˜์ด์ง€ {page_num+1}</div>
{table}
</div>
""")
# ์ด๋ฏธ์ง€ ์ถ”์ถœ
images = self._extract_images(page, page_num)
for img_idx, img_tag in enumerate(images):
media_order += 1
media_content.append(f"""
<div class="media-item" id="image-{page_num+1}-{img_idx+1}" data-page="{page_num+1}">
<div class="media-item-heading">์ด๋ฏธ์ง€ {media_order} - ํŽ˜์ด์ง€ {page_num+1}</div>
{img_tag}
</div>
""")
# ํ…์ŠคํŠธ HTML ์ƒ์„ฑ
text_html_content = self._create_html_template(
f"{self.pdf_name} - ํ…์ŠคํŠธ",
"\n".join(text_content)
)
# ๋ฏธ๋””์–ด HTML ์ƒ์„ฑ
media_html_content = self._create_html_template(
f"{self.pdf_name} - ํ‘œ ๋ฐ ์ด๋ฏธ์ง€",
"\n".join(media_content)
)
# ํ…์ŠคํŠธ HTML ํŒŒ์ผ ์ €์žฅ
text_output_path = self.output_dir / "text.html"
with open(text_output_path, "w", encoding="utf-8") as html_file:
html_file.write(text_html_content)
# ๋ฏธ๋””์–ด HTML ํŒŒ์ผ ์ €์žฅ
media_output_path = self.output_dir / "media.html"
with open(media_output_path, "w", encoding="utf-8") as html_file:
html_file.write(media_html_content)
# ๊ฒฐ๊ณผ ์ €์žฅ
self.text_html_content = text_html_content
self.media_html_content = media_html_content
return str(text_output_path), str(media_output_path)