|
|
import os |
|
|
from pathlib import Path |
|
|
import fitz |
|
|
import base64 |
|
|
import re |
|
|
import shutil |
|
|
|
|
|
class PDFToHTMLConverter: |
|
|
def __init__(self, pdf_path, output_dir=None): |
|
|
""" |
|
|
PDF๋ฅผ HTML๋ก ๋ณํํ๋ ํด๋์ค ์ด๊ธฐํ |
|
|
|
|
|
Args: |
|
|
pdf_path (str): PDF ํ์ผ ๊ฒฝ๋ก |
|
|
output_dir (str, optional): ์ถ๋ ฅ ๋๋ ํ ๋ฆฌ. ๊ธฐ๋ณธ๊ฐ์ PDF์ ๊ฐ์ ๋๋ ํ ๋ฆฌ. |
|
|
""" |
|
|
self.pdf_path = pdf_path |
|
|
self.pdf_name = Path(pdf_path).stem |
|
|
|
|
|
|
|
|
current_dir = Path.cwd() |
|
|
self.temp_dir = current_dir / ".temp" |
|
|
|
|
|
|
|
|
self.pdf_dir = self.temp_dir / "temp_input_pdf" |
|
|
|
|
|
|
|
|
self.output_dir = self.temp_dir / "temp_output_html" |
|
|
|
|
|
|
|
|
self.img_dir = self.output_dir / "images" |
|
|
|
|
|
|
|
|
self.temp_dir.mkdir(exist_ok=True) |
|
|
self.pdf_dir.mkdir(exist_ok=True) |
|
|
self.output_dir.mkdir(exist_ok=True) |
|
|
self.img_dir.mkdir(exist_ok=True) |
|
|
|
|
|
|
|
|
self.fixed_pdf_path = self.pdf_dir / "current.pdf" |
|
|
|
|
|
|
|
|
if isinstance(pdf_path, str): |
|
|
pdf_path = Path(pdf_path) |
|
|
|
|
|
|
|
|
if pdf_path != self.fixed_pdf_path: |
|
|
shutil.copy2(str(pdf_path), str(self.fixed_pdf_path)) |
|
|
print(f"PDF ํ์ผ ๋ณต์ฌ๋จ: {pdf_path} -> {self.fixed_pdf_path}") |
|
|
else: |
|
|
print(f"PDF ํ์ผ์ด ์ด๋ฏธ ์ฌ๋ฐ๋ฅธ ์์น์ ์์ต๋๋ค: {self.fixed_pdf_path}") |
|
|
|
|
|
|
|
|
self.doc = fitz.open(self.fixed_pdf_path) |
|
|
|
|
|
|
|
|
self.html_content = "" |
|
|
self.text_html_content = "" |
|
|
self.media_html_content = "" |
|
|
|
|
|
def _extract_text_with_structure(self, page): |
|
|
""" |
|
|
ํ์ด์ง์์ ํ
์คํธ๋ฅผ ์ถ์ถํ๊ณ ๊ธฐ๋ณธ ๊ตฌ์กฐ๋ฅผ ์ ์ง |
|
|
|
|
|
Args: |
|
|
page (fitz.Page): PDF ํ์ด์ง ๊ฐ์ฒด |
|
|
|
|
|
Returns: |
|
|
str: ๊ตฌ์กฐํ๋ HTML ํ
์คํธ |
|
|
""" |
|
|
blocks = page.get_text("dict")["blocks"] |
|
|
html_text = [] |
|
|
|
|
|
for block in blocks: |
|
|
if block["type"] == 0: |
|
|
text_lines = [] |
|
|
for line in block["lines"]: |
|
|
line_text = "" |
|
|
for span in line["spans"]: |
|
|
|
|
|
font_size = span["size"] |
|
|
is_bold = "bold" in span["font"].lower() or span.get("flags", 0) & 16 != 0 |
|
|
is_italic = "italic" in span["font"].lower() or span.get("flags", 0) & 1 != 0 |
|
|
|
|
|
text = span["text"] |
|
|
|
|
|
|
|
|
if font_size > 14: |
|
|
if is_bold: |
|
|
text = f"<h1>{text}</h1>" |
|
|
else: |
|
|
text = f"<h2>{text}</h2>" |
|
|
elif font_size > 12: |
|
|
if is_bold: |
|
|
text = f"<h3>{text}</h3>" |
|
|
else: |
|
|
text = f"<h4>{text}</h4>" |
|
|
else: |
|
|
if is_bold: |
|
|
text = f"<strong>{text}</strong>" |
|
|
if is_italic: |
|
|
text = f"<em>{text}</em>" |
|
|
|
|
|
line_text += text |
|
|
|
|
|
text_lines.append(line_text) |
|
|
|
|
|
|
|
|
if text_lines: |
|
|
paragraph = " ".join(text_lines) |
|
|
html_text.append(f"<p>{paragraph}</p>") |
|
|
|
|
|
return "\n".join(html_text) |
|
|
|
|
|
def _extract_images(self, page, page_num): |
|
|
""" |
|
|
ํ์ด์ง์์ ์ด๋ฏธ์ง ์ถ์ถ |
|
|
|
|
|
Args: |
|
|
page (fitz.Page): PDF ํ์ด์ง ๊ฐ์ฒด |
|
|
page_num (int): ํ์ด์ง ๋ฒํธ |
|
|
|
|
|
Returns: |
|
|
list: ์ด๋ฏธ์ง HTML ํ๊ทธ ๋ชฉ๋ก |
|
|
""" |
|
|
image_tags = [] |
|
|
image_list = page.get_images(full=True) |
|
|
|
|
|
for img_idx, img_info in enumerate(image_list): |
|
|
try: |
|
|
xref = img_info[0] |
|
|
base_img = self.doc.extract_image(xref) |
|
|
image_bytes = base_img["image"] |
|
|
|
|
|
|
|
|
image_ext = base_img["ext"] |
|
|
if image_ext.lower() not in ["jpeg", "jpg", "png"]: |
|
|
image_ext = "png" |
|
|
|
|
|
|
|
|
image_filename = f"page{page_num+1}_img{img_idx+1}.{image_ext}" |
|
|
image_path = self.img_dir / image_filename |
|
|
|
|
|
|
|
|
if not self.img_dir.exists(): |
|
|
self.img_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
with open(image_path, "wb") as img_file: |
|
|
img_file.write(image_bytes) |
|
|
|
|
|
|
|
|
print(f"์ด๋ฏธ์ง ์ ์ฅ: {image_path} (ํฌ๊ธฐ: {len(image_bytes)} ๋ฐ์ดํธ)") |
|
|
|
|
|
|
|
|
|
|
|
rel_img_path = f"images/{image_filename}" |
|
|
img_tag = f'<div class="image-container"><img src="{rel_img_path}" alt="Page {page_num+1} Image {img_idx+1}" style="max-width:100%; height:auto;"/></div>' |
|
|
image_tags.append(img_tag) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"์ด๋ฏธ์ง ์ถ์ถ ์ค ์ค๋ฅ: {str(e)}") |
|
|
|
|
|
return image_tags |
|
|
|
|
|
def _extract_tables(self, page): |
|
|
""" |
|
|
ํ์ด์ง์์ ํ ์ถ์ถ ์๋ |
|
|
|
|
|
Args: |
|
|
page (fitz.Page): PDF ํ์ด์ง ๊ฐ์ฒด |
|
|
|
|
|
Returns: |
|
|
list: ํ HTML ํ๊ทธ ๋ชฉ๋ก |
|
|
""" |
|
|
|
|
|
|
|
|
tables = [] |
|
|
|
|
|
|
|
|
blocks = page.get_text("dict")["blocks"] |
|
|
|
|
|
|
|
|
table_candidates = [] |
|
|
|
|
|
for i, block in enumerate(blocks): |
|
|
if block["type"] == 0: |
|
|
|
|
|
x0, y0, x1, y1 = block["bbox"] |
|
|
|
|
|
|
|
|
same_row_blocks = [] |
|
|
|
|
|
for j, other_block in enumerate(blocks): |
|
|
if i != j and other_block["type"] == 0: |
|
|
ox0, oy0, ox1, oy1 = other_block["bbox"] |
|
|
|
|
|
|
|
|
if abs(y0 - oy0) < 5 and abs(y1 - oy1) < 5: |
|
|
same_row_blocks.append(j) |
|
|
|
|
|
|
|
|
if len(same_row_blocks) >= 2: |
|
|
table_candidates.append((i, same_row_blocks)) |
|
|
|
|
|
|
|
|
if table_candidates: |
|
|
table_html = "<table border='1'>\n" |
|
|
|
|
|
for row_idx, row_blocks in table_candidates: |
|
|
table_html += "<tr>\n" |
|
|
|
|
|
|
|
|
block_text = "" |
|
|
for line in blocks[row_idx]["lines"]: |
|
|
for span in line["spans"]: |
|
|
block_text += span["text"] + " " |
|
|
|
|
|
table_html += f"<td>{block_text.strip()}</td>\n" |
|
|
|
|
|
|
|
|
for block_idx in row_blocks: |
|
|
block_text = "" |
|
|
for line in blocks[block_idx]["lines"]: |
|
|
for span in line["spans"]: |
|
|
block_text += span["text"] + " " |
|
|
|
|
|
table_html += f"<td>{block_text.strip()}</td>\n" |
|
|
|
|
|
table_html += "</tr>\n" |
|
|
|
|
|
table_html += "</table>" |
|
|
tables.append(table_html) |
|
|
|
|
|
return tables |
|
|
|
|
|
def _create_html_template(self, title, content, css_additional=""): |
|
|
""" |
|
|
HTML ํ
ํ๋ฆฟ ์์ฑ - ๋คํฌ ํ
๋ง ์ ์ฉ |
|
|
|
|
|
Args: |
|
|
title (str): HTML ์ ๋ชฉ |
|
|
content (str): HTML ๋ณธ๋ฌธ ๋ด์ฉ |
|
|
css_additional (str): ์ถ๊ฐ CSS ์คํ์ผ |
|
|
|
|
|
Returns: |
|
|
str: ์์ฑ๋ HTML ๋ฌธ์์ด |
|
|
""" |
|
|
return f"""<!DOCTYPE html> |
|
|
<html lang="ko"> |
|
|
<head> |
|
|
<meta charset="UTF-8"> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
|
<title>{title}</title> |
|
|
<style> |
|
|
body {{ |
|
|
font-family: Arial, sans-serif; |
|
|
line-height: 1.6; |
|
|
margin: 0; |
|
|
padding: 0; |
|
|
height: 100vh; |
|
|
overflow-y: auto; |
|
|
background-color: #2a2a2a; |
|
|
color: #ffffff; |
|
|
}} |
|
|
.page-title {{ |
|
|
padding: 10px 20px; |
|
|
margin: 0; |
|
|
background-color: #333; |
|
|
color: white; |
|
|
position: sticky; |
|
|
top: 0; |
|
|
z-index: 10; |
|
|
}} |
|
|
.content-container {{ |
|
|
padding: 20px; |
|
|
}} |
|
|
.image-container {{ |
|
|
text-align: center; |
|
|
margin: 20px 0; |
|
|
}} |
|
|
img {{ |
|
|
max-width: 100%; |
|
|
height: auto; |
|
|
border: 1px solid #444; |
|
|
}} |
|
|
table {{ |
|
|
border-collapse: collapse; |
|
|
width: 100%; |
|
|
margin: 20px 0; |
|
|
background-color: #333; |
|
|
}} |
|
|
td, th {{ |
|
|
border: 1px solid #555; |
|
|
padding: 8px; |
|
|
color: #fff; |
|
|
}} |
|
|
h1, h2, h3, h4, p, span, div {{ |
|
|
color: #fff; |
|
|
}} |
|
|
.media-item {{ |
|
|
margin-bottom: 30px; |
|
|
padding-bottom: 20px; |
|
|
border-bottom: 1px solid #444; |
|
|
}} |
|
|
.media-item-heading {{ |
|
|
background-color: #444; |
|
|
padding: 5px 10px; |
|
|
margin-bottom: 10px; |
|
|
font-weight: bold; |
|
|
border-left: 3px solid #E67E22; |
|
|
color: #fff; |
|
|
}} |
|
|
.page-text {{ |
|
|
margin-bottom: 30px; |
|
|
border-bottom: 1px solid #444; |
|
|
padding-bottom: 20px; |
|
|
}} |
|
|
/* ์คํฌ๋กค๋ฐ ์คํ์ผ */ |
|
|
::-webkit-scrollbar {{ |
|
|
width: 8px; |
|
|
}} |
|
|
::-webkit-scrollbar-track {{ |
|
|
background: #333; |
|
|
}} |
|
|
::-webkit-scrollbar-thumb {{ |
|
|
background: #666; |
|
|
border-radius: 4px; |
|
|
}} |
|
|
::-webkit-scrollbar-thumb:hover {{ |
|
|
background: #777; |
|
|
}} |
|
|
/* ๋งํฌ ์คํ์ผ */ |
|
|
a {{ |
|
|
color: #3498db; |
|
|
text-decoration: none; |
|
|
}} |
|
|
a:hover {{ |
|
|
text-decoration: underline; |
|
|
}} |
|
|
{css_additional} |
|
|
</style> |
|
|
</head> |
|
|
<body> |
|
|
<h1 class="page-title">{title}</h1> |
|
|
<div class="content-container"> |
|
|
{content} |
|
|
</div> |
|
|
</body> |
|
|
</html>""" |
|
|
|
|
|
def convert(self): |
|
|
""" |
|
|
PDF๋ฅผ ํ
์คํธ HTML๊ณผ ๋ฏธ๋์ด HTML๋ก ๋ถ๋ฆฌํ์ฌ ๋ณํ |
|
|
|
|
|
Returns: |
|
|
tuple: ํ
์คํธ HTML ๊ฒฝ๋ก, ๋ฏธ๋์ด HTML ๊ฒฝ๋ก |
|
|
""" |
|
|
|
|
|
text_content = [] |
|
|
media_content = [] |
|
|
media_order = 0 |
|
|
|
|
|
|
|
|
for page_num, page in enumerate(self.doc): |
|
|
|
|
|
text_html = self._extract_text_with_structure(page) |
|
|
text_content.append(f"\n<div class='page-text' id='page-text-{page_num+1}'>\n") |
|
|
text_content.append(f"<h3>ํ์ด์ง {page_num+1}</h3>") |
|
|
text_content.append(text_html) |
|
|
text_content.append("\n</div>\n") |
|
|
|
|
|
|
|
|
tables = self._extract_tables(page) |
|
|
for table_idx, table in enumerate(tables): |
|
|
media_order += 1 |
|
|
media_content.append(f""" |
|
|
<div class="media-item" id="table-{page_num+1}-{table_idx+1}" data-page="{page_num+1}"> |
|
|
<div class="media-item-heading">ํ {media_order} - ํ์ด์ง {page_num+1}</div> |
|
|
{table} |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
images = self._extract_images(page, page_num) |
|
|
for img_idx, img_tag in enumerate(images): |
|
|
media_order += 1 |
|
|
media_content.append(f""" |
|
|
<div class="media-item" id="image-{page_num+1}-{img_idx+1}" data-page="{page_num+1}"> |
|
|
<div class="media-item-heading">์ด๋ฏธ์ง {media_order} - ํ์ด์ง {page_num+1}</div> |
|
|
{img_tag} |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
text_html_content = self._create_html_template( |
|
|
f"{self.pdf_name} - ํ
์คํธ", |
|
|
"\n".join(text_content) |
|
|
) |
|
|
|
|
|
|
|
|
media_html_content = self._create_html_template( |
|
|
f"{self.pdf_name} - ํ ๋ฐ ์ด๋ฏธ์ง", |
|
|
"\n".join(media_content) |
|
|
) |
|
|
|
|
|
|
|
|
text_output_path = self.output_dir / "text.html" |
|
|
with open(text_output_path, "w", encoding="utf-8") as html_file: |
|
|
html_file.write(text_html_content) |
|
|
|
|
|
|
|
|
media_output_path = self.output_dir / "media.html" |
|
|
with open(media_output_path, "w", encoding="utf-8") as html_file: |
|
|
html_file.write(media_html_content) |
|
|
|
|
|
|
|
|
self.text_html_content = text_html_content |
|
|
self.media_html_content = media_html_content |
|
|
|
|
|
return str(text_output_path), str(media_output_path) |