Spaces:

jrpark
/

pdf2html

Paused

App Files Files Community

pdf2html / convert.py

jrpark

Upload folder using huggingface_hub

d1aa69e verified 10 months ago

raw

history blame contribute delete

15.6 kB

	import os
	from pathlib import Path
	import fitz # PyMuPDF
	import base64
	import re
	import shutil

	class PDFToHTMLConverter:
	def __init__(self, pdf_path, output_dir=None):
	"""
	PDF를 HTML로 변환하는 클래스 초기화

	Args:
	pdf_path (str): PDF 파일 경로
	output_dir (str, optional): 출력 디렉토리. 기본값은 PDF와 같은 디렉토리.
	"""
	self.pdf_path = pdf_path
	self.pdf_name = Path(pdf_path).stem

	# 프로젝트 루트 디렉토리에 .temp 폴더 생성
	current_dir = Path.cwd()
	self.temp_dir = current_dir / ".temp"

	# 입력 PDF 저장 디렉토리
	self.pdf_dir = self.temp_dir / "temp_input_pdf"

	# 출력 HTML 저장 디렉토리
	self.output_dir = self.temp_dir / "temp_output_html"

	# 이미지 저장 디렉토리
	self.img_dir = self.output_dir / "images"

	# 필요한 디렉토리 생성
	self.temp_dir.mkdir(exist_ok=True)
	self.pdf_dir.mkdir(exist_ok=True)
	self.output_dir.mkdir(exist_ok=True)
	self.img_dir.mkdir(exist_ok=True)

	# 고정된 파일 이름 설정
	self.fixed_pdf_path = self.pdf_dir / "current.pdf"

	# 파일 경로가 문자열인 경우 Path 객체로 변환
	if isinstance(pdf_path, str):
	pdf_path = Path(pdf_path)

	# 원본 PDF 파일이 고정 경로와 다른 경우에만 복사
	if pdf_path != self.fixed_pdf_path:
	shutil.copy2(str(pdf_path), str(self.fixed_pdf_path))
	print(f"PDF 파일 복사됨: {pdf_path} -> {self.fixed_pdf_path}")
	else:
	print(f"PDF 파일이 이미 올바른 위치에 있습니다: {self.fixed_pdf_path}")

	# PyMuPDF 문서 객체 열기
	self.doc = fitz.open(self.fixed_pdf_path)

	# 결과 HTML
	self.html_content = ""
	self.text_html_content = ""
	self.media_html_content = ""

	def _extract_text_with_structure(self, page):
	"""
	페이지에서 텍스트를 추출하고 기본 구조를 유지

	Args:
	page (fitz.Page): PDF 페이지 객체

	Returns:
	str: 구조화된 HTML 텍스트
	"""
	blocks = page.get_text("dict")["blocks"]
	html_text = []

	for block in blocks:
	if block["type"] == 0: # 텍스트 블록
	text_lines = []
	for line in block["lines"]:
	line_text = ""
	for span in line["spans"]:
	# 폰트 크기와 스타일 분석
	font_size = span["size"]
	is_bold = "bold" in span["font"].lower() or span.get("flags", 0) & 16 != 0
	is_italic = "italic" in span["font"].lower() or span.get("flags", 0) & 1 != 0

	text = span["text"]

	# 폰트 크기에 따라 제목 또는 일반 텍스트로 분류
	if font_size > 14: # 큰 폰트는 제목일 가능성이 높음
	if is_bold:
	text = f"<h1>{text}</h1>"
	else:
	text = f"<h2>{text}</h2>"
	elif font_size > 12:
	if is_bold:
	text = f"<h3>{text}</h3>"
	else:
	text = f"<h4>{text}</h4>"
	else:
	if is_bold:
	text = f"<strong>{text}</strong>"
	if is_italic:
	text = f"<em>{text}</em>"

	line_text += text

	text_lines.append(line_text)

	# 텍스트 라인을 단락으로 결합
	if text_lines:
	paragraph = " ".join(text_lines)
	html_text.append(f"<p>{paragraph}</p>")

	return "\n".join(html_text)

	def _extract_images(self, page, page_num):
	"""
	페이지에서 이미지 추출

	Args:
	page (fitz.Page): PDF 페이지 객체
	page_num (int): 페이지 번호

	Returns:
	list: 이미지 HTML 태그 목록
	"""
	image_tags = []
	image_list = page.get_images(full=True)

	for img_idx, img_info in enumerate(image_list):
	try:
	xref = img_info[0]
	base_img = self.doc.extract_image(xref)
	image_bytes = base_img["image"]

	# 이미지 포맷 확인 (기본값은 png)
	image_ext = base_img["ext"]
	if image_ext.lower() not in ["jpeg", "jpg", "png"]:
	image_ext = "png"

	# 고정된 경로에 이미지 저장
	image_filename = f"page{page_num+1}_img{img_idx+1}.{image_ext}"
	image_path = self.img_dir / image_filename

	# 이미지 디렉토리 확인
	if not self.img_dir.exists():
	self.img_dir.mkdir(parents=True, exist_ok=True)

	with open(image_path, "wb") as img_file:
	img_file.write(image_bytes)

	# 디버깅을 위한 코드
	print(f"이미지 저장: {image_path} (크기: {len(image_bytes)} 바이트)")

	# 이미지 태그 생성 (상대 경로 사용)
	# 경로가 HTML 파일에서 올바르게 참조될 수 있도록 합니다
	rel_img_path = f"images/{image_filename}"
	img_tag = f'<div class="image-container"><img src="{rel_img_path}" alt="Page {page_num+1} Image {img_idx+1}" style="max-width:100%; height:auto;"/></div>'
	image_tags.append(img_tag)

	except Exception as e:
	print(f"이미지 추출 중 오류: {str(e)}")

	return image_tags

	def _extract_tables(self, page):
	"""
	페이지에서 표 추출 시도

	Args:
	page (fitz.Page): PDF 페이지 객체

	Returns:
	list: 표 HTML 태그 목록
	"""
	# 표 감지 및 추출은 복잡한 작업입니다.
	# 이 간단한 예시에서는 테이블로 보이는 구조를 감지하는 기본적인 접근 방식을 사용합니다.
	tables = []

	# 페이지의 텍스트 블록을 분석
	blocks = page.get_text("dict")["blocks"]

	# 높이가 비슷한 텍스트 블록이 가로로 정렬된 경우 테이블 행일 가능성이 있음
	table_candidates = []

	for i, block in enumerate(blocks):
	if block["type"] == 0: # 텍스트 블록
	# 텍스트 블록의 위치 정보
	x0, y0, x1, y1 = block["bbox"]

	# 같은 행에 있는 다른 텍스트 블록 찾기
	same_row_blocks = []

	for j, other_block in enumerate(blocks):
	if i != j and other_block["type"] == 0:
	ox0, oy0, ox1, oy1 = other_block["bbox"]

	# y 좌표가 비슷하면 같은 행일 가능성이 있음
	if abs(y0 - oy0) < 5 and abs(y1 - oy1) < 5:
	same_row_blocks.append(j)

	# 같은 행에 여러 텍스트 블록이 있으면 테이블 행일 가능성이 높음
	if len(same_row_blocks) >= 2:
	table_candidates.append((i, same_row_blocks))

	# 테이블 후보가 있으면 HTML 테이블로 변환
	if table_candidates:
	table_html = "<table border='1'>\n"

	for row_idx, row_blocks in table_candidates:
	table_html += "<tr>\n"

	# 현재 블록 추가
	block_text = ""
	for line in blocks[row_idx]["lines"]:
	for span in line["spans"]:
	block_text += span["text"] + " "

	table_html += f"<td>{block_text.strip()}</td>\n"

	# 같은 행의 다른 블록 추가
	for block_idx in row_blocks:
	block_text = ""
	for line in blocks[block_idx]["lines"]:
	for span in line["spans"]:
	block_text += span["text"] + " "

	table_html += f"<td>{block_text.strip()}</td>\n"

	table_html += "</tr>\n"

	table_html += "</table>"
	tables.append(table_html)

	return tables

	def _create_html_template(self, title, content, css_additional=""):
	"""
	HTML 템플릿 생성 - 다크 테마 적용

	Args:
	title (str): HTML 제목
	content (str): HTML 본문 내용
	css_additional (str): 추가 CSS 스타일

	Returns:
	str: 완성된 HTML 문자열
	"""
	return f"""<!DOCTYPE html>
	<html lang="ko">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>{title}</title>
	<style>
	body {{
	font-family: Arial, sans-serif;
	line-height: 1.6;
	margin: 0;
	padding: 0;
	height: 100vh;
	overflow-y: auto;
	background-color: #2a2a2a;
	color: #ffffff;
	}}
	.page-title {{
	padding: 10px 20px;
	margin: 0;
	background-color: #333;
	color: white;
	position: sticky;
	top: 0;
	z-index: 10;
	}}
	.content-container {{
	padding: 20px;
	}}
	.image-container {{
	text-align: center;
	margin: 20px 0;
	}}
	img {{
	max-width: 100%;
	height: auto;
	border: 1px solid #444;
	}}
	table {{
	border-collapse: collapse;
	width: 100%;
	margin: 20px 0;
	background-color: #333;
	}}
	td, th {{
	border: 1px solid #555;
	padding: 8px;
	color: #fff;
	}}
	h1, h2, h3, h4, p, span, div {{
	color: #fff;
	}}
	.media-item {{
	margin-bottom: 30px;
	padding-bottom: 20px;
	border-bottom: 1px solid #444;
	}}
	.media-item-heading {{
	background-color: #444;
	padding: 5px 10px;
	margin-bottom: 10px;
	font-weight: bold;
	border-left: 3px solid #E67E22;
	color: #fff;
	}}
	.page-text {{
	margin-bottom: 30px;
	border-bottom: 1px solid #444;
	padding-bottom: 20px;
	}}
	/* 스크롤바 스타일 */
	::-webkit-scrollbar {{
	width: 8px;
	}}
	::-webkit-scrollbar-track {{
	background: #333;
	}}
	::-webkit-scrollbar-thumb {{
	background: #666;
	border-radius: 4px;
	}}
	::-webkit-scrollbar-thumb:hover {{
	background: #777;
	}}
	/* 링크 스타일 */
	a {{
	color: #3498db;
	text-decoration: none;
	}}
	a:hover {{
	text-decoration: underline;
	}}
	{css_additional}
	</style>
	</head>
	<body>
	<h1 class="page-title">{title}</h1>
	<div class="content-container">
	{content}
	</div>
	</body>
	</html>"""

	def convert(self):
	"""
	PDF를 텍스트 HTML과 미디어 HTML로 분리하여 변환

	Returns:
	tuple: 텍스트 HTML 경로, 미디어 HTML 경로
	"""
	# 텍스트 컬럼과 미디어 컬럼을 위한 컨텐츠 준비
	text_content = []
	media_content = []
	media_order = 0 # 미디어 아이템 순서

	# 각 페이지 처리
	for page_num, page in enumerate(self.doc):
	# 텍스트 추출
	text_html = self._extract_text_with_structure(page)
	text_content.append(f"\n<div class='page-text' id='page-text-{page_num+1}'>\n")
	text_content.append(f"<h3>페이지 {page_num+1}</h3>")
	text_content.append(text_html)
	text_content.append("\n</div>\n")

	# 표 추출
	tables = self._extract_tables(page)
	for table_idx, table in enumerate(tables):
	media_order += 1
	media_content.append(f"""
	<div class="media-item" id="table-{page_num+1}-{table_idx+1}" data-page="{page_num+1}">
	<div class="media-item-heading">표 {media_order} - 페이지 {page_num+1}</div>
	{table}
	</div>
	""")

	# 이미지 추출
	images = self._extract_images(page, page_num)
	for img_idx, img_tag in enumerate(images):
	media_order += 1
	media_content.append(f"""
	<div class="media-item" id="image-{page_num+1}-{img_idx+1}" data-page="{page_num+1}">
	<div class="media-item-heading">이미지 {media_order} - 페이지 {page_num+1}</div>
	{img_tag}
	</div>
	""")

	# 텍스트 HTML 생성
	text_html_content = self._create_html_template(
	f"{self.pdf_name} - 텍스트",
	"\n".join(text_content)
	)

	# 미디어 HTML 생성
	media_html_content = self._create_html_template(
	f"{self.pdf_name} - 표 및 이미지",
	"\n".join(media_content)
	)

	# 텍스트 HTML 파일 저장
	text_output_path = self.output_dir / "text.html"
	with open(text_output_path, "w", encoding="utf-8") as html_file:
	html_file.write(text_html_content)

	# 미디어 HTML 파일 저장
	media_output_path = self.output_dir / "media.html"
	with open(media_output_path, "w", encoding="utf-8") as html_file:
	html_file.write(media_html_content)

	# 결과 저장
	self.text_html_content = text_html_content
	self.media_html_content = media_html_content

	return str(text_output_path), str(media_output_path)