smarteye-backend / app /services /pdf_processor.py
AkJeond's picture
feat(backend): ๋ถ„์„ ๋ฐ ๋‹ค์šด๋กœ๋“œ API ๊ฐœ์„ 
188503c
# -*- coding: utf-8 -*-
"""
SmartEyeSsen PDF ์ฒ˜๋ฆฌ ์„œ๋น„์Šค
============================
PDF ํŒŒ์ผ์„ ํŽ˜์ด์ง€๋ณ„ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ๊ธฐ๋Šฅ์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค.
PyMuPDF (fitz)๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๊ณ ํ’ˆ์งˆ ์ด๋ฏธ์ง€ ๋ณ€ํ™˜์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
"""
from typing import List, Dict, Optional, Tuple
from loguru import logger
import os
import fitz # PyMuPDF
from PIL import Image
import io
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
DEFAULT_PDF_DPI = 300
class PDFProcessor:
"""PDF ํŒŒ์ผ ์ฒ˜๋ฆฌ ํด๋ž˜์Šค"""
def __init__(self, upload_directory: str = "uploads", dpi: Optional[int] = None):
"""
PDF ์ฒ˜๋ฆฌ๊ธฐ ์ดˆ๊ธฐํ™”
Args:
upload_directory: ํŒŒ์ผ ์ €์žฅ ๊ธฐ๋ณธ ๋””๋ ‰ํ† ๋ฆฌ
dpi: ์ด๋ฏธ์ง€ ๋ณ€ํ™˜ ํ•ด์ƒ๋„ (๊ธฐ๋ณธ๊ฐ’: 300)
"""
self.upload_directory = Path(upload_directory).resolve()
self.dpi = self._resolve_dpi(dpi)
self.jpeg_quality = 95
os.makedirs(self.upload_directory, exist_ok=True)
logger.info(
f"PDFProcessor ์ดˆ๊ธฐํ™” ์™„๋ฃŒ - DPI: {self.dpi}, ์ €์žฅ ๊ฒฝ๋กœ: {self.upload_directory}"
)
@staticmethod
def _resolve_dpi(provided_dpi: Optional[int]) -> int:
"""ํ™˜๊ฒฝ ๋ณ€์ˆ˜์™€ ์ธ์ž ๊ฐ’์„ ๊ณ ๋ คํ•ด DPI๋ฅผ ๊ฒฐ์ •"""
if provided_dpi and provided_dpi > 0:
return int(provided_dpi)
env_value = os.getenv("PDF_PROCESSOR_DPI")
if env_value:
try:
parsed = int(env_value)
if parsed > 0:
logger.debug(
f"ํ™˜๊ฒฝ ๋ณ€์ˆ˜ PDF_PROCESSOR_DPI ์ ์šฉ: {parsed} (์ธ์ž ๋ฏธ์ง€์ •)"
)
return parsed
except ValueError:
logger.warning(
f"ํ™˜๊ฒฝ ๋ณ€์ˆ˜ PDF_PROCESSOR_DPI ๊ฐ’ '{env_value}'์„(๋ฅผ) ์ •์ˆ˜๋กœ ๋ณ€ํ™˜ํ•  ์ˆ˜ ์—†์–ด ๊ธฐ๋ณธ๊ฐ’ {DEFAULT_PDF_DPI}์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค."
)
return DEFAULT_PDF_DPI
def convert_pdf_to_images(
self,
pdf_bytes: bytes,
project_id: int,
start_page_number: int
) -> List[Dict[str, any]]:
"""
PDF ๋ฐ”์ดํŠธ ๋ฐ์ดํ„ฐ๋ฅผ ํŽ˜์ด์ง€๋ณ„ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  ์ €์žฅ
Args:
pdf_bytes: PDF ํŒŒ์ผ์˜ ๋ฐ”์ดํŠธ ๋ฐ์ดํ„ฐ
project_id: ํ”„๋กœ์ ํŠธ ID (ํด๋” ๊ฒฝ๋กœ์šฉ)
start_page_number: ์‹œ์ž‘ ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
Returns:
๋ณ€ํ™˜๋œ ์ด๋ฏธ์ง€ ์ •๋ณด ๋ฆฌ์ŠคํŠธ
[
{
'page_number': 1,
'image_path': '123/page_1.jpg', # DB ์ €์žฅ์šฉ ์ƒ๋Œ€ ๊ฒฝ๋กœ
'full_path': 'uploads/123/page_1.jpg', # ์‹ค์ œ ํŒŒ์ผ ๊ฒฝ๋กœ
'width': 2480,
'height': 3508
},
...
]
Raises:
ValueError: PDF ํŒŒ์ผ์ด ์†์ƒ๋˜์—ˆ๊ฑฐ๋‚˜ ์ฝ์„ ์ˆ˜ ์—†๋Š” ๊ฒฝ์šฐ
OSError: ํŒŒ์ผ ์ €์žฅ ์ค‘ ๋””์Šคํฌ ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ
"""
logger.info(f"PDF ๋ณ€ํ™˜ ์‹œ์ž‘ - ProjectID: {project_id}, ์‹œ์ž‘ ํŽ˜์ด์ง€: {start_page_number}")
# ํ”„๋กœ์ ํŠธ๋ณ„ ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
project_dir = self.upload_directory / str(project_id)
project_dir.mkdir(parents=True, exist_ok=True)
converted_pages = []
pdf_document = None
try:
# PDF ๋ฌธ์„œ ์—ด๊ธฐ
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
total_pages = len(pdf_document)
logger.info(f"PDF ํŽ˜์ด์ง€ ์ˆ˜: {total_pages}")
if total_pages == 0:
raise ValueError("PDF ํŒŒ์ผ์— ํŽ˜์ด์ง€๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
# PDF ์›๋ณธ ํŒŒ์ผ ์ €์žฅ
original_pdf_path = project_dir / "original.pdf"
with open(original_pdf_path, "wb") as f:
f.write(pdf_bytes)
logger.info(f"PDF ์›๋ณธ ์ €์žฅ ์™„๋ฃŒ: {original_pdf_path}")
# ๊ฐ ํŽ˜์ด์ง€๋ฅผ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
for page_index in range(total_pages):
page_number = start_page_number + page_index
try:
# PDF ํŽ˜์ด์ง€๋ฅผ Pixmap์œผ๋กœ ๋ Œ๋”๋ง
page = pdf_document[page_index]
# DPI ๊ธฐ๋ฐ˜ ํ™•๋Œ€ ๋น„์œจ ๊ณ„์‚ฐ (72 DPI๊ฐ€ ๊ธฐ๋ณธ)
zoom = self.dpi / 72
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
# PIL Image๋กœ ๋ณ€ํ™˜
img_data = pix.tobytes("jpeg")
img = Image.open(io.BytesIO(img_data))
# ์ด๋ฏธ์ง€ ํฌ๊ธฐ
width, height = img.size
# ํŒŒ์ผ๋ช… ๋ฐ ๊ฒฝ๋กœ ์ƒ์„ฑ
filename = f"page_{page_number}.jpg"
full_path = project_dir / filename
public_path = Path("uploads") / str(project_id) / filename
# ์ด๋ฏธ์ง€ ์ €์žฅ (JPEG ํ’ˆ์งˆ ์ ์šฉ)
img.save(str(full_path), "JPEG", quality=self.jpeg_quality, optimize=True)
# ๋ณ€ํ™˜ ์ •๋ณด ์ €์žฅ
page_info = {
'page_number': page_number,
'image_path': str(public_path).replace("\\", "/"),
'full_path': str(full_path),
'width': width,
'height': height,
'dpi': self.dpi,
}
converted_pages.append(page_info)
logger.debug(
f"ํŽ˜์ด์ง€ {page_index + 1}/{total_pages} ๋ณ€ํ™˜ ์™„๋ฃŒ - "
f"ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ: {page_number}, ํฌ๊ธฐ: {width}x{height}"
)
except Exception as e:
logger.error(f"ํŽ˜์ด์ง€ {page_index + 1} ๋ณ€ํ™˜ ์‹คํŒจ: {str(e)}")
# ๋ถ€๋ถ„ ๋ณ€ํ™˜ ์‹คํŒจ ์‹œ ๋กค๋ฐฑ
self._rollback_conversion(converted_pages)
raise ValueError(f"PDF ํŽ˜์ด์ง€ {page_index + 1} ๋ณ€ํ™˜ ์‹คํŒจ: {str(e)}")
logger.info(
f"PDF ๋ณ€ํ™˜ ์™„๋ฃŒ - ProjectID: {project_id}, "
f"์ด {len(converted_pages)}๊ฐœ ํŽ˜์ด์ง€ ๋ณ€ํ™˜"
)
return converted_pages
except fitz.fitz.FileDataError as e:
logger.error(f"PDF ํŒŒ์ผ ์˜ค๋ฅ˜: {str(e)}")
raise ValueError(f"PDF ํŒŒ์ผ์ด ์†์ƒ๋˜์—ˆ๊ฑฐ๋‚˜ ์ฝ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {str(e)}")
except Exception as e:
logger.error(f"PDF ๋ณ€ํ™˜ ์ค‘ ์˜ˆ์ƒ์น˜ ๋ชปํ•œ ์˜ค๋ฅ˜: {str(e)}")
if converted_pages:
self._rollback_conversion(converted_pages)
raise
finally:
# PDF ๋ฌธ์„œ ๋‹ซ๊ธฐ
if pdf_document:
pdf_document.close()
def convert_pdf_to_images_parallel(
self,
pdf_bytes: bytes,
project_id: int,
start_page_number: int,
max_workers: Optional[int] = None
) -> List[Dict[str, any]]:
"""
PDF ๋ฐ”์ดํŠธ ๋ฐ์ดํ„ฐ๋ฅผ ํŽ˜์ด์ง€๋ณ„ ์ด๋ฏธ์ง€๋กœ ๋ณ‘๋ ฌ ๋ณ€ํ™˜ํ•˜๊ณ  ์ €์žฅ
Args:
pdf_bytes: PDF ํŒŒ์ผ์˜ ๋ฐ”์ดํŠธ ๋ฐ์ดํ„ฐ
project_id: ํ”„๋กœ์ ํŠธ ID (ํด๋” ๊ฒฝ๋กœ์šฉ)
start_page_number: ์‹œ์ž‘ ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
max_workers: ์ตœ๋Œ€ ์›Œ์ปค ์Šค๋ ˆ๋“œ ์ˆ˜ (None์ด๋ฉด CPU ์ฝ”์–ด ์ˆ˜, ์ตœ๋Œ€ 4๊ฐœ)
Returns:
๋ณ€ํ™˜๋œ ์ด๋ฏธ์ง€ ์ •๋ณด ๋ฆฌ์ŠคํŠธ
Note:
ThreadPoolExecutor๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์—ฌ๋Ÿฌ ํŽ˜์ด์ง€๋ฅผ ๋™์‹œ์— ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
๋Œ€์šฉ๋Ÿ‰ PDF์˜ ๊ฒฝ์šฐ ๋ณ€ํ™˜ ์†๋„๊ฐ€ 2-3๋ฐฐ ํ–ฅ์ƒ๋ฉ๋‹ˆ๋‹ค.
max_workers๋ฅผ ๋„ˆ๋ฌด ํฌ๊ฒŒ ์„ค์ •ํ•˜๋ฉด ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰์ด ์ฆ๊ฐ€ํ•  ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์ฃผ์˜ํ•˜์„ธ์š”.
"""
logger.info(
f"PDF ๋ณ‘๋ ฌ ๋ณ€ํ™˜ ์‹œ์ž‘ - ProjectID: {project_id}, ์‹œ์ž‘ ํŽ˜์ด์ง€: {start_page_number}"
)
# ํ”„๋กœ์ ํŠธ๋ณ„ ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
project_dir = self.upload_directory / str(project_id)
project_dir.mkdir(parents=True, exist_ok=True)
pdf_document = None
converted_pages = []
try:
# PDF ๋ฌธ์„œ ์—ด๊ธฐ
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
total_pages = len(pdf_document)
logger.info(f"PDF ํŽ˜์ด์ง€ ์ˆ˜: {total_pages}")
if total_pages == 0:
raise ValueError("PDF ํŒŒ์ผ์— ํŽ˜์ด์ง€๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
# PDF ์›๋ณธ ํŒŒ์ผ ์ €์žฅ
original_pdf_path = project_dir / "original.pdf"
with open(original_pdf_path, "wb") as f:
f.write(pdf_bytes)
logger.info(f"PDF ์›๋ณธ ์ €์žฅ ์™„๋ฃŒ: {original_pdf_path}")
# ์›Œ์ปค ์ˆ˜ ๊ฒฐ์ • (๊ธฐ๋ณธ: CPU ์ฝ”์–ด ์ˆ˜, ์ตœ๋Œ€ 4๊ฐœ)
if max_workers is None:
max_workers = min(os.cpu_count() or 4, 4)
logger.info(f"๋ณ‘๋ ฌ ๋ณ€ํ™˜ ์‹œ์ž‘: {max_workers}๊ฐœ ์›Œ์ปค ์‚ฌ์šฉ")
def convert_single_page(page_index: int) -> Dict[str, any]:
"""
๋‹จ์ผ ํŽ˜์ด์ง€ ๋ณ€ํ™˜ (์™„์ „ ๋…๋ฆฝ ์‹คํ–‰)
๊ฐ ์Šค๋ ˆ๋“œ๊ฐ€ ๋…๋ฆฝ์ ์ธ PDF ๋ฌธ์„œ ์ธ์Šคํ„ด์Šค๋ฅผ ์ƒ์„ฑํ•˜์—ฌ
์ง„์ •ํ•œ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋ฅผ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
"""
page_number = start_page_number + page_index
try:
# ๊ฐ ์Šค๋ ˆ๋“œ๊ฐ€ ๋…๋ฆฝ์ ์ธ PDF ๋ฌธ์„œ ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
# PyMuPDF๋Š” ๊ฐ Document ๊ฐ์ฒด๊ฐ€ ๋…๋ฆฝ์ ์ด๋ฉด ์Šค๋ ˆ๋“œ ์•ˆ์ „ํ•จ
temp_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
page = temp_doc[page_index]
# DPI ๊ธฐ๋ฐ˜ ํ™•๋Œ€ ๋น„์œจ ๊ณ„์‚ฐ
zoom = self.dpi / 72
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
# PIL Image๋กœ ๋ณ€ํ™˜
img_data = pix.tobytes("jpeg")
temp_doc.close()
img = Image.open(io.BytesIO(img_data))
width, height = img.size
# ํŒŒ์ผ๋ช… ๋ฐ ๊ฒฝ๋กœ ์ƒ์„ฑ
filename = f"page_{page_number}.jpg"
full_path = project_dir / filename
public_path = Path("uploads") / str(project_id) / filename
# ์ด๋ฏธ์ง€ ์ €์žฅ
img.save(str(full_path), "JPEG", quality=self.jpeg_quality, optimize=True)
logger.debug(
f"ํŽ˜์ด์ง€ {page_index + 1}/{total_pages} ๋ณ€ํ™˜ ์™„๋ฃŒ - "
f"ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ: {page_number}, ํฌ๊ธฐ: {width}x{height}"
)
return {
'page_number': page_number,
'image_path': str(public_path).replace("\\", "/"),
'full_path': str(full_path),
'width': width,
'height': height,
'dpi': self.dpi,
}
except Exception as e:
logger.error(f"ํŽ˜์ด์ง€ {page_index + 1} ๋ณ‘๋ ฌ ๋ณ€ํ™˜ ์‹คํŒจ: {str(e)}")
raise ValueError(f"PDF ํŽ˜์ด์ง€ {page_index + 1} ๋ณ€ํ™˜ ์‹คํŒจ: {str(e)}")
# ThreadPoolExecutor๋กœ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# ๋ชจ๋“  ํŽ˜์ด์ง€์— ๋Œ€ํ•œ Future ์ƒ์„ฑ
future_to_page = {
executor.submit(convert_single_page, i): i
for i in range(total_pages)
}
# ์™„๋ฃŒ๋œ ์ˆœ์„œ๋Œ€๋กœ ๊ฒฐ๊ณผ ์ˆ˜์ง‘
for future in as_completed(future_to_page):
page_index = future_to_page[future]
try:
page_info = future.result()
converted_pages.append(page_info)
except Exception as e:
logger.error(f"ํŽ˜์ด์ง€ {page_index + 1} ์ฒ˜๋ฆฌ ์‹คํŒจ: {str(e)}")
# ์‹คํŒจ ์‹œ ๋กค๋ฐฑ
self._rollback_conversion(converted_pages)
raise
# ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ ์ˆœ์œผ๋กœ ์ •๋ ฌ
converted_pages.sort(key=lambda x: x['page_number'])
logger.info(
f"PDF ๋ณ‘๋ ฌ ๋ณ€ํ™˜ ์™„๋ฃŒ - ProjectID: {project_id}, "
f"์ด {len(converted_pages)}๊ฐœ ํŽ˜์ด์ง€ ๋ณ€ํ™˜"
)
return converted_pages
except fitz.fitz.FileDataError as e:
logger.error(f"PDF ํŒŒ์ผ ์˜ค๋ฅ˜: {str(e)}")
raise ValueError(f"PDF ํŒŒ์ผ์ด ์†์ƒ๋˜์—ˆ๊ฑฐ๋‚˜ ์ฝ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {str(e)}")
except Exception as e:
logger.error(f"PDF ๋ณ‘๋ ฌ ๋ณ€ํ™˜ ์ค‘ ์˜ˆ์ƒ์น˜ ๋ชปํ•œ ์˜ค๋ฅ˜: {str(e)}")
if converted_pages:
self._rollback_conversion(converted_pages)
raise
finally:
# PDF ๋ฌธ์„œ ๋‹ซ๊ธฐ
if pdf_document:
pdf_document.close()
def _rollback_conversion(self, converted_pages: List[Dict[str, any]]) -> None:
"""
๋ณ€ํ™˜ ์‹คํŒจ ์‹œ ์ƒ์„ฑ๋œ ์ด๋ฏธ์ง€ ํŒŒ์ผ ๋กค๋ฐฑ
Args:
converted_pages: ๋กค๋ฐฑํ•  ํŽ˜์ด์ง€ ์ •๋ณด ๋ฆฌ์ŠคํŠธ
"""
logger.warning(f"๋ณ€ํ™˜ ๋กค๋ฐฑ ์‹œ์ž‘ - {len(converted_pages)}๊ฐœ ํŒŒ์ผ ์‚ญ์ œ")
for page_info in converted_pages:
try:
full_path = page_info.get('full_path')
if full_path and os.path.exists(full_path):
os.remove(full_path)
logger.debug(f"ํŒŒ์ผ ์‚ญ์ œ: {full_path}")
except Exception as e:
logger.error(f"๋กค๋ฐฑ ์ค‘ ํŒŒ์ผ ์‚ญ์ œ ์‹คํŒจ: {full_path}, ์˜ค๋ฅ˜: {str(e)}")
logger.info("๋ณ€ํ™˜ ๋กค๋ฐฑ ์™„๋ฃŒ")
def get_pdf_info(self, pdf_bytes: bytes) -> Dict[str, any]:
"""
PDF ํŒŒ์ผ์˜ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ถ”์ถœ
Args:
pdf_bytes: PDF ํŒŒ์ผ์˜ ๋ฐ”์ดํŠธ ๋ฐ์ดํ„ฐ
Returns:
PDF ์ •๋ณด ๋”•์…”๋„ˆ๋ฆฌ
{
'total_pages': 10,
'title': '๋ฌธ์„œ ์ œ๋ชฉ',
'author': '์ž‘์„ฑ์ž',
'subject': '์ฃผ์ œ',
'creator': '์ƒ์„ฑ ํ”„๋กœ๊ทธ๋žจ',
'producer': 'PDF ์ƒ์„ฑ๊ธฐ',
'creation_date': '์ƒ์„ฑ ๋‚ ์งœ'
}
"""
try:
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
metadata = pdf_document.metadata
info = {
'total_pages': len(pdf_document),
'title': metadata.get('title', ''),
'author': metadata.get('author', ''),
'subject': metadata.get('subject', ''),
'creator': metadata.get('creator', ''),
'producer': metadata.get('producer', ''),
'creation_date': metadata.get('creationDate', '')
}
pdf_document.close()
logger.debug(f"PDF ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ถ”์ถœ ์™„๋ฃŒ: {info}")
return info
except Exception as e:
logger.error(f"PDF ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ถ”์ถœ ์‹คํŒจ: {str(e)}")
raise ValueError(f"PDF ํŒŒ์ผ ์ •๋ณด๋ฅผ ์ฝ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {str(e)}")
# ์ „์—ญ ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ (์‹ฑ๊ธ€ํ†ค ํŒจํ„ด)
UPLOAD_ROOT = os.getenv("UPLOAD_DIR", "uploads")
pdf_processor = PDFProcessor(upload_directory=UPLOAD_ROOT)