Spaces:

Pier-Jean
/

docling-studio

Running

App Files Files Community

docling-studio / document-parser /services /document_service.py

Pier-Jean

Upload folder using huggingface_hub

cc59214 verified about 1 month ago

raw

history blame contribute delete

4.25 kB

	"""Document service — file upload, storage, and preview orchestration."""

	from __future__ import annotations

	import io
	import logging
	import os
	import uuid

	from pdf2image import convert_from_bytes, pdfinfo_from_bytes

	from domain.models import Document
	from infra.settings import settings
	from persistence import analysis_repo, document_repo

	logger = logging.getLogger(__name__)

	UPLOAD_DIR = settings.upload_dir
	MAX_FILE_SIZE = 5 * 1024 * 1024 # 5 MB
	MAX_PAGE_COUNT = settings.max_page_count # 0 = unlimited


	# PDF magic bytes: %PDF
	_PDF_MAGIC = b"%PDF"


	_UPLOAD_CHUNK_SIZE = 64 * 1024 # 64 KB chunks for streaming writes


	async def upload(filename: str, content_type: str, file_content: bytes) -> Document:
	"""Save uploaded file to disk and persist metadata.

	Writes the file in fixed-size chunks to keep peak memory usage low.
	"""
	if len(file_content) > MAX_FILE_SIZE:
	raise ValueError("File too large (max 5 MB)")

	if not file_content[:4].startswith(_PDF_MAGIC):
	raise ValueError("Invalid file: not a PDF document")

	os.makedirs(UPLOAD_DIR, exist_ok=True)

	ext = ".pdf" # Content already validated as PDF
	safe_name = f"{uuid.uuid4()}{ext}"
	file_path = os.path.join(UPLOAD_DIR, safe_name)

	# Write in chunks to avoid doubling memory usage for large files
	with open(file_path, "wb") as f:
	for offset in range(0, len(file_content), _UPLOAD_CHUNK_SIZE):
	f.write(file_content[offset : offset + _UPLOAD_CHUNK_SIZE])

	# Count PDF pages
	page_count = _count_pages(file_content)

	if MAX_PAGE_COUNT > 0 and page_count is not None and page_count > MAX_PAGE_COUNT:
	os.unlink(file_path)
	raise ValueError(f"Too many pages ({page_count}). Maximum allowed: {MAX_PAGE_COUNT}")

	doc = Document(
	filename=filename,
	content_type=content_type,
	file_size=len(file_content),
	page_count=page_count,
	storage_path=os.path.abspath(file_path),
	)
	await document_repo.insert(doc)
	return doc


	async def find_all() -> list[Document]:
	"""Return all documents, newest first."""
	return await document_repo.find_all()


	async def find_by_id(doc_id: str) -> Document \| None:
	"""Find a document by its ID, or return None."""
	return await document_repo.find_by_id(doc_id)


	async def delete(doc_id: str) -> bool:
	"""Delete document file, associated analyses, and database record."""
	doc = await document_repo.find_by_id(doc_id)
	if not doc:
	return False

	# Delete associated analyses first (cascade)
	await analysis_repo.delete_by_document(doc_id)

	# Delete file from disk (only if inside UPLOAD_DIR)
	try:
	real_path = os.path.realpath(doc.storage_path)
	real_upload_dir = os.path.realpath(UPLOAD_DIR)
	if real_path.startswith(real_upload_dir + os.sep) and os.path.exists(real_path):
	os.unlink(real_path)
	elif os.path.exists(doc.storage_path):
	logger.warning("Refused to delete file outside upload dir: %s", doc.storage_path)
	except FileNotFoundError:
	logger.info("File already removed: %s", doc.storage_path)
	except PermissionError:
	logger.error("Permission denied deleting file: %s", doc.storage_path)
	except OSError:
	logger.warning("Could not delete file: %s", doc.storage_path, exc_info=True)

	return await document_repo.delete(doc_id)


	def generate_preview(file_content: bytes, page: int = 1, dpi: int = 150) -> bytes:
	"""Generate a PNG preview of a specific PDF page."""
	images = convert_from_bytes(file_content, first_page=page, last_page=page, dpi=dpi)
	if not images:
	raise ValueError(f"Page {page} not found")

	buf = io.BytesIO()
	images[0].save(buf, format="PNG")
	return buf.getvalue()


	def _count_pages(file_content: bytes) -> int \| None:
	"""Count PDF pages using poppler via pdf2image."""
	try:
	info = pdfinfo_from_bytes(file_content)
	return info.get("Pages")
	except (FileNotFoundError, OSError) as exc:
	logger.warning("Could not count pages: %s", exc)
	return None
	except Exception:
	logger.warning("Unexpected error counting pages", exc_info=True)
	return None