Spaces:

SherlockRamos
/

docling-processor

Runtime error

App Files Files Community

docling-processor / app.py

SherlockRamos

🎨 Redesign from AnyCoder

4f3a6ee verified 8 days ago

raw

history blame

20.3 kB

	"""
	Docling Document Processor - Modern Redesigned UI
	A clean, mobile-first interface for document processing with AI.
	"""

	import os
	import sys
	import time
	import traceback
	from collections import defaultdict
	from datetime import datetime, timedelta
	from pathlib import Path
	from typing import Optional

	import gradio as gr

	# Importação condicional do spaces para ZeroGPU
	try:
	import spaces
	HAS_SPACES = True
	except ImportError:
	HAS_SPACES = False

	# Adiciona o diretório atual ao path para imports locais
	sys.path.insert(0, str(Path(__file__).parent))

	import config
	from utils.validators import validate_files, ValidationError
	from utils.file_handler import (
	create_temp_directory,
	cleanup_old_files,
	create_zip_output,
	save_output_file,
	)
	from utils.logger import setup_logger, get_logger
	from processors.docling_processor import DoclingProcessor
	from processors.json_formatter import format_to_json, JSONFormatter
	from processors.markdown_formatter import format_to_markdown, MarkdownFormatter

	# Configura logger
	logger = setup_logger("docling_space")

	# =============================================================================
	# RATE LIMITING (in-memory)
	# =============================================================================

	_rate_limit_store: dict[str, list[datetime]] = defaultdict(list)


	def check_rate_limit(request: gr.Request) -> bool:
	"""Verifica se o IP excedeu o limite de requisições."""
	if request is None:
	return True

	ip = None
	if hasattr(request, "headers"):
	headers = request.headers or {}
	ip = headers.get("x-forwarded-for", "").split(",")[0].strip()
	if not ip:
	ip = headers.get("x-real-ip", "").strip()

	if not ip:
	client_info = getattr(request, "client", None)
	if client_info:
	if isinstance(client_info, dict):
	ip = client_info.get("host", "")
	elif hasattr(client_info, "host"):
	ip = getattr(client_info, "host", "")
	else:
	ip = str(client_info)

	if not ip or ip == "unknown":
	session_hash = getattr(request, "session_hash", None)
	if session_hash:
	ip = f"session_{session_hash[:16]}"
	else:
	return True

	now = datetime.now()
	window_start = now - timedelta(hours=config.RATE_LIMIT_WINDOW_HOURS)

	_rate_limit_store[ip] = [
	ts for ts in _rate_limit_store[ip]
	if ts > window_start
	]

	if len(_rate_limit_store[ip]) >= config.RATE_LIMIT_REQUESTS:
	logger.warning(f"Rate limit excedido para IP: {ip}")
	return False

	_rate_limit_store[ip].append(now)
	return True


	# =============================================================================
	# FUNÇÃO DE PROCESSAMENTO PRINCIPAL
	# =============================================================================

	def _process_documents_internal(
	files: list,
	output_format: str,
	progress: Optional[gr.Progress] = None
	) -> tuple[str \| list[str], str]:
	"""Função interna de processamento (sem decorator GPU)."""
	start_time = time.time()
	cleanup_old_files()

	if progress:
	progress(0.1, desc="🔍 Validating files...")

	try:
	validated_files = validate_files(files)
	except ValidationError as e:
	logger.warning(f"Erro de validação: {e.message}")
	raise gr.Error(e.message)

	if progress:
	progress(0.2, desc="⚡ Initializing Docling...")

	processor = DoclingProcessor(
	enable_ocr=True,
	enable_table_detection=True,
	use_gpu=HAS_SPACES
	)

	output_dir = create_temp_directory(prefix="output_")
	output_files = []
	processed_count = 0
	total_files = len(validated_files)

	for i, (file_path, sanitized_name) in enumerate(validated_files):
	progress_pct = 0.2 + (0.6 * (i / total_files))

	if progress:
	progress(progress_pct, desc=f"📄 Processing {sanitized_name}...")

	try:
	processed_data = processor.process_document(file_path)
	base_name = Path(sanitized_name).stem

	if output_format == "JSON":
	json_content = format_to_json(processed_data, sanitized_name)
	json_path = save_output_file(
	json_content,
	f"{base_name}.json",
	output_dir
	)
	output_files.append((json_path, f"{base_name}.json"))

	elif output_format == "Markdown":
	md_content = format_to_markdown(processed_data)
	md_path = save_output_file(
	md_content,
	f"{base_name}.md",
	output_dir
	)
	output_files.append((md_path, f"{base_name}.md"))

	else: # Ambos
	json_content = format_to_json(processed_data, sanitized_name)
	md_content = format_to_markdown(processed_data)

	json_path = save_output_file(
	json_content,
	f"{base_name}.json",
	output_dir
	)
	md_path = save_output_file(
	md_content,
	f"{base_name}.md",
	output_dir
	)

	output_files.append((json_path, f"{base_name}.json"))
	output_files.append((md_path, f"{base_name}.md"))

	processed_count += 1
	logger.info(f"Processado: {sanitized_name}")

	except Exception as e:
	logger.error(f"Erro ao processar {sanitized_name}: {e}")
	logger.debug(traceback.format_exc())

	if total_files == 1:
	raise gr.Error(
	f"❌ Erro ao processar {sanitized_name}: {str(e)}"
	)

	if progress:
	progress(0.9, desc="📦 Preparing download...")

	if not output_files:
	raise gr.Error("❌ Nenhum arquivo foi processado com sucesso.")

	if len(output_files) > 1 or output_format == "Ambos":
	zip_path = create_zip_output(
	output_files,
	output_name="documentos_processados"
	)
	final_output = str(zip_path)
	else:
	final_output = str(output_files[0][0])

	elapsed_time = time.time() - start_time

	if progress:
	progress(1.0, desc="✅ Complete!")

	status_msg = (
	f"### ✅ Processing Complete!\n\n"
	f"Files processed: {processed_count}/{total_files} \n"
	f"Format: {output_format} \n"
	f"Time: {elapsed_time:.1f}s"
	)

	logger.info(
	f"Batch concluído: {processed_count}/{total_files} arquivos, "
	f"{elapsed_time:.1f}s, formato={output_format}"
	)

	return final_output, status_msg


	# Versão com GPU (se disponível)
	if HAS_SPACES:
	@spaces.GPU(duration=config.GPU_TIMEOUT_SECONDS)
	def process_documents_gpu(
	files: list,
	output_format: str,
	progress: gr.Progress = gr.Progress()
	) -> tuple[str \| list[str], str]:
	"""Processamento com aceleração GPU via ZeroGPU."""
	return _process_documents_internal(files, output_format, progress)
	else:
	process_documents_gpu = None


	def process_documents(
	files: list,
	output_format: str,
	request: gr.Request,
	progress: gr.Progress = gr.Progress()
	) -> tuple[str \| list[str], str]:
	"""Função principal de processamento."""
	if not check_rate_limit(request):
	raise gr.Error(
	f"⚠️ Rate limit exceeded. "
	f"Maximum: {config.RATE_LIMIT_REQUESTS} requests per hour. "
	f"Please try again later."
	)

	try:
	if HAS_SPACES and process_documents_gpu is not None:
	logger.info("Usando processamento GPU (ZeroGPU)")
	return process_documents_gpu(files, output_format, progress)
	else:
	logger.info("Usando processamento CPU (fallback)")
	return _process_documents_internal(files, output_format, progress)

	except gr.Error:
	raise
	except TimeoutError:
	logger.error("Timeout no processamento")
	raise gr.Error(
	"⏱️ Time limit exceeded. Try with smaller or fewer files."
	)
	except MemoryError:
	logger.error("Memória insuficiente")
	raise gr.Error(
	"💾 Insufficient memory. Try with smaller files."
	)
	except Exception as e:
	logger.error(f"Erro inesperado: {e}")
	logger.debug(traceback.format_exc())
	raise gr.Error(f"❌ Unexpected error: {str(e)}")


	# =============================================================================
	# INTERFACE GRADIO - MODERN REDESIGN
	# =============================================================================

	def create_interface() -> gr.Blocks:
	"""Creates a modern, mobile-first Gradio interface."""

	with gr.Blocks(
	title="📄 Docling Processor",
	fill_height=True,
	) as demo:

	# Header Section
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown(
	"""
	# 📄 Docling Document Processor

	Transform PDF, DOC, and DOCX files into structured formats using AI.

	Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
	""",
	elem_classes=["header-text"]
	)

	gr.Markdown("---")

	# Main Content Area
	with gr.Row():
	with gr.Column(scale=1):

	# Upload Section
	file_input = gr.File(
	file_count="multiple",
	file_types=[".pdf", ".doc", ".docx"],
	label="📁 Upload Documents",
	height=200,
	elem_classes=["upload-area"]
	)

	# Format Selector
	format_selector = gr.Radio(
	choices=config.OUTPUT_FORMATS,
	value="Markdown",
	label="📤 Output Format",
	info="Choose your preferred output format",
	elem_classes=["format-selector"]
	)

	# Process Button
	process_btn = gr.Button(
	"🚀 Process Documents",
	variant="primary",
	size="lg",
	elem_classes=["process-button"]
	)

	# Info Box
	with gr.Accordion("ℹ️ How to Use", open=False):
	gr.Markdown(
	"""
	### Quick Start

	1. Upload your documents (max 5 files, 50MB each)
	2. Select output format (JSON, Markdown, or both)
	3. Click Process Documents
	4. Download your results

	### Features

	- 🔍 Smart text, table & metadata extraction
	- 🌐 Automatic language detection
	- 🚀 GPU acceleration for fast processing
	- 📊 Preserves document structure

	### Supported Formats

	Input: PDF, DOC, DOCX
	Output: JSON, Markdown, or ZIP (both)
	"""
	)

	# Results Section
	gr.Markdown("---")

	with gr.Row():
	with gr.Column(scale=1):
	# Status Output
	status_output = gr.Markdown(
	label="Status",
	elem_classes=["status-output"]
	)

	# File Download
	file_output = gr.File(
	label="📥 Download Results",
	interactive=False,
	elem_classes=["download-area"]
	)

	# Footer
	gr.Markdown("---")
	gr.Markdown(
	f"""
	<div style="text-align: center; color: #666; font-size: 0.9em;">
	<p><strong>Limits:</strong> {config.MAX_FILES_PER_SESSION} files per upload \|
	{config.MAX_FILE_SIZE_MB}MB per file \|
	{config.RATE_LIMIT_REQUESTS} requests/hour</p>
	<p>Powered by <a href="https://github.com/docling-project/docling">Docling</a> •
	Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder">anycoder</a></p>
	</div>
	""",
	elem_classes=["footer-text"]
	)

	# Event Handlers
	process_btn.click(
	fn=process_documents,
	inputs=[file_input, format_selector],
	outputs=[file_output, status_output],
	show_progress="full",
	)

	# Clear status when new files are selected
	file_input.change(
	fn=lambda: ("", None),
	outputs=[status_output, file_output],
	)

	return demo


	# =============================================================================
	# PONTO DE ENTRADA
	# =============================================================================

	if __name__ == "__main__":
	# Cria diretórios necessários
	config.TEMP_DIR.mkdir(parents=True, exist_ok=True)
	config.LOGS_DIR.mkdir(parents=True, exist_ok=True)

	# Limpa arquivos temporários antigos
	cleanup_old_files()

	logger.info("Iniciando Docling Document Processor...")
	logger.info(f"ZeroGPU disponível: {HAS_SPACES}")

	# Cria e lança a interface
	demo = create_interface()

	# Detecta se está em ambiente containerizado (HF Spaces)
	is_containerized = HAS_SPACES or os.environ.get("SPACE_ID") is not None

	try:
	demo.queue().launch(
	server_name="0.0.0.0",
	server_port=7860,
	max_file_size=f"{config.MAX_FILE_SIZE_MB}mb",
	show_error=True,
	share=is_containerized,
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="indigo",
	neutral_hue="slate",
	font=gr.themes.GoogleFont("Inter"),
	text_size="lg",
	spacing_size="lg",
	radius_size="md"
	).set(
	button_primary_background_fill="*primary_600",
	button_primary_background_fill_hover="*primary_700",
	button_primary_text_color="white",
	block_title_text_weight="600",
	block_label_text_weight="500",
	),
	css="""
	/* Mobile-First Responsive Design */
	.gradio-container {
	max-width: 1200px !important;
	margin: 0 auto !important;
	padding: 1rem !important;
	}

	/* Header Styling */
	.header-text h1 {
	font-size: 2rem !important;
	font-weight: 700 !important;
	margin-bottom: 0.5rem !important;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	}

	.header-text p {
	font-size: 1.1rem !important;
	color: #64748b !important;
	line-height: 1.6 !important;
	}

	/* Upload Area */
	.upload-area {
	border: 2px dashed #cbd5e1 !important;
	border-radius: 12px !important;
	transition: all 0.3s ease !important;
	}

	.upload-area:hover {
	border-color: #667eea !important;
	background: #f8fafc !important;
	}

	/* Format Selector */
	.format-selector label {
	font-weight: 500 !important;
	margin-bottom: 0.5rem !important;
	}

	/* Process Button */
	.process-button {
	margin-top: 1rem !important;
	font-size: 1.1rem !important;
	padding: 0.75rem 2rem !important;
	border-radius: 8px !important;
	font-weight: 600 !important;
	box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1) !important;
	transition: all 0.3s ease !important;
	}

	.process-button:hover {
	transform: translateY(-2px) !important;
	box-shadow: 0 10px 15px -3px rgb(0 0 0 / 0.2) !important;
	}

	/* Status Output */
	.status-output {
	background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%) !important;
	border-left: 4px solid #0ea5e9 !important;
	padding: 1rem !important;
	border-radius: 8px !important;
	margin-top: 1rem !important;
	}

	/* Download Area */
	.download-area {
	margin-top: 1rem !important;
	border-radius: 8px !important;
	}

	/* Footer */
	.footer-text {
	opacity: 0.8 !important;
	}

	.footer-text a {
	color: #667eea !important;
	text-decoration: none !important;
	font-weight: 500 !important;
	}

	.footer-text a:hover {
	text-decoration: underline !important;
	}

	/* Accordion Styling */
	.accordion {
	margin-top: 1rem !important;
	}

	/* Mobile Responsiveness */
	@media (max-width: 768px) {
	.gradio-container {
	padding: 0.5rem !important;
	}

	.header-text h1 {
	font-size: 1.5rem !important;
	}

	.header-text p {
	font-size: 1rem !important;
	}

	.process-button {
	width: 100% !important;
	font-size: 1rem !important;
	}
	}

	/* Dark Mode Support */
	@media (prefers-color-scheme: dark) {
	.upload-area {
	border-color: #475569 !important;
	}

	.upload-area:hover {
	background: #1e293b !important;
	}

	.status-output {
	background: linear-gradient(135deg, #1e293b 0%, #334155 100%) !important;
	}
	}
	""",
	footer_links=[
	{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
	"gradio",
	"api"
	]
	)
	except Exception as e:
	logger.error(f"Erro ao iniciar aplicação: {e}")
	logger.info("Tentando iniciar com configuração alternativa...")

	try:
	demo.queue().launch(
	server_name="0.0.0.0",
	server_port=7860,
	max_file_size=f"{config.MAX_FILE_SIZE_MB}mb",
	show_error=True,
	share=True,
	theme=gr.themes.Soft(primary_hue="blue"),
	footer_links=[
	{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}
	]
	)
	except Exception as fallback_error:
	logger.critical(f"Falha crítica ao iniciar: {fallback_error}")
	raise