""" Docling Document Processor - Modern Redesigned UI A clean, mobile-first interface for document processing with AI. """ import os import sys import time import traceback from collections import defaultdict from datetime import datetime, timedelta from pathlib import Path from typing import Optional import gradio as gr # Importação condicional do spaces para ZeroGPU try: import spaces HAS_SPACES = True except ImportError: HAS_SPACES = False # Adiciona o diretório atual ao path para imports locais sys.path.insert(0, str(Path(__file__).parent)) import config from utils.validators import validate_files, ValidationError from utils.file_handler import ( create_temp_directory, cleanup_old_files, create_zip_output, save_output_file, ) from utils.logger import setup_logger, get_logger from processors.docling_processor import DoclingProcessor from processors.json_formatter import format_to_json, JSONFormatter from processors.markdown_formatter import format_to_markdown, MarkdownFormatter # Configura logger logger = setup_logger("docling_space") # ============================================================================= # RATE LIMITING (in-memory) # ============================================================================= _rate_limit_store: dict[str, list[datetime]] = defaultdict(list) def check_rate_limit(request: gr.Request) -> bool: """Verifica se o IP excedeu o limite de requisições.""" if request is None: return True ip = None if hasattr(request, "headers"): headers = request.headers or {} ip = headers.get("x-forwarded-for", "").split(",")[0].strip() if not ip: ip = headers.get("x-real-ip", "").strip() if not ip: client_info = getattr(request, "client", None) if client_info: if isinstance(client_info, dict): ip = client_info.get("host", "") elif hasattr(client_info, "host"): ip = getattr(client_info, "host", "") else: ip = str(client_info) if not ip or ip == "unknown": session_hash = getattr(request, "session_hash", None) if session_hash: ip = f"session_{session_hash[:16]}" else: return True now = datetime.now() window_start = now - timedelta(hours=config.RATE_LIMIT_WINDOW_HOURS) _rate_limit_store[ip] = [ ts for ts in _rate_limit_store[ip] if ts > window_start ] if len(_rate_limit_store[ip]) >= config.RATE_LIMIT_REQUESTS: logger.warning(f"Rate limit excedido para IP: {ip}") return False _rate_limit_store[ip].append(now) return True # ============================================================================= # FUNÇÃO DE PROCESSAMENTO PRINCIPAL # ============================================================================= def _process_documents_internal( files: list, output_format: str, progress: Optional[gr.Progress] = None ) -> tuple[str | list[str], str]: """Função interna de processamento (sem decorator GPU).""" start_time = time.time() cleanup_old_files() if progress: progress(0.1, desc="🔍 Validating files...") try: validated_files = validate_files(files) except ValidationError as e: logger.warning(f"Erro de validação: {e.message}") raise gr.Error(e.message) if progress: progress(0.2, desc="⚡ Initializing Docling...") processor = DoclingProcessor( enable_ocr=True, enable_table_detection=True, use_gpu=HAS_SPACES ) output_dir = create_temp_directory(prefix="output_") output_files = [] processed_count = 0 total_files = len(validated_files) for i, (file_path, sanitized_name) in enumerate(validated_files): progress_pct = 0.2 + (0.6 * (i / total_files)) if progress: progress(progress_pct, desc=f"📄 Processing {sanitized_name}...") try: processed_data = processor.process_document(file_path) base_name = Path(sanitized_name).stem if output_format == "JSON": json_content = format_to_json(processed_data, sanitized_name) json_path = save_output_file( json_content, f"{base_name}.json", output_dir ) output_files.append((json_path, f"{base_name}.json")) elif output_format == "Markdown": md_content = format_to_markdown(processed_data) md_path = save_output_file( md_content, f"{base_name}.md", output_dir ) output_files.append((md_path, f"{base_name}.md")) else: # Ambos json_content = format_to_json(processed_data, sanitized_name) md_content = format_to_markdown(processed_data) json_path = save_output_file( json_content, f"{base_name}.json", output_dir ) md_path = save_output_file( md_content, f"{base_name}.md", output_dir ) output_files.append((json_path, f"{base_name}.json")) output_files.append((md_path, f"{base_name}.md")) processed_count += 1 logger.info(f"Processado: {sanitized_name}") except Exception as e: logger.error(f"Erro ao processar {sanitized_name}: {e}") logger.debug(traceback.format_exc()) if total_files == 1: raise gr.Error( f"❌ Erro ao processar {sanitized_name}: {str(e)}" ) if progress: progress(0.9, desc="📦 Preparing download...") if not output_files: raise gr.Error("❌ Nenhum arquivo foi processado com sucesso.") if len(output_files) > 1 or output_format == "Ambos": zip_path = create_zip_output( output_files, output_name="documentos_processados" ) final_output = str(zip_path) else: final_output = str(output_files[0][0]) elapsed_time = time.time() - start_time if progress: progress(1.0, desc="✅ Complete!") status_msg = ( f"### ✅ Processing Complete!\n\n" f"**Files processed:** {processed_count}/{total_files} \n" f"**Format:** {output_format} \n" f"**Time:** {elapsed_time:.1f}s" ) logger.info( f"Batch concluído: {processed_count}/{total_files} arquivos, " f"{elapsed_time:.1f}s, formato={output_format}" ) return final_output, status_msg # Versão com GPU (se disponível) if HAS_SPACES: @spaces.GPU(duration=config.GPU_TIMEOUT_SECONDS) def process_documents_gpu( files: list, output_format: str, progress: gr.Progress = gr.Progress() ) -> tuple[str | list[str], str]: """Processamento com aceleração GPU via ZeroGPU.""" return _process_documents_internal(files, output_format, progress) else: process_documents_gpu = None def process_documents( files: list, output_format: str, request: gr.Request, progress: gr.Progress = gr.Progress() ) -> tuple[str | list[str], str]: """Função principal de processamento.""" if not check_rate_limit(request): raise gr.Error( f"⚠️ Rate limit exceeded. " f"Maximum: {config.RATE_LIMIT_REQUESTS} requests per hour. " f"Please try again later." ) try: if HAS_SPACES and process_documents_gpu is not None: logger.info("Usando processamento GPU (ZeroGPU)") return process_documents_gpu(files, output_format, progress) else: logger.info("Usando processamento CPU (fallback)") return _process_documents_internal(files, output_format, progress) except gr.Error: raise except TimeoutError: logger.error("Timeout no processamento") raise gr.Error( "⏱️ Time limit exceeded. Try with smaller or fewer files." ) except MemoryError: logger.error("Memória insuficiente") raise gr.Error( "💾 Insufficient memory. Try with smaller files." ) except Exception as e: logger.error(f"Erro inesperado: {e}") logger.debug(traceback.format_exc()) raise gr.Error(f"❌ Unexpected error: {str(e)}") # ============================================================================= # INTERFACE GRADIO - MODERN REDESIGN # ============================================================================= def create_interface() -> gr.Blocks: """Creates a modern, mobile-first Gradio interface.""" with gr.Blocks( title="📄 Docling Processor", fill_height=True, ) as demo: # Header Section with gr.Row(): with gr.Column(scale=1): gr.Markdown( """ # 📄 Docling Document Processor Transform PDF, DOC, and DOCX files into structured formats using AI. Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder) """, elem_classes=["header-text"] ) gr.Markdown("---") # Main Content Area with gr.Row(): with gr.Column(scale=1): # Upload Section file_input = gr.File( file_count="multiple", file_types=[".pdf", ".doc", ".docx"], label="📁 Upload Documents", height=200, elem_classes=["upload-area"] ) # Format Selector format_selector = gr.Radio( choices=config.OUTPUT_FORMATS, value="Markdown", label="📤 Output Format", info="Choose your preferred output format", elem_classes=["format-selector"] ) # Process Button process_btn = gr.Button( "🚀 Process Documents", variant="primary", size="lg", elem_classes=["process-button"] ) # Info Box with gr.Accordion("ℹ️ How to Use", open=False): gr.Markdown( """ ### Quick Start 1. **Upload** your documents (max 5 files, 50MB each) 2. **Select** output format (JSON, Markdown, or both) 3. **Click** Process Documents 4. **Download** your results ### Features - 🔍 Smart text, table & metadata extraction - 🌐 Automatic language detection - 🚀 GPU acceleration for fast processing - 📊 Preserves document structure ### Supported Formats **Input:** PDF, DOC, DOCX **Output:** JSON, Markdown, or ZIP (both) """ ) # Results Section gr.Markdown("---") with gr.Row(): with gr.Column(scale=1): # Status Output status_output = gr.Markdown( label="Status", elem_classes=["status-output"] ) # File Download file_output = gr.File( label="📥 Download Results", interactive=False, elem_classes=["download-area"] ) # Footer gr.Markdown("---") gr.Markdown( f"""
""", elem_classes=["footer-text"] ) # Event Handlers process_btn.click( fn=process_documents, inputs=[file_input, format_selector], outputs=[file_output, status_output], show_progress="full", ) # Clear status when new files are selected file_input.change( fn=lambda: ("", None), outputs=[status_output, file_output], ) return demo # ============================================================================= # PONTO DE ENTRADA # ============================================================================= if __name__ == "__main__": # Cria diretórios necessários config.TEMP_DIR.mkdir(parents=True, exist_ok=True) config.LOGS_DIR.mkdir(parents=True, exist_ok=True) # Limpa arquivos temporários antigos cleanup_old_files() logger.info("Iniciando Docling Document Processor...") logger.info(f"ZeroGPU disponível: {HAS_SPACES}") # Cria e lança a interface demo = create_interface() # Detecta se está em ambiente containerizado (HF Spaces) is_containerized = HAS_SPACES or os.environ.get("SPACE_ID") is not None try: demo.queue().launch( server_name="0.0.0.0", server_port=7860, max_file_size=f"{config.MAX_FILE_SIZE_MB}mb", show_error=True, share=is_containerized, theme=gr.themes.Soft( primary_hue="blue", secondary_hue="indigo", neutral_hue="slate", font=gr.themes.GoogleFont("Inter"), text_size="lg", spacing_size="lg", radius_size="md" ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", button_primary_text_color="white", block_title_text_weight="600", block_label_text_weight="500", ), css=""" /* Mobile-First Responsive Design */ .gradio-container { max-width: 1200px !important; margin: 0 auto !important; padding: 1rem !important; } /* Header Styling */ .header-text h1 { font-size: 2rem !important; font-weight: 700 !important; margin-bottom: 0.5rem !important; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; } .header-text p { font-size: 1.1rem !important; color: #64748b !important; line-height: 1.6 !important; } /* Upload Area */ .upload-area { border: 2px dashed #cbd5e1 !important; border-radius: 12px !important; transition: all 0.3s ease !important; } .upload-area:hover { border-color: #667eea !important; background: #f8fafc !important; } /* Format Selector */ .format-selector label { font-weight: 500 !important; margin-bottom: 0.5rem !important; } /* Process Button */ .process-button { margin-top: 1rem !important; font-size: 1.1rem !important; padding: 0.75rem 2rem !important; border-radius: 8px !important; font-weight: 600 !important; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1) !important; transition: all 0.3s ease !important; } .process-button:hover { transform: translateY(-2px) !important; box-shadow: 0 10px 15px -3px rgb(0 0 0 / 0.2) !important; } /* Status Output */ .status-output { background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%) !important; border-left: 4px solid #0ea5e9 !important; padding: 1rem !important; border-radius: 8px !important; margin-top: 1rem !important; } /* Download Area */ .download-area { margin-top: 1rem !important; border-radius: 8px !important; } /* Footer */ .footer-text { opacity: 0.8 !important; } .footer-text a { color: #667eea !important; text-decoration: none !important; font-weight: 500 !important; } .footer-text a:hover { text-decoration: underline !important; } /* Accordion Styling */ .accordion { margin-top: 1rem !important; } /* Mobile Responsiveness */ @media (max-width: 768px) { .gradio-container { padding: 0.5rem !important; } .header-text h1 { font-size: 1.5rem !important; } .header-text p { font-size: 1rem !important; } .process-button { width: 100% !important; font-size: 1rem !important; } } /* Dark Mode Support */ @media (prefers-color-scheme: dark) { .upload-area { border-color: #475569 !important; } .upload-area:hover { background: #1e293b !important; } .status-output { background: linear-gradient(135deg, #1e293b 0%, #334155 100%) !important; } } """, footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, "gradio", "api" ] ) except Exception as e: logger.error(f"Erro ao iniciar aplicação: {e}") logger.info("Tentando iniciar com configuração alternativa...") try: demo.queue().launch( server_name="0.0.0.0", server_port=7860, max_file_size=f"{config.MAX_FILE_SIZE_MB}mb", show_error=True, share=True, theme=gr.themes.Soft(primary_hue="blue"), footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"} ] ) except Exception as fallback_error: logger.critical(f"Falha crítica ao iniciar: {fallback_error}") raise