Spaces:
Runtime error
Runtime error
| """ | |
| Docling Document Processor - Modern Redesigned UI | |
| A clean, mobile-first interface for document processing with AI. | |
| """ | |
| import os | |
| import sys | |
| import time | |
| import traceback | |
| from collections import defaultdict | |
| from datetime import datetime, timedelta | |
| from pathlib import Path | |
| from typing import Optional | |
| import gradio as gr | |
| # Importação condicional do spaces para ZeroGPU | |
| try: | |
| import spaces | |
| HAS_SPACES = True | |
| except ImportError: | |
| HAS_SPACES = False | |
| # Adiciona o diretório atual ao path para imports locais | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| import config | |
| from utils.validators import validate_files, ValidationError | |
| from utils.file_handler import ( | |
| create_temp_directory, | |
| cleanup_old_files, | |
| create_zip_output, | |
| save_output_file, | |
| ) | |
| from utils.logger import setup_logger, get_logger | |
| from processors.docling_processor import DoclingProcessor | |
| from processors.json_formatter import format_to_json, JSONFormatter | |
| from processors.markdown_formatter import format_to_markdown, MarkdownFormatter | |
| # Configura logger | |
| logger = setup_logger("docling_space") | |
| # ============================================================================= | |
| # RATE LIMITING (in-memory) | |
| # ============================================================================= | |
| _rate_limit_store: dict[str, list[datetime]] = defaultdict(list) | |
| def check_rate_limit(request: gr.Request) -> bool: | |
| """Verifica se o IP excedeu o limite de requisições.""" | |
| if request is None: | |
| return True | |
| ip = None | |
| if hasattr(request, "headers"): | |
| headers = request.headers or {} | |
| ip = headers.get("x-forwarded-for", "").split(",")[0].strip() | |
| if not ip: | |
| ip = headers.get("x-real-ip", "").strip() | |
| if not ip: | |
| client_info = getattr(request, "client", None) | |
| if client_info: | |
| if isinstance(client_info, dict): | |
| ip = client_info.get("host", "") | |
| elif hasattr(client_info, "host"): | |
| ip = getattr(client_info, "host", "") | |
| else: | |
| ip = str(client_info) | |
| if not ip or ip == "unknown": | |
| session_hash = getattr(request, "session_hash", None) | |
| if session_hash: | |
| ip = f"session_{session_hash[:16]}" | |
| else: | |
| return True | |
| now = datetime.now() | |
| window_start = now - timedelta(hours=config.RATE_LIMIT_WINDOW_HOURS) | |
| _rate_limit_store[ip] = [ | |
| ts for ts in _rate_limit_store[ip] | |
| if ts > window_start | |
| ] | |
| if len(_rate_limit_store[ip]) >= config.RATE_LIMIT_REQUESTS: | |
| logger.warning(f"Rate limit excedido para IP: {ip}") | |
| return False | |
| _rate_limit_store[ip].append(now) | |
| return True | |
| # ============================================================================= | |
| # FUNÇÃO DE PROCESSAMENTO PRINCIPAL | |
| # ============================================================================= | |
| def _process_documents_internal( | |
| files: list, | |
| output_format: str, | |
| progress: Optional[gr.Progress] = None | |
| ) -> tuple[str | list[str], str]: | |
| """Função interna de processamento (sem decorator GPU).""" | |
| start_time = time.time() | |
| cleanup_old_files() | |
| if progress: | |
| progress(0.1, desc="🔍 Validating files...") | |
| try: | |
| validated_files = validate_files(files) | |
| except ValidationError as e: | |
| logger.warning(f"Erro de validação: {e.message}") | |
| raise gr.Error(e.message) | |
| if progress: | |
| progress(0.2, desc="⚡ Initializing Docling...") | |
| processor = DoclingProcessor( | |
| enable_ocr=True, | |
| enable_table_detection=True, | |
| use_gpu=HAS_SPACES | |
| ) | |
| output_dir = create_temp_directory(prefix="output_") | |
| output_files = [] | |
| processed_count = 0 | |
| total_files = len(validated_files) | |
| for i, (file_path, sanitized_name) in enumerate(validated_files): | |
| progress_pct = 0.2 + (0.6 * (i / total_files)) | |
| if progress: | |
| progress(progress_pct, desc=f"📄 Processing {sanitized_name}...") | |
| try: | |
| processed_data = processor.process_document(file_path) | |
| base_name = Path(sanitized_name).stem | |
| if output_format == "JSON": | |
| json_content = format_to_json(processed_data, sanitized_name) | |
| json_path = save_output_file( | |
| json_content, | |
| f"{base_name}.json", | |
| output_dir | |
| ) | |
| output_files.append((json_path, f"{base_name}.json")) | |
| elif output_format == "Markdown": | |
| md_content = format_to_markdown(processed_data) | |
| md_path = save_output_file( | |
| md_content, | |
| f"{base_name}.md", | |
| output_dir | |
| ) | |
| output_files.append((md_path, f"{base_name}.md")) | |
| else: # Ambos | |
| json_content = format_to_json(processed_data, sanitized_name) | |
| md_content = format_to_markdown(processed_data) | |
| json_path = save_output_file( | |
| json_content, | |
| f"{base_name}.json", | |
| output_dir | |
| ) | |
| md_path = save_output_file( | |
| md_content, | |
| f"{base_name}.md", | |
| output_dir | |
| ) | |
| output_files.append((json_path, f"{base_name}.json")) | |
| output_files.append((md_path, f"{base_name}.md")) | |
| processed_count += 1 | |
| logger.info(f"Processado: {sanitized_name}") | |
| except Exception as e: | |
| logger.error(f"Erro ao processar {sanitized_name}: {e}") | |
| logger.debug(traceback.format_exc()) | |
| if total_files == 1: | |
| raise gr.Error( | |
| f"❌ Erro ao processar {sanitized_name}: {str(e)}" | |
| ) | |
| if progress: | |
| progress(0.9, desc="📦 Preparing download...") | |
| if not output_files: | |
| raise gr.Error("❌ Nenhum arquivo foi processado com sucesso.") | |
| if len(output_files) > 1 or output_format == "Ambos": | |
| zip_path = create_zip_output( | |
| output_files, | |
| output_name="documentos_processados" | |
| ) | |
| final_output = str(zip_path) | |
| else: | |
| final_output = str(output_files[0][0]) | |
| elapsed_time = time.time() - start_time | |
| if progress: | |
| progress(1.0, desc="✅ Complete!") | |
| status_msg = ( | |
| f"### ✅ Processing Complete!\n\n" | |
| f"**Files processed:** {processed_count}/{total_files} \n" | |
| f"**Format:** {output_format} \n" | |
| f"**Time:** {elapsed_time:.1f}s" | |
| ) | |
| logger.info( | |
| f"Batch concluído: {processed_count}/{total_files} arquivos, " | |
| f"{elapsed_time:.1f}s, formato={output_format}" | |
| ) | |
| return final_output, status_msg | |
| # Versão com GPU (se disponível) | |
| if HAS_SPACES: | |
| def process_documents_gpu( | |
| files: list, | |
| output_format: str, | |
| progress: gr.Progress = gr.Progress() | |
| ) -> tuple[str | list[str], str]: | |
| """Processamento com aceleração GPU via ZeroGPU.""" | |
| return _process_documents_internal(files, output_format, progress) | |
| else: | |
| process_documents_gpu = None | |
| def process_documents( | |
| files: list, | |
| output_format: str, | |
| request: gr.Request, | |
| progress: gr.Progress = gr.Progress() | |
| ) -> tuple[str | list[str], str]: | |
| """Função principal de processamento.""" | |
| if not check_rate_limit(request): | |
| raise gr.Error( | |
| f"⚠️ Rate limit exceeded. " | |
| f"Maximum: {config.RATE_LIMIT_REQUESTS} requests per hour. " | |
| f"Please try again later." | |
| ) | |
| try: | |
| if HAS_SPACES and process_documents_gpu is not None: | |
| logger.info("Usando processamento GPU (ZeroGPU)") | |
| return process_documents_gpu(files, output_format, progress) | |
| else: | |
| logger.info("Usando processamento CPU (fallback)") | |
| return _process_documents_internal(files, output_format, progress) | |
| except gr.Error: | |
| raise | |
| except TimeoutError: | |
| logger.error("Timeout no processamento") | |
| raise gr.Error( | |
| "⏱️ Time limit exceeded. Try with smaller or fewer files." | |
| ) | |
| except MemoryError: | |
| logger.error("Memória insuficiente") | |
| raise gr.Error( | |
| "💾 Insufficient memory. Try with smaller files." | |
| ) | |
| except Exception as e: | |
| logger.error(f"Erro inesperado: {e}") | |
| logger.debug(traceback.format_exc()) | |
| raise gr.Error(f"❌ Unexpected error: {str(e)}") | |
| # ============================================================================= | |
| # INTERFACE GRADIO - MODERN REDESIGN | |
| # ============================================================================= | |
| def create_interface() -> gr.Blocks: | |
| """Creates a modern, mobile-first Gradio interface.""" | |
| with gr.Blocks( | |
| title="📄 Docling Processor", | |
| fill_height=True, | |
| ) as demo: | |
| # Header Section | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| """ | |
| # 📄 Docling Document Processor | |
| Transform PDF, DOC, and DOCX files into structured formats using AI. | |
| Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder) | |
| """, | |
| elem_classes=["header-text"] | |
| ) | |
| gr.Markdown("---") | |
| # Main Content Area | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Upload Section | |
| file_input = gr.File( | |
| file_count="multiple", | |
| file_types=[".pdf", ".doc", ".docx"], | |
| label="📁 Upload Documents", | |
| height=200, | |
| elem_classes=["upload-area"] | |
| ) | |
| # Format Selector | |
| format_selector = gr.Radio( | |
| choices=config.OUTPUT_FORMATS, | |
| value="Markdown", | |
| label="📤 Output Format", | |
| info="Choose your preferred output format", | |
| elem_classes=["format-selector"] | |
| ) | |
| # Process Button | |
| process_btn = gr.Button( | |
| "🚀 Process Documents", | |
| variant="primary", | |
| size="lg", | |
| elem_classes=["process-button"] | |
| ) | |
| # Info Box | |
| with gr.Accordion("ℹ️ How to Use", open=False): | |
| gr.Markdown( | |
| """ | |
| ### Quick Start | |
| 1. **Upload** your documents (max 5 files, 50MB each) | |
| 2. **Select** output format (JSON, Markdown, or both) | |
| 3. **Click** Process Documents | |
| 4. **Download** your results | |
| ### Features | |
| - 🔍 Smart text, table & metadata extraction | |
| - 🌐 Automatic language detection | |
| - 🚀 GPU acceleration for fast processing | |
| - 📊 Preserves document structure | |
| ### Supported Formats | |
| **Input:** PDF, DOC, DOCX | |
| **Output:** JSON, Markdown, or ZIP (both) | |
| """ | |
| ) | |
| # Results Section | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Status Output | |
| status_output = gr.Markdown( | |
| label="Status", | |
| elem_classes=["status-output"] | |
| ) | |
| # File Download | |
| file_output = gr.File( | |
| label="📥 Download Results", | |
| interactive=False, | |
| elem_classes=["download-area"] | |
| ) | |
| # Footer | |
| gr.Markdown("---") | |
| gr.Markdown( | |
| f""" | |
| <div style="text-align: center; color: #666; font-size: 0.9em;"> | |
| <p><strong>Limits:</strong> {config.MAX_FILES_PER_SESSION} files per upload | | |
| {config.MAX_FILE_SIZE_MB}MB per file | | |
| {config.RATE_LIMIT_REQUESTS} requests/hour</p> | |
| <p>Powered by <a href="https://github.com/docling-project/docling">Docling</a> • | |
| Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder">anycoder</a></p> | |
| </div> | |
| """, | |
| elem_classes=["footer-text"] | |
| ) | |
| # Event Handlers | |
| process_btn.click( | |
| fn=process_documents, | |
| inputs=[file_input, format_selector], | |
| outputs=[file_output, status_output], | |
| show_progress="full", | |
| ) | |
| # Clear status when new files are selected | |
| file_input.change( | |
| fn=lambda: ("", None), | |
| outputs=[status_output, file_output], | |
| ) | |
| return demo | |
| # ============================================================================= | |
| # PONTO DE ENTRADA | |
| # ============================================================================= | |
| if __name__ == "__main__": | |
| # Cria diretórios necessários | |
| config.TEMP_DIR.mkdir(parents=True, exist_ok=True) | |
| config.LOGS_DIR.mkdir(parents=True, exist_ok=True) | |
| # Limpa arquivos temporários antigos | |
| cleanup_old_files() | |
| logger.info("Iniciando Docling Document Processor...") | |
| logger.info(f"ZeroGPU disponível: {HAS_SPACES}") | |
| # Cria e lança a interface | |
| demo = create_interface() | |
| # Detecta se está em ambiente containerizado (HF Spaces) | |
| is_containerized = HAS_SPACES or os.environ.get("SPACE_ID") is not None | |
| try: | |
| demo.queue().launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| max_file_size=f"{config.MAX_FILE_SIZE_MB}mb", | |
| show_error=True, | |
| share=is_containerized, | |
| theme=gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="indigo", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| text_size="lg", | |
| spacing_size="lg", | |
| radius_size="md" | |
| ).set( | |
| button_primary_background_fill="*primary_600", | |
| button_primary_background_fill_hover="*primary_700", | |
| button_primary_text_color="white", | |
| block_title_text_weight="600", | |
| block_label_text_weight="500", | |
| ), | |
| css=""" | |
| /* Mobile-First Responsive Design */ | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| margin: 0 auto !important; | |
| padding: 1rem !important; | |
| } | |
| /* Header Styling */ | |
| .header-text h1 { | |
| font-size: 2rem !important; | |
| font-weight: 700 !important; | |
| margin-bottom: 0.5rem !important; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| } | |
| .header-text p { | |
| font-size: 1.1rem !important; | |
| color: #64748b !important; | |
| line-height: 1.6 !important; | |
| } | |
| /* Upload Area */ | |
| .upload-area { | |
| border: 2px dashed #cbd5e1 !important; | |
| border-radius: 12px !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .upload-area:hover { | |
| border-color: #667eea !important; | |
| background: #f8fafc !important; | |
| } | |
| /* Format Selector */ | |
| .format-selector label { | |
| font-weight: 500 !important; | |
| margin-bottom: 0.5rem !important; | |
| } | |
| /* Process Button */ | |
| .process-button { | |
| margin-top: 1rem !important; | |
| font-size: 1.1rem !important; | |
| padding: 0.75rem 2rem !important; | |
| border-radius: 8px !important; | |
| font-weight: 600 !important; | |
| box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1) !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .process-button:hover { | |
| transform: translateY(-2px) !important; | |
| box-shadow: 0 10px 15px -3px rgb(0 0 0 / 0.2) !important; | |
| } | |
| /* Status Output */ | |
| .status-output { | |
| background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%) !important; | |
| border-left: 4px solid #0ea5e9 !important; | |
| padding: 1rem !important; | |
| border-radius: 8px !important; | |
| margin-top: 1rem !important; | |
| } | |
| /* Download Area */ | |
| .download-area { | |
| margin-top: 1rem !important; | |
| border-radius: 8px !important; | |
| } | |
| /* Footer */ | |
| .footer-text { | |
| opacity: 0.8 !important; | |
| } | |
| .footer-text a { | |
| color: #667eea !important; | |
| text-decoration: none !important; | |
| font-weight: 500 !important; | |
| } | |
| .footer-text a:hover { | |
| text-decoration: underline !important; | |
| } | |
| /* Accordion Styling */ | |
| .accordion { | |
| margin-top: 1rem !important; | |
| } | |
| /* Mobile Responsiveness */ | |
| @media (max-width: 768px) { | |
| .gradio-container { | |
| padding: 0.5rem !important; | |
| } | |
| .header-text h1 { | |
| font-size: 1.5rem !important; | |
| } | |
| .header-text p { | |
| font-size: 1rem !important; | |
| } | |
| .process-button { | |
| width: 100% !important; | |
| font-size: 1rem !important; | |
| } | |
| } | |
| /* Dark Mode Support */ | |
| @media (prefers-color-scheme: dark) { | |
| .upload-area { | |
| border-color: #475569 !important; | |
| } | |
| .upload-area:hover { | |
| background: #1e293b !important; | |
| } | |
| .status-output { | |
| background: linear-gradient(135deg, #1e293b 0%, #334155 100%) !important; | |
| } | |
| } | |
| """, | |
| footer_links=[ | |
| {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, | |
| "gradio", | |
| "api" | |
| ] | |
| ) | |
| except Exception as e: | |
| logger.error(f"Erro ao iniciar aplicação: {e}") | |
| logger.info("Tentando iniciar com configuração alternativa...") | |
| try: | |
| demo.queue().launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| max_file_size=f"{config.MAX_FILE_SIZE_MB}mb", | |
| show_error=True, | |
| share=True, | |
| theme=gr.themes.Soft(primary_hue="blue"), | |
| footer_links=[ | |
| {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"} | |
| ] | |
| ) | |
| except Exception as fallback_error: | |
| logger.critical(f"Falha crítica ao iniciar: {fallback_error}") | |
| raise |