SherlockRamos's picture
🎨 Redesign from AnyCoder
4f3a6ee verified
raw
history blame
20.3 kB
"""
Docling Document Processor - Modern Redesigned UI
A clean, mobile-first interface for document processing with AI.
"""
import os
import sys
import time
import traceback
from collections import defaultdict
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional
import gradio as gr
# Importação condicional do spaces para ZeroGPU
try:
import spaces
HAS_SPACES = True
except ImportError:
HAS_SPACES = False
# Adiciona o diretório atual ao path para imports locais
sys.path.insert(0, str(Path(__file__).parent))
import config
from utils.validators import validate_files, ValidationError
from utils.file_handler import (
create_temp_directory,
cleanup_old_files,
create_zip_output,
save_output_file,
)
from utils.logger import setup_logger, get_logger
from processors.docling_processor import DoclingProcessor
from processors.json_formatter import format_to_json, JSONFormatter
from processors.markdown_formatter import format_to_markdown, MarkdownFormatter
# Configura logger
logger = setup_logger("docling_space")
# =============================================================================
# RATE LIMITING (in-memory)
# =============================================================================
_rate_limit_store: dict[str, list[datetime]] = defaultdict(list)
def check_rate_limit(request: gr.Request) -> bool:
"""Verifica se o IP excedeu o limite de requisições."""
if request is None:
return True
ip = None
if hasattr(request, "headers"):
headers = request.headers or {}
ip = headers.get("x-forwarded-for", "").split(",")[0].strip()
if not ip:
ip = headers.get("x-real-ip", "").strip()
if not ip:
client_info = getattr(request, "client", None)
if client_info:
if isinstance(client_info, dict):
ip = client_info.get("host", "")
elif hasattr(client_info, "host"):
ip = getattr(client_info, "host", "")
else:
ip = str(client_info)
if not ip or ip == "unknown":
session_hash = getattr(request, "session_hash", None)
if session_hash:
ip = f"session_{session_hash[:16]}"
else:
return True
now = datetime.now()
window_start = now - timedelta(hours=config.RATE_LIMIT_WINDOW_HOURS)
_rate_limit_store[ip] = [
ts for ts in _rate_limit_store[ip]
if ts > window_start
]
if len(_rate_limit_store[ip]) >= config.RATE_LIMIT_REQUESTS:
logger.warning(f"Rate limit excedido para IP: {ip}")
return False
_rate_limit_store[ip].append(now)
return True
# =============================================================================
# FUNÇÃO DE PROCESSAMENTO PRINCIPAL
# =============================================================================
def _process_documents_internal(
files: list,
output_format: str,
progress: Optional[gr.Progress] = None
) -> tuple[str | list[str], str]:
"""Função interna de processamento (sem decorator GPU)."""
start_time = time.time()
cleanup_old_files()
if progress:
progress(0.1, desc="🔍 Validating files...")
try:
validated_files = validate_files(files)
except ValidationError as e:
logger.warning(f"Erro de validação: {e.message}")
raise gr.Error(e.message)
if progress:
progress(0.2, desc="⚡ Initializing Docling...")
processor = DoclingProcessor(
enable_ocr=True,
enable_table_detection=True,
use_gpu=HAS_SPACES
)
output_dir = create_temp_directory(prefix="output_")
output_files = []
processed_count = 0
total_files = len(validated_files)
for i, (file_path, sanitized_name) in enumerate(validated_files):
progress_pct = 0.2 + (0.6 * (i / total_files))
if progress:
progress(progress_pct, desc=f"📄 Processing {sanitized_name}...")
try:
processed_data = processor.process_document(file_path)
base_name = Path(sanitized_name).stem
if output_format == "JSON":
json_content = format_to_json(processed_data, sanitized_name)
json_path = save_output_file(
json_content,
f"{base_name}.json",
output_dir
)
output_files.append((json_path, f"{base_name}.json"))
elif output_format == "Markdown":
md_content = format_to_markdown(processed_data)
md_path = save_output_file(
md_content,
f"{base_name}.md",
output_dir
)
output_files.append((md_path, f"{base_name}.md"))
else: # Ambos
json_content = format_to_json(processed_data, sanitized_name)
md_content = format_to_markdown(processed_data)
json_path = save_output_file(
json_content,
f"{base_name}.json",
output_dir
)
md_path = save_output_file(
md_content,
f"{base_name}.md",
output_dir
)
output_files.append((json_path, f"{base_name}.json"))
output_files.append((md_path, f"{base_name}.md"))
processed_count += 1
logger.info(f"Processado: {sanitized_name}")
except Exception as e:
logger.error(f"Erro ao processar {sanitized_name}: {e}")
logger.debug(traceback.format_exc())
if total_files == 1:
raise gr.Error(
f"❌ Erro ao processar {sanitized_name}: {str(e)}"
)
if progress:
progress(0.9, desc="📦 Preparing download...")
if not output_files:
raise gr.Error("❌ Nenhum arquivo foi processado com sucesso.")
if len(output_files) > 1 or output_format == "Ambos":
zip_path = create_zip_output(
output_files,
output_name="documentos_processados"
)
final_output = str(zip_path)
else:
final_output = str(output_files[0][0])
elapsed_time = time.time() - start_time
if progress:
progress(1.0, desc="✅ Complete!")
status_msg = (
f"### ✅ Processing Complete!\n\n"
f"**Files processed:** {processed_count}/{total_files} \n"
f"**Format:** {output_format} \n"
f"**Time:** {elapsed_time:.1f}s"
)
logger.info(
f"Batch concluído: {processed_count}/{total_files} arquivos, "
f"{elapsed_time:.1f}s, formato={output_format}"
)
return final_output, status_msg
# Versão com GPU (se disponível)
if HAS_SPACES:
@spaces.GPU(duration=config.GPU_TIMEOUT_SECONDS)
def process_documents_gpu(
files: list,
output_format: str,
progress: gr.Progress = gr.Progress()
) -> tuple[str | list[str], str]:
"""Processamento com aceleração GPU via ZeroGPU."""
return _process_documents_internal(files, output_format, progress)
else:
process_documents_gpu = None
def process_documents(
files: list,
output_format: str,
request: gr.Request,
progress: gr.Progress = gr.Progress()
) -> tuple[str | list[str], str]:
"""Função principal de processamento."""
if not check_rate_limit(request):
raise gr.Error(
f"⚠️ Rate limit exceeded. "
f"Maximum: {config.RATE_LIMIT_REQUESTS} requests per hour. "
f"Please try again later."
)
try:
if HAS_SPACES and process_documents_gpu is not None:
logger.info("Usando processamento GPU (ZeroGPU)")
return process_documents_gpu(files, output_format, progress)
else:
logger.info("Usando processamento CPU (fallback)")
return _process_documents_internal(files, output_format, progress)
except gr.Error:
raise
except TimeoutError:
logger.error("Timeout no processamento")
raise gr.Error(
"⏱️ Time limit exceeded. Try with smaller or fewer files."
)
except MemoryError:
logger.error("Memória insuficiente")
raise gr.Error(
"💾 Insufficient memory. Try with smaller files."
)
except Exception as e:
logger.error(f"Erro inesperado: {e}")
logger.debug(traceback.format_exc())
raise gr.Error(f"❌ Unexpected error: {str(e)}")
# =============================================================================
# INTERFACE GRADIO - MODERN REDESIGN
# =============================================================================
def create_interface() -> gr.Blocks:
"""Creates a modern, mobile-first Gradio interface."""
with gr.Blocks(
title="📄 Docling Processor",
fill_height=True,
) as demo:
# Header Section
with gr.Row():
with gr.Column(scale=1):
gr.Markdown(
"""
# 📄 Docling Document Processor
Transform PDF, DOC, and DOCX files into structured formats using AI.
Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
""",
elem_classes=["header-text"]
)
gr.Markdown("---")
# Main Content Area
with gr.Row():
with gr.Column(scale=1):
# Upload Section
file_input = gr.File(
file_count="multiple",
file_types=[".pdf", ".doc", ".docx"],
label="📁 Upload Documents",
height=200,
elem_classes=["upload-area"]
)
# Format Selector
format_selector = gr.Radio(
choices=config.OUTPUT_FORMATS,
value="Markdown",
label="📤 Output Format",
info="Choose your preferred output format",
elem_classes=["format-selector"]
)
# Process Button
process_btn = gr.Button(
"🚀 Process Documents",
variant="primary",
size="lg",
elem_classes=["process-button"]
)
# Info Box
with gr.Accordion("ℹ️ How to Use", open=False):
gr.Markdown(
"""
### Quick Start
1. **Upload** your documents (max 5 files, 50MB each)
2. **Select** output format (JSON, Markdown, or both)
3. **Click** Process Documents
4. **Download** your results
### Features
- 🔍 Smart text, table & metadata extraction
- 🌐 Automatic language detection
- 🚀 GPU acceleration for fast processing
- 📊 Preserves document structure
### Supported Formats
**Input:** PDF, DOC, DOCX
**Output:** JSON, Markdown, or ZIP (both)
"""
)
# Results Section
gr.Markdown("---")
with gr.Row():
with gr.Column(scale=1):
# Status Output
status_output = gr.Markdown(
label="Status",
elem_classes=["status-output"]
)
# File Download
file_output = gr.File(
label="📥 Download Results",
interactive=False,
elem_classes=["download-area"]
)
# Footer
gr.Markdown("---")
gr.Markdown(
f"""
<div style="text-align: center; color: #666; font-size: 0.9em;">
<p><strong>Limits:</strong> {config.MAX_FILES_PER_SESSION} files per upload |
{config.MAX_FILE_SIZE_MB}MB per file |
{config.RATE_LIMIT_REQUESTS} requests/hour</p>
<p>Powered by <a href="https://github.com/docling-project/docling">Docling</a> •
Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder">anycoder</a></p>
</div>
""",
elem_classes=["footer-text"]
)
# Event Handlers
process_btn.click(
fn=process_documents,
inputs=[file_input, format_selector],
outputs=[file_output, status_output],
show_progress="full",
)
# Clear status when new files are selected
file_input.change(
fn=lambda: ("", None),
outputs=[status_output, file_output],
)
return demo
# =============================================================================
# PONTO DE ENTRADA
# =============================================================================
if __name__ == "__main__":
# Cria diretórios necessários
config.TEMP_DIR.mkdir(parents=True, exist_ok=True)
config.LOGS_DIR.mkdir(parents=True, exist_ok=True)
# Limpa arquivos temporários antigos
cleanup_old_files()
logger.info("Iniciando Docling Document Processor...")
logger.info(f"ZeroGPU disponível: {HAS_SPACES}")
# Cria e lança a interface
demo = create_interface()
# Detecta se está em ambiente containerizado (HF Spaces)
is_containerized = HAS_SPACES or os.environ.get("SPACE_ID") is not None
try:
demo.queue().launch(
server_name="0.0.0.0",
server_port=7860,
max_file_size=f"{config.MAX_FILE_SIZE_MB}mb",
show_error=True,
share=is_containerized,
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="indigo",
neutral_hue="slate",
font=gr.themes.GoogleFont("Inter"),
text_size="lg",
spacing_size="lg",
radius_size="md"
).set(
button_primary_background_fill="*primary_600",
button_primary_background_fill_hover="*primary_700",
button_primary_text_color="white",
block_title_text_weight="600",
block_label_text_weight="500",
),
css="""
/* Mobile-First Responsive Design */
.gradio-container {
max-width: 1200px !important;
margin: 0 auto !important;
padding: 1rem !important;
}
/* Header Styling */
.header-text h1 {
font-size: 2rem !important;
font-weight: 700 !important;
margin-bottom: 0.5rem !important;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.header-text p {
font-size: 1.1rem !important;
color: #64748b !important;
line-height: 1.6 !important;
}
/* Upload Area */
.upload-area {
border: 2px dashed #cbd5e1 !important;
border-radius: 12px !important;
transition: all 0.3s ease !important;
}
.upload-area:hover {
border-color: #667eea !important;
background: #f8fafc !important;
}
/* Format Selector */
.format-selector label {
font-weight: 500 !important;
margin-bottom: 0.5rem !important;
}
/* Process Button */
.process-button {
margin-top: 1rem !important;
font-size: 1.1rem !important;
padding: 0.75rem 2rem !important;
border-radius: 8px !important;
font-weight: 600 !important;
box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1) !important;
transition: all 0.3s ease !important;
}
.process-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 10px 15px -3px rgb(0 0 0 / 0.2) !important;
}
/* Status Output */
.status-output {
background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%) !important;
border-left: 4px solid #0ea5e9 !important;
padding: 1rem !important;
border-radius: 8px !important;
margin-top: 1rem !important;
}
/* Download Area */
.download-area {
margin-top: 1rem !important;
border-radius: 8px !important;
}
/* Footer */
.footer-text {
opacity: 0.8 !important;
}
.footer-text a {
color: #667eea !important;
text-decoration: none !important;
font-weight: 500 !important;
}
.footer-text a:hover {
text-decoration: underline !important;
}
/* Accordion Styling */
.accordion {
margin-top: 1rem !important;
}
/* Mobile Responsiveness */
@media (max-width: 768px) {
.gradio-container {
padding: 0.5rem !important;
}
.header-text h1 {
font-size: 1.5rem !important;
}
.header-text p {
font-size: 1rem !important;
}
.process-button {
width: 100% !important;
font-size: 1rem !important;
}
}
/* Dark Mode Support */
@media (prefers-color-scheme: dark) {
.upload-area {
border-color: #475569 !important;
}
.upload-area:hover {
background: #1e293b !important;
}
.status-output {
background: linear-gradient(135deg, #1e293b 0%, #334155 100%) !important;
}
}
""",
footer_links=[
{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
"gradio",
"api"
]
)
except Exception as e:
logger.error(f"Erro ao iniciar aplicação: {e}")
logger.info("Tentando iniciar com configuração alternativa...")
try:
demo.queue().launch(
server_name="0.0.0.0",
server_port=7860,
max_file_size=f"{config.MAX_FILE_SIZE_MB}mb",
show_error=True,
share=True,
theme=gr.themes.Soft(primary_hue="blue"),
footer_links=[
{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}
]
)
except Exception as fallback_error:
logger.critical(f"Falha crítica ao iniciar: {fallback_error}")
raise