docling-processor / tests /test_processors.py
Gabriel Ramos
feat: Docling Document Processor - Gradio + ZeroGPU
780413d
"""
Testes unitários para os processadores e validadores.
Execute com: python -m pytest tests/test_processors.py -v
"""
import json
import os
import sys
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
# Adiciona o diretório pai ao path
sys.path.insert(0, str(Path(__file__).parent.parent))
import config
from utils.validators import (
ValidationError,
sanitize_filename,
validate_file_count,
validate_file_size,
)
# =============================================================================
# FIXTURES
# =============================================================================
@pytest.fixture
def temp_file():
"""Cria um arquivo temporário para testes."""
with tempfile.NamedTemporaryFile(
mode="wb",
suffix=".pdf",
delete=False
) as f:
# Escreve conteúdo mínimo de PDF
f.write(b"%PDF-1.4\n")
f.write(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n")
f.write(b"%%EOF\n")
temp_path = f.name
yield temp_path
# Cleanup
if os.path.exists(temp_path):
os.unlink(temp_path)
@pytest.fixture
def large_temp_file():
"""Cria um arquivo temporário grande (> limite)."""
with tempfile.NamedTemporaryFile(
mode="wb",
suffix=".pdf",
delete=False
) as f:
# Escreve mais que o limite
f.write(b"X" * (config.MAX_FILE_SIZE_BYTES + 1000))
temp_path = f.name
yield temp_path
if os.path.exists(temp_path):
os.unlink(temp_path)
@pytest.fixture
def empty_temp_file():
"""Cria um arquivo temporário vazio."""
with tempfile.NamedTemporaryFile(
mode="wb",
suffix=".pdf",
delete=False
) as f:
temp_path = f.name
yield temp_path
if os.path.exists(temp_path):
os.unlink(temp_path)
# =============================================================================
# TESTES DE VALIDAÇÃO
# =============================================================================
class TestValidateFileCount:
"""Testes para validate_file_count()."""
def test_valid_count_single(self):
"""Teste com um arquivo."""
assert validate_file_count([1]) is True
def test_valid_count_multiple(self):
"""Teste com múltiplos arquivos dentro do limite."""
files = list(range(config.MAX_FILES_PER_SESSION))
assert validate_file_count(files) is True
def test_empty_list_raises(self):
"""Teste com lista vazia deve falhar."""
with pytest.raises(ValidationError) as exc_info:
validate_file_count([])
assert exc_info.value.error_code == "NO_FILES"
def test_too_many_files_raises(self):
"""Teste com arquivos demais deve falhar."""
files = list(range(config.MAX_FILES_PER_SESSION + 1))
with pytest.raises(ValidationError) as exc_info:
validate_file_count(files)
assert exc_info.value.error_code == "TOO_MANY_FILES"
class TestValidateFileSize:
"""Testes para validate_file_size()."""
def test_valid_size(self, temp_file):
"""Teste com arquivo de tamanho válido."""
assert validate_file_size(temp_file) is True
def test_file_too_large(self, large_temp_file):
"""Teste com arquivo muito grande."""
with pytest.raises(ValidationError) as exc_info:
validate_file_size(large_temp_file)
assert exc_info.value.error_code == "FILE_TOO_LARGE"
def test_empty_file(self, empty_temp_file):
"""Teste com arquivo vazio."""
with pytest.raises(ValidationError) as exc_info:
validate_file_size(empty_temp_file)
assert exc_info.value.error_code == "EMPTY_FILE"
def test_file_not_found(self):
"""Teste com arquivo inexistente."""
with pytest.raises(ValidationError) as exc_info:
validate_file_size("/caminho/inexistente/arquivo.pdf")
assert exc_info.value.error_code == "FILE_NOT_FOUND"
class TestSanitizeFilename:
"""Testes para sanitize_filename()."""
def test_normal_filename(self):
"""Teste com nome normal."""
assert sanitize_filename("documento.pdf") == "documento.pdf"
def test_special_characters(self):
"""Teste com caracteres especiais."""
result = sanitize_filename("doc<>:test.pdf")
assert "<" not in result
assert ">" not in result
assert ":" not in result
def test_spaces(self):
"""Teste com espaços."""
result = sanitize_filename("meu documento.pdf")
assert result == "meu_documento.pdf"
def test_multiple_underscores(self):
"""Teste com underscores múltiplos."""
result = sanitize_filename("doc___test.pdf")
assert "___" not in result
def test_empty_filename(self):
"""Teste com nome vazio."""
result = sanitize_filename("")
assert result == "arquivo_sem_nome"
def test_long_filename(self):
"""Teste com nome muito longo."""
long_name = "a" * 300 + ".pdf"
result = sanitize_filename(long_name)
assert len(result) <= config.FILENAME_MAX_LENGTH
# =============================================================================
# TESTES DE FORMATAÇÃO JSON
# =============================================================================
class TestJSONFormatter:
"""Testes para json_formatter.py."""
def test_format_to_json_basic(self):
"""Teste de formatação JSON básica."""
from processors.json_formatter import format_to_json
# Mock de dados processados
mock_document = MagicMock()
mock_document.export_to_dict.return_value = {"content": "teste"}
processed_data = {
"document": mock_document,
"metadata": {"nome_arquivo": "test.pdf"},
"tables": [],
"language": "pt",
}
result = format_to_json(processed_data, "test.pdf")
assert isinstance(result, str)
parsed = json.loads(result)
assert parsed["arquivo"] == "test.pdf"
assert parsed["idioma"] == "pt"
assert "processado_em" in parsed
def test_format_to_json_with_tables(self):
"""Teste de formatação JSON com tabelas."""
from processors.json_formatter import format_to_json
mock_document = MagicMock()
mock_document.export_to_dict.return_value = {}
processed_data = {
"document": mock_document,
"metadata": {},
"tables": [
{"indice": 1, "dados": [{"col1": "val1"}]}
],
"language": "en",
}
result = format_to_json(processed_data, "test.pdf")
parsed = json.loads(result)
assert len(parsed["tabelas"]) == 1
assert parsed["tabelas"][0]["indice"] == 1
# =============================================================================
# TESTES DE FORMATAÇÃO MARKDOWN
# =============================================================================
class TestMarkdownFormatter:
"""Testes para markdown_formatter.py."""
def test_format_to_markdown_basic(self):
"""Teste de formatação Markdown básica."""
from processors.markdown_formatter import format_to_markdown
mock_document = MagicMock()
mock_document.export_to_markdown.return_value = "# Conteúdo\n\nTexto aqui."
processed_data = {
"document": mock_document,
"metadata": {"nome_arquivo": "test.pdf", "num_paginas": 3},
"tables": [],
"language": "pt",
}
result = format_to_markdown(processed_data)
assert isinstance(result, str)
assert "# " in result or "## " in result # Tem headings
def test_dict_to_markdown_table(self):
"""Teste de conversão de dict para tabela MD."""
from processors.markdown_formatter import _dict_to_markdown_table
data = [
{"Nome": "Alice", "Idade": 30},
{"Nome": "Bob", "Idade": 25},
]
result = _dict_to_markdown_table(data)
assert "| Nome | Idade |" in result
assert "| --- | --- |" in result
assert "| Alice | 30 |" in result
assert "| Bob | 25 |" in result
def test_empty_table(self):
"""Teste com tabela vazia."""
from processors.markdown_formatter import _dict_to_markdown_table
result = _dict_to_markdown_table([])
assert "vazia" in result.lower()
# =============================================================================
# TESTES DE FILE HANDLER
# =============================================================================
class TestFileHandler:
"""Testes para file_handler.py."""
def test_create_temp_directory(self):
"""Teste de criação de diretório temporário."""
from utils.file_handler import create_temp_directory
temp_dir = create_temp_directory(prefix="test_")
try:
assert temp_dir.exists()
assert temp_dir.is_dir()
assert "test_" in temp_dir.name
finally:
# Cleanup
if temp_dir.exists():
import shutil
shutil.rmtree(temp_dir)
def test_save_output_file(self):
"""Teste de salvamento de arquivo de saída."""
from utils.file_handler import save_output_file, create_temp_directory
temp_dir = create_temp_directory(prefix="test_")
try:
content = "Conteúdo de teste"
output_path = save_output_file(content, "teste.txt", temp_dir)
assert output_path.exists()
assert output_path.read_text() == content
finally:
import shutil
if temp_dir.exists():
shutil.rmtree(temp_dir)
def test_format_size(self):
"""Teste de formatação de tamanho."""
from utils.file_handler import format_size
assert "B" in format_size(500)
assert "KB" in format_size(1024 * 5)
assert "MB" in format_size(1024 * 1024 * 10)
assert "GB" in format_size(1024 * 1024 * 1024 * 2)
# =============================================================================
# TESTES DE INTEGRAÇÃO (MOCK)
# =============================================================================
class TestDoclingProcessorMock:
"""Testes do DoclingProcessor com mocks."""
@patch("processors.docling_processor.DocumentConverter")
def test_processor_initialization(self, mock_converter_class):
"""Teste de inicialização do processador."""
from processors.docling_processor import DoclingProcessor
processor = DoclingProcessor(
enable_ocr=True,
enable_table_detection=True,
use_gpu=False
)
assert processor.enable_ocr is True
assert processor.enable_table_detection is True
assert processor.use_gpu is False
@patch("processors.docling_processor.DocumentConverter")
def test_processor_process_document(self, mock_converter_class):
"""Teste de processamento de documento."""
from processors.docling_processor import DoclingProcessor
# Setup mock
mock_converter = MagicMock()
mock_converter_class.return_value = mock_converter
mock_result = MagicMock()
mock_document = MagicMock()
mock_document.export_to_markdown.return_value = "# Teste"
mock_result.document = mock_document
mock_converter.convert.return_value = mock_result
# Cria arquivo temporário
with tempfile.NamedTemporaryFile(
mode="wb",
suffix=".pdf",
delete=False
) as f:
f.write(b"%PDF-1.4\n%%EOF\n")
temp_path = f.name
try:
processor = DoclingProcessor()
result = processor.process_document(temp_path)
assert "document" in result
assert "metadata" in result
assert "tables" in result
assert "language" in result
finally:
os.unlink(temp_path)
# =============================================================================
# EXECUTAR TESTES
# =============================================================================
if __name__ == "__main__":
pytest.main([__file__, "-v"])