Spaces:

guifav
/

rag_template

Sleeping

rag_template / tests /test_document_processing.py

Guilherme Favaron

Sync: Complete project update (Phase 6) - API, Metadata, Eval, Docs

a686b1b 6 days ago

4.62 kB

	"""
	Testes para processamento de documentos.
	"""

	import pytest
	import tempfile
	from pathlib import Path
	from src.document_processing import DocumentProcessor


	class TestDocumentProcessor:
	"""Testes para classe DocumentProcessor."""

	@pytest.fixture
	def processor(self):
	"""Instancia de DocumentProcessor."""
	return DocumentProcessor()

	def test_extract_text_from_txt(self, processor):
	"""Testa extracao de texto de arquivo TXT."""
	# Criar arquivo temporario
	with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
	f.write("Este e um texto de teste.\nCom multiplas linhas.")
	temp_path = f.name

	try:
	text = processor.extract_text(temp_path)
	assert "Este e um texto de teste" in text
	assert "Com multiplas linhas" in text
	finally:
	Path(temp_path).unlink()

	def test_extract_text_from_nonexistent_file(self, processor):
	"""Testa extracao de arquivo que nao existe."""
	with pytest.raises(FileNotFoundError):
	processor.extract_text("/caminho/inexistente.txt")

	def test_detect_file_type_txt(self, processor):
	"""Testa deteccao de tipo TXT."""
	assert processor.detect_file_type("documento.txt") == "TXT"
	assert processor.detect_file_type("arquivo.text") == "TXT"

	def test_detect_file_type_pdf(self, processor):
	"""Testa deteccao de tipo PDF."""
	assert processor.detect_file_type("documento.pdf") == "PDF"
	assert processor.detect_file_type("ARQUIVO.PDF") == "PDF"

	def test_detect_file_type_md(self, processor):
	"""Testa deteccao de tipo Markdown."""
	assert processor.detect_file_type("readme.md") == "MD"
	assert processor.detect_file_type("docs.markdown") == "MD"

	def test_detect_file_type_unknown(self, processor):
	"""Testa deteccao de tipo desconhecido."""
	assert processor.detect_file_type("arquivo.xyz") == "UNKNOWN"

	def test_clean_text(self, processor):
	"""Testa limpeza de texto."""
	dirty_text = " Texto com espacos \n\n\n multiplos "
	clean_text = processor.clean_text(dirty_text)

	assert " " not in clean_text
	assert "\n\n\n" not in clean_text
	assert clean_text.strip() == clean_text

	def test_clean_text_empty(self, processor):
	"""Testa limpeza de texto vazio."""
	assert processor.clean_text("") == ""
	assert processor.clean_text(" ") == ""

	def test_get_text_stats(self, processor):
	"""Testa calculo de estatisticas de texto."""
	text = "Este e um texto de teste. Tem varias palavras e caracteres."

	stats = processor.get_text_stats(text)

	assert stats['num_chars'] > 0
	assert stats['num_words'] > 0
	assert stats['num_lines'] >= 1
	assert stats['num_chars'] == len(text)

	def test_get_text_stats_empty(self, processor):
	"""Testa estatisticas de texto vazio."""
	stats = processor.get_text_stats("")

	assert stats['num_chars'] == 0
	assert stats['num_words'] == 0
	assert stats['num_lines'] == 0

	def test_split_into_sentences(self, processor):
	"""Testa divisao em sentencas."""
	text = "Esta e a primeira sentenca. Esta e a segunda. E esta e a terceira!"

	sentences = processor.split_into_sentences(text)

	assert len(sentences) == 3
	assert "primeira" in sentences[0]
	assert "segunda" in sentences[1]
	assert "terceira" in sentences[2]

	def test_extract_metadata_from_filename(self, processor):
	"""Testa extracao de metadata do nome do arquivo."""
	metadata = processor.extract_metadata_from_filename("documento_importante_2026.pdf")

	assert metadata['file_type'] == "PDF"
	assert '2026' in metadata.get('filename', '')

	def test_process_file_txt(self, processor):
	"""Testa processamento completo de arquivo TXT."""
	# Criar arquivo temporario
	with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
	f.write("Conteudo do arquivo de teste.")
	temp_path = f.name

	try:
	result = processor.process_file(temp_path)

	assert result['text'] is not None
	assert result['file_type'] == "TXT"
	assert result['stats']['num_chars'] > 0
	assert 'Conteudo do arquivo' in result['text']
	finally:
	Path(temp_path).unlink()


	if __name__ == "__main__":
	pytest.main([__file__, "-v"])