"""Tests for document ingestion.""" import pytest from pathlib import Path import tempfile from core.ingest import DocumentLoader, HierarchicalClassifier, DocumentProcessor def test_document_loader_txt(): """Test loading text files.""" with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: f.write("Test content for document loader.") temp_path = f.name try: loader = DocumentLoader() content, metadata = loader.load_txt(temp_path) assert "Test content" in content assert metadata["format"] == "txt" assert "source_name" in metadata finally: Path(temp_path).unlink() def test_document_loader_unsupported(): """Test handling of unsupported file formats.""" with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f: temp_path = f.name try: loader = DocumentLoader() with pytest.raises(ValueError): loader.load(temp_path) finally: Path(temp_path).unlink() def test_hierarchical_classifier(): """Test hierarchical classification.""" classifier = HierarchicalClassifier("hospital") text = "Patient admission procedures and clinical protocols for emergency care." classification = classifier.classify_text(text) assert "level1" in classification assert "level2" in classification assert "level3" in classification assert "doc_type" in classification def test_document_processor(): """Test document processing pipeline.""" with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: f.write("Medical policy for patient care. " * 100) # Create substantial content temp_path = f.name try: processor = DocumentProcessor( hierarchy_name="hospital", chunk_size=256, chunk_overlap=50 ) chunks = processor.process_document(temp_path) assert len(chunks) > 0 assert "text" in chunks[0] assert "metadata" in chunks[0] assert "chunk_id" in chunks[0]["metadata"] assert "level1" in chunks[0]["metadata"] finally: Path(temp_path).unlink() def test_chunk_consistency(): """Test that chunks maintain metadata consistency.""" with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: f.write("Hospital emergency procedures. " * 50) temp_path = f.name try: processor = DocumentProcessor("hospital", chunk_size=200, chunk_overlap=20) chunks = processor.process_document(temp_path) # All chunks should have same doc_id doc_ids = [c["metadata"]["doc_id"] for c in chunks] assert len(set(doc_ids)) == 1 # Chunk indices should be sequential indices = [c["metadata"]["chunk_index"] for c in chunks] assert indices == list(range(len(chunks))) finally: Path(temp_path).unlink()