Spaces:
Sleeping
Sleeping
| """Tests for document ingestion.""" | |
| import pytest | |
| from pathlib import Path | |
| import tempfile | |
| from core.ingest import DocumentLoader, HierarchicalClassifier, DocumentProcessor | |
| def test_document_loader_txt(): | |
| """Test loading text files.""" | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: | |
| f.write("Test content for document loader.") | |
| temp_path = f.name | |
| try: | |
| loader = DocumentLoader() | |
| content, metadata = loader.load_txt(temp_path) | |
| assert "Test content" in content | |
| assert metadata["format"] == "txt" | |
| assert "source_name" in metadata | |
| finally: | |
| Path(temp_path).unlink() | |
| def test_document_loader_unsupported(): | |
| """Test handling of unsupported file formats.""" | |
| with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f: | |
| temp_path = f.name | |
| try: | |
| loader = DocumentLoader() | |
| with pytest.raises(ValueError): | |
| loader.load(temp_path) | |
| finally: | |
| Path(temp_path).unlink() | |
| def test_hierarchical_classifier(): | |
| """Test hierarchical classification.""" | |
| classifier = HierarchicalClassifier("hospital") | |
| text = "Patient admission procedures and clinical protocols for emergency care." | |
| classification = classifier.classify_text(text) | |
| assert "level1" in classification | |
| assert "level2" in classification | |
| assert "level3" in classification | |
| assert "doc_type" in classification | |
| def test_document_processor(): | |
| """Test document processing pipeline.""" | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: | |
| f.write("Medical policy for patient care. " * 100) # Create substantial content | |
| temp_path = f.name | |
| try: | |
| processor = DocumentProcessor( | |
| hierarchy_name="hospital", | |
| chunk_size=256, | |
| chunk_overlap=50 | |
| ) | |
| chunks = processor.process_document(temp_path) | |
| assert len(chunks) > 0 | |
| assert "text" in chunks[0] | |
| assert "metadata" in chunks[0] | |
| assert "chunk_id" in chunks[0]["metadata"] | |
| assert "level1" in chunks[0]["metadata"] | |
| finally: | |
| Path(temp_path).unlink() | |
| def test_chunk_consistency(): | |
| """Test that chunks maintain metadata consistency.""" | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: | |
| f.write("Hospital emergency procedures. " * 50) | |
| temp_path = f.name | |
| try: | |
| processor = DocumentProcessor("hospital", chunk_size=200, chunk_overlap=20) | |
| chunks = processor.process_document(temp_path) | |
| # All chunks should have same doc_id | |
| doc_ids = [c["metadata"]["doc_id"] for c in chunks] | |
| assert len(set(doc_ids)) == 1 | |
| # Chunk indices should be sequential | |
| indices = [c["metadata"]["chunk_index"] for c in chunks] | |
| assert indices == list(range(len(chunks))) | |
| finally: | |
| Path(temp_path).unlink() |