hierarchical-rag-eval / tests /test_ingest.py
hh786's picture
Deployment of Hierarchical RAG system
c54dcef
"""Tests for document ingestion."""
import pytest
from pathlib import Path
import tempfile
from core.ingest import DocumentLoader, HierarchicalClassifier, DocumentProcessor
def test_document_loader_txt():
"""Test loading text files."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write("Test content for document loader.")
temp_path = f.name
try:
loader = DocumentLoader()
content, metadata = loader.load_txt(temp_path)
assert "Test content" in content
assert metadata["format"] == "txt"
assert "source_name" in metadata
finally:
Path(temp_path).unlink()
def test_document_loader_unsupported():
"""Test handling of unsupported file formats."""
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
temp_path = f.name
try:
loader = DocumentLoader()
with pytest.raises(ValueError):
loader.load(temp_path)
finally:
Path(temp_path).unlink()
def test_hierarchical_classifier():
"""Test hierarchical classification."""
classifier = HierarchicalClassifier("hospital")
text = "Patient admission procedures and clinical protocols for emergency care."
classification = classifier.classify_text(text)
assert "level1" in classification
assert "level2" in classification
assert "level3" in classification
assert "doc_type" in classification
def test_document_processor():
"""Test document processing pipeline."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write("Medical policy for patient care. " * 100) # Create substantial content
temp_path = f.name
try:
processor = DocumentProcessor(
hierarchy_name="hospital",
chunk_size=256,
chunk_overlap=50
)
chunks = processor.process_document(temp_path)
assert len(chunks) > 0
assert "text" in chunks[0]
assert "metadata" in chunks[0]
assert "chunk_id" in chunks[0]["metadata"]
assert "level1" in chunks[0]["metadata"]
finally:
Path(temp_path).unlink()
def test_chunk_consistency():
"""Test that chunks maintain metadata consistency."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write("Hospital emergency procedures. " * 50)
temp_path = f.name
try:
processor = DocumentProcessor("hospital", chunk_size=200, chunk_overlap=20)
chunks = processor.process_document(temp_path)
# All chunks should have same doc_id
doc_ids = [c["metadata"]["doc_id"] for c in chunks]
assert len(set(doc_ids)) == 1
# Chunk indices should be sequential
indices = [c["metadata"]["chunk_index"] for c in chunks]
assert indices == list(range(len(chunks)))
finally:
Path(temp_path).unlink()