""" Test cases for the document text extraction system. """ import unittest import json from pathlib import Path import tempfile import os from src.data_preparation import DocumentProcessor, NERDatasetCreator from src.model import ModelConfig, create_model_and_trainer from src.inference import DocumentInference class TestDocumentProcessor(unittest.TestCase): """Test cases for document processing.""" def setUp(self): """Set up test fixtures.""" self.processor = DocumentProcessor() def test_clean_text(self): """Test text cleaning functionality.""" dirty_text = " This is a test text!!! " clean_text = self.processor.clean_text(dirty_text) self.assertEqual(clean_text, "This is a test text!") def test_entity_patterns(self): """Test entity pattern matching.""" test_text = "Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00" # Test that patterns exist self.assertIn('NAME', self.processor.entity_patterns) self.assertIn('DATE', self.processor.entity_patterns) self.assertIn('INVOICE_NO', self.processor.entity_patterns) self.assertIn('AMOUNT', self.processor.entity_patterns) class TestNERDatasetCreator(unittest.TestCase): """Test cases for NER dataset creation.""" def setUp(self): """Set up test fixtures.""" self.processor = DocumentProcessor() self.dataset_creator = NERDatasetCreator(self.processor) def test_auto_label_text(self): """Test automatic text labeling.""" test_text = "Invoice sent to Robert White on 15/09/2025 Amount: $1,250" labeled_tokens = self.dataset_creator.auto_label_text(test_text) # Check that we get tokens and labels self.assertIsInstance(labeled_tokens, list) self.assertGreater(len(labeled_tokens), 0) # Check that each item is a (token, label) tuple for token, label in labeled_tokens: self.assertIsInstance(token, str) self.assertIsInstance(label, str) def test_create_training_example(self): """Test training example creation.""" test_text = "Invoice INV-1001 for $500" example = self.dataset_creator.create_training_example(test_text) # Check required fields self.assertIn('tokens', example) self.assertIn('labels', example) self.assertIn('text', example) # Check that tokens and labels have the same length self.assertEqual(len(example['tokens']), len(example['labels'])) def test_create_sample_dataset(self): """Test sample dataset creation.""" dataset = self.dataset_creator.create_sample_dataset() # Check that we get a non-empty dataset self.assertIsInstance(dataset, list) self.assertGreater(len(dataset), 0) # Check first example structure first_example = dataset[0] self.assertIn('tokens', first_example) self.assertIn('labels', first_example) self.assertIn('text', first_example) class TestModelConfig(unittest.TestCase): """Test cases for model configuration.""" def test_default_config(self): """Test default configuration creation.""" config = ModelConfig() # Check default values self.assertEqual(config.model_name, "distilbert-base-uncased") self.assertEqual(config.max_length, 512) self.assertEqual(config.batch_size, 16) # Check entity labels self.assertIsInstance(config.entity_labels, list) self.assertGreater(len(config.entity_labels), 0) self.assertIn('O', config.entity_labels) # Check label mappings self.assertIsInstance(config.label2id, dict) self.assertIsInstance(config.id2label, dict) self.assertEqual(len(config.label2id), len(config.entity_labels)) def test_custom_config(self): """Test custom configuration.""" custom_labels = ['O', 'B-TEST', 'I-TEST'] config = ModelConfig( batch_size=32, learning_rate=1e-5, entity_labels=custom_labels ) self.assertEqual(config.batch_size, 32) self.assertEqual(config.learning_rate, 1e-5) self.assertEqual(config.entity_labels, custom_labels) self.assertEqual(config.num_labels, 3) class TestModelCreation(unittest.TestCase): """Test cases for model creation.""" def test_create_model_and_trainer(self): """Test model and trainer creation.""" config = ModelConfig( batch_size=4, # Small batch for testing num_epochs=1, entity_labels=['O', 'B-TEST', 'I-TEST'] ) model, trainer = create_model_and_trainer(config) # Check that objects are created self.assertIsNotNone(model) self.assertIsNotNone(trainer) # Check configuration self.assertEqual(trainer.config.batch_size, 4) self.assertEqual(trainer.config.num_epochs, 1) class TestInference(unittest.TestCase): """Test cases for inference pipeline.""" @classmethod def setUpClass(cls): """Set up class-level fixtures.""" # Create a minimal trained model for testing # This is a placeholder - in real testing, you'd use a pre-trained model cls.model_path = "test_model" cls.test_text = "Invoice sent to John Doe on 01/15/2025 Amount: $500.00" def test_entity_validation(self): """Test entity validation patterns.""" # We can test the patterns without loading a full model test_patterns = { 'DATE': ['01/15/2025', '2025-01-15', 'January 15, 2025'], 'AMOUNT': ['$500.00', '$1,250.50', '1000.00 USD'], 'EMAIL': ['test@email.com', 'user.name@domain.co.uk'], 'PHONE': ['(555) 123-4567', '+1-555-987-6543', '555-123-4567'] } # This test checks that our regex patterns work import re date_pattern = r'\b\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}\b' self.assertTrue(re.search(date_pattern, '01/15/2025')) amount_pattern = r'\$\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?' self.assertTrue(re.search(amount_pattern, '$1,250.50')) email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' self.assertTrue(re.search(email_pattern, 'test@email.com')) class TestEndToEnd(unittest.TestCase): """End-to-end integration tests.""" def test_data_preparation_flow(self): """Test the complete data preparation flow.""" # Create processor and dataset creator processor = DocumentProcessor() dataset_creator = NERDatasetCreator(processor) # Create sample dataset dataset = dataset_creator.create_sample_dataset() # Verify dataset structure self.assertIsInstance(dataset, list) self.assertGreater(len(dataset), 0) for example in dataset: self.assertIn('tokens', example) self.assertIn('labels', example) self.assertIn('text', example) self.assertEqual(len(example['tokens']), len(example['labels'])) def test_model_config_flow(self): """Test model configuration and creation flow.""" # Create configuration config = ModelConfig(batch_size=4, num_epochs=1) # Create model and trainer model, trainer = create_model_and_trainer(config) # Verify objects exist and have correct configuration self.assertIsNotNone(model) self.assertIsNotNone(trainer) self.assertEqual(trainer.config.batch_size, 4) self.assertEqual(trainer.config.num_epochs, 1) def test_save_and_load_dataset(self): """Test saving and loading dataset.""" # Create dataset processor = DocumentProcessor() dataset_creator = NERDatasetCreator(processor) dataset = dataset_creator.create_sample_dataset() # Save to temporary file with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: temp_path = f.name json.dump(dataset, f, indent=2) try: # Load and verify with open(temp_path, 'r') as f: loaded_dataset = json.load(f) self.assertEqual(len(loaded_dataset), len(dataset)) self.assertEqual(loaded_dataset[0]['text'], dataset[0]['text']) finally: # Clean up os.unlink(temp_path) def run_tests(): """Run all tests.""" print("Running Document Text Extraction Tests") print("=" * 50) # Create test suite test_suite = unittest.TestSuite() # Add test classes test_classes = [ TestDocumentProcessor, TestNERDatasetCreator, TestModelConfig, TestModelCreation, TestInference, TestEndToEnd ] for test_class in test_classes: tests = unittest.TestLoader().loadTestsFromTestCase(test_class) test_suite.addTests(tests) # Run tests runner = unittest.TextTestRunner(verbosity=2) result = runner.run(test_suite) # Print summary if result.wasSuccessful(): print(f"\nAll tests passed! ({result.testsRun} tests)") else: print(f"\n{len(result.failures)} failures, {len(result.errors)} errors") if result.failures: print("\nFailures:") for test, failure in result.failures: print(f" {test}: {failure}") if result.errors: print("\nErrors:") for test, error in result.errors: print(f" {test}: {error}") return result.wasSuccessful() if __name__ == "__main__": run_tests()