Spaces:
Sleeping
Sleeping
| # test_file_processing_service.py | |
| """ | |
| Unit tests for FileProcessingService. | |
| Tests core functionality of file processing including CSV/XLSX parsing, | |
| validation, and template generation. | |
| """ | |
| import csv | |
| import io | |
| import tempfile | |
| import pytest | |
| from pathlib import Path | |
| from datetime import datetime | |
| import pandas as pd | |
| from src.core.file_processing_service import FileProcessingService | |
| from src.core.verification_models import TestMessage, FileUploadResult | |
| class TestFileProcessingService: | |
| """Test cases for FileProcessingService.""" | |
| def setup_method(self): | |
| """Set up test fixtures.""" | |
| self.service = FileProcessingService() | |
| def test_validate_file_format_csv(self): | |
| """Test CSV file extension validation.""" | |
| assert self.service.validate_file_extension("test.csv") is True | |
| assert self.service.validate_file_extension("test.CSV") is True | |
| def test_validate_file_format_xlsx(self): | |
| """Test XLSX file extension validation.""" | |
| assert self.service.validate_file_extension("test.xlsx") is True | |
| assert self.service.validate_file_extension("test.XLSX") is True | |
| def test_validate_file_format_invalid(self): | |
| """Test invalid file extension validation.""" | |
| assert self.service.validate_file_extension("test.txt") is False | |
| assert self.service.validate_file_extension("test.doc") is False | |
| assert self.service.validate_file_extension("test") is False | |
| def test_detect_csv_delimiter_comma(self): | |
| """Test CSV delimiter detection for comma.""" | |
| content = "message,expected_classification\nHello,green\nWorld,red" | |
| delimiter = self.service._detect_csv_delimiter(content) | |
| assert delimiter == "," | |
| def test_detect_csv_delimiter_semicolon(self): | |
| """Test CSV delimiter detection for semicolon.""" | |
| content = "message;expected_classification\nHello;green\nWorld;red" | |
| delimiter = self.service._detect_csv_delimiter(content) | |
| assert delimiter == ";" | |
| def test_detect_csv_delimiter_tab(self): | |
| """Test CSV delimiter detection for tab.""" | |
| content = "message\texpected_classification\nHello\tgreen\nWorld\tred" | |
| delimiter = self.service._detect_csv_delimiter(content) | |
| assert delimiter == "\t" | |
| def test_normalize_column_names_standard(self): | |
| """Test column name normalization with standard names.""" | |
| columns = ["message", "expected_classification"] | |
| normalized = self.service._normalize_column_names(columns) | |
| assert normalized["message"] == "message" | |
| assert normalized["expected_classification"] == "expected_classification" | |
| def test_normalize_column_names_alternatives(self): | |
| """Test column name normalization with alternative names.""" | |
| columns = ["text", "label"] | |
| normalized = self.service._normalize_column_names(columns) | |
| assert normalized["message"] == "text" | |
| assert normalized["expected_classification"] == "label" | |
| def test_validate_test_cases_data_valid(self): | |
| """Test validation of valid test case data.""" | |
| data = [ | |
| {"message": "Hello world", "expected_classification": "green"}, | |
| {"message": "I'm worried", "expected_classification": "yellow"}, | |
| ] | |
| errors = self.service._validate_test_cases_data(data) | |
| assert len(errors) == 0 | |
| def test_validate_test_cases_data_empty_message(self): | |
| """Test validation with empty message.""" | |
| data = [ | |
| {"message": "", "expected_classification": "green"}, | |
| ] | |
| errors = self.service._validate_test_cases_data(data) | |
| assert len(errors) == 1 | |
| assert "message text is empty" in errors[0] | |
| def test_validate_test_cases_data_invalid_classification(self): | |
| """Test validation with invalid classification.""" | |
| data = [ | |
| {"message": "Hello", "expected_classification": "blue"}, | |
| ] | |
| errors = self.service._validate_test_cases_data(data) | |
| assert len(errors) == 1 | |
| assert "invalid classification" in errors[0] | |
| def test_parse_csv_file_valid(self): | |
| """Test parsing a valid CSV file.""" | |
| # Create temporary CSV file | |
| csv_content = "message,expected_classification\nHello world,green\nI'm worried,yellow\n" | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: | |
| f.write(csv_content) | |
| temp_path = f.name | |
| try: | |
| result = self.service.parse_csv_file(temp_path) | |
| assert result.file_format == "csv" | |
| assert result.total_rows == 2 | |
| assert result.valid_rows == 2 | |
| assert len(result.validation_errors) == 0 | |
| assert len(result.parsed_test_cases) == 2 | |
| # Check first test case | |
| first_case = result.parsed_test_cases[0] | |
| assert first_case.text == "Hello world" | |
| assert first_case.pre_classified_label == "green" | |
| finally: | |
| Path(temp_path).unlink() | |
| def test_parse_csv_file_missing_columns(self): | |
| """Test parsing CSV file with missing required columns.""" | |
| csv_content = "text,label\nHello world,green\n" | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: | |
| f.write(csv_content) | |
| temp_path = f.name | |
| try: | |
| result = self.service.parse_csv_file(temp_path) | |
| # Should still work because 'text' and 'label' are alternative names | |
| assert result.file_format == "csv" | |
| assert result.total_rows == 1 | |
| assert result.valid_rows == 1 | |
| assert len(result.parsed_test_cases) == 1 | |
| finally: | |
| Path(temp_path).unlink() | |
| def test_parse_xlsx_file_valid(self): | |
| """Test parsing a valid XLSX file.""" | |
| # Create temporary XLSX file | |
| data = { | |
| "message": ["Hello world", "I'm worried"], | |
| "expected_classification": ["green", "yellow"] | |
| } | |
| df = pd.DataFrame(data) | |
| with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as f: | |
| temp_path = f.name | |
| df.to_excel(temp_path, index=False) | |
| try: | |
| result = self.service.parse_xlsx_file(temp_path) | |
| assert result.file_format == "xlsx" | |
| assert result.total_rows == 2 | |
| assert result.valid_rows == 2 | |
| assert len(result.validation_errors) == 0 | |
| assert len(result.parsed_test_cases) == 2 | |
| # Check first test case | |
| first_case = result.parsed_test_cases[0] | |
| assert first_case.text == "Hello world" | |
| assert first_case.pre_classified_label == "green" | |
| finally: | |
| Path(temp_path).unlink() | |
| def test_convert_to_test_messages(self): | |
| """Test converting parsed data to TestMessage objects.""" | |
| data = [ | |
| {"message": "Hello world", "expected_classification": "green"}, | |
| {"message": "I'm worried", "expected_classification": "yellow"}, | |
| ] | |
| messages = self.service.convert_to_test_messages(data) | |
| assert len(messages) == 2 | |
| assert messages[0].text == "Hello world" | |
| assert messages[0].pre_classified_label == "green" | |
| assert messages[1].text == "I'm worried" | |
| assert messages[1].pre_classified_label == "yellow" | |
| def test_generate_csv_template(self): | |
| """Test CSV template generation.""" | |
| template = self.service.generate_csv_template() | |
| # Parse the template to verify structure | |
| reader = csv.reader(io.StringIO(template)) | |
| rows = list(reader) | |
| assert len(rows) >= 2 # Header + at least one data row | |
| assert rows[0] == ["message", "expected_classification"] | |
| # Check that all data rows have valid classifications | |
| for row in rows[1:]: | |
| if len(row) >= 2: | |
| assert row[1].lower() in ["green", "yellow", "red"] | |
| def test_generate_xlsx_template(self): | |
| """Test XLSX template generation.""" | |
| template_bytes = self.service.generate_xlsx_template() | |
| assert isinstance(template_bytes, bytes) | |
| assert len(template_bytes) > 0 | |
| # Verify we can read the generated template | |
| with tempfile.NamedTemporaryFile(suffix='.xlsx') as f: | |
| f.write(template_bytes) | |
| f.flush() | |
| df = pd.read_excel(f.name) | |
| assert "message" in df.columns | |
| assert "expected_classification" in df.columns | |
| assert len(df) > 0 | |
| def test_get_validation_error_details(self): | |
| """Test validation error details generation.""" | |
| errors = [ | |
| "Missing required columns: message", | |
| "Row 1: invalid classification 'blue'", | |
| "Row 2: message text is empty" | |
| ] | |
| details = self.service.get_validation_error_details(errors) | |
| assert details["total_errors"] == 3 | |
| assert details["errors"] == errors | |
| assert len(details["suggestions"]) > 0 | |
| assert "format_help" in details | |
| def test_suggest_format_corrections(self): | |
| """Test format correction suggestions.""" | |
| content = "text;label\nHello;green\nWorld;red" | |
| suggestions = self.service.suggest_format_corrections(content) | |
| assert len(suggestions) > 0 | |
| # Should suggest something about semicolon delimiter or column names | |
| def test_process_uploaded_file_invalid_format(self): | |
| """Test processing file with invalid format.""" | |
| with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as f: | |
| f.write(b"Hello world") | |
| temp_path = f.name | |
| try: | |
| result = self.service.process_uploaded_file(temp_path) | |
| assert result.file_format == "unknown" | |
| assert len(result.validation_errors) > 0 | |
| assert "Unsupported file format" in result.validation_errors[0] | |
| finally: | |
| Path(temp_path).unlink() |