# test_file_processing_service.py """ Unit tests for FileProcessingService. Tests core functionality of file processing including CSV/XLSX parsing, validation, and template generation. """ import csv import io import tempfile import pytest from pathlib import Path from datetime import datetime import pandas as pd from src.core.file_processing_service import FileProcessingService from src.core.verification_models import TestMessage, FileUploadResult class TestFileProcessingService: """Test cases for FileProcessingService.""" def setup_method(self): """Set up test fixtures.""" self.service = FileProcessingService() def test_validate_file_format_csv(self): """Test CSV file extension validation.""" assert self.service.validate_file_extension("test.csv") is True assert self.service.validate_file_extension("test.CSV") is True def test_validate_file_format_xlsx(self): """Test XLSX file extension validation.""" assert self.service.validate_file_extension("test.xlsx") is True assert self.service.validate_file_extension("test.XLSX") is True def test_validate_file_format_invalid(self): """Test invalid file extension validation.""" assert self.service.validate_file_extension("test.txt") is False assert self.service.validate_file_extension("test.doc") is False assert self.service.validate_file_extension("test") is False def test_detect_csv_delimiter_comma(self): """Test CSV delimiter detection for comma.""" content = "message,expected_classification\nHello,green\nWorld,red" delimiter = self.service._detect_csv_delimiter(content) assert delimiter == "," def test_detect_csv_delimiter_semicolon(self): """Test CSV delimiter detection for semicolon.""" content = "message;expected_classification\nHello;green\nWorld;red" delimiter = self.service._detect_csv_delimiter(content) assert delimiter == ";" def test_detect_csv_delimiter_tab(self): """Test CSV delimiter detection for tab.""" content = "message\texpected_classification\nHello\tgreen\nWorld\tred" delimiter = self.service._detect_csv_delimiter(content) assert delimiter == "\t" def test_normalize_column_names_standard(self): """Test column name normalization with standard names.""" columns = ["message", "expected_classification"] normalized = self.service._normalize_column_names(columns) assert normalized["message"] == "message" assert normalized["expected_classification"] == "expected_classification" def test_normalize_column_names_alternatives(self): """Test column name normalization with alternative names.""" columns = ["text", "label"] normalized = self.service._normalize_column_names(columns) assert normalized["message"] == "text" assert normalized["expected_classification"] == "label" def test_validate_test_cases_data_valid(self): """Test validation of valid test case data.""" data = [ {"message": "Hello world", "expected_classification": "green"}, {"message": "I'm worried", "expected_classification": "yellow"}, ] errors = self.service._validate_test_cases_data(data) assert len(errors) == 0 def test_validate_test_cases_data_empty_message(self): """Test validation with empty message.""" data = [ {"message": "", "expected_classification": "green"}, ] errors = self.service._validate_test_cases_data(data) assert len(errors) == 1 assert "message text is empty" in errors[0] def test_validate_test_cases_data_invalid_classification(self): """Test validation with invalid classification.""" data = [ {"message": "Hello", "expected_classification": "blue"}, ] errors = self.service._validate_test_cases_data(data) assert len(errors) == 1 assert "invalid classification" in errors[0] def test_parse_csv_file_valid(self): """Test parsing a valid CSV file.""" # Create temporary CSV file csv_content = "message,expected_classification\nHello world,green\nI'm worried,yellow\n" with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: f.write(csv_content) temp_path = f.name try: result = self.service.parse_csv_file(temp_path) assert result.file_format == "csv" assert result.total_rows == 2 assert result.valid_rows == 2 assert len(result.validation_errors) == 0 assert len(result.parsed_test_cases) == 2 # Check first test case first_case = result.parsed_test_cases[0] assert first_case.text == "Hello world" assert first_case.pre_classified_label == "green" finally: Path(temp_path).unlink() def test_parse_csv_file_missing_columns(self): """Test parsing CSV file with missing required columns.""" csv_content = "text,label\nHello world,green\n" with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: f.write(csv_content) temp_path = f.name try: result = self.service.parse_csv_file(temp_path) # Should still work because 'text' and 'label' are alternative names assert result.file_format == "csv" assert result.total_rows == 1 assert result.valid_rows == 1 assert len(result.parsed_test_cases) == 1 finally: Path(temp_path).unlink() def test_parse_xlsx_file_valid(self): """Test parsing a valid XLSX file.""" # Create temporary XLSX file data = { "message": ["Hello world", "I'm worried"], "expected_classification": ["green", "yellow"] } df = pd.DataFrame(data) with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as f: temp_path = f.name df.to_excel(temp_path, index=False) try: result = self.service.parse_xlsx_file(temp_path) assert result.file_format == "xlsx" assert result.total_rows == 2 assert result.valid_rows == 2 assert len(result.validation_errors) == 0 assert len(result.parsed_test_cases) == 2 # Check first test case first_case = result.parsed_test_cases[0] assert first_case.text == "Hello world" assert first_case.pre_classified_label == "green" finally: Path(temp_path).unlink() def test_convert_to_test_messages(self): """Test converting parsed data to TestMessage objects.""" data = [ {"message": "Hello world", "expected_classification": "green"}, {"message": "I'm worried", "expected_classification": "yellow"}, ] messages = self.service.convert_to_test_messages(data) assert len(messages) == 2 assert messages[0].text == "Hello world" assert messages[0].pre_classified_label == "green" assert messages[1].text == "I'm worried" assert messages[1].pre_classified_label == "yellow" def test_generate_csv_template(self): """Test CSV template generation.""" template = self.service.generate_csv_template() # Parse the template to verify structure reader = csv.reader(io.StringIO(template)) rows = list(reader) assert len(rows) >= 2 # Header + at least one data row assert rows[0] == ["message", "expected_classification"] # Check that all data rows have valid classifications for row in rows[1:]: if len(row) >= 2: assert row[1].lower() in ["green", "yellow", "red"] def test_generate_xlsx_template(self): """Test XLSX template generation.""" template_bytes = self.service.generate_xlsx_template() assert isinstance(template_bytes, bytes) assert len(template_bytes) > 0 # Verify we can read the generated template with tempfile.NamedTemporaryFile(suffix='.xlsx') as f: f.write(template_bytes) f.flush() df = pd.read_excel(f.name) assert "message" in df.columns assert "expected_classification" in df.columns assert len(df) > 0 def test_get_validation_error_details(self): """Test validation error details generation.""" errors = [ "Missing required columns: message", "Row 1: invalid classification 'blue'", "Row 2: message text is empty" ] details = self.service.get_validation_error_details(errors) assert details["total_errors"] == 3 assert details["errors"] == errors assert len(details["suggestions"]) > 0 assert "format_help" in details def test_suggest_format_corrections(self): """Test format correction suggestions.""" content = "text;label\nHello;green\nWorld;red" suggestions = self.service.suggest_format_corrections(content) assert len(suggestions) > 0 # Should suggest something about semicolon delimiter or column names def test_process_uploaded_file_invalid_format(self): """Test processing file with invalid format.""" with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as f: f.write(b"Hello world") temp_path = f.name try: result = self.service.process_uploaded_file(temp_path) assert result.file_format == "unknown" assert len(result.validation_errors) > 0 assert "Unsupported file format" in result.validation_errors[0] finally: Path(temp_path).unlink()