# tests/test_data_processor.py import os import tempfile import unittest import pandas as pd from src.book_recommender.core.exceptions import DataNotFoundError, FileProcessingError from src.book_recommender.data.processor import clean_and_prepare_data class TestDataProcessor(unittest.TestCase): def setUp(self): # Create a temporary directory for test artifacts self.test_dir = tempfile.TemporaryDirectory() self.raw_path = os.path.join(self.test_dir.name, "test_books.csv") self.processed_path = os.path.join(self.test_dir.name, "test_books_cleaned.parquet") # Create a dummy CSV file self.sample_data = { "title": ["Book A", "Book B", None, "Book D"], "authors": ["Author 1", "Author 2", "Author 3", "Author 4"], "genres": ["['Fiction']", "['Sci-Fi', 'Fantasy']", "['History']", float("nan")], "description": ["Desc A", "Desc B", "Desc C", "Desc D"], "tags": ["['tag1']", "['tag2']", "['tag3']", "['tag4']"], } pd.DataFrame(self.sample_data).to_csv(self.raw_path, index=False) def tearDown(self): # Clean up the temporary directory self.test_dir.cleanup() def test_clean_and_prepare_data(self): # Run the function to be tested processed_df = clean_and_prepare_data(self.raw_path, self.processed_path) # 1. Test that the output file is created self.assertTrue(os.path.exists(self.processed_path)) # 2. Test that rows with no title are dropped self.assertEqual(len(processed_df), 3) self.assertNotIn(None, processed_df["title_lower"]) # 3. Test that NaN values are filled and list-strings are parsed self.assertFalse(processed_df["genres"].isnull().any()) self.assertEqual(processed_df.iloc[0]["genres"], "fiction") # 4. Test that 'combined_text' column is created and is correct self.assertIn("combined_text", processed_df.columns) # Check the content for the first valid book expected_text = "book a book a book a by author 1. genres: fiction. description: desc a. tags: tag1" self.assertEqual(processed_df.iloc[0]["combined_text"], expected_text) def test_missing_raw_data_file(self): """Test that DataNotFoundError is raised if the raw data file is missing.""" with self.assertRaises(DataNotFoundError) as cm: clean_and_prepare_data(raw_path="non_existent_file.csv", processed_path=self.processed_path) self.assertIsInstance(cm.exception, DataNotFoundError) def test_empty_dataframe(self): """Test that a ValueError is raised if the dataframe is empty after cleaning.""" # Create an empty csv empty_df = pd.DataFrame(columns=["title", "authors", "genres", "description", "tags"]) empty_df.to_csv(self.raw_path, index=False) with self.assertRaises(ValueError) as cm: clean_and_prepare_data(raw_path=self.raw_path, processed_path=self.processed_path) self.assertIsInstance(cm.exception, ValueError) def test_malformed_csv(self): """Test that FileProcessingError is raised for a malformed CSV.""" # Create a malformed CSV file with open(self.raw_path, "w", encoding="utf-8") as f: f.write('title,authors\n"Book","Author"\n"Another Book') # Missing closing quote with self.assertRaises(FileProcessingError) as cm: clean_and_prepare_data(raw_path=self.raw_path, processed_path=self.processed_path) self.assertIsInstance(cm.exception, FileProcessingError) if __name__ == "__main__": unittest.main()