Spaces:
Running
Running
File size: 3,668 Bytes
cdb73a8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | # tests/test_data_processor.py
import os
import tempfile
import unittest
import pandas as pd
from src.book_recommender.core.exceptions import DataNotFoundError, FileProcessingError
from src.book_recommender.data.processor import clean_and_prepare_data
class TestDataProcessor(unittest.TestCase):
def setUp(self):
# Create a temporary directory for test artifacts
self.test_dir = tempfile.TemporaryDirectory()
self.raw_path = os.path.join(self.test_dir.name, "test_books.csv")
self.processed_path = os.path.join(self.test_dir.name, "test_books_cleaned.parquet")
# Create a dummy CSV file
self.sample_data = {
"title": ["Book A", "Book B", None, "Book D"],
"authors": ["Author 1", "Author 2", "Author 3", "Author 4"],
"genres": ["['Fiction']", "['Sci-Fi', 'Fantasy']", "['History']", float("nan")],
"description": ["Desc A", "Desc B", "Desc C", "Desc D"],
"tags": ["['tag1']", "['tag2']", "['tag3']", "['tag4']"],
}
pd.DataFrame(self.sample_data).to_csv(self.raw_path, index=False)
def tearDown(self):
# Clean up the temporary directory
self.test_dir.cleanup()
def test_clean_and_prepare_data(self):
# Run the function to be tested
processed_df = clean_and_prepare_data(self.raw_path, self.processed_path)
# 1. Test that the output file is created
self.assertTrue(os.path.exists(self.processed_path))
# 2. Test that rows with no title are dropped
self.assertEqual(len(processed_df), 3)
self.assertNotIn(None, processed_df["title_lower"])
# 3. Test that NaN values are filled and list-strings are parsed
self.assertFalse(processed_df["genres"].isnull().any())
self.assertEqual(processed_df.iloc[0]["genres"], "fiction")
# 4. Test that 'combined_text' column is created and is correct
self.assertIn("combined_text", processed_df.columns)
# Check the content for the first valid book
expected_text = "book a book a book a by author 1. genres: fiction. description: desc a. tags: tag1"
self.assertEqual(processed_df.iloc[0]["combined_text"], expected_text)
def test_missing_raw_data_file(self):
"""Test that DataNotFoundError is raised if the raw data file is missing."""
with self.assertRaises(DataNotFoundError) as cm:
clean_and_prepare_data(raw_path="non_existent_file.csv", processed_path=self.processed_path)
self.assertIsInstance(cm.exception, DataNotFoundError)
def test_empty_dataframe(self):
"""Test that a ValueError is raised if the dataframe is empty after cleaning."""
# Create an empty csv
empty_df = pd.DataFrame(columns=["title", "authors", "genres", "description", "tags"])
empty_df.to_csv(self.raw_path, index=False)
with self.assertRaises(ValueError) as cm:
clean_and_prepare_data(raw_path=self.raw_path, processed_path=self.processed_path)
self.assertIsInstance(cm.exception, ValueError)
def test_malformed_csv(self):
"""Test that FileProcessingError is raised for a malformed CSV."""
# Create a malformed CSV file
with open(self.raw_path, "w", encoding="utf-8") as f:
f.write('title,authors\n"Book","Author"\n"Another Book') # Missing closing quote
with self.assertRaises(FileProcessingError) as cm:
clean_and_prepare_data(raw_path=self.raw_path, processed_path=self.processed_path)
self.assertIsInstance(cm.exception, FileProcessingError)
if __name__ == "__main__":
unittest.main()
|