File size: 3,569 Bytes
cdb73a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# tests/test_integration.py

import os
import tempfile
import unittest

import pandas as pd

# FastAPI related imports
import src.book_recommender.core.config as config
from src.book_recommender.data.processor import process_dataframe
from src.book_recommender.ml.embedder import generate_embeddings
from src.book_recommender.ml.recommender import BookRecommender


class TestIntegration(unittest.TestCase):

    def setUp(self):
        self.test_dir = tempfile.TemporaryDirectory()

        self.raw_csv_path = os.path.join(self.test_dir.name, "test_books.csv")

        # Create a dummy CSV file for the test
        sample_data = {
            "title": ["The Sun Also Rises", "A Farewell to Arms", "For Whom the Bell Tolls", "The Old Man and the Sea"],
            "authors": ["Ernest Hemingway", "Ernest Hemingway", "Ernest Hemingway", "Ernest Hemingway"],
            "genres": ["Fiction", "War, Fiction", "War, Fiction", "Fiction"],
            "description": [
                "A story of American and British expatriates in Paris.",
                "A love story during World War I.",
                "An American in the Spanish Civil War.",
                "An old fisherman struggles with a giant marlin.",
            ],
            "tags": ["lost generation", "war", "spain", "cuba"],
        }
        pd.DataFrame(sample_data).to_csv(self.raw_csv_path, index=False)

    def tearDown(self):
        self.test_dir.cleanup()

    def test_full_pipeline(self):
        """
        Tests the full data processing and recommendation pipeline end-to-end.
        """
        # --- 1. Load Raw Data ---
        raw_df = pd.read_csv(self.raw_csv_path)
        self.assertEqual(len(raw_df), 4)

        # --- 2. Process DataFrame ---
        processed_df = process_dataframe(raw_df)
        self.assertEqual(len(processed_df), 4)
        self.assertIn("combined_text", processed_df.columns)

        # --- 3. Generate Embeddings ---
        embeddings = generate_embeddings(df=processed_df, model_name=config.EMBEDDING_MODEL)
        self.assertEqual(embeddings.shape, (4, config.EMBEDDING_DIMENSION))

        # --- 4. Initialize Recommender ---
        recommender = BookRecommender(book_data=processed_df, embeddings=embeddings)
        self.assertIsNotNone(recommender)

        # --- 5. Get Recommendations ---
        # "A Farewell to Arms" and "For Whom the Bell Tolls" are both war novels by the same author,
        # so they should be highly similar.
        recommendations = recommender.get_recommendations("A Farewell to Arms", top_k=1)

        # --- 6. Assert Results ---
        self.assertEqual(len(recommendations), 1)

        top_recommendation = recommendations[0]

        # The exact top book can vary with model updates, so we check for reasonableness.
        # It should be another Hemingway novel from our list.
        expected_titles = ["The Sun Also Rises", "For Whom the Bell Tolls", "The Old Man and the Sea"]
        self.assertIn(top_recommendation["title"], expected_titles)

        # The similarity should be high, indicating a strong match.
        # NOTE: The threshold is set lower (e.g., > 0.3) because with normalized
        # text and the 'all-MiniLM-L6-v2' model, scores for even closely
        # related documents might not be extremely high. This ensures the test
        # is robust to minor model variations.
        self.assertTrue(
            top_recommendation["similarity"] > 0.3, f"Similarity score {top_recommendation['similarity']} is not > 0.3"
        )


if __name__ == "__main__":
    unittest.main()