File size: 7,650 Bytes
20ce80b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import streamlit as st
import pickle
import polars as pl
import re
import pandas as pd
import numpy as np
from collections import Counter

st.set_page_config(page_title="Book Recommendation Engine", layout="wide")

@st.cache_resource
def load_models():
    # Load the TF-IDF vectorizer
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        tfidf = pickle.load(f)

    # Load the KNN model
    with open('knn_model.pkl', 'rb') as f:
        knn_model = pickle.load(f)
    
    return tfidf, knn_model

@st.cache_data
def load_data():
    # Load the dataset
    df_lazy = pl.scan_csv('goodreadsV5.csv')
    df_cleaned = (
        df_lazy.drop_nulls(subset=['name', 'summary', 'genres'])
        .with_columns([
            (pl.col('summary') + ' ' + pl.col('genres')).alias('combined_features')
        ])
    ).collect()
    
    # Apply preprocessing to create the 'processed_features' column
    df_cleaned = df_cleaned.with_columns([
        pl.col('combined_features')
        .map_elements(preprocess_text, return_dtype=pl.Utf8)
        .alias('processed_features')
    ])
    
    # Convert to pandas for easier indexing with KNN results
    df_pandas = df_cleaned.to_pandas()
    
    return df_cleaned, df_pandas

# Define the preprocessing function
def preprocess_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())

# Recommendation function for out-of-dataset books
def recommend_books_knn_out_of_dataset(df_pandas, tfidf, knn_model, input_summary, input_genres, top_n=5):
    # Combine and preprocess the input book's features
    combined_input = f"{input_summary} {input_genres}"
    processed_input = preprocess_text(combined_input)

    # Transform the input book's features using the loaded TF-IDF vectorizer
    input_vector = tfidf.transform([processed_input])

    # Find the nearest neighbors using the loaded KNN model
    distances, indices = knn_model.kneighbors(input_vector, n_neighbors=top_n)

    # Retrieve the recommended book information using pandas DataFrame
    recommendations = []
    for i, idx in enumerate(indices.flatten()):
        book = {
            "title": df_pandas.iloc[idx]['name'],
            "summary": df_pandas.iloc[idx]['summary'],
            "genres": df_pandas.iloc[idx]['genres'],
            "similarity_score": 1 - distances.flatten()[i]  # Convert distance to similarity score
        }
        recommendations.append(book)

    return recommendations

def main():
    st.title("πŸ“š Book Recommendation Engine")
    
    # Initialize session state variables if they don't exist
    if 'example_summary' not in st.session_state:
        st.session_state['example_summary'] = ""
    if 'example_genres' not in st.session_state:
        st.session_state['example_genres'] = ""
    if 'run_example' not in st.session_state:
        st.session_state['run_example'] = False
    
    try:
        # Load models and data
        tfidf, knn_model = load_models()
        df_cleaned, df_pandas = load_data()
        
        # Pre-fill with example if one was selected
        default_summary = st.session_state['example_summary'] if st.session_state['run_example'] else "A fantasy adventure about a young wizard learning magic."
        default_genres = st.session_state['example_genres'] if st.session_state['run_example'] else "fantasy, adventure, magic"
        
        # Main content
        st.subheader("Find Book Recommendations")
        st.write("Enter a book summary and genres to get personalized recommendations.")
        
        col1, col2 = st.columns(2)
        
        with col1:
            input_summary = st.text_area("Book Summary", default_summary, height=150)
        
        with col2:
            input_genres = st.text_input("Genres (comma-separated)", default_genres)
            num_recommendations = st.slider("Number of Recommendations", 
                                           min_value=1, max_value=20, value=5)
        
        # Display recommendations immediately if example was selected
        if st.session_state['run_example'] or st.button("Get Recommendations", type="primary"):
            with st.spinner("Finding the best book matches for you..."):
                # Use the current input values, which may come from examples or user input
                recommendations = recommend_books_knn_out_of_dataset(
                    df_pandas, tfidf, knn_model, input_summary, input_genres, num_recommendations
                )
            
            st.subheader("πŸ“š Your Recommended Books")
            
            for i, book in enumerate(recommendations):
                with st.expander(f"{i+1}. {book['title']}"):
                    st.markdown(f"**Summary:** {book['summary']}")
                    st.markdown(f"**Genres:** {book['genres']}")
            
            # Reset the example flag so it doesn't run again on rerender
            st.session_state['run_example'] = False
        
        # Example tabs section
        st.subheader("Try these examples")
        example_tabs = st.tabs(["Fantasy Adventure", "Romance", "Science Fiction", "Mystery"])
        
        def set_example(summary, genres):
            st.session_state['example_summary'] = summary
            st.session_state['example_genres'] = genres
            st.session_state['run_example'] = True
            st.rerun()
        
        with example_tabs[0]:
            st.write("A magical journey through enchanted lands with dragons and wizards.")
            st.write("Genres: fantasy, adventure, magic")
            if st.button("Use this example", key="ex1"):
                set_example(
                    "A magical journey through enchanted lands with dragons and wizards.",
                    "fantasy, adventure, magic"
                )
        
        with example_tabs[1]:
            st.write("A love story between two people from different worlds who meet by chance.")
            st.write("Genres: romance, contemporary, drama")
            if st.button("Use this example", key="ex2"):
                set_example(
                    "A love story between two people from different worlds who meet by chance.",
                    "romance, contemporary, drama"
                )
        
        with example_tabs[2]:
            st.write("Space explorers discover an alien civilization that challenges their understanding of humanity.")
            st.write("Genres: science fiction, space, aliens")
            if st.button("Use this example", key="ex3"):
                set_example(
                    "Space explorers discover an alien civilization that challenges their understanding of humanity.",
                    "science fiction, space, aliens"
                )
        
        with example_tabs[3]:
            st.write("A detective investigates a series of mysterious disappearances in a small town.")
            st.write("Genres: mystery, thriller, crime")
            if st.button("Use this example", key="ex4"):
                set_example(
                    "A detective investigates a series of mysterious disappearances in a small town.",
                    "mystery, thriller, crime"
                )
        
    except Exception as e:
        st.error(f"An error occurred: {e}")
        st.info("Make sure you have the required model files (tfidf_vectorizer.pkl, knn_model.pkl) and dataset (goodreadsV2.csv) in the same directory as this app.")
        st.code("""
# Files needed:
- tfidf_vectorizer.pkl: Your trained TF-IDF vectorizer
- knn_model.pkl: Your trained KNN model
- goodreadsV2.csv: Your dataset with book information
        """)

if __name__ == "__main__":
    main()