import streamlit as st import pickle import polars as pl import re import pandas as pd import numpy as np from collections import Counter st.set_page_config(page_title="Book Recommendation Engine", layout="wide") @st.cache_resource def load_models(): # Load the TF-IDF vectorizer with open('tfidf_vectorizer.pkl', 'rb') as f: tfidf = pickle.load(f) # Load the KNN model with open('knn_model.pkl', 'rb') as f: knn_model = pickle.load(f) return tfidf, knn_model @st.cache_data def load_data(): # Load the dataset df_lazy = pl.scan_csv('goodreadsV5.csv') df_cleaned = ( df_lazy.drop_nulls(subset=['name', 'summary', 'genres']) .with_columns([ (pl.col('summary') + ' ' + pl.col('genres')).alias('combined_features') ]) ).collect() # Apply preprocessing to create the 'processed_features' column df_cleaned = df_cleaned.with_columns([ pl.col('combined_features') .map_elements(preprocess_text, return_dtype=pl.Utf8) .alias('processed_features') ]) # Convert to pandas for easier indexing with KNN results df_pandas = df_cleaned.to_pandas() return df_cleaned, df_pandas # Define the preprocessing function def preprocess_text(text): return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower()) # Recommendation function for out-of-dataset books def recommend_books_knn_out_of_dataset(df_pandas, tfidf, knn_model, input_summary, input_genres, top_n=5): # Combine and preprocess the input book's features combined_input = f"{input_summary} {input_genres}" processed_input = preprocess_text(combined_input) # Transform the input book's features using the loaded TF-IDF vectorizer input_vector = tfidf.transform([processed_input]) # Find the nearest neighbors using the loaded KNN model distances, indices = knn_model.kneighbors(input_vector, n_neighbors=top_n) # Retrieve the recommended book information using pandas DataFrame recommendations = [] for i, idx in enumerate(indices.flatten()): book = { "title": df_pandas.iloc[idx]['name'], "summary": df_pandas.iloc[idx]['summary'], "genres": df_pandas.iloc[idx]['genres'], "similarity_score": 1 - distances.flatten()[i] # Convert distance to similarity score } recommendations.append(book) return recommendations def main(): st.title("📚 Book Recommendation Engine") # Initialize session state variables if they don't exist if 'example_summary' not in st.session_state: st.session_state['example_summary'] = "" if 'example_genres' not in st.session_state: st.session_state['example_genres'] = "" if 'run_example' not in st.session_state: st.session_state['run_example'] = False try: # Load models and data tfidf, knn_model = load_models() df_cleaned, df_pandas = load_data() # Pre-fill with example if one was selected default_summary = st.session_state['example_summary'] if st.session_state['run_example'] else "A fantasy adventure about a young wizard learning magic." default_genres = st.session_state['example_genres'] if st.session_state['run_example'] else "fantasy, adventure, magic" # Main content st.subheader("Find Book Recommendations") st.write("Enter a book summary and genres to get personalized recommendations.") col1, col2 = st.columns(2) with col1: input_summary = st.text_area("Book Summary", default_summary, height=150) with col2: input_genres = st.text_input("Genres (comma-separated)", default_genres) num_recommendations = st.slider("Number of Recommendations", min_value=1, max_value=20, value=5) # Display recommendations immediately if example was selected if st.session_state['run_example'] or st.button("Get Recommendations", type="primary"): with st.spinner("Finding the best book matches for you..."): # Use the current input values, which may come from examples or user input recommendations = recommend_books_knn_out_of_dataset( df_pandas, tfidf, knn_model, input_summary, input_genres, num_recommendations ) st.subheader("📚 Your Recommended Books") for i, book in enumerate(recommendations): with st.expander(f"{i+1}. {book['title']}"): st.markdown(f"**Summary:** {book['summary']}") st.markdown(f"**Genres:** {book['genres']}") # Reset the example flag so it doesn't run again on rerender st.session_state['run_example'] = False # Example tabs section st.subheader("Try these examples") example_tabs = st.tabs(["Fantasy Adventure", "Romance", "Science Fiction", "Mystery"]) def set_example(summary, genres): st.session_state['example_summary'] = summary st.session_state['example_genres'] = genres st.session_state['run_example'] = True st.rerun() with example_tabs[0]: st.write("A magical journey through enchanted lands with dragons and wizards.") st.write("Genres: fantasy, adventure, magic") if st.button("Use this example", key="ex1"): set_example( "A magical journey through enchanted lands with dragons and wizards.", "fantasy, adventure, magic" ) with example_tabs[1]: st.write("A love story between two people from different worlds who meet by chance.") st.write("Genres: romance, contemporary, drama") if st.button("Use this example", key="ex2"): set_example( "A love story between two people from different worlds who meet by chance.", "romance, contemporary, drama" ) with example_tabs[2]: st.write("Space explorers discover an alien civilization that challenges their understanding of humanity.") st.write("Genres: science fiction, space, aliens") if st.button("Use this example", key="ex3"): set_example( "Space explorers discover an alien civilization that challenges their understanding of humanity.", "science fiction, space, aliens" ) with example_tabs[3]: st.write("A detective investigates a series of mysterious disappearances in a small town.") st.write("Genres: mystery, thriller, crime") if st.button("Use this example", key="ex4"): set_example( "A detective investigates a series of mysterious disappearances in a small town.", "mystery, thriller, crime" ) except Exception as e: st.error(f"An error occurred: {e}") st.info("Make sure you have the required model files (tfidf_vectorizer.pkl, knn_model.pkl) and dataset (goodreadsV2.csv) in the same directory as this app.") st.code(""" # Files needed: - tfidf_vectorizer.pkl: Your trained TF-IDF vectorizer - knn_model.pkl: Your trained KNN model - goodreadsV2.csv: Your dataset with book information """) if __name__ == "__main__": main()