Spaces:

CSAT
/

bookengine

Sleeping

File size: 7,650 Bytes

20ce80b

import streamlit as st
import pickle
import polars as pl
import re
import pandas as pd
import numpy as np
from collections import Counter

st.set_page_config(page_title="Book Recommendation Engine", layout="wide")

@st.cache_resource
def load_models():
    # Load the TF-IDF vectorizer
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        tfidf = pickle.load(f)

    # Load the KNN model
    with open('knn_model.pkl', 'rb') as f:
        knn_model = pickle.load(f)
    
    return tfidf, knn_model

@st.cache_data
def load_data():
    # Load the dataset
    df_lazy = pl.scan_csv('goodreadsV5.csv')
    df_cleaned = (
        df_lazy.drop_nulls(subset=['name', 'summary', 'genres'])
        .with_columns([
            (pl.col('summary') + ' ' + pl.col('genres')).alias('combined_features')
        ])
    ).collect()
    
    # Apply preprocessing to create the 'processed_features' column
    df_cleaned = df_cleaned.with_columns([
        pl.col('combined_features')
        .map_elements(preprocess_text, return_dtype=pl.Utf8)
        .alias('processed_features')
    ])
    
    # Convert to pandas for easier indexing with KNN results
    df_pandas = df_cleaned.to_pandas()
    
    return df_cleaned, df_pandas

# Define the preprocessing function
def preprocess_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())

# Recommendation function for out-of-dataset books
def recommend_books_knn_out_of_dataset(df_pandas, tfidf, knn_model, input_summary, input_genres, top_n=5):
    # Combine and preprocess the input book's features
    combined_input = f"{input_summary} {input_genres}"
    processed_input = preprocess_text(combined_input)

    # Transform the input book's features using the loaded TF-IDF vectorizer
    input_vector = tfidf.transform([processed_input])

    # Find the nearest neighbors using the loaded KNN model
    distances, indices = knn_model.kneighbors(input_vector, n_neighbors=top_n)

    # Retrieve the recommended book information using pandas DataFrame
    recommendations = []
    for i, idx in enumerate(indices.flatten()):
        book = {
            "title": df_pandas.iloc[idx]['name'],
            "summary": df_pandas.iloc[idx]['summary'],
            "genres": df_pandas.iloc[idx]['genres'],
            "similarity_score": 1 - distances.flatten()[i]  # Convert distance to similarity score
        }
        recommendations.append(book)

    return recommendations

def main():
    st.title("📚 Book Recommendation Engine")
    
    # Initialize session state variables if they don't exist
    if 'example_summary' not in st.session_state:
        st.session_state['example_summary'] = ""
    if 'example_genres' not in st.session_state:
        st.session_state['example_genres'] = ""
    if 'run_example' not in st.session_state:
        st.session_state['run_example'] = False
    
    try:
        # Load models and data
        tfidf, knn_model = load_models()
        df_cleaned, df_pandas = load_data()
        
        # Pre-fill with example if one was selected
        default_summary = st.session_state['example_summary'] if st.session_state['run_example'] else "A fantasy adventure about a young wizard learning magic."
        default_genres = st.session_state['example_genres'] if st.session_state['run_example'] else "fantasy, adventure, magic"
        
        # Main content
        st.subheader("Find Book Recommendations")
        st.write("Enter a book summary and genres to get personalized recommendations.")
        
        col1, col2 = st.columns(2)
        
        with col1:
            input_summary = st.text_area("Book Summary", default_summary, height=150)
        
        with col2:
            input_genres = st.text_input("Genres (comma-separated)", default_genres)
            num_recommendations = st.slider("Number of Recommendations", 
                                           min_value=1, max_value=20, value=5)
        
        # Display recommendations immediately if example was selected
        if st.session_state['run_example'] or st.button("Get Recommendations", type="primary"):
            with st.spinner("Finding the best book matches for you..."):
                # Use the current input values, which may come from examples or user input
                recommendations = recommend_books_knn_out_of_dataset(
                    df_pandas, tfidf, knn_model, input_summary, input_genres, num_recommendations
                )
            
            st.subheader("📚 Your Recommended Books")
            
            for i, book in enumerate(recommendations):
                with st.expander(f"{i+1}. {book['title']}"):
                    st.markdown(f"**Summary:** {book['summary']}")
                    st.markdown(f"**Genres:** {book['genres']}")
            
            # Reset the example flag so it doesn't run again on rerender
            st.session_state['run_example'] = False
        
        # Example tabs section
        st.subheader("Try these examples")
        example_tabs = st.tabs(["Fantasy Adventure", "Romance", "Science Fiction", "Mystery"])
        
        def set_example(summary, genres):
            st.session_state['example_summary'] = summary
            st.session_state['example_genres'] = genres
            st.session_state['run_example'] = True
            st.rerun()
        
        with example_tabs[0]:
            st.write("A magical journey through enchanted lands with dragons and wizards.")
            st.write("Genres: fantasy, adventure, magic")
            if st.button("Use this example", key="ex1"):
                set_example(
                    "A magical journey through enchanted lands with dragons and wizards.",
                    "fantasy, adventure, magic"
                )
        
        with example_tabs[1]:
            st.write("A love story between two people from different worlds who meet by chance.")
            st.write("Genres: romance, contemporary, drama")
            if st.button("Use this example", key="ex2"):
                set_example(
                    "A love story between two people from different worlds who meet by chance.",
                    "romance, contemporary, drama"
                )
        
        with example_tabs[2]:
            st.write("Space explorers discover an alien civilization that challenges their understanding of humanity.")
            st.write("Genres: science fiction, space, aliens")
            if st.button("Use this example", key="ex3"):
                set_example(
                    "Space explorers discover an alien civilization that challenges their understanding of humanity.",
                    "science fiction, space, aliens"
                )
        
        with example_tabs[3]:
            st.write("A detective investigates a series of mysterious disappearances in a small town.")
            st.write("Genres: mystery, thriller, crime")
            if st.button("Use this example", key="ex4"):
                set_example(
                    "A detective investigates a series of mysterious disappearances in a small town.",
                    "mystery, thriller, crime"
                )
        
    except Exception as e:
        st.error(f"An error occurred: {e}")
        st.info("Make sure you have the required model files (tfidf_vectorizer.pkl, knn_model.pkl) and dataset (goodreadsV2.csv) in the same directory as this app.")
        st.code("""
# Files needed:
- tfidf_vectorizer.pkl: Your trained TF-IDF vectorizer
- knn_model.pkl: Your trained KNN model
- goodreadsV2.csv: Your dataset with book information
        """)

if __name__ == "__main__":
    main()