bookengine / app.py
CSAT's picture
Update app.py
20ce80b verified
import streamlit as st
import pickle
import polars as pl
import re
import pandas as pd
import numpy as np
from collections import Counter
st.set_page_config(page_title="Book Recommendation Engine", layout="wide")
@st.cache_resource
def load_models():
# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as f:
tfidf = pickle.load(f)
# Load the KNN model
with open('knn_model.pkl', 'rb') as f:
knn_model = pickle.load(f)
return tfidf, knn_model
@st.cache_data
def load_data():
# Load the dataset
df_lazy = pl.scan_csv('goodreadsV5.csv')
df_cleaned = (
df_lazy.drop_nulls(subset=['name', 'summary', 'genres'])
.with_columns([
(pl.col('summary') + ' ' + pl.col('genres')).alias('combined_features')
])
).collect()
# Apply preprocessing to create the 'processed_features' column
df_cleaned = df_cleaned.with_columns([
pl.col('combined_features')
.map_elements(preprocess_text, return_dtype=pl.Utf8)
.alias('processed_features')
])
# Convert to pandas for easier indexing with KNN results
df_pandas = df_cleaned.to_pandas()
return df_cleaned, df_pandas
# Define the preprocessing function
def preprocess_text(text):
return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
# Recommendation function for out-of-dataset books
def recommend_books_knn_out_of_dataset(df_pandas, tfidf, knn_model, input_summary, input_genres, top_n=5):
# Combine and preprocess the input book's features
combined_input = f"{input_summary} {input_genres}"
processed_input = preprocess_text(combined_input)
# Transform the input book's features using the loaded TF-IDF vectorizer
input_vector = tfidf.transform([processed_input])
# Find the nearest neighbors using the loaded KNN model
distances, indices = knn_model.kneighbors(input_vector, n_neighbors=top_n)
# Retrieve the recommended book information using pandas DataFrame
recommendations = []
for i, idx in enumerate(indices.flatten()):
book = {
"title": df_pandas.iloc[idx]['name'],
"summary": df_pandas.iloc[idx]['summary'],
"genres": df_pandas.iloc[idx]['genres'],
"similarity_score": 1 - distances.flatten()[i] # Convert distance to similarity score
}
recommendations.append(book)
return recommendations
def main():
st.title("πŸ“š Book Recommendation Engine")
# Initialize session state variables if they don't exist
if 'example_summary' not in st.session_state:
st.session_state['example_summary'] = ""
if 'example_genres' not in st.session_state:
st.session_state['example_genres'] = ""
if 'run_example' not in st.session_state:
st.session_state['run_example'] = False
try:
# Load models and data
tfidf, knn_model = load_models()
df_cleaned, df_pandas = load_data()
# Pre-fill with example if one was selected
default_summary = st.session_state['example_summary'] if st.session_state['run_example'] else "A fantasy adventure about a young wizard learning magic."
default_genres = st.session_state['example_genres'] if st.session_state['run_example'] else "fantasy, adventure, magic"
# Main content
st.subheader("Find Book Recommendations")
st.write("Enter a book summary and genres to get personalized recommendations.")
col1, col2 = st.columns(2)
with col1:
input_summary = st.text_area("Book Summary", default_summary, height=150)
with col2:
input_genres = st.text_input("Genres (comma-separated)", default_genres)
num_recommendations = st.slider("Number of Recommendations",
min_value=1, max_value=20, value=5)
# Display recommendations immediately if example was selected
if st.session_state['run_example'] or st.button("Get Recommendations", type="primary"):
with st.spinner("Finding the best book matches for you..."):
# Use the current input values, which may come from examples or user input
recommendations = recommend_books_knn_out_of_dataset(
df_pandas, tfidf, knn_model, input_summary, input_genres, num_recommendations
)
st.subheader("πŸ“š Your Recommended Books")
for i, book in enumerate(recommendations):
with st.expander(f"{i+1}. {book['title']}"):
st.markdown(f"**Summary:** {book['summary']}")
st.markdown(f"**Genres:** {book['genres']}")
# Reset the example flag so it doesn't run again on rerender
st.session_state['run_example'] = False
# Example tabs section
st.subheader("Try these examples")
example_tabs = st.tabs(["Fantasy Adventure", "Romance", "Science Fiction", "Mystery"])
def set_example(summary, genres):
st.session_state['example_summary'] = summary
st.session_state['example_genres'] = genres
st.session_state['run_example'] = True
st.rerun()
with example_tabs[0]:
st.write("A magical journey through enchanted lands with dragons and wizards.")
st.write("Genres: fantasy, adventure, magic")
if st.button("Use this example", key="ex1"):
set_example(
"A magical journey through enchanted lands with dragons and wizards.",
"fantasy, adventure, magic"
)
with example_tabs[1]:
st.write("A love story between two people from different worlds who meet by chance.")
st.write("Genres: romance, contemporary, drama")
if st.button("Use this example", key="ex2"):
set_example(
"A love story between two people from different worlds who meet by chance.",
"romance, contemporary, drama"
)
with example_tabs[2]:
st.write("Space explorers discover an alien civilization that challenges their understanding of humanity.")
st.write("Genres: science fiction, space, aliens")
if st.button("Use this example", key="ex3"):
set_example(
"Space explorers discover an alien civilization that challenges their understanding of humanity.",
"science fiction, space, aliens"
)
with example_tabs[3]:
st.write("A detective investigates a series of mysterious disappearances in a small town.")
st.write("Genres: mystery, thriller, crime")
if st.button("Use this example", key="ex4"):
set_example(
"A detective investigates a series of mysterious disappearances in a small town.",
"mystery, thriller, crime"
)
except Exception as e:
st.error(f"An error occurred: {e}")
st.info("Make sure you have the required model files (tfidf_vectorizer.pkl, knn_model.pkl) and dataset (goodreadsV2.csv) in the same directory as this app.")
st.code("""
# Files needed:
- tfidf_vectorizer.pkl: Your trained TF-IDF vectorizer
- knn_model.pkl: Your trained KNN model
- goodreadsV2.csv: Your dataset with book information
""")
if __name__ == "__main__":
main()