import streamlit as st import pickle import polars as pl import re import requests from io import BytesIO from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.neighbors import NearestNeighbors import matplotlib.pyplot as plt import seaborn as sns import os import time # Set page configuration st.set_page_config( page_title="Book Recommendation System", page_icon="📚", layout="wide" ) # GitHub URLs for model files and dataset GITHUB_CSV_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/goodreadsV2.csv" GITHUB_KNN_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/knn_model.pkl" GITHUB_TFIDF_URL = "https://raw.githubusercontent.com/Manithj/bookRecEngine/main/tfidf_vectorizer.pkl" # Local file paths for saved models and dataset MODEL_DIR = "models" DATA_DIR = "data" KNN_PATH = os.path.join(MODEL_DIR, "knn_model.pkl") TFIDF_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl") CSV_PATH = os.path.join(DATA_DIR, "goodreadsV2.csv") # Create directories if they don't exist os.makedirs(MODEL_DIR, exist_ok=True) os.makedirs(DATA_DIR, exist_ok=True) # Define the preprocessing function def preprocess_text(text): return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower()) # Download and save files if they don't exist locally def download_and_save_file(url, save_path, is_binary=True): if not os.path.exists(save_path): with st.spinner(f"Downloading {os.path.basename(save_path)}..."): response = requests.get(url) if response.status_code == 200: mode = "wb" if is_binary else "w" with open(save_path, mode) as f: f.write(response.content) st.success(f"Downloaded {os.path.basename(save_path)}") # Add a small delay to ensure file is completely written time.sleep(1) else: st.error(f"Failed to download from {url}, status code: {response.status_code}") return False return True # Load models from local storage or download if needed @st.cache_resource def load_models(): try: # Download models if they don't exist locally tfidf_downloaded = download_and_save_file(GITHUB_TFIDF_URL, TFIDF_PATH) knn_downloaded = download_and_save_file(GITHUB_KNN_URL, KNN_PATH) if not (tfidf_downloaded and knn_downloaded): return None, None # Load models from local storage with open(TFIDF_PATH, 'rb') as f: tfidf = pickle.load(f) with open(KNN_PATH, 'rb') as f: knn_model = pickle.load(f) return tfidf, knn_model except Exception as e: st.error(f"Error loading models: {e}") return None, None # Load the dataset from local storage or download if needed @st.cache_data def load_data(): try: # Download dataset if it doesn't exist locally csv_downloaded = download_and_save_file(GITHUB_CSV_URL, CSV_PATH, is_binary=True) if not csv_downloaded: return None # Load CSV from local storage df_cleaned = pl.read_csv(CSV_PATH) # Clean and prepare the data df_cleaned = df_cleaned.drop_nulls(subset=['name', 'summary', 'genres']) df_cleaned = df_cleaned.with_columns([ (pl.col('summary') + ' ' + pl.col('genres')).alias('combined_features') ]) # Apply preprocessing df_cleaned = df_cleaned.with_columns([ pl.col('combined_features') .map_elements(preprocess_text, return_dtype=pl.Utf8) .alias('processed_features') ]) return df_cleaned except Exception as e: st.error(f"Error loading dataset: {e}") return None # Load models and data at startup - this happens only once due to caching with st.spinner("Loading models and data (this will only happen once)..."): tfidf, knn_model = load_models() df_cleaned = load_data() if tfidf is not None and knn_model is not None and df_cleaned is not None: models_loaded = True else: models_loaded = False # App title and description st.title("📚 Book Recommendation System") st.markdown("Enter a book summary and genres to get personalized book recommendations!") if not models_loaded: st.error("Failed to load models or data. Please check the file paths and URLs.") else: st.success("Models and data loaded successfully!") # Recommendation function for out-of-dataset books def recommend_books_knn_out_of_dataset(input_summary, input_genres, top_n=5): # Combine and preprocess the input book's features combined_input = f"{input_summary} {input_genres}" processed_input = preprocess_text(combined_input) # Transform the input book's features using the loaded TF-IDF vectorizer input_vector = tfidf.transform([processed_input]) # Find the nearest neighbors using the loaded KNN model distances, indices = knn_model.kneighbors(input_vector, n_neighbors=top_n) # Retrieve the recommended book titles and additional information recommendations = [] for i, idx in enumerate(indices.flatten()): book_info = { "title": df_cleaned['name'][idx], "summary": df_cleaned['summary'][idx], "genres": df_cleaned['genres'][idx], "similarity_score": 1 - distances.flatten()[i] # Convert distance to similarity } recommendations.append(book_info) return recommendations # Sidebar for inputs st.sidebar.header("Input Parameters") # Input fields input_summary = st.sidebar.text_area("Book Summary", placeholder="Enter a brief summary of the book...", height=150) input_genres = st.sidebar.text_input("Genres", placeholder="E.g., fantasy, adventure, mystery") # Number of recommendations slider num_recommendations = st.sidebar.slider("Number of Recommendations", min_value=1, max_value=10, value=5) # Get recommendations button if st.sidebar.button("Get Recommendations") and models_loaded: if input_summary and input_genres: with st.spinner("Finding the perfect books for you..."): # Get recommendations recommendations = recommend_books_knn_out_of_dataset( input_summary, input_genres, top_n=num_recommendations ) # Display recommendations st.header("Recommended Books") # Create columns for book cards cols = st.columns(min(3, num_recommendations)) for i, book in enumerate(recommendations): col_idx = i % 3 with cols[col_idx]: st.subheader(book["title"]) st.markdown(f"**Genres:** {book['genres']}") st.markdown(f"**Similarity Score:** {book['similarity_score']:.2f}") with st.expander("Summary"): st.write(book["summary"]) st.divider() # Visualization of similarity scores st.header("Similarity Scores") fig, ax = plt.subplots(figsize=(10, 5)) book_titles = [book["title"] for book in recommendations] similarity_scores = [book["similarity_score"] for book in recommendations] # Create horizontal bar chart sns.barplot(x=similarity_scores, y=book_titles, palette="viridis", ax=ax) ax.set_xlabel("Similarity Score") ax.set_ylabel("Book Title") ax.set_title("Book Recommendation Similarity Scores") st.pyplot(fig) else: st.warning("Please enter both a summary and genres to get recommendations.") # Add some information about the app st.sidebar.markdown("---") st.sidebar.header("About") st.sidebar.info( """ This app uses TF-IDF vectorization and K-Nearest Neighbors to recommend books based on your input summary and genres. The recommendations are based on textual similarity between your input and our database of books from Goodreads. Models and data are stored locally on the server after initial download. """ ) # Add example inputs for quick testing st.sidebar.markdown("---") st.sidebar.header("Try these examples") if st.sidebar.button("Example 1: Fantasy Adventure"): st.sidebar.text_area("Book Summary", value="A young wizard discovers his magical powers and embarks on a journey to defeat a dark lord threatening the world.", height=150, key="example1_summary") st.sidebar.text_input("Genres", value="fantasy, adventure, magic", key="example1_genres") if st.sidebar.button("Example 2: Mystery Thriller"): st.sidebar.text_area("Book Summary", value="A detective investigates a series of murders that seem to be connected to an unsolved case from decades ago.", height=150, key="example2_summary") st.sidebar.text_input("Genres", value="mystery, thriller, crime", key="example2_genres") # Add a footer st.markdown("---") st.markdown("📚 Book Recommendation System | Created with Streamlit")