import pandas as pd import numpy as np import streamlit as st from pathlib import Path import sys import os import time from scipy.sparse import coo_matrix, csr_matrix from sklearn.decomposition import TruncatedSVD, IncrementalPCA from sklearn.preprocessing import normalize from sklearn.metrics.pairwise import cosine_similarity import joblib # ====================== Streamlit Setup ====================== st.set_page_config(layout="wide") st.title("🎬 Movie Recommender System User-Based") # ====================== Paths ====================== sys.path.append(os.path.dirname(__file__)) data_dir = Path(__file__).parent / 'data' cache_dir = data_dir / 'cache' cache_dir.mkdir(exist_ok=True) movies_path = 'movies_final.csv' ratings_path = data_dir / 'ratings.csv' # ====================== Load Data ====================== # used to **cache the output of a function that loads or processes data** so that it doesn’t get recomputed every time the app reruns. # This is especially useful when loading large datasets or performing expensive computations.Since our data is huge @st.cache_data def load_data(): movies = pd.read_csv(movies_path) ratings = st.session_state['ratings_df'] # ratings = ratings.drop(columns=['timestamp'], errors='ignore') ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first') ratings['userId'] = ratings['userId'].astype(int) ratings['movieId'] = ratings['movieId'].astype(int) return movies, ratings movies, ratings = load_data() ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique().tolist())] st.text(f"Initial ratings shape: {ratings.shape}") # ====================== Data Filtering ====================== user_counts = ratings['userId'].value_counts() active_users = user_counts[user_counts >= 450].index ratings_filtered = ratings[ratings['userId'].isin(active_users)] movie_counts = ratings_filtered['movieId'].value_counts() popular_movies = movie_counts[movie_counts >= 450].index ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(popular_movies)] ratings = ratings_filtered.copy() del ratings_filtered st.text(f"Filtered ratings shape: {ratings.shape}") st.markdown("#### 🎯 Dataset Filtering Summary") st.markdown(""" We filtered the dataset in **two steps** to focus on active users and popular movies: 1. **Active Users** - Kept users who have rated **at least 450 movies**. - Ensures we focus on users who are actively engaged. 2. **Popular Movies** - Kept movies that have **at least 450 ratings**. - Ensures we focus on movies with enough feedback to be meaningful. **Result:** The final dataset contains ratings from active users on popular movies, improving the quality of recommendations and decreasing the size of the dataset. """) formatted_num = "{:,}".format(ratings.shape[0]) print(formatted_num) # Optional: Show numbers dynamically st.markdown(f"**πŸ“Š Ratings after filtering:** {formatted_num} ratings from {ratings['userId'].nunique()} users on {ratings['movieId'].nunique()} movies.") # ====================== Index Mapping ====================== # For each unique userId, assigns a unique integer code starting from 0. ratings['user_idx'] = ratings['userId'].astype('category').cat.codes ratings['movie_idx'] = ratings['movieId'].astype('category').cat.codes num_users = ratings['user_idx'].nunique() num_movies = ratings['movie_idx'].nunique() # Map indices back to original IDs # {0: 100, 1: 101, 2: 105} user_idx_to_id = dict(enumerate(ratings['userId'].astype('category').cat.categories)) movie_idx_to_id = dict(enumerate(ratings['movieId'].astype('category').cat.categories)) # ====================== Sparse Matrix ====================== # Rows β†’ users # Columns β†’ movies # Values β†’ ratings # Most entries are **0 (or missing)** because not all users rate all movies. user_item_matrix = coo_matrix( (ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])), shape=(num_users, num_movies) ).tocsr() # CSR format allows fast operations for matrix factorization or nearest-neighbor search # The full matrix would have 50 million cells. But maybe only 1 million ratings exist. # Dense matrix: stores all 50 million entries β†’ huge memory usage.Sparse matrix: stores only the non-zero ratings β†’ huge memory savings. # There are different sparse formats, e.g., COO, CSR, CSC. # ====================== Incremental SVD Caching ====================== SVD_CACHE_PATH = cache_dir / "svd_incremental_cache.joblib" def compute_incremental_svd(matrix: csr_matrix, n_components=100, batch_size=1000): """ Memory-efficient incremental SVD using IncrementalPCA. Works even when matrix cannot fit entirely in memory. """ if SVD_CACHE_PATH.exists(): # st.text("βœ… Loading cached SVD decomposition...") U, Sigma, VT = joblib.load(SVD_CACHE_PATH) return U, Sigma, VT st.text("βš™οΈ Computing Incremental SVD... (this may take several minutes)") ipca = IncrementalPCA(n_components=n_components) n_samples = matrix.shape[0] # Fit in chunks for start_idx in range(0, n_samples, batch_size): end_idx = min(start_idx + batch_size, n_samples) batch = matrix[start_idx:end_idx].toarray() ipca.partial_fit(batch) st.text(f"Processed rows {start_idx} to {end_idx}") # Transform full data U = [] for start_idx in range(0, n_samples, batch_size): end_idx = min(start_idx + batch_size, n_samples) batch = matrix[start_idx:end_idx].toarray() U.append(ipca.transform(batch)) U = np.vstack(U) VT = ipca.components_ Sigma = np.ones(ipca.n_components_) joblib.dump((U, Sigma, VT), SVD_CACHE_PATH) st.text("βœ… SVD computation cached successfully.") return U, Sigma, VT U, Sigma, VT = compute_incremental_svd(user_item_matrix, n_components=100, batch_size=100000) U_normalized = normalize(U) # ====================== Recommendation Function ====================== def recommend_for_user_svd(user_id, top_n=5, n_similar_users=30): start = time.time() if user_id not in ratings['userId'].values: return "User not found.", 0 user_idx = ratings.loc[ratings['userId'] == user_id, 'user_idx'].iloc[0] # Cosine similarity in latent space user_vector = U_normalized[user_idx] similarities = np.dot(U_normalized, user_vector) # Select top similar users similar_user_indices = similarities.argsort()[::-1][1:n_similar_users+1] similar_scores = similarities[similar_user_indices] user_rated_movie_indices = user_item_matrix.getrow(user_idx).nonzero()[1] user_rated_movie_set = set(user_rated_movie_indices) # Aggregate weighted ratings scores = {} for sim_user_idx, sim_score in zip(similar_user_indices, similar_scores): sim_user_ratings = user_item_matrix.getrow(sim_user_idx) movie_indices = sim_user_ratings.nonzero()[1] sim_ratings = sim_user_ratings.data for m_idx, rating in zip(movie_indices, sim_ratings): if m_idx not in user_rated_movie_set: scores[m_idx] = scores.get(m_idx, 0) + sim_score * rating # Sort and recommend recommended_movie_indices = sorted(scores, key=scores.get, reverse=True)[:top_n] recommended_movie_ids = [movie_idx_to_id[idx] for idx in recommended_movie_indices] end = time.time() recommendations = movies[movies['movieId'].isin(recommended_movie_ids)][['title']] return recommendations, end - start # 1. Works in latent space β†’ captures hidden user preferences. # 2. Uses top similar users β†’ collaborative filtering logic. # 3. Avoids recommending already rated movies. # 4. Weighted aggregation ensures more similar users influence recommendations more. # 5. Returns top N movies along with computation time. # ====================== Streamlit UI =========================================== # st.dataframe(ratings.head(1000)) # st.text('unique') # st.text(ratings['user_idx'].nunique()) # st.text(ratings['userId'].nunique()) user_id_input = st.number_input( "Enter User ID", min_value=int(ratings['user_idx'].min()), max_value=int(ratings['user_idx'].max()), step=1 ) user_id_input = ratings[ratings['user_idx'] == user_id_input]['userId'].unique()[0] # st.dataframe(ratings) # st.text(user_id_input) if st.button("πŸŽ₯ Recommend Movies"): col1, col2 = st.columns(2) with col1: st.text('User Activity') current_user = ratings[ratings['userId']==user_id_input] merge_results = pd.merge(current_user,movies, how='left', on = 'movieId') merge_results = merge_results[['title','rating']] st.dataframe(merge_results) with col2: st.text('Recommended Movies') recs, duration = recommend_for_user_svd(user_id_input, top_n=5) st.dataframe(recs) st.text(f"Computed in {duration:.2f} seconds.")