Movie_Recommender_System-sj / collaborative_user.py
SnehaJais's picture
Initial Commit
c37645b verified
import pandas as pd
import numpy as np
import streamlit as st
from pathlib import Path
import sys
import os
import time
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.decomposition import TruncatedSVD, IncrementalPCA
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import joblib
# ====================== Streamlit Setup ======================
st.set_page_config(layout="wide")
st.title("🎬 Movie Recommender System User-Based")
# ====================== Paths ======================
sys.path.append(os.path.dirname(__file__))
data_dir = Path(__file__).parent / 'data'
cache_dir = data_dir / 'cache'
cache_dir.mkdir(exist_ok=True)
movies_path = 'movies_final.csv'
ratings_path = data_dir / 'ratings.csv'
# ====================== Load Data ======================
# used to **cache the output of a function that loads or processes data** so that it doesn’t get recomputed every time the app reruns.
# This is especially useful when loading large datasets or performing expensive computations.Since our data is huge
@st.cache_data
def load_data():
movies = pd.read_csv(movies_path)
ratings = st.session_state['ratings_df']
# ratings = ratings.drop(columns=['timestamp'], errors='ignore')
ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
ratings['userId'] = ratings['userId'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)
return movies, ratings
movies, ratings = load_data()
ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique().tolist())]
st.text(f"Initial ratings shape: {ratings.shape}")
# ====================== Data Filtering ======================
user_counts = ratings['userId'].value_counts()
active_users = user_counts[user_counts >= 450].index
ratings_filtered = ratings[ratings['userId'].isin(active_users)]
movie_counts = ratings_filtered['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 450].index
ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(popular_movies)]
ratings = ratings_filtered.copy()
del ratings_filtered
st.text(f"Filtered ratings shape: {ratings.shape}")
st.markdown("#### 🎯 Dataset Filtering Summary")
st.markdown("""
We filtered the dataset in **two steps** to focus on active users and popular movies:
1. **Active Users**
- Kept users who have rated **at least 450 movies**.
- Ensures we focus on users who are actively engaged.
2. **Popular Movies**
- Kept movies that have **at least 450 ratings**.
- Ensures we focus on movies with enough feedback to be meaningful.
**Result:**
The final dataset contains ratings from active users on popular movies, improving the quality of recommendations and decreasing the size of the dataset.
""")
formatted_num = "{:,}".format(ratings.shape[0])
print(formatted_num)
# Optional: Show numbers dynamically
st.markdown(f"**πŸ“Š Ratings after filtering:** {formatted_num} ratings from {ratings['userId'].nunique()} users on {ratings['movieId'].nunique()} movies.")
# ====================== Index Mapping ======================
# For each unique userId, assigns a unique integer code starting from 0.
ratings['user_idx'] = ratings['userId'].astype('category').cat.codes
ratings['movie_idx'] = ratings['movieId'].astype('category').cat.codes
num_users = ratings['user_idx'].nunique()
num_movies = ratings['movie_idx'].nunique()
# Map indices back to original IDs
# {0: 100, 1: 101, 2: 105}
user_idx_to_id = dict(enumerate(ratings['userId'].astype('category').cat.categories))
movie_idx_to_id = dict(enumerate(ratings['movieId'].astype('category').cat.categories))
# ====================== Sparse Matrix ======================
# Rows β†’ users
# Columns β†’ movies
# Values β†’ ratings
# Most entries are **0 (or missing)** because not all users rate all movies.
user_item_matrix = coo_matrix(
(ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
shape=(num_users, num_movies)
).tocsr()
# CSR format allows fast operations for matrix factorization or nearest-neighbor search
# The full matrix would have 50 million cells. But maybe only 1 million ratings exist.
# Dense matrix: stores all 50 million entries β†’ huge memory usage.Sparse matrix: stores only the non-zero ratings β†’ huge memory savings.
# There are different sparse formats, e.g., COO, CSR, CSC.
# ====================== Incremental SVD Caching ======================
SVD_CACHE_PATH = cache_dir / "svd_incremental_cache.joblib"
def compute_incremental_svd(matrix: csr_matrix, n_components=100, batch_size=1000):
"""
Memory-efficient incremental SVD using IncrementalPCA.
Works even when matrix cannot fit entirely in memory.
"""
if SVD_CACHE_PATH.exists():
# st.text("βœ… Loading cached SVD decomposition...")
U, Sigma, VT = joblib.load(SVD_CACHE_PATH)
return U, Sigma, VT
st.text("βš™οΈ Computing Incremental SVD... (this may take several minutes)")
ipca = IncrementalPCA(n_components=n_components)
n_samples = matrix.shape[0]
# Fit in chunks
for start_idx in range(0, n_samples, batch_size):
end_idx = min(start_idx + batch_size, n_samples)
batch = matrix[start_idx:end_idx].toarray()
ipca.partial_fit(batch)
st.text(f"Processed rows {start_idx} to {end_idx}")
# Transform full data
U = []
for start_idx in range(0, n_samples, batch_size):
end_idx = min(start_idx + batch_size, n_samples)
batch = matrix[start_idx:end_idx].toarray()
U.append(ipca.transform(batch))
U = np.vstack(U)
VT = ipca.components_
Sigma = np.ones(ipca.n_components_)
joblib.dump((U, Sigma, VT), SVD_CACHE_PATH)
st.text("βœ… SVD computation cached successfully.")
return U, Sigma, VT
U, Sigma, VT = compute_incremental_svd(user_item_matrix, n_components=100, batch_size=100000)
U_normalized = normalize(U)
# ====================== Recommendation Function ======================
def recommend_for_user_svd(user_id, top_n=5, n_similar_users=30):
start = time.time()
if user_id not in ratings['userId'].values:
return "User not found.", 0
user_idx = ratings.loc[ratings['userId'] == user_id, 'user_idx'].iloc[0]
# Cosine similarity in latent space
user_vector = U_normalized[user_idx]
similarities = np.dot(U_normalized, user_vector)
# Select top similar users
similar_user_indices = similarities.argsort()[::-1][1:n_similar_users+1]
similar_scores = similarities[similar_user_indices]
user_rated_movie_indices = user_item_matrix.getrow(user_idx).nonzero()[1]
user_rated_movie_set = set(user_rated_movie_indices)
# Aggregate weighted ratings
scores = {}
for sim_user_idx, sim_score in zip(similar_user_indices, similar_scores):
sim_user_ratings = user_item_matrix.getrow(sim_user_idx)
movie_indices = sim_user_ratings.nonzero()[1]
sim_ratings = sim_user_ratings.data
for m_idx, rating in zip(movie_indices, sim_ratings):
if m_idx not in user_rated_movie_set:
scores[m_idx] = scores.get(m_idx, 0) + sim_score * rating
# Sort and recommend
recommended_movie_indices = sorted(scores, key=scores.get, reverse=True)[:top_n]
recommended_movie_ids = [movie_idx_to_id[idx] for idx in recommended_movie_indices]
end = time.time()
recommendations = movies[movies['movieId'].isin(recommended_movie_ids)][['title']]
return recommendations, end - start
# 1. Works in latent space β†’ captures hidden user preferences.
# 2. Uses top similar users β†’ collaborative filtering logic.
# 3. Avoids recommending already rated movies.
# 4. Weighted aggregation ensures more similar users influence recommendations more.
# 5. Returns top N movies along with computation time.
# ====================== Streamlit UI ===========================================
# st.dataframe(ratings.head(1000))
# st.text('unique')
# st.text(ratings['user_idx'].nunique())
# st.text(ratings['userId'].nunique())
user_id_input = st.number_input(
"Enter User ID",
min_value=int(ratings['user_idx'].min()),
max_value=int(ratings['user_idx'].max()),
step=1
)
user_id_input = ratings[ratings['user_idx'] == user_id_input]['userId'].unique()[0]
# st.dataframe(ratings)
# st.text(user_id_input)
if st.button("πŸŽ₯ Recommend Movies"):
col1, col2 = st.columns(2)
with col1:
st.text('User Activity')
current_user = ratings[ratings['userId']==user_id_input]
merge_results = pd.merge(current_user,movies, how='left', on = 'movieId')
merge_results = merge_results[['title','rating']]
st.dataframe(merge_results)
with col2:
st.text('Recommended Movies')
recs, duration = recommend_for_user_svd(user_id_input, top_n=5)
st.dataframe(recs)
st.text(f"Computed in {duration:.2f} seconds.")