File size: 9,170 Bytes
c37645b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 | import pandas as pd
import numpy as np
import streamlit as st
from pathlib import Path
import sys
import os
import time
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.decomposition import TruncatedSVD, IncrementalPCA
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import joblib
# ====================== Streamlit Setup ======================
st.set_page_config(layout="wide")
st.title("π¬ Movie Recommender System User-Based")
# ====================== Paths ======================
sys.path.append(os.path.dirname(__file__))
data_dir = Path(__file__).parent / 'data'
cache_dir = data_dir / 'cache'
cache_dir.mkdir(exist_ok=True)
movies_path = 'movies_final.csv'
ratings_path = data_dir / 'ratings.csv'
# ====================== Load Data ======================
# used to **cache the output of a function that loads or processes data** so that it doesnβt get recomputed every time the app reruns.
# This is especially useful when loading large datasets or performing expensive computations.Since our data is huge
@st.cache_data
def load_data():
movies = pd.read_csv(movies_path)
ratings = st.session_state['ratings_df']
# ratings = ratings.drop(columns=['timestamp'], errors='ignore')
ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
ratings['userId'] = ratings['userId'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)
return movies, ratings
movies, ratings = load_data()
ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique().tolist())]
st.text(f"Initial ratings shape: {ratings.shape}")
# ====================== Data Filtering ======================
user_counts = ratings['userId'].value_counts()
active_users = user_counts[user_counts >= 450].index
ratings_filtered = ratings[ratings['userId'].isin(active_users)]
movie_counts = ratings_filtered['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 450].index
ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(popular_movies)]
ratings = ratings_filtered.copy()
del ratings_filtered
st.text(f"Filtered ratings shape: {ratings.shape}")
st.markdown("#### π― Dataset Filtering Summary")
st.markdown("""
We filtered the dataset in **two steps** to focus on active users and popular movies:
1. **Active Users**
- Kept users who have rated **at least 450 movies**.
- Ensures we focus on users who are actively engaged.
2. **Popular Movies**
- Kept movies that have **at least 450 ratings**.
- Ensures we focus on movies with enough feedback to be meaningful.
**Result:**
The final dataset contains ratings from active users on popular movies, improving the quality of recommendations and decreasing the size of the dataset.
""")
formatted_num = "{:,}".format(ratings.shape[0])
print(formatted_num)
# Optional: Show numbers dynamically
st.markdown(f"**π Ratings after filtering:** {formatted_num} ratings from {ratings['userId'].nunique()} users on {ratings['movieId'].nunique()} movies.")
# ====================== Index Mapping ======================
# For each unique userId, assigns a unique integer code starting from 0.
ratings['user_idx'] = ratings['userId'].astype('category').cat.codes
ratings['movie_idx'] = ratings['movieId'].astype('category').cat.codes
num_users = ratings['user_idx'].nunique()
num_movies = ratings['movie_idx'].nunique()
# Map indices back to original IDs
# {0: 100, 1: 101, 2: 105}
user_idx_to_id = dict(enumerate(ratings['userId'].astype('category').cat.categories))
movie_idx_to_id = dict(enumerate(ratings['movieId'].astype('category').cat.categories))
# ====================== Sparse Matrix ======================
# Rows β users
# Columns β movies
# Values β ratings
# Most entries are **0 (or missing)** because not all users rate all movies.
user_item_matrix = coo_matrix(
(ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
shape=(num_users, num_movies)
).tocsr()
# CSR format allows fast operations for matrix factorization or nearest-neighbor search
# The full matrix would have 50 million cells. But maybe only 1 million ratings exist.
# Dense matrix: stores all 50 million entries β huge memory usage.Sparse matrix: stores only the non-zero ratings β huge memory savings.
# There are different sparse formats, e.g., COO, CSR, CSC.
# ====================== Incremental SVD Caching ======================
SVD_CACHE_PATH = cache_dir / "svd_incremental_cache.joblib"
def compute_incremental_svd(matrix: csr_matrix, n_components=100, batch_size=1000):
"""
Memory-efficient incremental SVD using IncrementalPCA.
Works even when matrix cannot fit entirely in memory.
"""
if SVD_CACHE_PATH.exists():
# st.text("β
Loading cached SVD decomposition...")
U, Sigma, VT = joblib.load(SVD_CACHE_PATH)
return U, Sigma, VT
st.text("βοΈ Computing Incremental SVD... (this may take several minutes)")
ipca = IncrementalPCA(n_components=n_components)
n_samples = matrix.shape[0]
# Fit in chunks
for start_idx in range(0, n_samples, batch_size):
end_idx = min(start_idx + batch_size, n_samples)
batch = matrix[start_idx:end_idx].toarray()
ipca.partial_fit(batch)
st.text(f"Processed rows {start_idx} to {end_idx}")
# Transform full data
U = []
for start_idx in range(0, n_samples, batch_size):
end_idx = min(start_idx + batch_size, n_samples)
batch = matrix[start_idx:end_idx].toarray()
U.append(ipca.transform(batch))
U = np.vstack(U)
VT = ipca.components_
Sigma = np.ones(ipca.n_components_)
joblib.dump((U, Sigma, VT), SVD_CACHE_PATH)
st.text("β
SVD computation cached successfully.")
return U, Sigma, VT
U, Sigma, VT = compute_incremental_svd(user_item_matrix, n_components=100, batch_size=100000)
U_normalized = normalize(U)
# ====================== Recommendation Function ======================
def recommend_for_user_svd(user_id, top_n=5, n_similar_users=30):
start = time.time()
if user_id not in ratings['userId'].values:
return "User not found.", 0
user_idx = ratings.loc[ratings['userId'] == user_id, 'user_idx'].iloc[0]
# Cosine similarity in latent space
user_vector = U_normalized[user_idx]
similarities = np.dot(U_normalized, user_vector)
# Select top similar users
similar_user_indices = similarities.argsort()[::-1][1:n_similar_users+1]
similar_scores = similarities[similar_user_indices]
user_rated_movie_indices = user_item_matrix.getrow(user_idx).nonzero()[1]
user_rated_movie_set = set(user_rated_movie_indices)
# Aggregate weighted ratings
scores = {}
for sim_user_idx, sim_score in zip(similar_user_indices, similar_scores):
sim_user_ratings = user_item_matrix.getrow(sim_user_idx)
movie_indices = sim_user_ratings.nonzero()[1]
sim_ratings = sim_user_ratings.data
for m_idx, rating in zip(movie_indices, sim_ratings):
if m_idx not in user_rated_movie_set:
scores[m_idx] = scores.get(m_idx, 0) + sim_score * rating
# Sort and recommend
recommended_movie_indices = sorted(scores, key=scores.get, reverse=True)[:top_n]
recommended_movie_ids = [movie_idx_to_id[idx] for idx in recommended_movie_indices]
end = time.time()
recommendations = movies[movies['movieId'].isin(recommended_movie_ids)][['title']]
return recommendations, end - start
# 1. Works in latent space β captures hidden user preferences.
# 2. Uses top similar users β collaborative filtering logic.
# 3. Avoids recommending already rated movies.
# 4. Weighted aggregation ensures more similar users influence recommendations more.
# 5. Returns top N movies along with computation time.
# ====================== Streamlit UI ===========================================
# st.dataframe(ratings.head(1000))
# st.text('unique')
# st.text(ratings['user_idx'].nunique())
# st.text(ratings['userId'].nunique())
user_id_input = st.number_input(
"Enter User ID",
min_value=int(ratings['user_idx'].min()),
max_value=int(ratings['user_idx'].max()),
step=1
)
user_id_input = ratings[ratings['user_idx'] == user_id_input]['userId'].unique()[0]
# st.dataframe(ratings)
# st.text(user_id_input)
if st.button("π₯ Recommend Movies"):
col1, col2 = st.columns(2)
with col1:
st.text('User Activity')
current_user = ratings[ratings['userId']==user_id_input]
merge_results = pd.merge(current_user,movies, how='left', on = 'movieId')
merge_results = merge_results[['title','rating']]
st.dataframe(merge_results)
with col2:
st.text('Recommended Movies')
recs, duration = recommend_for_user_svd(user_id_input, top_n=5)
st.dataframe(recs)
st.text(f"Computed in {duration:.2f} seconds.") |