|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import streamlit as st
|
|
|
from pathlib import Path
|
|
|
import sys
|
|
|
import os
|
|
|
import time
|
|
|
from scipy.sparse import coo_matrix, csr_matrix
|
|
|
from sklearn.decomposition import TruncatedSVD, IncrementalPCA
|
|
|
from sklearn.preprocessing import normalize
|
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
import joblib
|
|
|
|
|
|
|
|
|
st.set_page_config(layout="wide")
|
|
|
st.title("π¬ Movie Recommender System User-Based")
|
|
|
|
|
|
|
|
|
sys.path.append(os.path.dirname(__file__))
|
|
|
data_dir = Path(__file__).parent / 'data'
|
|
|
cache_dir = data_dir / 'cache'
|
|
|
cache_dir.mkdir(exist_ok=True)
|
|
|
|
|
|
movies_path = 'movies_final.csv'
|
|
|
ratings_path = data_dir / 'ratings.csv'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data
|
|
|
def load_data():
|
|
|
movies = pd.read_csv(movies_path)
|
|
|
ratings = st.session_state['ratings_df']
|
|
|
|
|
|
ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
|
|
|
ratings['userId'] = ratings['userId'].astype(int)
|
|
|
ratings['movieId'] = ratings['movieId'].astype(int)
|
|
|
return movies, ratings
|
|
|
|
|
|
movies, ratings = load_data()
|
|
|
|
|
|
ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique().tolist())]
|
|
|
|
|
|
st.text(f"Initial ratings shape: {ratings.shape}")
|
|
|
|
|
|
|
|
|
user_counts = ratings['userId'].value_counts()
|
|
|
active_users = user_counts[user_counts >= 450].index
|
|
|
ratings_filtered = ratings[ratings['userId'].isin(active_users)]
|
|
|
|
|
|
movie_counts = ratings_filtered['movieId'].value_counts()
|
|
|
popular_movies = movie_counts[movie_counts >= 450].index
|
|
|
ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(popular_movies)]
|
|
|
ratings = ratings_filtered.copy()
|
|
|
del ratings_filtered
|
|
|
|
|
|
st.text(f"Filtered ratings shape: {ratings.shape}")
|
|
|
|
|
|
st.markdown("#### π― Dataset Filtering Summary")
|
|
|
st.markdown("""
|
|
|
We filtered the dataset in **two steps** to focus on active users and popular movies:
|
|
|
|
|
|
1. **Active Users**
|
|
|
- Kept users who have rated **at least 450 movies**.
|
|
|
- Ensures we focus on users who are actively engaged.
|
|
|
|
|
|
2. **Popular Movies**
|
|
|
- Kept movies that have **at least 450 ratings**.
|
|
|
- Ensures we focus on movies with enough feedback to be meaningful.
|
|
|
|
|
|
**Result:**
|
|
|
The final dataset contains ratings from active users on popular movies, improving the quality of recommendations and decreasing the size of the dataset.
|
|
|
""")
|
|
|
|
|
|
formatted_num = "{:,}".format(ratings.shape[0])
|
|
|
print(formatted_num)
|
|
|
|
|
|
|
|
|
st.markdown(f"**π Ratings after filtering:** {formatted_num} ratings from {ratings['userId'].nunique()} users on {ratings['movieId'].nunique()} movies.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ratings['user_idx'] = ratings['userId'].astype('category').cat.codes
|
|
|
ratings['movie_idx'] = ratings['movieId'].astype('category').cat.codes
|
|
|
|
|
|
num_users = ratings['user_idx'].nunique()
|
|
|
num_movies = ratings['movie_idx'].nunique()
|
|
|
|
|
|
|
|
|
|
|
|
user_idx_to_id = dict(enumerate(ratings['userId'].astype('category').cat.categories))
|
|
|
movie_idx_to_id = dict(enumerate(ratings['movieId'].astype('category').cat.categories))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
user_item_matrix = coo_matrix(
|
|
|
(ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
|
|
|
shape=(num_users, num_movies)
|
|
|
).tocsr()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SVD_CACHE_PATH = cache_dir / "svd_incremental_cache.joblib"
|
|
|
|
|
|
def compute_incremental_svd(matrix: csr_matrix, n_components=100, batch_size=1000):
|
|
|
"""
|
|
|
Memory-efficient incremental SVD using IncrementalPCA.
|
|
|
Works even when matrix cannot fit entirely in memory.
|
|
|
"""
|
|
|
if SVD_CACHE_PATH.exists():
|
|
|
|
|
|
U, Sigma, VT = joblib.load(SVD_CACHE_PATH)
|
|
|
return U, Sigma, VT
|
|
|
|
|
|
st.text("βοΈ Computing Incremental SVD... (this may take several minutes)")
|
|
|
ipca = IncrementalPCA(n_components=n_components)
|
|
|
n_samples = matrix.shape[0]
|
|
|
|
|
|
|
|
|
for start_idx in range(0, n_samples, batch_size):
|
|
|
end_idx = min(start_idx + batch_size, n_samples)
|
|
|
batch = matrix[start_idx:end_idx].toarray()
|
|
|
ipca.partial_fit(batch)
|
|
|
st.text(f"Processed rows {start_idx} to {end_idx}")
|
|
|
|
|
|
|
|
|
U = []
|
|
|
for start_idx in range(0, n_samples, batch_size):
|
|
|
end_idx = min(start_idx + batch_size, n_samples)
|
|
|
batch = matrix[start_idx:end_idx].toarray()
|
|
|
U.append(ipca.transform(batch))
|
|
|
U = np.vstack(U)
|
|
|
|
|
|
VT = ipca.components_
|
|
|
Sigma = np.ones(ipca.n_components_)
|
|
|
|
|
|
joblib.dump((U, Sigma, VT), SVD_CACHE_PATH)
|
|
|
st.text("β
SVD computation cached successfully.")
|
|
|
return U, Sigma, VT
|
|
|
|
|
|
U, Sigma, VT = compute_incremental_svd(user_item_matrix, n_components=100, batch_size=100000)
|
|
|
U_normalized = normalize(U)
|
|
|
|
|
|
|
|
|
def recommend_for_user_svd(user_id, top_n=5, n_similar_users=30):
|
|
|
start = time.time()
|
|
|
if user_id not in ratings['userId'].values:
|
|
|
return "User not found.", 0
|
|
|
|
|
|
user_idx = ratings.loc[ratings['userId'] == user_id, 'user_idx'].iloc[0]
|
|
|
|
|
|
|
|
|
user_vector = U_normalized[user_idx]
|
|
|
similarities = np.dot(U_normalized, user_vector)
|
|
|
|
|
|
|
|
|
similar_user_indices = similarities.argsort()[::-1][1:n_similar_users+1]
|
|
|
similar_scores = similarities[similar_user_indices]
|
|
|
|
|
|
user_rated_movie_indices = user_item_matrix.getrow(user_idx).nonzero()[1]
|
|
|
user_rated_movie_set = set(user_rated_movie_indices)
|
|
|
|
|
|
|
|
|
scores = {}
|
|
|
for sim_user_idx, sim_score in zip(similar_user_indices, similar_scores):
|
|
|
sim_user_ratings = user_item_matrix.getrow(sim_user_idx)
|
|
|
movie_indices = sim_user_ratings.nonzero()[1]
|
|
|
sim_ratings = sim_user_ratings.data
|
|
|
for m_idx, rating in zip(movie_indices, sim_ratings):
|
|
|
if m_idx not in user_rated_movie_set:
|
|
|
scores[m_idx] = scores.get(m_idx, 0) + sim_score * rating
|
|
|
|
|
|
|
|
|
recommended_movie_indices = sorted(scores, key=scores.get, reverse=True)[:top_n]
|
|
|
recommended_movie_ids = [movie_idx_to_id[idx] for idx in recommended_movie_indices]
|
|
|
|
|
|
end = time.time()
|
|
|
recommendations = movies[movies['movieId'].isin(recommended_movie_ids)][['title']]
|
|
|
return recommendations, end - start
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
user_id_input = st.number_input(
|
|
|
"Enter User ID",
|
|
|
min_value=int(ratings['user_idx'].min()),
|
|
|
max_value=int(ratings['user_idx'].max()),
|
|
|
step=1
|
|
|
)
|
|
|
|
|
|
|
|
|
user_id_input = ratings[ratings['user_idx'] == user_id_input]['userId'].unique()[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if st.button("π₯ Recommend Movies"):
|
|
|
|
|
|
col1, col2 = st.columns(2)
|
|
|
with col1:
|
|
|
st.text('User Activity')
|
|
|
current_user = ratings[ratings['userId']==user_id_input]
|
|
|
merge_results = pd.merge(current_user,movies, how='left', on = 'movieId')
|
|
|
merge_results = merge_results[['title','rating']]
|
|
|
st.dataframe(merge_results)
|
|
|
with col2:
|
|
|
st.text('Recommended Movies')
|
|
|
recs, duration = recommend_for_user_svd(user_id_input, top_n=5)
|
|
|
st.dataframe(recs)
|
|
|
st.text(f"Computed in {duration:.2f} seconds.") |