File size: 9,170 Bytes
c37645b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import pandas as pd
import numpy as np
import streamlit as st
from pathlib import Path
import sys
import os
import time
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.decomposition import TruncatedSVD, IncrementalPCA
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import joblib

# ====================== Streamlit Setup ======================
st.set_page_config(layout="wide")
st.title("🎬 Movie Recommender System User-Based")

# ====================== Paths ======================
sys.path.append(os.path.dirname(__file__))
data_dir = Path(__file__).parent / 'data'
cache_dir = data_dir / 'cache'
cache_dir.mkdir(exist_ok=True)

movies_path = 'movies_final.csv'
ratings_path = data_dir / 'ratings.csv'

# ====================== Load Data ======================

# used to **cache the output of a function that loads or processes data** so that it doesn’t get recomputed every time the app reruns. 
# This is especially useful when loading large datasets or performing expensive computations.Since our data is huge

@st.cache_data
def load_data():
    movies = pd.read_csv(movies_path)
    ratings = st.session_state['ratings_df']
    # ratings = ratings.drop(columns=['timestamp'], errors='ignore')
    ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
    ratings['userId'] = ratings['userId'].astype(int)
    ratings['movieId'] = ratings['movieId'].astype(int)
    return movies, ratings

movies, ratings = load_data()

ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique().tolist())]

st.text(f"Initial ratings shape: {ratings.shape}")

# ====================== Data Filtering ======================
user_counts = ratings['userId'].value_counts()
active_users = user_counts[user_counts >= 450].index
ratings_filtered = ratings[ratings['userId'].isin(active_users)]

movie_counts = ratings_filtered['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 450].index
ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(popular_movies)]
ratings = ratings_filtered.copy()
del ratings_filtered

st.text(f"Filtered ratings shape: {ratings.shape}")

st.markdown("#### 🎯 Dataset Filtering Summary")
st.markdown("""

We filtered the dataset in **two steps** to focus on active users and popular movies:



1. **Active Users**  

   - Kept users who have rated **at least 450 movies**.  

   - Ensures we focus on users who are actively engaged.



2. **Popular Movies**  

   - Kept movies that have **at least 450 ratings**.  

   - Ensures we focus on movies with enough feedback to be meaningful.



 **Result:**  

The final dataset contains ratings from active users on popular movies, improving the quality of recommendations and decreasing the size of the dataset.

""")

formatted_num = "{:,}".format(ratings.shape[0])
print(formatted_num)

# Optional: Show numbers dynamically
st.markdown(f"**πŸ“Š Ratings after filtering:** {formatted_num} ratings from {ratings['userId'].nunique()} users on {ratings['movieId'].nunique()} movies.")


# ====================== Index Mapping ======================
# For each unique userId, assigns a unique integer code starting from 0.
ratings['user_idx'] = ratings['userId'].astype('category').cat.codes
ratings['movie_idx'] = ratings['movieId'].astype('category').cat.codes

num_users = ratings['user_idx'].nunique()
num_movies = ratings['movie_idx'].nunique()

# Map indices back to original IDs
# {0: 100, 1: 101, 2: 105}
user_idx_to_id = dict(enumerate(ratings['userId'].astype('category').cat.categories))
movie_idx_to_id = dict(enumerate(ratings['movieId'].astype('category').cat.categories))

# ====================== Sparse Matrix ======================

# Rows β†’ users
# Columns β†’ movies
# Values β†’ ratings
# Most entries are **0 (or missing)** because not all users rate all movies.

user_item_matrix = coo_matrix(
    (ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
    shape=(num_users, num_movies)
).tocsr()

# CSR format allows fast operations for matrix factorization or nearest-neighbor search
# The full matrix would have 50 million cells. But maybe only 1 million ratings exist.
# Dense matrix: stores all 50 million entries β†’ huge memory usage.Sparse matrix: stores only the non-zero ratings β†’ huge memory savings.
# There are different sparse formats, e.g., COO, CSR, CSC.

# ====================== Incremental SVD Caching ======================
SVD_CACHE_PATH = cache_dir / "svd_incremental_cache.joblib"

def compute_incremental_svd(matrix: csr_matrix, n_components=100, batch_size=1000):
    """

    Memory-efficient incremental SVD using IncrementalPCA.

    Works even when matrix cannot fit entirely in memory.

    """
    if SVD_CACHE_PATH.exists():
        # st.text("βœ… Loading cached SVD decomposition...")
        U, Sigma, VT = joblib.load(SVD_CACHE_PATH)
        return U, Sigma, VT

    st.text("βš™οΈ Computing Incremental SVD... (this may take several minutes)")
    ipca = IncrementalPCA(n_components=n_components)
    n_samples = matrix.shape[0]

    # Fit in chunks
    for start_idx in range(0, n_samples, batch_size):
        end_idx = min(start_idx + batch_size, n_samples)
        batch = matrix[start_idx:end_idx].toarray()
        ipca.partial_fit(batch)
        st.text(f"Processed rows {start_idx} to {end_idx}")

    # Transform full data
    U = []
    for start_idx in range(0, n_samples, batch_size):
        end_idx = min(start_idx + batch_size, n_samples)
        batch = matrix[start_idx:end_idx].toarray()
        U.append(ipca.transform(batch))
    U = np.vstack(U)

    VT = ipca.components_
    Sigma = np.ones(ipca.n_components_)

    joblib.dump((U, Sigma, VT), SVD_CACHE_PATH)
    st.text("βœ… SVD computation cached successfully.")
    return U, Sigma, VT

U, Sigma, VT = compute_incremental_svd(user_item_matrix, n_components=100, batch_size=100000)
U_normalized = normalize(U)

# ====================== Recommendation Function ======================
def recommend_for_user_svd(user_id, top_n=5, n_similar_users=30):
    start = time.time()
    if user_id not in ratings['userId'].values:
        return "User not found.", 0

    user_idx = ratings.loc[ratings['userId'] == user_id, 'user_idx'].iloc[0]

    # Cosine similarity in latent space
    user_vector = U_normalized[user_idx]
    similarities = np.dot(U_normalized, user_vector)

    # Select top similar users
    similar_user_indices = similarities.argsort()[::-1][1:n_similar_users+1]
    similar_scores = similarities[similar_user_indices]

    user_rated_movie_indices = user_item_matrix.getrow(user_idx).nonzero()[1]
    user_rated_movie_set = set(user_rated_movie_indices)

    # Aggregate weighted ratings
    scores = {}
    for sim_user_idx, sim_score in zip(similar_user_indices, similar_scores):
        sim_user_ratings = user_item_matrix.getrow(sim_user_idx)
        movie_indices = sim_user_ratings.nonzero()[1]
        sim_ratings = sim_user_ratings.data
        for m_idx, rating in zip(movie_indices, sim_ratings):
            if m_idx not in user_rated_movie_set:
                scores[m_idx] = scores.get(m_idx, 0) + sim_score * rating

    # Sort and recommend
    recommended_movie_indices = sorted(scores, key=scores.get, reverse=True)[:top_n]
    recommended_movie_ids = [movie_idx_to_id[idx] for idx in recommended_movie_indices]

    end = time.time()
    recommendations = movies[movies['movieId'].isin(recommended_movie_ids)][['title']]
    return recommendations, end - start


# 1. Works in latent space β†’ captures hidden user preferences.
# 2. Uses top similar users β†’ collaborative filtering logic.
# 3. Avoids recommending already rated movies.
# 4. Weighted aggregation ensures more similar users influence recommendations more.
# 5. Returns top N movies along with computation time.

# ====================== Streamlit UI ===========================================

# st.dataframe(ratings.head(1000))
# st.text('unique')
# st.text(ratings['user_idx'].nunique())
# st.text(ratings['userId'].nunique())

user_id_input = st.number_input(
    "Enter User ID",
    min_value=int(ratings['user_idx'].min()),
    max_value=int(ratings['user_idx'].max()),
    step=1
)


user_id_input = ratings[ratings['user_idx'] == user_id_input]['userId'].unique()[0]

# st.dataframe(ratings)

# st.text(user_id_input)


if st.button("πŸŽ₯ Recommend Movies"):    

    col1, col2 = st.columns(2)
    with col1:
        st.text('User Activity')
        current_user = ratings[ratings['userId']==user_id_input]
        merge_results = pd.merge(current_user,movies, how='left', on = 'movieId')
        merge_results = merge_results[['title','rating']]
        st.dataframe(merge_results)
    with col2:
        st.text('Recommended Movies')
        recs, duration = recommend_for_user_svd(user_id_input, top_n=5)
        st.dataframe(recs)
    st.text(f"Computed in {duration:.2f} seconds.")