SnehaJais commited on
Commit
c37645b
·
verified ·
1 Parent(s): 4e35fb5

Initial Commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/grouped_tags.csv filter=lfs diff=lfs merge=lfs -text
37
+ data/merger.csv filter=lfs diff=lfs merge=lfs -text
38
+ data/ratings.csv filter=lfs diff=lfs merge=lfs -text
39
+ data/tags.csv filter=lfs diff=lfs merge=lfs -text
40
+ movies_final.csv filter=lfs diff=lfs merge=lfs -text
about_me.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ #===============================
4
+ import pandas as pd
5
+ from pathlib import Path
6
+ import os
7
+ import sys
8
+
9
+ # Define the data directory outside the function
10
+ data_dir = Path(__file__).parent / 'data'
11
+
12
+ # 1. Use st.cache_data to run this function only once
13
+ @st.cache_data
14
+ def load_ratings_data():
15
+ chunks = []
16
+ # Using nullable integers ('Int32') is correct
17
+ dtype_ratings = {'userId': 'Int32', 'movieId': 'Int32', 'rating': 'float32'}
18
+
19
+ # Add a spinner to show the user the file is loading
20
+ with st.spinner('Loading ratings data (please bear with me)...'):
21
+ for chunk in pd.read_csv(
22
+ data_dir / 'ratings.csv',
23
+ dtype=dtype_ratings,
24
+ chunksize=1_000_000
25
+ ):
26
+ # Drop rows where 'userId' or 'movieId' is NaN (as requested)
27
+ chunk = chunk.dropna(subset=['userId', 'movieId'])
28
+ chunks.append(chunk)
29
+
30
+ # Concatenate all chunks into the final dataframe
31
+ ratings = pd.concat(chunks, ignore_index=True)
32
+ return ratings
33
+
34
+ # 2. Call the function to load and cache the data.
35
+ # This DataFrame is now available globally or can be passed to functions.
36
+ ratings_df = load_ratings_data()
37
+ st.session_state['ratings_df'] = ratings_df # Use session_state to share it easily
38
+
39
+ #==============================
40
+
41
+
42
+
43
+ # Basic page styling
44
+ st.markdown(
45
+ """
46
+ <style>
47
+ .stApp {
48
+ background-color: white;
49
+ color: #116A91;
50
+ }
51
+ .title {
52
+ text-align: center;
53
+ color: #116A91;
54
+ font-size: 36px;
55
+ margin-bottom: 10px;
56
+ }
57
+ .subtitle {
58
+ text-align: center;
59
+ color: #444444;
60
+ font-size: 18px;
61
+ margin-bottom: 30px;
62
+ }
63
+ .section {
64
+ background-color: #f9f9f9;
65
+ padding: 20px;
66
+ margin-bottom: 20px;
67
+ border-radius: 6px;
68
+ }
69
+ .header {
70
+ color: #116A91;
71
+ font-size: 20px;
72
+ margin-bottom: 10px;
73
+ }
74
+ .content {
75
+ color: #333333;
76
+ font-size: 16px;
77
+ line-height: 1.6;
78
+ }
79
+ </style>
80
+ """, unsafe_allow_html=True
81
+ )
82
+
83
+ # Page Title and Subtitle
84
+ st.markdown("<div class='title'>Movie Recommender System</div>", unsafe_allow_html=True)
85
+ st.markdown("<div class='subtitle'>Discover movies you'll love using different recommendation techniques 🎬</div>", unsafe_allow_html=True)
86
+
87
+ # Creator Section
88
+ st.markdown("""
89
+ <div class='section'>
90
+ <div class='header'>👤 About the Creator</div>
91
+ <div class='content'>
92
+ <strong>SJ</strong> is the creator of this Movie Recommender System.
93
+ <br><br>
94
+ Currently working as a <strong>Product Manager</strong>, SJ specializes in leading cross-functional teams to build and launch user-centric digital products. With a strong foundation in both business strategy and data, SJ is passionate about solving real-world problems through thoughtful product design.
95
+ <br><br>
96
+ Prior to becoming a Product Manager, SJ held roles as:
97
+ <ul>
98
+ <li><strong>Data Analyst</strong> – Experienced in SQL, Excel, Tableau, Python, Consumer Bureau data, and Microfinance analytics</li>
99
+ <li><strong>Data Engineer</strong> – Worked with IBM DataStage, Informatica, Ataccama, and Collibra</li>
100
+ </ul>
101
+ This diverse background is what inspired SJ to build tools like this — combining the power of data with simple, intuitive products.
102
+ </div>
103
+ </div>
104
+ """, unsafe_allow_html=True)
105
+
106
+ # Purpose Section
107
+ st.markdown("""
108
+ <div class='section'>
109
+ <div class='header'>📌 Purpose of This App</div>
110
+ <div class='content'>
111
+ This Movie Recommender System helps users discover movies they might enjoy based on different recommendation techniques:
112
+ <ul>
113
+ <li>🔥 Popularity Based Filtering – Recommend trending movies</li>
114
+ <li>🎬 Content-Based Filtering – Recommend movies similar to your favorites</li>
115
+ <li>👥 Collaborative User-Based – Recommend movies liked by users similar to you</li>
116
+ <li>🛍️ Collaborative Item-Based – Recommend movies similar to ones you've rated highly</li>
117
+ </ul>
118
+ The goal is to provide a simple, interactive way to explore movie recommendations tailored to your preferences.
119
+ </div>
120
+ </div>
121
+ """, unsafe_allow_html=True)
122
+
123
+ # Technology Section
124
+ st.markdown("""
125
+ <div class='section'>
126
+ <div class='header'>🛠️ Technology Used</div>
127
+ <div class='content'>
128
+ The application is developed using <strong>Streamlit</strong>, a lightweight and efficient Python framework for building data apps.
129
+ <br><br>
130
+ Key Python libraries used include <strong>Pandas</strong>, <strong>NumPy</strong>, <strong>Scikit-Learn</strong>, and <strong>Scipy</strong> for data processing and recommendation algorithms.
131
+ </div>
132
+ </div>
133
+ """, unsafe_allow_html=True)
134
+
135
+ # Features Section
136
+ st.markdown("""
137
+ <div class='section'>
138
+ <div class='header'>🔍 What You Can Do with It</div>
139
+ <div class='content'>
140
+ With this app, users can:
141
+ <ul>
142
+ <li>Explore movie recommendations based on popularity, content similarity, or collaborative filtering</li>
143
+ <li>View detailed recommendations for a specific user</li>
144
+ <li>Filter movies based on ratings and user activity</li>
145
+ <li>Understand hidden patterns in user preferences using latent features</li>
146
+ </ul>
147
+ The system helps both casual viewers and movie enthusiasts find movies they are likely to enjoy.
148
+ </div>
149
+ </div>
150
+ """, unsafe_allow_html=True)
151
+
152
+ # Footer / Call-to-action
153
+ st.markdown("""
154
+ <div class='section'>
155
+ <div class='header'>🎯 Ready to Discover Movies?</div>
156
+ <div class='content'>
157
+ Choose a recommendation technique from the sidebar and start exploring movies tailored to your tastes.
158
+ <br><br>
159
+ Enjoy discovering new favorites!
160
+ </div>
161
+ </div>
162
+ """, unsafe_allow_html=True)
base.py ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+ import sys
5
+ import os
6
+ import time
7
+ from pathlib import Path
8
+ import io
9
+ import nltk
10
+ import string
11
+ from nltk.corpus import stopwords
12
+ from nltk.tokenize import word_tokenize
13
+ from nltk.stem import WordNetLemmatizer
14
+
15
+ # nltk.download('punkt')
16
+ # nltk.download('stopwords')
17
+ # nltk.download('wordnet')
18
+
19
+ nltk_data_dir = os.path.join(os.getcwd(), "nltk_data")
20
+ os.makedirs(nltk_data_dir, exist_ok=True)
21
+ nltk.data.path.append(nltk_data_dir)
22
+
23
+ def download_nltk_resources():
24
+ resources = ['wordnet', 'stopwords', 'punkt']
25
+ for res in resources:
26
+ try:
27
+ nltk.data.find(res)
28
+ except LookupError:
29
+ try:
30
+ nltk.download(res, download_dir=nltk_data_dir)
31
+ except Exception as e:
32
+ st.error(f"Error downloading {res}: {e}")
33
+
34
+ # ====================== Streamlit Setup ======================
35
+ st.set_page_config(layout="wide")
36
+ st.title("Data Preprocessing")
37
+ st.markdown('---')
38
+
39
+ # ====================== Importing the datasets ===========================
40
+ sys.path.append(os.path.dirname(__file__))
41
+ data_dir = Path(__file__).parent / 'data'
42
+
43
+ # links dataset
44
+ links = data_dir / 'links.csv'
45
+ links = pd.read_csv(links)
46
+
47
+ # tags dataset
48
+ tags = data_dir / 'tags.csv'
49
+ tags = pd.read_csv(tags)
50
+
51
+ # movies dataset
52
+ movies = data_dir / 'movies.csv'
53
+ movies = pd.read_csv(movies)
54
+
55
+ # ratings dataset
56
+ # chunks = []
57
+ # dtype_ratings = {'userId': 'Int32', 'movieId': 'Int32', 'rating': 'float32'}
58
+
59
+ # for chunk in pd.read_csv(data_dir / 'ratings.csv', dtype=dtype_ratings, chunksize=1_000_000):
60
+ # chunk = chunk.dropna(subset=['userId', 'movieId'])
61
+ # chunks.append(chunk)
62
+
63
+ # # Concatenate all chunks into the final dataframe
64
+ # ratings = pd.concat(chunks, ignore_index=True)
65
+
66
+
67
+ ratings = st.session_state['ratings_df']
68
+
69
+ # ratings = ratings[ratings['movieId'].isin(movies['movieId'])]
70
+ # ratings = ratings.drop(columns=['timestamp'], errors='ignore')
71
+ # loc = data_dir / 'ratings.csv'
72
+ # ratings.to_csv(loc,index=False)
73
+
74
+ #============================ Descriptions of the datasets ===========================
75
+ st.subheader("ℹ️ MovieLens Datasets")
76
+
77
+ st.markdown("""
78
+ The **MovieLens dataset** is a benchmark dataset widely used for building and testing movie recommendation systems.
79
+ Below is a brief overview of the four main data files included:
80
+ """)
81
+
82
+ # --- Row 1: Movies and Tags ---
83
+ row1_col1, row1_col2 = st.columns(2)
84
+
85
+ with row1_col1:
86
+ st.subheader("🎬 movies.csv")
87
+ st.markdown("""
88
+ Contains metadata about each movie.
89
+ **Columns:**
90
+ - 'movieId': Unique identifier for each movie
91
+ - 'title': Movie title (usually includes release year)
92
+ - 'genres': Pipe-separated list of genres (e.g., *Action|Comedy|Drama*)
93
+ """)
94
+
95
+ with row1_col2:
96
+ st.subheader("🏷️ tags.csv")
97
+ st.markdown("""
98
+ Contains user-generated movie tags.
99
+ **Columns:**
100
+ - 'userId': User who applied the tag
101
+ - 'movieId': Tagged movie
102
+ - 'tag': User-generated keyword or phrase
103
+ - 'timestamp': When the tag was added (UNIX time)
104
+ """)
105
+
106
+ # --- Row 2: Ratings and Links ---
107
+ row2_col1, row2_col2 = st.columns(2)
108
+
109
+ with row2_col1:
110
+ st.subheader("⭐ ratings.csv")
111
+ st.markdown("""
112
+ Contains user ratings for movies.
113
+ **Columns:**
114
+ - 'userId': Unique identifier for each user
115
+ - 'movieId': Rated movie
116
+ - 'rating': Rating given (typically from 0.5 to 5.0)
117
+ - 'timestamp': When the rating was recorded (UNIX time)
118
+ """)
119
+ st.markdown('*Please note that the "ratings" folder is quite large, and I have already performed some preprocessing on it. In this section, I will only be demonstrating the steps involved in the preprocessing process.*')
120
+
121
+ with row2_col2:
122
+ st.subheader("🔗 links.csv")
123
+ st.markdown("""
124
+ Maps MovieLens movies to external movie databases.
125
+ **Columns:**
126
+ - 'movieId': Unique identifier for each movie
127
+ - 'imdbId': IMDb movie ID (for cross-referencing)
128
+ - 'tmdbId': TMDb movie ID (for fetching posters or metadata)
129
+ """)
130
+
131
+ st.markdown('---')
132
+
133
+ # ========================= Displaying the dataset ===================================
134
+ st.subheader('📊 Sample Datasets Preview')
135
+ col1, col2 = st.columns(2)
136
+ with col1:
137
+ st.subheader('Movies dataset')
138
+ st.text(f"Shape : {movies.shape}")
139
+ st.dataframe(movies.head())
140
+ with col2:
141
+ st.subheader('Tags dataset')
142
+ st.text(f"Shape : {tags.shape}")
143
+ st.dataframe(tags.head())
144
+ col1, col2 = st.columns(2)
145
+ with col1:
146
+ st.subheader('Ratings dataset')
147
+ st.text(f"Shape : {ratings.shape}")
148
+ st.dataframe(ratings.head())
149
+ with col2:
150
+ st.subheader('Links dataset')
151
+ st.text(f"Shape : {links.shape}")
152
+ st.dataframe(links.head())
153
+
154
+ st.markdown('---')
155
+
156
+ # ====================================================== Preprocessing ================================================================
157
+
158
+ # =============================dropping not required columns=============================
159
+
160
+ st.subheader('📉 Dropping the columns that are not required')
161
+ st.text("Dropped 'timestamp' and 'tmdbId' columns from Ratings and Links datasets")
162
+
163
+ links = links.drop(columns = ['tmdbId'])
164
+
165
+
166
+ saved_ratings_movie_unique = ratings['movieId'].nunique()
167
+ saved_ratings_duplicates = ratings[ratings.duplicated()].shape[0]
168
+
169
+ st.text('After Removal')
170
+ col1, col2 = st.columns(2)
171
+ with col1:
172
+ st.subheader('Ratings dataset')
173
+ st.text(f"Shape : {ratings.shape}")
174
+ st.dataframe(ratings.head())
175
+ with col2:
176
+ st.subheader('Links dataset')
177
+ st.text(f"Shape : {links.shape}")
178
+ st.dataframe(links.head())
179
+ st.markdown('---')
180
+
181
+ # =============================Information about the dataset=============================
182
+
183
+ st.subheader('ℹ️ Dataset Information')
184
+ col1, col2 = st.columns(2)
185
+ with col1:
186
+ st.text('Movies')
187
+ movies_info = pd.DataFrame({
188
+ "Column Name": movies.columns,
189
+ "Non-Null Count": movies.notnull().sum().values,
190
+ "Data Type": movies.dtypes.values
191
+ })
192
+ st.dataframe(movies_info)
193
+
194
+ st.text('Ratings')
195
+ ratings_info = pd.DataFrame({
196
+ "Column Name": ratings.columns,
197
+ "Non-Null Count": ratings.notnull().sum().values,
198
+ "Data Type": ratings.dtypes.values
199
+ })
200
+ st.dataframe(ratings_info)
201
+
202
+ with col2:
203
+ st.text('Tags')
204
+ tags_info = pd.DataFrame({
205
+ "Column Name": tags.columns,
206
+ "Non-Null Count": tags.notnull().sum().values,
207
+ "Data Type": tags.dtypes.values
208
+ })
209
+ st.dataframe(tags_info)
210
+
211
+ st.text('Links')
212
+ links_info = pd.DataFrame({
213
+ "Column Name": links.columns,
214
+ "Non-Null Count": links.notnull().sum().values,
215
+ "Data Type": links.dtypes.values
216
+ })
217
+ st.dataframe(links_info)
218
+
219
+ st.markdown('---')
220
+
221
+ del(ratings)
222
+
223
+ # ============================= Duplicates hai ya nahi =============================
224
+ st.subheader('🧐 Checking for Duplicates')
225
+ col1, col2, col3, col4 = st.columns(4)
226
+ with col1:
227
+ st.text('Movies')
228
+ st.text(f"# Duplicates found : {movies[movies.duplicated()].shape[0]}")
229
+ with col2:
230
+ st.text('Ratings')
231
+ st.text(f"# Duplicates found : {saved_ratings_duplicates}")
232
+ with col3:
233
+ st.text('Tags')
234
+ st.text(f"# Duplicates found : {tags[tags.duplicated()].shape[0]}")
235
+ st.table(tags[tags.duplicated()])
236
+ with col4:
237
+ st.text('Links')
238
+ st.text(f"# Duplicates found : {links[links.duplicated()].shape[0]}")
239
+
240
+ st.markdown("*Removing duplicate rows detected in the dataset.*")
241
+
242
+ st.markdown("---")
243
+
244
+ # ============================= UNique counts =============================
245
+
246
+ st.subheader('🔍 Checking for consistency across Dataframes')
247
+ st.text(f"Unique movies in Movies Dataset : {movies['movieId'].nunique()}")
248
+ st.text(f"Unique movies in Tags Dataset : {tags['movieId'].nunique()}")
249
+ st.text(f"Unique movies in Ratings Dataset : {saved_ratings_movie_unique}")
250
+ st.text(f"Unique movies in Links Dataset : {links['movieId'].nunique()}")
251
+ st.text(f"Unique imdbId in Links Dataset : {links['imdbId'].nunique()}")
252
+
253
+ st.markdown("*The Tags dataset has tags missing for most of the movies*")
254
+ st.markdown("---")
255
+
256
+
257
+
258
+ # ============================= Grouping the tags data ====================================
259
+
260
+ st.subheader('Merging Tags and Links Datasets')
261
+ st.markdown("""
262
+ - Grouping the Tags dataset based on 'movieId' and merging with the Movies dataset
263
+ - Merging the Links dataset with the Movies dataset based on 'imdbId'
264
+ """)
265
+
266
+ # grouping tags data
267
+ grouped_tags = tags.groupby('movieId')['tag'].agg(list).reset_index()
268
+
269
+ del(tags)
270
+
271
+ # Merging the datasets
272
+ movies_tag = pd.merge(movies,grouped_tags, how = 'left', on= 'movieId')
273
+ movies_tag = pd.merge(movies_tag,links, how = 'left', on= 'movieId')
274
+
275
+ # movies_tag.to_csv('tags_check.csv')
276
+
277
+ st.dataframe(movies_tag.head())
278
+
279
+ del(links)
280
+ del(movies)
281
+
282
+ st.markdown('---')
283
+
284
+ # ============================= Extracting Year and Title =============================
285
+
286
+ # st.text('Extracting the Year from the Title of the movies')
287
+
288
+ import re
289
+ def extract_year_and_clean_title(title):
290
+ # Use regex to find all years in parentheses and extract the last one
291
+ years = re.findall(r'\((\d{4})\)', title)
292
+
293
+ if years:
294
+ # The last year is the one we want
295
+ year = int(years[-1])
296
+ # Remove the last year (and parentheses) from the title
297
+ cleaned_title = title.replace(f"({years[-1]})", "").strip()
298
+ # Optionally, remove any extra parentheses and clean up
299
+ cleaned_title = re.sub(r'\(.*\)', '', cleaned_title).strip()
300
+ return year, cleaned_title
301
+ return 0, title # Return None if no year is found
302
+
303
+
304
+ # Apply the function to the 'title' column to create 'year' and 'cleaned_title' columns
305
+ movies_tag[['year', 'title']] = movies_tag['title'].apply(lambda x: pd.Series(extract_year_and_clean_title(x)))
306
+
307
+
308
+ # ============================= Removing the data after the parenthesis. trimming the title =============================
309
+
310
+ def bracket_title(title):
311
+ sp_title = title.split(' (')
312
+ return sp_title[0]
313
+
314
+ movies_tag['bracket_title'] = movies_tag['title'].apply(bracket_title)
315
+
316
+
317
+ def imp_title(title):
318
+ sp_title = title.split(',')
319
+ return len(sp_title)
320
+
321
+ movies_tag['imp_title'] = movies_tag['bracket_title'].apply(imp_title)
322
+
323
+
324
+
325
+ def rearrange_last_word(text):
326
+ # Split by comma, preserving the commas
327
+ parts = text.split(', ')
328
+
329
+ # If there is more than one part, rearrange
330
+ if len(parts) > 1:
331
+ last_word = parts[-1] # Get the last part after the last comma
332
+ remaining_parts = parts[:-1] # Get the rest of the parts
333
+ # Move the last part to the front and join them with commas, no extra comma before first part
334
+ rearranged_text = f'{last_word} ' + ', '.join(remaining_parts)
335
+ else:
336
+ rearranged_text = text # If there's only one part, return as is
337
+
338
+ return rearranged_text
339
+
340
+
341
+ movies_tag['clean_title'] = movies_tag['bracket_title'].apply(rearrange_last_word)
342
+
343
+
344
+ # st.text(movies_tag['year'].unique())
345
+
346
+ movies_tag['year'] = movies_tag['year'].astype(int)
347
+
348
+ movies_tag = movies_tag[movies_tag['imp_title'].isin([1,2])]
349
+
350
+ movies_tag['genres'] = movies_tag['genres'].str.replace('|',',')
351
+ import ast
352
+ def to_list_conv(row):
353
+ return row.split(',')
354
+
355
+
356
+ movies_tag['genres'] = movies_tag['genres'].apply(to_list_conv)
357
+
358
+ st.subheader('✅ After cleaning & merging the datasets')
359
+ st.markdown("""
360
+ - Extracting the Year from the Title of the movies
361
+ - Converting the Year column to integer data type
362
+ - Replacing non-integer values in the Year column with 0
363
+ - Converting the Genres column to a list after replacing the '|' separator
364
+ """)
365
+
366
+ st.dataframe(movies_tag[['movieId','clean_title','year','genres','tag']].head(100))
367
+
368
+ st.markdown("""- 📌 Number of movies with missing tags: {0}""".format(movies_tag['tag'].isna().sum()))
369
+
370
+
371
+ st.markdown('---')
372
+ # ============================================ Plot Dataset extraction =========================================
373
+
374
+ st.subheader("🎬 Extracting Movie Details from IMDb")
375
+ st.text("Extracting the Plot, Year & Genres of the movies from the 'imdbinfo' package with the help of the 'imdbID' column in the Links dataset")
376
+
377
+ movies_merge = data_dir / 'merger.csv'
378
+ movies_merge = pd.read_csv(movies_merge)
379
+ st.dataframe(movies_merge[['title','year','plot','genres']])
380
+
381
+ st.text(movies_merge.shape)
382
+
383
+ # ============================================ Plot Dataset Check =========================================
384
+
385
+ st.markdown('- **Checking for NULL values in the Dataset - Match not found during the extraction process**')
386
+ col1,col2 = st.columns(2)
387
+ with col1:
388
+ st.text('# Empty fields')
389
+ st.dataframe(movies_merge[['title','year','plot','genres']].isna().sum())
390
+ with col2:
391
+ st.text('After removing the rows with empty title - no match found')
392
+ movies_merge = movies_merge.dropna(subset=['title'])
393
+ st.dataframe(movies_merge[['title','year','plot','genres']].isna().sum())
394
+
395
+
396
+
397
+ st.markdown("- **Same as in the Movies dataset converting the Year column to Integer and replacing NULLs with 0**")
398
+ # st.text(movies_merge['year'].unique())
399
+ movies_merge['year'] = movies_merge['year'].fillna(0)
400
+ movies_merge['year'] = movies_merge['year'].astype(int)
401
+
402
+ movies_merge['title'] = movies_merge['title'].str.strip()
403
+
404
+
405
+
406
+ # - Null IMDB Ids (No match found) : {movies_merge['imdbId'].isna().sum()}
407
+ st.markdown(f"""
408
+ - **Checking for duplicates in the extracted file keeping the NULLs aside**
409
+ - Duplicate Title : {movies_merge['title'].dropna().duplicated().sum()}
410
+ - Duplicate Title + Year : {movies_merge.dropna(subset=['title', 'year']).duplicated(['title', 'year']).sum()}
411
+ - Duplicate Title + Year + Plot : {movies_merge.dropna(subset=['title', 'year', 'plot']).duplicated(['title', 'year', 'plot']).sum()}
412
+ - Duplicate Plot : {movies_merge['plot'].dropna().duplicated().sum()}
413
+ """)
414
+
415
+
416
+ st.markdown('- **Checking why are there Duplicates present in the title column**')
417
+ st.dataframe(movies_merge[movies_merge[['title', 'year']].duplicated(keep=False)].sort_values('title').reset_index(drop=True))
418
+ st.text('Because of difference in year the titles are duplicated or because of multiple description for the plot field')
419
+
420
+ st.markdown("- **Removing the Duplicates basis the Title and the Year column**")
421
+
422
+
423
+ movies_merge = movies_merge.drop_duplicates(subset=['title', 'year'], keep='first')
424
+
425
+ st.markdown(f"""
426
+ - **Checking for duplicates in the extracted file keeping the NULLs aside**
427
+ - Duplicate Title : {movies_merge['title'].dropna().duplicated().sum()}
428
+ - Duplicate Title + Year : {movies_merge.dropna(subset=['title', 'year']).duplicated(['title', 'year']).sum()}
429
+ - Duplicate Title + Year + Plot : {movies_merge.dropna(subset=['title', 'year', 'plot']).duplicated(['title', 'year', 'plot']).sum()}
430
+ - Duplicate Plot : {movies_merge['plot'].dropna().duplicated().sum()}
431
+ """)
432
+
433
+ st.markdown('- **Checking why are there Duplicates present in the plot column apart from NULL values**')
434
+ st.dataframe(movies_merge[movies_merge['plot'].notna() & movies_merge['plot'].duplicated(keep=False)].sort_values('plot').reset_index(drop=True))
435
+
436
+
437
+ movies_merge = movies_merge[~movies_merge['plot'].duplicated(keep=False)]
438
+ st.markdown("- **Removing the Duplicates basis the Plot column as we dont know for which movie the plot is correct**")
439
+ # movies_merge.drop_duplicates(subset='plot', keep='first', inplace=True)
440
+
441
+
442
+ st.markdown(f"""
443
+ - **Checking for duplicates in the extracted file keeping the NULLs aside**
444
+ - Duplicate Title : {movies_merge['title'].dropna().duplicated().sum()}
445
+ - Duplicate Title + Year : {movies_merge.dropna(subset=['title', 'year']).duplicated(['title', 'year']).sum()}
446
+ - Duplicate Title + Year + Plot : {movies_merge.dropna(subset=['title', 'year', 'plot']).duplicated(['title', 'year', 'plot']).sum()}
447
+ - Duplicate Plot : {movies_merge['plot'].dropna().duplicated().sum()}
448
+ """)
449
+
450
+
451
+
452
+
453
+
454
+ st.markdown('---')
455
+ # ========================================= merging with the main dataset =================================
456
+
457
+ st.subheader('🧠 Merging the Dataset for the Plot and Genres columns')
458
+ st.text('Joined on Title and Year column')
459
+
460
+ # movies_tag.to_csv('chek_movies_tag.csv')
461
+ # movies_merge.to_csv('chek_movies_merge.csv')
462
+
463
+ movies_tag = pd.merge(movies_tag,movies_merge, how = 'left', left_on=['clean_title', 'year'], right_on=['title', 'year'])
464
+ movies_tag = movies_tag.reset_index(drop=True)
465
+
466
+ # movies_tag.to_csv('chek_movies_tag_after_join.csv')
467
+
468
+ # ========================================= Converting to meaningfull tags =================================
469
+
470
+ st.subheader("Cleaning and Merging Movie Tags & Genres")
471
+ st.markdown("""
472
+ We are performing the following preprocessing steps on the dataset:
473
+
474
+ - **Standardizing genres and tags**
475
+ - Convert all entries in 'genres_x', 'genres_y', and 'tag' columns to lowercase.
476
+ - Ensure only valid string items are kept and handle empty or missing lists.
477
+ """)
478
+
479
+ movies_tag['genres_x'] = movies_tag['genres_x'].apply(
480
+ lambda x: [item.lower() for item in x if isinstance(item, str)] if isinstance(x, list) and x else []
481
+ )
482
+ movies_tag['tag'] = movies_tag['tag'].apply(
483
+ lambda x: [item.lower() for item in x if isinstance(item, str)] if isinstance(x, list) and x else []
484
+ )
485
+
486
+ st.markdown("""
487
+ - **Converting string representations to lists for 'genres_y'**
488
+ - Some entries might be stored as strings, so we safely convert them to Python lists.
489
+ - Lowercase all valid genre entries.
490
+ """)
491
+ def to_list_conv(x):
492
+ if pd.isna(x):
493
+ return []
494
+ try:
495
+ return ast.literal_eval(x)
496
+ except (ValueError, SyntaxError):
497
+ return []
498
+
499
+ movies_tag['genres_y'] = movies_tag['genres_y'].apply(to_list_conv)
500
+ movies_tag['genres_y'] = movies_tag['genres_y'].apply(
501
+ lambda x: [item.lower() for item in x if isinstance(item, str)] if isinstance(x, list) and x else []
502
+ )
503
+
504
+ st.markdown("""
505
+ - **Combining all genres into a single column**
506
+ - Merge 'genres_x' and 'genres_y' into a unified 'genres' column.
507
+ - Remove duplicate genres per movie.
508
+ - Drop the original 'genres_x' and 'genres_y' columns.
509
+ """)
510
+ movies_tag['genres'] = movies_tag.apply(lambda row: list(set(row['genres_x'] + row['genres_y'])), axis=1)
511
+ movies_tag['tag'] = movies_tag.apply(lambda row: list(set(row['tag'])), axis=1)
512
+ movies_tag = movies_tag.drop(columns=['genres_x','genres_y'])
513
+
514
+ st.subheader("Preprocessing Movie Plots")
515
+ st.markdown("""
516
+ - Clean the 'plot' text by:
517
+ 1. Lowercasing all words
518
+ 2. Removing punctuation
519
+ 3. Tokenizing into words
520
+ 4. Removing stopwords
521
+ 5. Lemmatizing words
522
+ """)
523
+ lemmatizer = WordNetLemmatizer()
524
+ def preprocess_sentence(sentence):
525
+ if not isinstance(sentence, str):
526
+ return []
527
+ stop_words = set(stopwords.words('english'))
528
+ sentence = re.sub(r'[^\w\s]', '', sentence)
529
+ sentence = sentence.lower()
530
+ tokens = word_tokenize(sentence)
531
+ tokens = [word for word in tokens if word not in stop_words]
532
+ lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
533
+ return lemmatized_tokens
534
+
535
+ movies_tag['plot'] = movies_tag['plot'].apply(preprocess_sentence)
536
+
537
+ st.subheader("Cleaning Columns and Removing Duplicates")
538
+ st.markdown("""
539
+ - Drop unnecessary columns from the merged dataset.
540
+ - Rename 'clean_title' to 'title'.
541
+ - Reorder columns for clarity: 'movieId', 'title', 'year', 'tag', 'genres', 'plot'.
542
+ """)
543
+ movies_tag = movies_tag.drop(columns=['title_x', 'imdbId', 'title_y','bracket_title','imp_title'])
544
+ movies_tag.rename(columns = {'clean_title':'title'}, inplace=True)
545
+ # movies_tag.drop_duplicates(subset='title', keep='first', inplace=True)
546
+
547
+ # movies_tag = movies_tag.sort_values(by=['title', 'year'], ascending=[True, False])
548
+ # movies_tag = movies_tag.drop_duplicates(subset='title', keep='first')
549
+
550
+ movies_tag = movies_tag[['movieId','title','year','tag','genres','plot']]
551
+
552
+ st.markdown('---')
553
+
554
+ # ============================== Displaying final movies dataset =====================================
555
+ item_to_remove = '(no genres listed)'
556
+ movies_tag['genres'] = movies_tag['genres'].apply(lambda x: [item for item in x if item != item_to_remove])
557
+
558
+
559
+ st.subheader("🎬 Movies Dataset After Merging and Cleaning")
560
+ st.dataframe(movies_tag)
561
+
562
+ st.markdown(f"**Total unique movies after filtering:** {movies_tag['movieId'].nunique()}")
563
+
564
+
565
+ # ================================================ removing duplicates =============================================================
566
+
567
+
568
+ def check_empty_lists(row):
569
+ return isinstance(row['tag'], list) and len(row['tag']) == 0 and \
570
+ isinstance(row['genres'], list) and len(row['genres']) == 0 and \
571
+ isinstance(row['plot'], list) and len(row['plot']) == 0
572
+
573
+ # Step 1: Count rows where all three columns ('tag', 'genres', 'plot') are empty lists
574
+ empty_rows_count = movies_tag[movies_tag.apply(check_empty_lists, axis=1)].shape[0]
575
+ st.markdown(f"**Number of rows where 'tag', 'genres', and 'plot' are empty lists:** {empty_rows_count}")
576
+
577
+ # Step 2: Drop rows where all three columns are empty lists
578
+ movies_tag_cleaned = movies_tag[~movies_tag.apply(check_empty_lists, axis=1)]
579
+
580
+
581
+ #================ extracting the ratings dataset=============================
582
+
583
+ movies_tag_cleaned = movies_tag_cleaned.drop_duplicates(subset=['title', 'year'], keep='first')
584
+
585
+ # ratings = ratings[ratings['movieId'].isin(movies_tag_cleaned['movieId'])]
586
+ # ratings = ratings.drop_duplicates()
587
+ # ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='last')
588
+
589
+ # ratings.to_csv(loc, index=False)
590
+
591
+ # Correct way to save without creating Unnamed columns
592
+
593
+ # del(ratings)
594
+
595
+ st.subheader('Dropping the rows with empty Tag, Genres and Plot')
596
+ # movies_tag_cleaned.to_csv('movies_final.csv', index=False)
597
+
598
+ st.markdown(f"**Total unique movies after filtering:** {movies_tag_cleaned['movieId'].nunique()}")
599
+
600
+ st.markdown('---')
601
+
602
+ st.subheader("🎬 Final Movies Dataset")
603
+ st.dataframe(movies_tag_cleaned)
collaborative_item.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+ from pathlib import Path
5
+ import sys
6
+ import os
7
+ import time
8
+ from scipy.sparse import coo_matrix, csr_matrix
9
+ from sklearn.decomposition import IncrementalPCA
10
+ from sklearn.preprocessing import normalize
11
+ import joblib
12
+
13
+ # ====================== Streamlit Setup ======================
14
+ st.set_page_config(layout="wide")
15
+ st.title("🎬 Movie Recommender System Item-Based")
16
+
17
+ # ====================== Paths ======================
18
+ sys.path.append(os.path.dirname(__file__))
19
+ data_dir = Path(__file__).parent / 'data'
20
+ cache_dir = data_dir / 'cache'
21
+ cache_dir.mkdir(exist_ok=True)
22
+
23
+ movies_path = data_dir / 'movies.csv'
24
+ ratings_path = data_dir / 'ratings.csv'
25
+
26
+ # ====================== Load Data ======================
27
+ @st.cache_data
28
+ def load_data():
29
+ movies = pd.read_csv(movies_path)
30
+ ratings = st.session_state['ratings_df']
31
+ ratings = ratings.drop(columns=['timestamp'], errors='ignore')
32
+ ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
33
+ ratings['userId'] = ratings['userId'].astype(int)
34
+ ratings['movieId'] = ratings['movieId'].astype(int)
35
+ # Keep only movies present in movies.csv
36
+ ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique().tolist())]
37
+ return movies, ratings
38
+
39
+ movies, ratings = load_data()
40
+ st.text(f"Initial ratings shape: {ratings.shape}")
41
+
42
+ # ====================== Data Filtering ======================
43
+
44
+ user_counts = ratings['userId'].value_counts()
45
+ active_users = user_counts[user_counts >= 450].index
46
+ ratings_filtered = ratings[ratings['userId'].isin(active_users)]
47
+
48
+ movie_counts = ratings_filtered['movieId'].value_counts()
49
+ popular_movies = movie_counts[movie_counts >= 450].index
50
+ ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(popular_movies)]
51
+ ratings = ratings_filtered.copy()
52
+ del ratings_filtered
53
+
54
+ st.text(f"Filtered ratings shape: {ratings.shape}")
55
+
56
+ #===================summary=================================
57
+ st.markdown("#### 🎯 Dataset Filtering Summary")
58
+ st.markdown("""
59
+ We filtered the dataset in **two steps** to focus on active users and popular movies:
60
+
61
+ 1. **Active Users**
62
+ - Kept users who have rated **at least 450 movies**.
63
+ - Ensures we focus on users who are actively engaged.
64
+
65
+ 2. **Popular Movies**
66
+ - Kept movies that have **at least 450 ratings**.
67
+ - Ensures we focus on movies with enough feedback to be meaningful.
68
+
69
+ **Result:**
70
+ The final dataset contains ratings from active users on popular movies, improving the quality of recommendations and decreasing the size of the dataset.
71
+ """)
72
+
73
+
74
+ # ====================== Index Mapping ======================
75
+ ratings['user_idx'] = ratings['userId'].astype('category').cat.codes
76
+ ratings['movie_idx'] = ratings['movieId'].astype('category').cat.codes
77
+
78
+ num_users = ratings['user_idx'].nunique()
79
+ num_movies = ratings['movie_idx'].nunique()
80
+
81
+ user_idx_to_id = dict(enumerate(ratings['userId'].astype('category').cat.categories))
82
+ movie_idx_to_id = dict(enumerate(ratings['movieId'].astype('category').cat.categories))
83
+
84
+ # ====================== Sparse Matrix ======================
85
+ user_item_matrix = coo_matrix(
86
+ (ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
87
+ shape=(num_users, num_movies)
88
+ ).tocsr()
89
+
90
+ # ====================== Incremental SVD Caching ======================
91
+
92
+ SVD_CACHE_PATH = cache_dir / "svd_incremental_cache.joblib"
93
+
94
+ def compute_incremental_svd(matrix: csr_matrix, n_components=100, batch_size=1000):
95
+ if SVD_CACHE_PATH.exists():
96
+ # st.text("✅ Loading cached SVD decomposition...")
97
+ U, Sigma, VT = joblib.load(SVD_CACHE_PATH)
98
+ return U, Sigma, VT
99
+
100
+ st.text("⚙️ Computing Incremental SVD... (may take a few minutes)")
101
+ ipca = IncrementalPCA(n_components=n_components)
102
+ n_samples = matrix.shape[0]
103
+
104
+ # Fit in chunks
105
+ for start_idx in range(0, n_samples, batch_size):
106
+ end_idx = min(start_idx + batch_size, n_samples)
107
+ batch = matrix[start_idx:end_idx].toarray()
108
+ ipca.partial_fit(batch)
109
+ st.text(f"Processed rows {start_idx} to {end_idx}")
110
+
111
+ # Transform full data
112
+ U = []
113
+ for start_idx in range(0, n_samples, batch_size):
114
+ end_idx = min(start_idx + batch_size, n_samples)
115
+ batch = matrix[start_idx:end_idx].toarray()
116
+ U.append(ipca.transform(batch))
117
+ U = np.vstack(U)
118
+
119
+ VT = ipca.components_
120
+ Sigma = np.ones(ipca.n_components_)
121
+
122
+ joblib.dump((U, Sigma, VT), SVD_CACHE_PATH)
123
+ st.text("✅ SVD computation cached successfully.")
124
+ return U, Sigma, VT
125
+
126
+ U, Sigma, VT = compute_incremental_svd(user_item_matrix, n_components=100, batch_size=100000)
127
+ U_normalized = normalize(U)
128
+
129
+ # ====================== Recommendation Function ======================
130
+ def recommend_for_user_svd(user_id, top_n=5, n_similar_users=30):
131
+ start = time.time()
132
+ if user_id not in ratings['userId'].values:
133
+ return "User not found.", 0
134
+
135
+ user_idx = ratings.loc[ratings['userId'] == user_id, 'user_idx'].iloc[0]
136
+
137
+ # Cosine similarity in latent space
138
+ user_vector = U_normalized[user_idx]
139
+ similarities = np.dot(U_normalized, user_vector)
140
+
141
+ # Top similar users
142
+ similar_user_indices = similarities.argsort()[::-1][1:n_similar_users+1]
143
+ similar_scores = similarities[similar_user_indices]
144
+
145
+ # Movies already rated
146
+ user_rated_movie_indices = user_item_matrix.getrow(user_idx).nonzero()[1]
147
+ user_rated_movie_set = set(user_rated_movie_indices)
148
+
149
+ # Aggregate weighted ratings
150
+ scores = {}
151
+ for sim_user_idx, sim_score in zip(similar_user_indices, similar_scores):
152
+ sim_user_ratings = user_item_matrix.getrow(sim_user_idx)
153
+ movie_indices = sim_user_ratings.nonzero()[1]
154
+ sim_ratings = sim_user_ratings.data
155
+ for m_idx, rating in zip(movie_indices, sim_ratings):
156
+ if m_idx not in user_rated_movie_set:
157
+ scores[m_idx] = scores.get(m_idx, 0) + sim_score * rating
158
+
159
+ # Sort and recommend
160
+ recommended_movie_indices = sorted(scores, key=scores.get, reverse=True)[:top_n]
161
+ recommended_movie_ids = [movie_idx_to_id[idx] for idx in recommended_movie_indices]
162
+
163
+ end = time.time()
164
+ recommendations = movies[movies['movieId'].isin(recommended_movie_ids)][['title','genres']]
165
+ return recommendations, end - start
166
+
167
+ # ====================== Streamlit UI ======================
168
+
169
+ user_id_input = st.number_input(
170
+ "Enter User ID",
171
+ min_value=int(ratings['user_idx'].min()),
172
+ max_value=int(ratings['user_idx'].max()),
173
+ step=1
174
+ )
175
+
176
+ user_id_input = ratings[ratings['user_idx'] == user_id_input]['userId'].unique()[0]
177
+
178
+
179
+ duration = 0
180
+ if st.button("🎥 Recommend Movies"):
181
+ col1, col2 = st.columns(2)
182
+ with col1:
183
+ st.text('User Activity')
184
+ current_user = ratings[ratings['userId']==user_id_input]
185
+ merge_results = pd.merge(current_user,movies, how='left', on = 'movieId')
186
+ merge_results = merge_results[['title','rating']]
187
+ st.dataframe(merge_results)
188
+ with col2:
189
+ st.text('Recommended Movies')
190
+ recs, duration = recommend_for_user_svd(user_id_input, top_n=5)
191
+ st.dataframe(recs['title'])
192
+ st.text(f"Computed in {duration:.2f} seconds.")
193
+
194
+
195
+ # ====================== Explanation of CF ======================
196
+ # st.markdown("""
197
+ # ## 🤝 Collaborative Filtering Types
198
+
199
+ # **User-Based CF**:
200
+ # - Focus on users similar to you
201
+ # - Recommend movies liked by similar users
202
+
203
+ # **Item-Based CF**:
204
+ # - Focus on movies similar to ones you liked
205
+ # - Recommend similar movies
206
+
207
+ # Example:
208
+ # - You liked *Inception* and *Matrix*
209
+ # - User-Based: "People like you also liked *Avengers*"
210
+ # - Item-Based: "Since you liked *Inception*, try *Interstellar*"
211
+ # """)
collaborative_user.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+ from pathlib import Path
5
+ import sys
6
+ import os
7
+ import time
8
+ from scipy.sparse import coo_matrix, csr_matrix
9
+ from sklearn.decomposition import TruncatedSVD, IncrementalPCA
10
+ from sklearn.preprocessing import normalize
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
+ import joblib
13
+
14
+ # ====================== Streamlit Setup ======================
15
+ st.set_page_config(layout="wide")
16
+ st.title("🎬 Movie Recommender System User-Based")
17
+
18
+ # ====================== Paths ======================
19
+ sys.path.append(os.path.dirname(__file__))
20
+ data_dir = Path(__file__).parent / 'data'
21
+ cache_dir = data_dir / 'cache'
22
+ cache_dir.mkdir(exist_ok=True)
23
+
24
+ movies_path = 'movies_final.csv'
25
+ ratings_path = data_dir / 'ratings.csv'
26
+
27
+ # ====================== Load Data ======================
28
+
29
+ # used to **cache the output of a function that loads or processes data** so that it doesn’t get recomputed every time the app reruns.
30
+ # This is especially useful when loading large datasets or performing expensive computations.Since our data is huge
31
+
32
+ @st.cache_data
33
+ def load_data():
34
+ movies = pd.read_csv(movies_path)
35
+ ratings = st.session_state['ratings_df']
36
+ # ratings = ratings.drop(columns=['timestamp'], errors='ignore')
37
+ ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
38
+ ratings['userId'] = ratings['userId'].astype(int)
39
+ ratings['movieId'] = ratings['movieId'].astype(int)
40
+ return movies, ratings
41
+
42
+ movies, ratings = load_data()
43
+
44
+ ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique().tolist())]
45
+
46
+ st.text(f"Initial ratings shape: {ratings.shape}")
47
+
48
+ # ====================== Data Filtering ======================
49
+ user_counts = ratings['userId'].value_counts()
50
+ active_users = user_counts[user_counts >= 450].index
51
+ ratings_filtered = ratings[ratings['userId'].isin(active_users)]
52
+
53
+ movie_counts = ratings_filtered['movieId'].value_counts()
54
+ popular_movies = movie_counts[movie_counts >= 450].index
55
+ ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(popular_movies)]
56
+ ratings = ratings_filtered.copy()
57
+ del ratings_filtered
58
+
59
+ st.text(f"Filtered ratings shape: {ratings.shape}")
60
+
61
+ st.markdown("#### 🎯 Dataset Filtering Summary")
62
+ st.markdown("""
63
+ We filtered the dataset in **two steps** to focus on active users and popular movies:
64
+
65
+ 1. **Active Users**
66
+ - Kept users who have rated **at least 450 movies**.
67
+ - Ensures we focus on users who are actively engaged.
68
+
69
+ 2. **Popular Movies**
70
+ - Kept movies that have **at least 450 ratings**.
71
+ - Ensures we focus on movies with enough feedback to be meaningful.
72
+
73
+ **Result:**
74
+ The final dataset contains ratings from active users on popular movies, improving the quality of recommendations and decreasing the size of the dataset.
75
+ """)
76
+
77
+ formatted_num = "{:,}".format(ratings.shape[0])
78
+ print(formatted_num)
79
+
80
+ # Optional: Show numbers dynamically
81
+ st.markdown(f"**📊 Ratings after filtering:** {formatted_num} ratings from {ratings['userId'].nunique()} users on {ratings['movieId'].nunique()} movies.")
82
+
83
+
84
+ # ====================== Index Mapping ======================
85
+ # For each unique userId, assigns a unique integer code starting from 0.
86
+ ratings['user_idx'] = ratings['userId'].astype('category').cat.codes
87
+ ratings['movie_idx'] = ratings['movieId'].astype('category').cat.codes
88
+
89
+ num_users = ratings['user_idx'].nunique()
90
+ num_movies = ratings['movie_idx'].nunique()
91
+
92
+ # Map indices back to original IDs
93
+ # {0: 100, 1: 101, 2: 105}
94
+ user_idx_to_id = dict(enumerate(ratings['userId'].astype('category').cat.categories))
95
+ movie_idx_to_id = dict(enumerate(ratings['movieId'].astype('category').cat.categories))
96
+
97
+ # ====================== Sparse Matrix ======================
98
+
99
+ # Rows → users
100
+ # Columns → movies
101
+ # Values → ratings
102
+ # Most entries are **0 (or missing)** because not all users rate all movies.
103
+
104
+ user_item_matrix = coo_matrix(
105
+ (ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
106
+ shape=(num_users, num_movies)
107
+ ).tocsr()
108
+
109
+ # CSR format allows fast operations for matrix factorization or nearest-neighbor search
110
+ # The full matrix would have 50 million cells. But maybe only 1 million ratings exist.
111
+ # Dense matrix: stores all 50 million entries → huge memory usage.Sparse matrix: stores only the non-zero ratings → huge memory savings.
112
+ # There are different sparse formats, e.g., COO, CSR, CSC.
113
+
114
+ # ====================== Incremental SVD Caching ======================
115
+ SVD_CACHE_PATH = cache_dir / "svd_incremental_cache.joblib"
116
+
117
+ def compute_incremental_svd(matrix: csr_matrix, n_components=100, batch_size=1000):
118
+ """
119
+ Memory-efficient incremental SVD using IncrementalPCA.
120
+ Works even when matrix cannot fit entirely in memory.
121
+ """
122
+ if SVD_CACHE_PATH.exists():
123
+ # st.text("✅ Loading cached SVD decomposition...")
124
+ U, Sigma, VT = joblib.load(SVD_CACHE_PATH)
125
+ return U, Sigma, VT
126
+
127
+ st.text("⚙️ Computing Incremental SVD... (this may take several minutes)")
128
+ ipca = IncrementalPCA(n_components=n_components)
129
+ n_samples = matrix.shape[0]
130
+
131
+ # Fit in chunks
132
+ for start_idx in range(0, n_samples, batch_size):
133
+ end_idx = min(start_idx + batch_size, n_samples)
134
+ batch = matrix[start_idx:end_idx].toarray()
135
+ ipca.partial_fit(batch)
136
+ st.text(f"Processed rows {start_idx} to {end_idx}")
137
+
138
+ # Transform full data
139
+ U = []
140
+ for start_idx in range(0, n_samples, batch_size):
141
+ end_idx = min(start_idx + batch_size, n_samples)
142
+ batch = matrix[start_idx:end_idx].toarray()
143
+ U.append(ipca.transform(batch))
144
+ U = np.vstack(U)
145
+
146
+ VT = ipca.components_
147
+ Sigma = np.ones(ipca.n_components_)
148
+
149
+ joblib.dump((U, Sigma, VT), SVD_CACHE_PATH)
150
+ st.text("✅ SVD computation cached successfully.")
151
+ return U, Sigma, VT
152
+
153
+ U, Sigma, VT = compute_incremental_svd(user_item_matrix, n_components=100, batch_size=100000)
154
+ U_normalized = normalize(U)
155
+
156
+ # ====================== Recommendation Function ======================
157
+ def recommend_for_user_svd(user_id, top_n=5, n_similar_users=30):
158
+ start = time.time()
159
+ if user_id not in ratings['userId'].values:
160
+ return "User not found.", 0
161
+
162
+ user_idx = ratings.loc[ratings['userId'] == user_id, 'user_idx'].iloc[0]
163
+
164
+ # Cosine similarity in latent space
165
+ user_vector = U_normalized[user_idx]
166
+ similarities = np.dot(U_normalized, user_vector)
167
+
168
+ # Select top similar users
169
+ similar_user_indices = similarities.argsort()[::-1][1:n_similar_users+1]
170
+ similar_scores = similarities[similar_user_indices]
171
+
172
+ user_rated_movie_indices = user_item_matrix.getrow(user_idx).nonzero()[1]
173
+ user_rated_movie_set = set(user_rated_movie_indices)
174
+
175
+ # Aggregate weighted ratings
176
+ scores = {}
177
+ for sim_user_idx, sim_score in zip(similar_user_indices, similar_scores):
178
+ sim_user_ratings = user_item_matrix.getrow(sim_user_idx)
179
+ movie_indices = sim_user_ratings.nonzero()[1]
180
+ sim_ratings = sim_user_ratings.data
181
+ for m_idx, rating in zip(movie_indices, sim_ratings):
182
+ if m_idx not in user_rated_movie_set:
183
+ scores[m_idx] = scores.get(m_idx, 0) + sim_score * rating
184
+
185
+ # Sort and recommend
186
+ recommended_movie_indices = sorted(scores, key=scores.get, reverse=True)[:top_n]
187
+ recommended_movie_ids = [movie_idx_to_id[idx] for idx in recommended_movie_indices]
188
+
189
+ end = time.time()
190
+ recommendations = movies[movies['movieId'].isin(recommended_movie_ids)][['title']]
191
+ return recommendations, end - start
192
+
193
+
194
+ # 1. Works in latent space → captures hidden user preferences.
195
+ # 2. Uses top similar users → collaborative filtering logic.
196
+ # 3. Avoids recommending already rated movies.
197
+ # 4. Weighted aggregation ensures more similar users influence recommendations more.
198
+ # 5. Returns top N movies along with computation time.
199
+
200
+ # ====================== Streamlit UI ===========================================
201
+
202
+ # st.dataframe(ratings.head(1000))
203
+ # st.text('unique')
204
+ # st.text(ratings['user_idx'].nunique())
205
+ # st.text(ratings['userId'].nunique())
206
+
207
+ user_id_input = st.number_input(
208
+ "Enter User ID",
209
+ min_value=int(ratings['user_idx'].min()),
210
+ max_value=int(ratings['user_idx'].max()),
211
+ step=1
212
+ )
213
+
214
+
215
+ user_id_input = ratings[ratings['user_idx'] == user_id_input]['userId'].unique()[0]
216
+
217
+ # st.dataframe(ratings)
218
+
219
+ # st.text(user_id_input)
220
+
221
+
222
+ if st.button("🎥 Recommend Movies"):
223
+
224
+ col1, col2 = st.columns(2)
225
+ with col1:
226
+ st.text('User Activity')
227
+ current_user = ratings[ratings['userId']==user_id_input]
228
+ merge_results = pd.merge(current_user,movies, how='left', on = 'movieId')
229
+ merge_results = merge_results[['title','rating']]
230
+ st.dataframe(merge_results)
231
+ with col2:
232
+ st.text('Recommended Movies')
233
+ recs, duration = recommend_for_user_svd(user_id_input, top_n=5)
234
+ st.dataframe(recs)
235
+ st.text(f"Computed in {duration:.2f} seconds.")
content_based.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
2
+ from sklearn.metrics.pairwise import cosine_similarity
3
+ import pandas as pd
4
+ import numpy as np
5
+ import streamlit as st
6
+ from gensim.models import Word2Vec
7
+
8
+ # ====================== Streamlit Setup ======================
9
+ st.set_page_config(layout="wide")
10
+ st.title("Content Based Filtering")
11
+ st.markdown('---')
12
+
13
+ # ======================== Load Data ==========================
14
+ movies = pd.read_csv('movies_final.csv')
15
+ movies = movies.drop(columns=['movieId'])
16
+
17
+ # ======================== Helper Function ====================
18
+ def combine_tokens(row, cols):
19
+ combined = []
20
+ for col in cols:
21
+ val = row[col]
22
+ if isinstance(val, list):
23
+ combined.extend(val)
24
+ elif isinstance(val, str):
25
+ combined.append(val)
26
+ return list(set(combined))
27
+
28
+ # ======================== BOW ================================
29
+ def recommend_bow(title, year, top_n=5):
30
+ if title not in movies['title'].values:
31
+ return "Movie not found."
32
+
33
+ # Get index of the queried movie
34
+ idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
35
+
36
+ # Compute cosine similarity scores
37
+ query_vector = bow_matrix[idx]
38
+ sim_scores = cosine_similarity(query_vector, bow_matrix).flatten()
39
+
40
+ # Get indices of similar movies (excluding the query itself)
41
+ similar_indices = sim_scores.argsort()[::-1][1:top_n + 1]
42
+
43
+ # Create a DataFrame for top recommendations with similarity scores
44
+ recommendations = pd.DataFrame({
45
+ 'title': movies.iloc[similar_indices]['title'].values,
46
+ 'year': movies.iloc[similar_indices]['year'].values,
47
+ 'similarity_score': sim_scores[similar_indices]
48
+ })
49
+
50
+ # Sort for clarity (highest similarity first)
51
+ recommendations = recommendations.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)
52
+
53
+ return recommendations
54
+
55
+
56
+ # ======================== TF-IDF ==============================
57
+ def recommend_tfidf(title, year, top_n=5):
58
+ if title not in movies['title'].values:
59
+ return "Movie not found."
60
+
61
+ idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
62
+ query_vector = tfidf_matrix[idx]
63
+ sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
64
+
65
+ # Get the indices of similar movies (excluding the movie itself)
66
+ similar_indices = sim_scores.argsort()[::-1][1:]
67
+
68
+ # Ensure you don’t get more than `top_n` recommendations
69
+ similar_indices = similar_indices[:top_n]
70
+
71
+ # Get the movie titles
72
+ top_movies = movies.iloc[similar_indices]['title'].tolist()
73
+
74
+ return top_movies
75
+
76
+ # ======================== WORD2VEC ============================
77
+ def build_word2vec_vectors():
78
+ tokenized_corpus = [text.split() for text in movies['tokens']]
79
+
80
+ # Train Word2Vec model
81
+ w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
82
+
83
+ # Function to compute average vector for each movie
84
+ def get_avg_vector(tokens):
85
+ vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
86
+ if len(vectors) == 0:
87
+ return np.zeros(w2v_model.vector_size)
88
+ return np.mean(vectors, axis=0)
89
+
90
+ # Compute average vector for each movie
91
+ movies['w2v_vector'] = [get_avg_vector(text.split()) for text in movies['tokens']]
92
+ w2v_matrix = np.vstack(movies['w2v_vector'].values)
93
+ return w2v_model, w2v_matrix
94
+
95
+ def recommend_w2v(title, year, w2v_matrix, top_n=5):
96
+ if title not in movies['title'].values:
97
+ return "Movie not found."
98
+
99
+ idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
100
+ query_vec = w2v_matrix[idx].reshape(1, -1)
101
+ sims = cosine_similarity(query_vec, w2v_matrix).flatten()
102
+
103
+ # Get the indices of similar movies (excluding the movie itself)
104
+ similar_indices = sims.argsort()[::-1][1:]
105
+
106
+ # Ensure you don’t get more than `top_n` recommendations
107
+ similar_indices = similar_indices[:top_n]
108
+
109
+ # Get the movie titles
110
+ top_movies = movies.iloc[similar_indices]['title'].tolist()
111
+
112
+ return top_movies
113
+
114
+ # ======================== UI ========================
115
+ st.header('🎬 Choose an Algorithm and Movie for Content-Based Filtering')
116
+
117
+ select_algo = st.selectbox('Select the algorithm', ['Bag of Words', 'TF-IDF', 'Word2Vec'])
118
+ selected_movie = st.selectbox('Select a movie', movies['title'].unique())
119
+ selected_year = st.selectbox('Select the year', movies[movies['title']==selected_movie]['year'])
120
+
121
+ all_columns = movies.columns.tolist()
122
+ selected_cols = st.multiselect("Select columns to combine for tokens", all_columns, default=['genres', 'tag', 'plot'])
123
+
124
+ # Combine selected columns into tokens
125
+ movies['tokens'] = movies.apply(lambda row: combine_tokens(row, selected_cols), axis=1)
126
+ movies['tokens'] = movies['tokens'].apply(lambda x: ' '.join(x))
127
+
128
+ if st.button('Recommend'):
129
+ # Display the selected movie and its columns
130
+ st.dataframe(movies[(movies['title'] == selected_movie) & (movies['year'] == selected_year)][['title'] + selected_cols])
131
+
132
+ # Recommendation based on the selected algorithm
133
+ if select_algo == 'Bag of Words':
134
+ with st.spinner("Building Bag of Words model..."):
135
+ vectorizer = CountVectorizer()
136
+ bow_matrix = vectorizer.fit_transform(movies['tokens'])
137
+ output = recommend_bow(selected_movie, selected_year, top_n=5)
138
+ st.success("Bag of Words model ready ✅")
139
+
140
+ output_display = pd.merge( output, movies[['title', 'year'] + selected_cols], on=['title', 'year'], how='left' ).reset_index(drop=True)
141
+ st.subheader("🎯 Top Recommendations (Bag of Words)")
142
+ st.dataframe(output_display[['title', 'year', 'similarity_score'] + selected_cols])
143
+
144
+ elif select_algo == 'TF-IDF':
145
+ with st.spinner("Building TF-IDF model..."):
146
+ vectorizer = TfidfVectorizer()
147
+ tfidf_matrix = vectorizer.fit_transform(movies['tokens'])
148
+ output = recommend_tfidf(selected_movie, selected_year, top_n=5)
149
+ st.success("TF-IDF model ready ✅")
150
+ st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True))
151
+
152
+ elif select_algo == 'Word2Vec':
153
+ with st.spinner("Training Word2Vec model..."):
154
+ w2v_model, w2v_matrix = build_word2vec_vectors()
155
+ output = recommend_w2v(selected_movie, selected_year, w2v_matrix, top_n=5)
156
+ st.success("Word2Vec model ready ✅")
157
+ st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True))
data/cache/svd_incremental_cache.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58176c8e3545d4ade2701a7fdeca9d1495b864fa2a77bbc5db4272dfacb01eda
3
+ size 13812355
data/grouped_tags.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9269b5d520d4063ae57c3027ce0402dc755927ac11db46ea835c2321f6f071e
3
+ size 31065818
data/links.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/merger.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d566674daf655758a3c7d01e1ec31ff6a63064d8f1b6fa67f693e2badd8347e
3
+ size 21817265
data/movies.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/ratings.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0ab9d46f7ba3cdb7a00602f608491bcae6d479f1c5649d291d07eccb0289870
3
+ size 529039466
data/tags.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e4b043d9088ce7bdad9f4ba06901330f3c4adec6e448da83d9a5d32fc756d55
3
+ size 25847889
item_based.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import sys
3
+ import os
4
+ import pandas as pd
5
+ from pathlib import Path
6
+ import numpy as np
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ import streamlit as st
9
+ import time
10
+
11
+
12
+ # sys.path.append(os.path.dirname(__file__))
13
+ # data_dir = Path(__file__).parent.parent / 'data'
14
+
15
+ # # links dataset
16
+ # movies = data_dir / 'movies_final.csv'
17
+
18
+
19
+ movies = pd.read_csv('movies_final.csv')
20
+ st.text(movies.shape)
21
+ st.dataframe(movies.head())
22
+
23
+ start = time.time()
24
+
25
+ model = SentenceTransformer('all-MiniLM-L6-v2')
26
+ movies['bert_vector'] = model.encode(movies['tokens'].tolist(), show_progress_bar=True).tolist()
27
+
28
+ def recommend_bert(title, top_n=5):
29
+ if title not in movies['title'].values:
30
+ return "Movie not found."
31
+
32
+ idx = movies[movies['title'] == title].index[0]
33
+ query_vector = np.array(movies.loc[idx, 'bert_vector']).reshape(1, -1)
34
+ all_vectors = np.vstack(movies['bert_vector'].values)
35
+
36
+ similarities = cosine_similarity(query_vector, all_vectors)[0]
37
+ sim_scores = list(enumerate(similarities))
38
+ sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
39
+
40
+ # Skip the first result (it’s the movie itself)
41
+ recommended = [movies.iloc[i]['title'] for i, score in sim_scores[1:top_n+1]]
42
+ return recommended
43
+
44
+ out = recommend_bert('Schindler\'s List')
45
+ st.text(out)
46
+
47
+ end = time.time()
48
+
49
+ st.text(end-start)
main.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Define pages (by file or function)
4
+ page1 = st.Page("about_me.py", title="About Me", icon="👤")
5
+ page2 = st.Page("base.py", title="Home Page", icon="🏠")
6
+ page3 = st.Page("popularity.py", title="Popularity Based Filtering", icon="🔥")
7
+ page4 = st.Page("content_based.py", title="Content Based Filtering", icon="🎬")
8
+ page5 = st.Page("collaborative_user.py", title="Collaborative User based", icon="👥")
9
+ page6 = st.Page("collaborative_item.py", title="Collaborative Item based", icon="🛍️")
10
+
11
+
12
+ # Create navigation
13
+ pg = st.navigation([page1, page2, page3, page4, page5, page6])
14
+
15
+ # Run navigation
16
+ pg.run()
movies_final.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77a03ffbfcc887eef7e34b750ec446233b09b2e70471eae138bbb094c0c09cda
3
+ size 24885151
nltk_data/corpora/stopwords.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:174365e185ab7b343a4ca41ac9d1b44f03c928a226eb720afb027882865e07c0
3
+ size 823
nltk_data/corpora/stopwords/english ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a
2
+ about
3
+ above
4
+ after
5
+ again
6
+ against
7
+ ain
8
+ all
9
+ am
10
+ an
11
+ and
12
+ any
13
+ are
14
+ aren
15
+ aren't
16
+ as
17
+ at
18
+ be
19
+ because
20
+ been
21
+ before
22
+ being
23
+ below
24
+ between
25
+ both
26
+ but
27
+ by
28
+ can
29
+ couldn
30
+ couldn't
31
+ d
32
+ did
33
+ didn
34
+ didn't
35
+ do
36
+ does
37
+ doesn
38
+ doesn't
39
+ doing
40
+ don
41
+ don't
42
+ down
43
+ during
44
+ each
45
+ few
46
+ for
47
+ from
48
+ further
49
+ had
50
+ hadn
51
+ hadn't
52
+ has
53
+ hasn
54
+ hasn't
55
+ have
56
+ haven
57
+ haven't
58
+ having
59
+ he
60
+ he'd
61
+ he'll
62
+ her
63
+ here
64
+ hers
65
+ herself
66
+ he's
67
+ him
68
+ himself
69
+ his
70
+ how
71
+ i
72
+ i'd
73
+ if
74
+ i'll
75
+ i'm
76
+ in
77
+ into
78
+ is
79
+ isn
80
+ isn't
81
+ it
82
+ it'd
83
+ it'll
84
+ it's
85
+ its
86
+ itself
87
+ i've
88
+ just
89
+ ll
90
+ m
91
+ ma
92
+ me
93
+ mightn
94
+ mightn't
95
+ more
96
+ most
97
+ mustn
98
+ mustn't
99
+ my
100
+ myself
101
+ needn
102
+ needn't
103
+ no
104
+ nor
105
+ not
106
+ now
107
+ o
108
+ of
109
+ off
110
+ on
111
+ once
112
+ only
113
+ or
114
+ other
115
+ our
116
+ ours
117
+ ourselves
118
+ out
119
+ over
120
+ own
121
+ re
122
+ s
123
+ same
124
+ shan
125
+ shan't
126
+ she
127
+ she'd
128
+ she'll
129
+ she's
130
+ should
131
+ shouldn
132
+ shouldn't
133
+ should've
134
+ so
135
+ some
136
+ such
137
+ t
138
+ than
139
+ that
140
+ that'll
141
+ the
142
+ their
143
+ theirs
144
+ them
145
+ themselves
146
+ then
147
+ there
148
+ these
149
+ they
150
+ they'd
151
+ they'll
152
+ they're
153
+ they've
154
+ this
155
+ those
156
+ through
157
+ to
158
+ too
159
+ under
160
+ until
161
+ up
162
+ ve
163
+ very
164
+ was
165
+ wasn
166
+ wasn't
167
+ we
168
+ we'd
169
+ we'll
170
+ we're
171
+ were
172
+ weren
173
+ weren't
174
+ we've
175
+ what
176
+ when
177
+ where
178
+ which
179
+ while
180
+ who
181
+ whom
182
+ why
183
+ will
184
+ with
185
+ won
186
+ won't
187
+ wouldn
188
+ wouldn't
189
+ y
190
+ you
191
+ you'd
192
+ you'll
193
+ your
194
+ you're
195
+ yours
196
+ yourself
197
+ yourselves
198
+ you've
nltk_data/corpora/wordnet.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbda5ea6eef7f36a97a43d4a75f85e07fccbb4f23657d27b4ccbc93e2646ab59
3
+ size 10775600
nltk_data/tokenizers/punkt.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec
3
+ size 13905355
nltk_data/tokenizers/punkt/.DS_Store ADDED
Binary file (6.15 kB). View file
 
nltk_data/tokenizers/punkt/PY3/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt/PY3/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
3
+ size 406697
nltk_data/tokenizers/punkt/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dda37972ae88998a6fd3e3ec002697a6bd362b32d050fda7d7ca5276873092aa
3
+ size 433305
popularity.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+ from pathlib import Path
5
+ import sys
6
+ import os
7
+ import time
8
+ import matplotlib.pyplot as plt
9
+
10
+
11
+ # ====================== Streamlit Setup ======================
12
+ st.set_page_config(layout="wide")
13
+ st.title("Popularity Based Filtering")
14
+ st.markdown('---')
15
+
16
+ # ====================== Data Extraction ======================
17
+
18
+ sys.path.append(os.path.dirname(__file__))
19
+ data_dir = Path(__file__).parent / 'data'
20
+
21
+ movies = 'movies_final.csv'
22
+ movies = pd.read_csv(movies)
23
+ movies = movies[['movieId','title']]
24
+
25
+ # ratings dataset
26
+ ratings = data_dir / 'ratings.csv'
27
+ ratings = st.session_state['ratings_df']
28
+ # ratings = ratings.drop(columns=['timestamp'])
29
+ ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
30
+
31
+
32
+ st.subheader('👀 Glance of the dataset')
33
+ col1 ,col2 = st.columns(2)
34
+ with col1:
35
+ st.markdown("Ratings Dataset")
36
+ st.dataframe(ratings.head())
37
+ with col2:
38
+ st.markdown("Movies Dataset")
39
+ st.dataframe(movies.head())
40
+ st.markdown('---')
41
+
42
+ # ============================= Most Rated Movies ====================================================
43
+ col1, col2 = st.columns(2)
44
+
45
+ most_rated = ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
46
+ most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
47
+ most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
48
+ movie_ids = most_rated_final['movieId'].tolist()
49
+
50
+ avg_ratings = ratings[ratings['movieId'].isin(movie_ids)]
51
+ avg_ratings = avg_ratings.groupby('movieId')['rating'].mean().reset_index().rename(columns={'rating': 'Avg_Rating'})
52
+ most_rated_final = pd.merge(most_rated_final, avg_ratings, how='left', on='movieId')
53
+
54
+ with col1:
55
+ st.subheader('📈 Top 5 Most Rated Movies')
56
+ st.dataframe(most_rated_final[['title', 'No_of_Ratings', 'Avg_Rating']])
57
+
58
+ # ============================= Least Rated Movies ====================================================
59
+ most_rated = ratings.groupby('movieId').agg({'rating': 'mean', 'userId': 'count'}).rename(
60
+ columns={'rating': 'Avg_Rating', 'userId': 'No_of_Ratings'}
61
+ )
62
+ most_rated = most_rated.sort_values(['No_of_Ratings','Avg_Rating']).head()
63
+ most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
64
+
65
+ with col2:
66
+ st.subheader('📉 Top 5 Least Rated Movies')
67
+ st.dataframe(most_rated_final[['title', 'No_of_Ratings', 'Avg_Rating']])
68
+
69
+ # ============================= Highest Average Rated Movies ==========================================
70
+ st.markdown('---')
71
+
72
+ col1, col2 = st.columns(2)
73
+
74
+ filter_ratings = ratings[ratings['rating'] == 5]
75
+ most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
76
+ most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
77
+ most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
78
+
79
+ filter_ratings = ratings[ratings['rating'] == 0.5]
80
+ most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
81
+ most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
82
+ least_most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
83
+
84
+
85
+ with col1:
86
+ st.subheader('Highest Rated Movies with 5⭐')
87
+ st.dataframe(most_rated_final[['title', 'No_of_Ratings']])
88
+ st.subheader('Highest Rated Movies with 0.5⭐')
89
+ st.dataframe(least_most_rated_final[['title', 'No_of_Ratings']])
90
+
91
+ # ============================= Ratings Distribution Graph ============================================
92
+ import matplotlib.pyplot as plt
93
+
94
+ # col1, col2 = st.columns(2)
95
+
96
+ with col2:
97
+ st.subheader('📊 Ratings Distribution')
98
+ graph = ratings.groupby('rating').count()['userId']
99
+ fig, ax = plt.subplots()
100
+ ax.bar(graph.index, graph.values)
101
+ ax.set_xlabel('Rating')
102
+ ax.set_ylabel('Number of Ratings')
103
+ ax.set_title('Number of Ratings by Rating Value')
104
+ st.pyplot(fig)
105
+
106
+ st.markdown('---')
107
+
108
+ # ============================= User Selected Ratings ===================================================
109
+ st.subheader('🎯 Movies Filtered by Selected Rating')
110
+ ratings_selected = st.selectbox(
111
+ 'Select the Rating',
112
+ pd.Series(ratings['rating'].unique()).sort_values(ascending=False).tolist()
113
+ )
114
+ filter_ratings = ratings[ratings['rating'] == ratings_selected]
115
+
116
+ col1, col2 = st.columns(2)
117
+
118
+ with col1:
119
+ st.markdown('**Most Rated for Selected Rating**')
120
+ most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
121
+ most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
122
+ most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
123
+ st.dataframe(most_rated_final[['title', 'No_of_Ratings']])
124
+ with col2:
125
+ st.markdown('**Least Rated for Selected Rating**')
126
+ most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values().head()
127
+ most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
128
+ most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
129
+ st.dataframe(most_rated_final[['title', 'No_of_Ratings']])
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ joblib==1.4.2
2
+ matplotlib==3.10.7
3
+ nltk==3.9.2
4
+ pandas==2.3.3
5
+ scikit_learn==1.7.2
6
+ sentence_transformers==5.1.1
7
+ streamlit==1.50.0
8
+ gensim==4.3.3
9
+ numpy==1.23.5
10
+ scipy==1.13.0