Janesh1 commited on
Commit
f15b668
·
verified ·
1 Parent(s): 0a4866d

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +123 -0
  3. movies_metadata.csv +3 -0
  4. ratings_small.csv +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ movies_metadata.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.metrics import mean_squared_error
8
+
9
+ # Cache data loading to make it faster
10
+ @st.cache_data
11
+ def load_data():
12
+ movies = pd.read_csv('movies_metadata.csv', low_memory=False)
13
+ movies = movies.sample(n=15000, random_state=42)
14
+ ratings = pd.read_csv('ratings_small.csv')
15
+ movies['overview'] = movies['overview'].fillna('')
16
+ movies['id'] = pd.to_numeric(movies['id'], errors='coerce').astype('Int64')
17
+ return movies, ratings
18
+
19
+ # Cache TF-IDF and similarity matrix for content-based
20
+ @st.cache_data
21
+ def compute_content_based_matrix(movies):
22
+ movies['genres_str'] = movies['genres'].apply(lambda x: ' '.join(x.split('|')))
23
+ vectorizer = TfidfVectorizer()
24
+ tfidf_matrix = vectorizer.fit_transform(movies['genres_str'])
25
+ similarity_matrix = cosine_similarity(tfidf_matrix)
26
+ title_to_index = pd.Series(movies.index, index=movies['title'])
27
+ return tfidf_matrix, similarity_matrix, title_to_index
28
+
29
+ # Cache user profiles for user-based
30
+ @st.cache_data
31
+ def compute_user_profiles(ratings, movies):
32
+ train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)
33
+ tfidf = TfidfVectorizer(stop_words='english')
34
+ tfidf_matrix = tfidf.fit_transform(movies['genres'].fillna(''))
35
+ movie_id_to_idx = {mid: idx for idx, mid in enumerate(movies['id'])}
36
+
37
+ def build_user_profile(ratings_df, tfidf_matrix, movie_id_to_idx):
38
+ user_profiles = {}
39
+ for user_id, group in ratings_df.groupby('userId'):
40
+ rated_movies = group['movieId'].values
41
+ ratings = group['rating'].values
42
+ movie_indices = [movie_id_to_idx[m] for m in rated_movies if m in movie_id_to_idx]
43
+ if not movie_indices:
44
+ continue
45
+ weighted_vectors = np.sum([ratings[i] * tfidf_matrix[movie_indices[i]].toarray().flatten()
46
+ for i in range(len(movie_indices))], axis=0)
47
+ rating_sum = np.sum(ratings)
48
+ user_profiles[user_id] = weighted_vectors / rating_sum if rating_sum > 0 else weighted_vectors
49
+ return user_profiles, train_ratings, test_ratings
50
+
51
+ user_profiles, train_ratings, test_ratings = build_user_profile(train_ratings, tfidf_matrix, movie_id_to_idx)
52
+ return user_profiles, tfidf_matrix, movie_id_to_idx, train_ratings, test_ratings
53
+
54
+ # Content-based recommendation function
55
+ def get_similar_movies(title, similarity_matrix, title_to_index, movies, N=5):
56
+ try:
57
+ index = title_to_index[title]
58
+ similarity_scores = similarity_matrix[index]
59
+ similar_indices = similarity_scores.argsort()[::-1][1:N+1]
60
+ similar_movies = movies['title'].iloc[similar_indices]
61
+ similar_scores = similarity_scores[similar_indices]
62
+ return list(zip(similar_movies, similar_scores))
63
+ except KeyError:
64
+ return None
65
+
66
+ # User profile-based recommendation function
67
+ def get_top_n_recommendations(user_id, user_profiles, tfidf_matrix, movie_id_to_idx, movies, train_ratings, n=5):
68
+ if user_id not in user_profiles:
69
+ return None
70
+ user_profile = user_profiles[user_id]
71
+ similarities = cosine_similarity(user_profile.reshape(1, -1), tfidf_matrix).flatten()
72
+ movie_indices = np.argsort(similarities)[::-1]
73
+ rated_movies = set(train_ratings[train_ratings['userId'] == user_id]['movieId'].values)
74
+ top_n_indices = [idx for idx in movie_indices if movies['id'].iloc[idx] not in rated_movies][:n]
75
+ return [(movies['title'].iloc[idx], 1 + 4 * similarities[idx]) for idx in top_n_indices]
76
+
77
+ # Streamlit app
78
+ st.title("🎥 Movie Recommender System")
79
+ st.write("Pick a way to find awesome movies! Either choose a movie you like or enter your user ID for personalized picks.")
80
+
81
+ # Load data
82
+ movies, ratings = load_data()
83
+
84
+ # Sidebar for selecting recommendation type
85
+ recommendation_type = st.sidebar.selectbox("Choose Recommendation Type", ["Content-Based", "User Profile-Based"])
86
+
87
+ if recommendation_type == "Content-Based":
88
+ st.header("Content-Based Movie Recommendations")
89
+ st.write("Enter a movie title to find similar movies based on genres.")
90
+
91
+ # Compute content-based matrices
92
+ tfidf_matrix, similarity_matrix, title_to_index = compute_content_based_matrix(movies)
93
+
94
+ # Movie title input
95
+ movie_title = st.selectbox("Select a Movie", options=[""] + list(movies['title'].dropna().unique()))
96
+
97
+ if movie_title:
98
+ recommendations = get_similar_movies(movie_title, similarity_matrix, title_to_index, movies, N=5)
99
+ if recommendations:
100
+ st.write(f"**Movies similar to '{movie_title}':**")
101
+ for i, (movie, score) in enumerate(recommendations, 1):
102
+ st.write(f"{i}. {movie} (Similarity Score: {score:.2f})")
103
+ else:
104
+ st.error(f"Oops! Movie '{movie_title}' not found. Try another title!")
105
+
106
+ else:
107
+ st.header("User Profile-Based Movie Recommendations")
108
+ st.write("Enter your user ID to get personalized movie picks based on your ratings.")
109
+
110
+ # Compute user profiles
111
+ user_profiles, tfidf_matrix, movie_id_to_idx, train_ratings, test_ratings = compute_user_profiles(ratings, movies)
112
+
113
+ # User ID input
114
+ user_id = st.number_input("Enter User ID", min_value=1, step=1, value=1)
115
+
116
+ if st.button("Get Recommendations"):
117
+ recommendations = get_top_n_recommendations(user_id, user_profiles, tfidf_matrix, movie_id_to_idx, movies, train_ratings, n=5)
118
+ if recommendations:
119
+ st.write(f"**Top 5 recommendations for User {user_id}:**")
120
+ for i, (movie, pred_rating) in enumerate(recommendations, 1):
121
+ st.write(f"{i}. {movie} (Predicted Rating: {pred_rating:.2f})")
122
+ else:
123
+ st.error(f"Oops! User ID {user_id} not found or hasn't rated enough movies. Try another ID!")
movies_metadata.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37ec322786b136a24604564e55160be75f8937546f11cb1a990076971fa895fd
3
+ size 34445126
ratings_small.csv ADDED
The diff for this file is too large to render. See raw diff