Upload 3 files
Browse files- .gitattributes +1 -0
- app.py +123 -0
- movies_metadata.csv +3 -0
- ratings_small.csv +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
movies_metadata.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
+
from sklearn.model_selection import train_test_split
|
| 7 |
+
from sklearn.metrics import mean_squared_error
|
| 8 |
+
|
| 9 |
+
# Cache data loading to make it faster
|
| 10 |
+
@st.cache_data
|
| 11 |
+
def load_data():
|
| 12 |
+
movies = pd.read_csv('movies_metadata.csv', low_memory=False)
|
| 13 |
+
movies = movies.sample(n=15000, random_state=42)
|
| 14 |
+
ratings = pd.read_csv('ratings_small.csv')
|
| 15 |
+
movies['overview'] = movies['overview'].fillna('')
|
| 16 |
+
movies['id'] = pd.to_numeric(movies['id'], errors='coerce').astype('Int64')
|
| 17 |
+
return movies, ratings
|
| 18 |
+
|
| 19 |
+
# Cache TF-IDF and similarity matrix for content-based
|
| 20 |
+
@st.cache_data
|
| 21 |
+
def compute_content_based_matrix(movies):
|
| 22 |
+
movies['genres_str'] = movies['genres'].apply(lambda x: ' '.join(x.split('|')))
|
| 23 |
+
vectorizer = TfidfVectorizer()
|
| 24 |
+
tfidf_matrix = vectorizer.fit_transform(movies['genres_str'])
|
| 25 |
+
similarity_matrix = cosine_similarity(tfidf_matrix)
|
| 26 |
+
title_to_index = pd.Series(movies.index, index=movies['title'])
|
| 27 |
+
return tfidf_matrix, similarity_matrix, title_to_index
|
| 28 |
+
|
| 29 |
+
# Cache user profiles for user-based
|
| 30 |
+
@st.cache_data
|
| 31 |
+
def compute_user_profiles(ratings, movies):
|
| 32 |
+
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)
|
| 33 |
+
tfidf = TfidfVectorizer(stop_words='english')
|
| 34 |
+
tfidf_matrix = tfidf.fit_transform(movies['genres'].fillna(''))
|
| 35 |
+
movie_id_to_idx = {mid: idx for idx, mid in enumerate(movies['id'])}
|
| 36 |
+
|
| 37 |
+
def build_user_profile(ratings_df, tfidf_matrix, movie_id_to_idx):
|
| 38 |
+
user_profiles = {}
|
| 39 |
+
for user_id, group in ratings_df.groupby('userId'):
|
| 40 |
+
rated_movies = group['movieId'].values
|
| 41 |
+
ratings = group['rating'].values
|
| 42 |
+
movie_indices = [movie_id_to_idx[m] for m in rated_movies if m in movie_id_to_idx]
|
| 43 |
+
if not movie_indices:
|
| 44 |
+
continue
|
| 45 |
+
weighted_vectors = np.sum([ratings[i] * tfidf_matrix[movie_indices[i]].toarray().flatten()
|
| 46 |
+
for i in range(len(movie_indices))], axis=0)
|
| 47 |
+
rating_sum = np.sum(ratings)
|
| 48 |
+
user_profiles[user_id] = weighted_vectors / rating_sum if rating_sum > 0 else weighted_vectors
|
| 49 |
+
return user_profiles, train_ratings, test_ratings
|
| 50 |
+
|
| 51 |
+
user_profiles, train_ratings, test_ratings = build_user_profile(train_ratings, tfidf_matrix, movie_id_to_idx)
|
| 52 |
+
return user_profiles, tfidf_matrix, movie_id_to_idx, train_ratings, test_ratings
|
| 53 |
+
|
| 54 |
+
# Content-based recommendation function
|
| 55 |
+
def get_similar_movies(title, similarity_matrix, title_to_index, movies, N=5):
|
| 56 |
+
try:
|
| 57 |
+
index = title_to_index[title]
|
| 58 |
+
similarity_scores = similarity_matrix[index]
|
| 59 |
+
similar_indices = similarity_scores.argsort()[::-1][1:N+1]
|
| 60 |
+
similar_movies = movies['title'].iloc[similar_indices]
|
| 61 |
+
similar_scores = similarity_scores[similar_indices]
|
| 62 |
+
return list(zip(similar_movies, similar_scores))
|
| 63 |
+
except KeyError:
|
| 64 |
+
return None
|
| 65 |
+
|
| 66 |
+
# User profile-based recommendation function
|
| 67 |
+
def get_top_n_recommendations(user_id, user_profiles, tfidf_matrix, movie_id_to_idx, movies, train_ratings, n=5):
|
| 68 |
+
if user_id not in user_profiles:
|
| 69 |
+
return None
|
| 70 |
+
user_profile = user_profiles[user_id]
|
| 71 |
+
similarities = cosine_similarity(user_profile.reshape(1, -1), tfidf_matrix).flatten()
|
| 72 |
+
movie_indices = np.argsort(similarities)[::-1]
|
| 73 |
+
rated_movies = set(train_ratings[train_ratings['userId'] == user_id]['movieId'].values)
|
| 74 |
+
top_n_indices = [idx for idx in movie_indices if movies['id'].iloc[idx] not in rated_movies][:n]
|
| 75 |
+
return [(movies['title'].iloc[idx], 1 + 4 * similarities[idx]) for idx in top_n_indices]
|
| 76 |
+
|
| 77 |
+
# Streamlit app
|
| 78 |
+
st.title("🎥 Movie Recommender System")
|
| 79 |
+
st.write("Pick a way to find awesome movies! Either choose a movie you like or enter your user ID for personalized picks.")
|
| 80 |
+
|
| 81 |
+
# Load data
|
| 82 |
+
movies, ratings = load_data()
|
| 83 |
+
|
| 84 |
+
# Sidebar for selecting recommendation type
|
| 85 |
+
recommendation_type = st.sidebar.selectbox("Choose Recommendation Type", ["Content-Based", "User Profile-Based"])
|
| 86 |
+
|
| 87 |
+
if recommendation_type == "Content-Based":
|
| 88 |
+
st.header("Content-Based Movie Recommendations")
|
| 89 |
+
st.write("Enter a movie title to find similar movies based on genres.")
|
| 90 |
+
|
| 91 |
+
# Compute content-based matrices
|
| 92 |
+
tfidf_matrix, similarity_matrix, title_to_index = compute_content_based_matrix(movies)
|
| 93 |
+
|
| 94 |
+
# Movie title input
|
| 95 |
+
movie_title = st.selectbox("Select a Movie", options=[""] + list(movies['title'].dropna().unique()))
|
| 96 |
+
|
| 97 |
+
if movie_title:
|
| 98 |
+
recommendations = get_similar_movies(movie_title, similarity_matrix, title_to_index, movies, N=5)
|
| 99 |
+
if recommendations:
|
| 100 |
+
st.write(f"**Movies similar to '{movie_title}':**")
|
| 101 |
+
for i, (movie, score) in enumerate(recommendations, 1):
|
| 102 |
+
st.write(f"{i}. {movie} (Similarity Score: {score:.2f})")
|
| 103 |
+
else:
|
| 104 |
+
st.error(f"Oops! Movie '{movie_title}' not found. Try another title!")
|
| 105 |
+
|
| 106 |
+
else:
|
| 107 |
+
st.header("User Profile-Based Movie Recommendations")
|
| 108 |
+
st.write("Enter your user ID to get personalized movie picks based on your ratings.")
|
| 109 |
+
|
| 110 |
+
# Compute user profiles
|
| 111 |
+
user_profiles, tfidf_matrix, movie_id_to_idx, train_ratings, test_ratings = compute_user_profiles(ratings, movies)
|
| 112 |
+
|
| 113 |
+
# User ID input
|
| 114 |
+
user_id = st.number_input("Enter User ID", min_value=1, step=1, value=1)
|
| 115 |
+
|
| 116 |
+
if st.button("Get Recommendations"):
|
| 117 |
+
recommendations = get_top_n_recommendations(user_id, user_profiles, tfidf_matrix, movie_id_to_idx, movies, train_ratings, n=5)
|
| 118 |
+
if recommendations:
|
| 119 |
+
st.write(f"**Top 5 recommendations for User {user_id}:**")
|
| 120 |
+
for i, (movie, pred_rating) in enumerate(recommendations, 1):
|
| 121 |
+
st.write(f"{i}. {movie} (Predicted Rating: {pred_rating:.2f})")
|
| 122 |
+
else:
|
| 123 |
+
st.error(f"Oops! User ID {user_id} not found or hasn't rated enough movies. Try another ID!")
|
movies_metadata.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37ec322786b136a24604564e55160be75f8937546f11cb1a990076971fa895fd
|
| 3 |
+
size 34445126
|
ratings_small.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|