File size: 5,991 Bytes
f15b668 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Cache data loading to make it faster
@st.cache_data
def load_data():
movies = pd.read_csv('movies_metadata.csv', low_memory=False)
movies = movies.sample(n=15000, random_state=42)
ratings = pd.read_csv('ratings_small.csv')
movies['overview'] = movies['overview'].fillna('')
movies['id'] = pd.to_numeric(movies['id'], errors='coerce').astype('Int64')
return movies, ratings
# Cache TF-IDF and similarity matrix for content-based
@st.cache_data
def compute_content_based_matrix(movies):
movies['genres_str'] = movies['genres'].apply(lambda x: ' '.join(x.split('|')))
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(movies['genres_str'])
similarity_matrix = cosine_similarity(tfidf_matrix)
title_to_index = pd.Series(movies.index, index=movies['title'])
return tfidf_matrix, similarity_matrix, title_to_index
# Cache user profiles for user-based
@st.cache_data
def compute_user_profiles(ratings, movies):
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'].fillna(''))
movie_id_to_idx = {mid: idx for idx, mid in enumerate(movies['id'])}
def build_user_profile(ratings_df, tfidf_matrix, movie_id_to_idx):
user_profiles = {}
for user_id, group in ratings_df.groupby('userId'):
rated_movies = group['movieId'].values
ratings = group['rating'].values
movie_indices = [movie_id_to_idx[m] for m in rated_movies if m in movie_id_to_idx]
if not movie_indices:
continue
weighted_vectors = np.sum([ratings[i] * tfidf_matrix[movie_indices[i]].toarray().flatten()
for i in range(len(movie_indices))], axis=0)
rating_sum = np.sum(ratings)
user_profiles[user_id] = weighted_vectors / rating_sum if rating_sum > 0 else weighted_vectors
return user_profiles, train_ratings, test_ratings
user_profiles, train_ratings, test_ratings = build_user_profile(train_ratings, tfidf_matrix, movie_id_to_idx)
return user_profiles, tfidf_matrix, movie_id_to_idx, train_ratings, test_ratings
# Content-based recommendation function
def get_similar_movies(title, similarity_matrix, title_to_index, movies, N=5):
try:
index = title_to_index[title]
similarity_scores = similarity_matrix[index]
similar_indices = similarity_scores.argsort()[::-1][1:N+1]
similar_movies = movies['title'].iloc[similar_indices]
similar_scores = similarity_scores[similar_indices]
return list(zip(similar_movies, similar_scores))
except KeyError:
return None
# User profile-based recommendation function
def get_top_n_recommendations(user_id, user_profiles, tfidf_matrix, movie_id_to_idx, movies, train_ratings, n=5):
if user_id not in user_profiles:
return None
user_profile = user_profiles[user_id]
similarities = cosine_similarity(user_profile.reshape(1, -1), tfidf_matrix).flatten()
movie_indices = np.argsort(similarities)[::-1]
rated_movies = set(train_ratings[train_ratings['userId'] == user_id]['movieId'].values)
top_n_indices = [idx for idx in movie_indices if movies['id'].iloc[idx] not in rated_movies][:n]
return [(movies['title'].iloc[idx], 1 + 4 * similarities[idx]) for idx in top_n_indices]
# Streamlit app
st.title("🎥 Movie Recommender System")
st.write("Pick a way to find awesome movies! Either choose a movie you like or enter your user ID for personalized picks.")
# Load data
movies, ratings = load_data()
# Sidebar for selecting recommendation type
recommendation_type = st.sidebar.selectbox("Choose Recommendation Type", ["Content-Based", "User Profile-Based"])
if recommendation_type == "Content-Based":
st.header("Content-Based Movie Recommendations")
st.write("Enter a movie title to find similar movies based on genres.")
# Compute content-based matrices
tfidf_matrix, similarity_matrix, title_to_index = compute_content_based_matrix(movies)
# Movie title input
movie_title = st.selectbox("Select a Movie", options=[""] + list(movies['title'].dropna().unique()))
if movie_title:
recommendations = get_similar_movies(movie_title, similarity_matrix, title_to_index, movies, N=5)
if recommendations:
st.write(f"**Movies similar to '{movie_title}':**")
for i, (movie, score) in enumerate(recommendations, 1):
st.write(f"{i}. {movie} (Similarity Score: {score:.2f})")
else:
st.error(f"Oops! Movie '{movie_title}' not found. Try another title!")
else:
st.header("User Profile-Based Movie Recommendations")
st.write("Enter your user ID to get personalized movie picks based on your ratings.")
# Compute user profiles
user_profiles, tfidf_matrix, movie_id_to_idx, train_ratings, test_ratings = compute_user_profiles(ratings, movies)
# User ID input
user_id = st.number_input("Enter User ID", min_value=1, step=1, value=1)
if st.button("Get Recommendations"):
recommendations = get_top_n_recommendations(user_id, user_profiles, tfidf_matrix, movie_id_to_idx, movies, train_ratings, n=5)
if recommendations:
st.write(f"**Top 5 recommendations for User {user_id}:**")
for i, (movie, pred_rating) in enumerate(recommendations, 1):
st.write(f"{i}. {movie} (Predicted Rating: {pred_rating:.2f})")
else:
st.error(f"Oops! User ID {user_id} not found or hasn't rated enough movies. Try another ID!") |