File size: 5,991 Bytes
f15b668
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Cache data loading to make it faster
@st.cache_data
def load_data():
    movies = pd.read_csv('movies_metadata.csv', low_memory=False)
    movies = movies.sample(n=15000, random_state=42)
    ratings = pd.read_csv('ratings_small.csv')
    movies['overview'] = movies['overview'].fillna('')
    movies['id'] = pd.to_numeric(movies['id'], errors='coerce').astype('Int64')
    return movies, ratings

# Cache TF-IDF and similarity matrix for content-based
@st.cache_data
def compute_content_based_matrix(movies):
    movies['genres_str'] = movies['genres'].apply(lambda x: ' '.join(x.split('|')))
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(movies['genres_str'])
    similarity_matrix = cosine_similarity(tfidf_matrix)
    title_to_index = pd.Series(movies.index, index=movies['title'])
    return tfidf_matrix, similarity_matrix, title_to_index

# Cache user profiles for user-based
@st.cache_data
def compute_user_profiles(ratings, movies):
    train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(movies['genres'].fillna(''))
    movie_id_to_idx = {mid: idx for idx, mid in enumerate(movies['id'])}
    
    def build_user_profile(ratings_df, tfidf_matrix, movie_id_to_idx):
        user_profiles = {}
        for user_id, group in ratings_df.groupby('userId'):
            rated_movies = group['movieId'].values
            ratings = group['rating'].values
            movie_indices = [movie_id_to_idx[m] for m in rated_movies if m in movie_id_to_idx]
            if not movie_indices:
                continue
            weighted_vectors = np.sum([ratings[i] * tfidf_matrix[movie_indices[i]].toarray().flatten()
                                       for i in range(len(movie_indices))], axis=0)
            rating_sum = np.sum(ratings)
            user_profiles[user_id] = weighted_vectors / rating_sum if rating_sum > 0 else weighted_vectors
        return user_profiles, train_ratings, test_ratings
    
    user_profiles, train_ratings, test_ratings = build_user_profile(train_ratings, tfidf_matrix, movie_id_to_idx)
    return user_profiles, tfidf_matrix, movie_id_to_idx, train_ratings, test_ratings

# Content-based recommendation function
def get_similar_movies(title, similarity_matrix, title_to_index, movies, N=5):
    try:
        index = title_to_index[title]
        similarity_scores = similarity_matrix[index]
        similar_indices = similarity_scores.argsort()[::-1][1:N+1]
        similar_movies = movies['title'].iloc[similar_indices]
        similar_scores = similarity_scores[similar_indices]
        return list(zip(similar_movies, similar_scores))
    except KeyError:
        return None

# User profile-based recommendation function
def get_top_n_recommendations(user_id, user_profiles, tfidf_matrix, movie_id_to_idx, movies, train_ratings, n=5):
    if user_id not in user_profiles:
        return None
    user_profile = user_profiles[user_id]
    similarities = cosine_similarity(user_profile.reshape(1, -1), tfidf_matrix).flatten()
    movie_indices = np.argsort(similarities)[::-1]
    rated_movies = set(train_ratings[train_ratings['userId'] == user_id]['movieId'].values)
    top_n_indices = [idx for idx in movie_indices if movies['id'].iloc[idx] not in rated_movies][:n]
    return [(movies['title'].iloc[idx], 1 + 4 * similarities[idx]) for idx in top_n_indices]

# Streamlit app
st.title("🎥 Movie Recommender System")
st.write("Pick a way to find awesome movies! Either choose a movie you like or enter your user ID for personalized picks.")

# Load data
movies, ratings = load_data()

# Sidebar for selecting recommendation type
recommendation_type = st.sidebar.selectbox("Choose Recommendation Type", ["Content-Based", "User Profile-Based"])

if recommendation_type == "Content-Based":
    st.header("Content-Based Movie Recommendations")
    st.write("Enter a movie title to find similar movies based on genres.")
    
    # Compute content-based matrices
    tfidf_matrix, similarity_matrix, title_to_index = compute_content_based_matrix(movies)
    
    # Movie title input
    movie_title = st.selectbox("Select a Movie", options=[""] + list(movies['title'].dropna().unique()))
    
    if movie_title:
        recommendations = get_similar_movies(movie_title, similarity_matrix, title_to_index, movies, N=5)
        if recommendations:
            st.write(f"**Movies similar to '{movie_title}':**")
            for i, (movie, score) in enumerate(recommendations, 1):
                st.write(f"{i}. {movie} (Similarity Score: {score:.2f})")
        else:
            st.error(f"Oops! Movie '{movie_title}' not found. Try another title!")

else:
    st.header("User Profile-Based Movie Recommendations")
    st.write("Enter your user ID to get personalized movie picks based on your ratings.")
    
    # Compute user profiles
    user_profiles, tfidf_matrix, movie_id_to_idx, train_ratings, test_ratings = compute_user_profiles(ratings, movies)
    
    # User ID input
    user_id = st.number_input("Enter User ID", min_value=1, step=1, value=1)
    
    if st.button("Get Recommendations"):
        recommendations = get_top_n_recommendations(user_id, user_profiles, tfidf_matrix, movie_id_to_idx, movies, train_ratings, n=5)
        if recommendations:
            st.write(f"**Top 5 recommendations for User {user_id}:**")
            for i, (movie, pred_rating) in enumerate(recommendations, 1):
                st.write(f"{i}. {movie} (Predicted Rating: {pred_rating:.2f})")
        else:
            st.error(f"Oops! User ID {user_id} not found or hasn't rated enough movies. Try another ID!")