Movie_Recommender_System-sj / content_based.py
SnehaJais's picture
Initial Commit
c37645b verified
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import streamlit as st
from gensim.models import Word2Vec
# ====================== Streamlit Setup ======================
st.set_page_config(layout="wide")
st.title("Content Based Filtering")
st.markdown('---')
# ======================== Load Data ==========================
movies = pd.read_csv('movies_final.csv')
movies = movies.drop(columns=['movieId'])
# ======================== Helper Function ====================
def combine_tokens(row, cols):
combined = []
for col in cols:
val = row[col]
if isinstance(val, list):
combined.extend(val)
elif isinstance(val, str):
combined.append(val)
return list(set(combined))
# ======================== BOW ================================
def recommend_bow(title, year, top_n=5):
if title not in movies['title'].values:
return "Movie not found."
# Get index of the queried movie
idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
# Compute cosine similarity scores
query_vector = bow_matrix[idx]
sim_scores = cosine_similarity(query_vector, bow_matrix).flatten()
# Get indices of similar movies (excluding the query itself)
similar_indices = sim_scores.argsort()[::-1][1:top_n + 1]
# Create a DataFrame for top recommendations with similarity scores
recommendations = pd.DataFrame({
'title': movies.iloc[similar_indices]['title'].values,
'year': movies.iloc[similar_indices]['year'].values,
'similarity_score': sim_scores[similar_indices]
})
# Sort for clarity (highest similarity first)
recommendations = recommendations.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)
return recommendations
# ======================== TF-IDF ==============================
def recommend_tfidf(title, year, top_n=5):
if title not in movies['title'].values:
return "Movie not found."
idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
query_vector = tfidf_matrix[idx]
sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
# Get the indices of similar movies (excluding the movie itself)
similar_indices = sim_scores.argsort()[::-1][1:]
# Ensure you don’t get more than `top_n` recommendations
similar_indices = similar_indices[:top_n]
# Get the movie titles
top_movies = movies.iloc[similar_indices]['title'].tolist()
return top_movies
# ======================== WORD2VEC ============================
def build_word2vec_vectors():
tokenized_corpus = [text.split() for text in movies['tokens']]
# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
# Function to compute average vector for each movie
def get_avg_vector(tokens):
vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
if len(vectors) == 0:
return np.zeros(w2v_model.vector_size)
return np.mean(vectors, axis=0)
# Compute average vector for each movie
movies['w2v_vector'] = [get_avg_vector(text.split()) for text in movies['tokens']]
w2v_matrix = np.vstack(movies['w2v_vector'].values)
return w2v_model, w2v_matrix
def recommend_w2v(title, year, w2v_matrix, top_n=5):
if title not in movies['title'].values:
return "Movie not found."
idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
query_vec = w2v_matrix[idx].reshape(1, -1)
sims = cosine_similarity(query_vec, w2v_matrix).flatten()
# Get the indices of similar movies (excluding the movie itself)
similar_indices = sims.argsort()[::-1][1:]
# Ensure you don’t get more than `top_n` recommendations
similar_indices = similar_indices[:top_n]
# Get the movie titles
top_movies = movies.iloc[similar_indices]['title'].tolist()
return top_movies
# ======================== UI ========================
st.header('🎬 Choose an Algorithm and Movie for Content-Based Filtering')
select_algo = st.selectbox('Select the algorithm', ['Bag of Words', 'TF-IDF', 'Word2Vec'])
selected_movie = st.selectbox('Select a movie', movies['title'].unique())
selected_year = st.selectbox('Select the year', movies[movies['title']==selected_movie]['year'])
all_columns = movies.columns.tolist()
selected_cols = st.multiselect("Select columns to combine for tokens", all_columns, default=['genres', 'tag', 'plot'])
# Combine selected columns into tokens
movies['tokens'] = movies.apply(lambda row: combine_tokens(row, selected_cols), axis=1)
movies['tokens'] = movies['tokens'].apply(lambda x: ' '.join(x))
if st.button('Recommend'):
# Display the selected movie and its columns
st.dataframe(movies[(movies['title'] == selected_movie) & (movies['year'] == selected_year)][['title'] + selected_cols])
# Recommendation based on the selected algorithm
if select_algo == 'Bag of Words':
with st.spinner("Building Bag of Words model..."):
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(movies['tokens'])
output = recommend_bow(selected_movie, selected_year, top_n=5)
st.success("Bag of Words model ready ✅")
output_display = pd.merge( output, movies[['title', 'year'] + selected_cols], on=['title', 'year'], how='left' ).reset_index(drop=True)
st.subheader("🎯 Top Recommendations (Bag of Words)")
st.dataframe(output_display[['title', 'year', 'similarity_score'] + selected_cols])
elif select_algo == 'TF-IDF':
with st.spinner("Building TF-IDF model..."):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(movies['tokens'])
output = recommend_tfidf(selected_movie, selected_year, top_n=5)
st.success("TF-IDF model ready ✅")
st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True))
elif select_algo == 'Word2Vec':
with st.spinner("Training Word2Vec model..."):
w2v_model, w2v_matrix = build_word2vec_vectors()
output = recommend_w2v(selected_movie, selected_year, w2v_matrix, top_n=5)
st.success("Word2Vec model ready ✅")
st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True))