| | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| | from sklearn.metrics.pairwise import cosine_similarity
|
| | import pandas as pd
|
| | import numpy as np
|
| | import streamlit as st
|
| | from gensim.models import Word2Vec
|
| |
|
| |
|
| | st.set_page_config(layout="wide")
|
| | st.title("Content Based Filtering")
|
| | st.markdown('---')
|
| |
|
| |
|
| | movies = pd.read_csv('movies_final.csv')
|
| | movies = movies.drop(columns=['movieId'])
|
| |
|
| |
|
| | def combine_tokens(row, cols):
|
| | combined = []
|
| | for col in cols:
|
| | val = row[col]
|
| | if isinstance(val, list):
|
| | combined.extend(val)
|
| | elif isinstance(val, str):
|
| | combined.append(val)
|
| | return list(set(combined))
|
| |
|
| |
|
| | def recommend_bow(title, year, top_n=5):
|
| | if title not in movies['title'].values:
|
| | return "Movie not found."
|
| |
|
| |
|
| | idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
|
| |
|
| |
|
| | query_vector = bow_matrix[idx]
|
| | sim_scores = cosine_similarity(query_vector, bow_matrix).flatten()
|
| |
|
| |
|
| | similar_indices = sim_scores.argsort()[::-1][1:top_n + 1]
|
| |
|
| |
|
| | recommendations = pd.DataFrame({
|
| | 'title': movies.iloc[similar_indices]['title'].values,
|
| | 'year': movies.iloc[similar_indices]['year'].values,
|
| | 'similarity_score': sim_scores[similar_indices]
|
| | })
|
| |
|
| |
|
| | recommendations = recommendations.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)
|
| |
|
| | return recommendations
|
| |
|
| |
|
| |
|
| | def recommend_tfidf(title, year, top_n=5):
|
| | if title not in movies['title'].values:
|
| | return "Movie not found."
|
| |
|
| | idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
|
| | query_vector = tfidf_matrix[idx]
|
| | sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
|
| |
|
| |
|
| | similar_indices = sim_scores.argsort()[::-1][1:]
|
| |
|
| |
|
| | similar_indices = similar_indices[:top_n]
|
| |
|
| |
|
| | top_movies = movies.iloc[similar_indices]['title'].tolist()
|
| |
|
| | return top_movies
|
| |
|
| |
|
| | def build_word2vec_vectors():
|
| | tokenized_corpus = [text.split() for text in movies['tokens']]
|
| |
|
| |
|
| | w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
|
| |
|
| |
|
| | def get_avg_vector(tokens):
|
| | vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
|
| | if len(vectors) == 0:
|
| | return np.zeros(w2v_model.vector_size)
|
| | return np.mean(vectors, axis=0)
|
| |
|
| |
|
| | movies['w2v_vector'] = [get_avg_vector(text.split()) for text in movies['tokens']]
|
| | w2v_matrix = np.vstack(movies['w2v_vector'].values)
|
| | return w2v_model, w2v_matrix
|
| |
|
| | def recommend_w2v(title, year, w2v_matrix, top_n=5):
|
| | if title not in movies['title'].values:
|
| | return "Movie not found."
|
| |
|
| | idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
|
| | query_vec = w2v_matrix[idx].reshape(1, -1)
|
| | sims = cosine_similarity(query_vec, w2v_matrix).flatten()
|
| |
|
| |
|
| | similar_indices = sims.argsort()[::-1][1:]
|
| |
|
| |
|
| | similar_indices = similar_indices[:top_n]
|
| |
|
| |
|
| | top_movies = movies.iloc[similar_indices]['title'].tolist()
|
| |
|
| | return top_movies
|
| |
|
| |
|
| | st.header('🎬 Choose an Algorithm and Movie for Content-Based Filtering')
|
| |
|
| | select_algo = st.selectbox('Select the algorithm', ['Bag of Words', 'TF-IDF', 'Word2Vec'])
|
| | selected_movie = st.selectbox('Select a movie', movies['title'].unique())
|
| | selected_year = st.selectbox('Select the year', movies[movies['title']==selected_movie]['year'])
|
| |
|
| | all_columns = movies.columns.tolist()
|
| | selected_cols = st.multiselect("Select columns to combine for tokens", all_columns, default=['genres', 'tag', 'plot'])
|
| |
|
| |
|
| | movies['tokens'] = movies.apply(lambda row: combine_tokens(row, selected_cols), axis=1)
|
| | movies['tokens'] = movies['tokens'].apply(lambda x: ' '.join(x))
|
| |
|
| | if st.button('Recommend'):
|
| |
|
| | st.dataframe(movies[(movies['title'] == selected_movie) & (movies['year'] == selected_year)][['title'] + selected_cols])
|
| |
|
| |
|
| | if select_algo == 'Bag of Words':
|
| | with st.spinner("Building Bag of Words model..."):
|
| | vectorizer = CountVectorizer()
|
| | bow_matrix = vectorizer.fit_transform(movies['tokens'])
|
| | output = recommend_bow(selected_movie, selected_year, top_n=5)
|
| | st.success("Bag of Words model ready ✅")
|
| |
|
| | output_display = pd.merge( output, movies[['title', 'year'] + selected_cols], on=['title', 'year'], how='left' ).reset_index(drop=True)
|
| | st.subheader("🎯 Top Recommendations (Bag of Words)")
|
| | st.dataframe(output_display[['title', 'year', 'similarity_score'] + selected_cols])
|
| |
|
| | elif select_algo == 'TF-IDF':
|
| | with st.spinner("Building TF-IDF model..."):
|
| | vectorizer = TfidfVectorizer()
|
| | tfidf_matrix = vectorizer.fit_transform(movies['tokens'])
|
| | output = recommend_tfidf(selected_movie, selected_year, top_n=5)
|
| | st.success("TF-IDF model ready ✅")
|
| | st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True))
|
| |
|
| | elif select_algo == 'Word2Vec':
|
| | with st.spinner("Training Word2Vec model..."):
|
| | w2v_model, w2v_matrix = build_word2vec_vectors()
|
| | output = recommend_w2v(selected_movie, selected_year, w2v_matrix, top_n=5)
|
| | st.success("Word2Vec model ready ✅")
|
| | st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True))
|
| |
|