from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd import numpy as np import streamlit as st from gensim.models import Word2Vec # ====================== Streamlit Setup ====================== st.set_page_config(layout="wide") st.title("Content Based Filtering") st.markdown('---') # ======================== Load Data ========================== movies = pd.read_csv('movies_final.csv') movies = movies.drop(columns=['movieId']) # ======================== Helper Function ==================== def combine_tokens(row, cols): combined = [] for col in cols: val = row[col] if isinstance(val, list): combined.extend(val) elif isinstance(val, str): combined.append(val) return list(set(combined)) # ======================== BOW ================================ def recommend_bow(title, year, top_n=5): if title not in movies['title'].values: return "Movie not found." # Get index of the queried movie idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0] # Compute cosine similarity scores query_vector = bow_matrix[idx] sim_scores = cosine_similarity(query_vector, bow_matrix).flatten() # Get indices of similar movies (excluding the query itself) similar_indices = sim_scores.argsort()[::-1][1:top_n + 1] # Create a DataFrame for top recommendations with similarity scores recommendations = pd.DataFrame({ 'title': movies.iloc[similar_indices]['title'].values, 'year': movies.iloc[similar_indices]['year'].values, 'similarity_score': sim_scores[similar_indices] }) # Sort for clarity (highest similarity first) recommendations = recommendations.sort_values(by='similarity_score', ascending=False).reset_index(drop=True) return recommendations # ======================== TF-IDF ============================== def recommend_tfidf(title, year, top_n=5): if title not in movies['title'].values: return "Movie not found." idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0] query_vector = tfidf_matrix[idx] sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten() # Get the indices of similar movies (excluding the movie itself) similar_indices = sim_scores.argsort()[::-1][1:] # Ensure you don’t get more than `top_n` recommendations similar_indices = similar_indices[:top_n] # Get the movie titles top_movies = movies.iloc[similar_indices]['title'].tolist() return top_movies # ======================== WORD2VEC ============================ def build_word2vec_vectors(): tokenized_corpus = [text.split() for text in movies['tokens']] # Train Word2Vec model w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4) # Function to compute average vector for each movie def get_avg_vector(tokens): vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv] if len(vectors) == 0: return np.zeros(w2v_model.vector_size) return np.mean(vectors, axis=0) # Compute average vector for each movie movies['w2v_vector'] = [get_avg_vector(text.split()) for text in movies['tokens']] w2v_matrix = np.vstack(movies['w2v_vector'].values) return w2v_model, w2v_matrix def recommend_w2v(title, year, w2v_matrix, top_n=5): if title not in movies['title'].values: return "Movie not found." idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0] query_vec = w2v_matrix[idx].reshape(1, -1) sims = cosine_similarity(query_vec, w2v_matrix).flatten() # Get the indices of similar movies (excluding the movie itself) similar_indices = sims.argsort()[::-1][1:] # Ensure you don’t get more than `top_n` recommendations similar_indices = similar_indices[:top_n] # Get the movie titles top_movies = movies.iloc[similar_indices]['title'].tolist() return top_movies # ======================== UI ======================== st.header('🎬 Choose an Algorithm and Movie for Content-Based Filtering') select_algo = st.selectbox('Select the algorithm', ['Bag of Words', 'TF-IDF', 'Word2Vec']) selected_movie = st.selectbox('Select a movie', movies['title'].unique()) selected_year = st.selectbox('Select the year', movies[movies['title']==selected_movie]['year']) all_columns = movies.columns.tolist() selected_cols = st.multiselect("Select columns to combine for tokens", all_columns, default=['genres', 'tag', 'plot']) # Combine selected columns into tokens movies['tokens'] = movies.apply(lambda row: combine_tokens(row, selected_cols), axis=1) movies['tokens'] = movies['tokens'].apply(lambda x: ' '.join(x)) if st.button('Recommend'): # Display the selected movie and its columns st.dataframe(movies[(movies['title'] == selected_movie) & (movies['year'] == selected_year)][['title'] + selected_cols]) # Recommendation based on the selected algorithm if select_algo == 'Bag of Words': with st.spinner("Building Bag of Words model..."): vectorizer = CountVectorizer() bow_matrix = vectorizer.fit_transform(movies['tokens']) output = recommend_bow(selected_movie, selected_year, top_n=5) st.success("Bag of Words model ready ✅") output_display = pd.merge( output, movies[['title', 'year'] + selected_cols], on=['title', 'year'], how='left' ).reset_index(drop=True) st.subheader("🎯 Top Recommendations (Bag of Words)") st.dataframe(output_display[['title', 'year', 'similarity_score'] + selected_cols]) elif select_algo == 'TF-IDF': with st.spinner("Building TF-IDF model..."): vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(movies['tokens']) output = recommend_tfidf(selected_movie, selected_year, top_n=5) st.success("TF-IDF model ready ✅") st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True)) elif select_algo == 'Word2Vec': with st.spinner("Training Word2Vec model..."): w2v_model, w2v_matrix = build_word2vec_vectors() output = recommend_w2v(selected_movie, selected_year, w2v_matrix, top_n=5) st.success("Word2Vec model ready ✅") st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True))