import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer import requests import gradio as gr import os ratings = pd.read_csv("ratings.csv") movies = pd.read_csv("movies.csv") OMDB_API_KEY = os.environ.get("omdbapikey") movie_lookup = movies.set_index("movieId")["title"].to_dict() reverse_movie_lookup = {v.lower(): k for k, v in movie_lookup.items()} train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42) train_matrix = train_df.pivot_table(index='userId', columns='movieId', values='rating') train_matrix_filled = train_matrix.fillna(0) user_similarity = cosine_similarity(train_matrix_filled) user_similarity_df = pd.DataFrame(user_similarity, index=train_matrix_filled.index, columns=train_matrix_filled.index) item_rating_matrix = train_matrix_filled.T item_similarity = cosine_similarity(item_rating_matrix) item_similarity_df = pd.DataFrame(item_similarity, index=item_rating_matrix.index, columns=item_rating_matrix.index) data = pd.merge(ratings, movies, on='movieId') data['genres'] = data['genres'].fillna('') vectorizer = TfidfVectorizer(token_pattern=r'[a-zA-Z0-9\-]+') tfidf_matrix = vectorizer.fit_transform(data['genres'].values) movie_ids = data['movieId'].values unique_movie_ids, indices = np.unique(movie_ids, return_index=True) movie_id_to_index = {mid: idx for idx, mid in enumerate(unique_movie_ids)} movie_genre_matrix = tfidf_matrix[indices] def get_movie_poster(title): if not OMDB_API_KEY: return '' try: response = requests.get(f"http://www.omdbapi.com/?t={title}&apikey={OMDB_API_KEY}") data = response.json() return data.get('Poster', '') except: return '' def user_cf_recommend(user_id): try: user_id = int(user_id) if user_id not in user_similarity_df.index: return "User ID not found." similar_users = user_similarity_df[user_id].drop(user_id) top_similar_users = similar_users.sort_values(ascending=False).head(10) scores = {} sim_sums = {} for other_user, similarity in top_similar_users.items(): other_ratings = train_matrix.loc[other_user].dropna() for movie_id, rating in other_ratings.items(): if movie_id not in train_matrix.loc[user_id] or pd.isna(train_matrix.loc[user_id, movie_id]): scores[movie_id] = scores.get(movie_id, 0) + similarity * rating sim_sums[movie_id] = sim_sums.get(movie_id, 0) + abs(similarity) ranked_movies = sorted([(movie_id, score / sim_sums[movie_id]) for movie_id, score in scores.items() if sim_sums[movie_id] > 0], key=lambda x: x[1], reverse=True)[:5] result = [] for movie_id, score in ranked_movies: title = movie_lookup.get(movie_id, 'Unknown') poster = get_movie_poster(title) result.append((title, round(score, 2), poster)) return result except: return "Invalid input." def item_cf_recommend(movie_title): movie_title = movie_title.lower().strip() if movie_title not in reverse_movie_lookup: return "Movie not found." target_movie_id = reverse_movie_lookup[movie_title] if target_movie_id not in item_similarity_df: return "No similarity data available." similar_scores = item_similarity_df[target_movie_id].drop(target_movie_id) top_similar_ids = similar_scores.sort_values(ascending=False).head(5).index result = [] for mid in top_similar_ids: title = movie_lookup.get(mid, 'Unknown') poster = get_movie_poster(title) result.append((title, poster)) return result def cb_recommend(movie_title): movie_title = movie_title.strip().lower() movies['title_lower'] = movies['title'].str.lower() if movie_title not in movies['title_lower'].values: return "Movie not found." input_index = movies[movies['title_lower'] == movie_title].index[0] movie_id = movies.loc[input_index, 'movieId'] if movie_id not in movie_id_to_index: return "No genre data available." input_vec = movie_genre_matrix[movie_id_to_index[movie_id]] sims = cosine_similarity(input_vec, movie_genre_matrix).flatten() sim_indices = sims.argsort()[::-1] seen = set() result = [] for i in sim_indices: rec_movie_id = unique_movie_ids[i] title = movies[movies['movieId'] == rec_movie_id]['title'].values[0] if title.lower() != movie_title and title not in seen: poster = get_movie_poster(title) result.append((title, poster)) seen.add(title) if len(result) == 5: break return result def format_recommendations(recommendations): if isinstance(recommendations, str): return recommendations formatted = [] for item in recommendations: if len(item) == 3: title, score, poster = item if poster: formatted.append(f"