import numpy as np import pandas as pd import ast from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from nltk.stem.porter import PorterStemmer import gradio as gr import nltk import difflib nltk.download('punkt') # Load data movies = pd.read_csv('tmdb_5000_movies.csv') credits = pd.read_csv('tmdb_5000_credits.csv') movies = movies.merge(credits, on='title') movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']] movies.dropna(inplace=True) # Process genres, keywords def convert(obj): return [i['name'].replace(" ", "") for i in ast.literal_eval(obj)] movies['genres'] = movies['genres'].apply(convert) movies['keywords'] = movies['keywords'].apply(convert) # Top 3 cast def convert3(obj): return [i['name'].replace(" ", "") for i in ast.literal_eval(obj)[:3]] movies['cast'] = movies['cast'].apply(convert3) # Director def fetch_director(obj): for i in ast.literal_eval(obj): if i['job'] == 'Director': return [i['name'].replace(" ", "")] return [] movies['crew'] = movies['crew'].apply(fetch_director) # Overview processing movies['overview'] = movies['overview'].apply(lambda x: x.split()) # Create tags movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew'] new_df = movies[['movie_id', 'title', 'tags']] new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower()) # Stemming ps = PorterStemmer() def stem(text): return " ".join([ps.stem(word) for word in text.split()]) new_df['tags'] = new_df['tags'].apply(stem) # Vectorization cv = CountVectorizer(max_features=5000, stop_words='english') vectors = cv.fit_transform(new_df['tags']).toarray() # Similarity similarity = cosine_similarity(vectors) # Recommendation function def recommend(movie): movie = movie.lower() titles = new_df['title'].str.lower().tolist() # close_matches = difflib.get_close_matches(movie, titles, n=10, cutoff=0.4) # # if not close_matches: # return ["Movie not found"], [] # # if movie not in new_df['title'].str.lower().values: # return ["Movie not found in database :( "] # # movie = close_matches[0] index = new_df[new_df['title'].str.lower() == movie].index[0] distances = similarity[index] movie_list = sorted(enumerate(distances), reverse=True, key=lambda x: x[1])[1:6] return [new_df.iloc[i[0]].title for i in movie_list] movie_list = new_df['title'].tolist() # Gradio interface def recommend_interface(movie_name): return recommend(movie_name) demo = gr.Interface(fn=recommend_interface, inputs=gr.Dropdown(movie_list, label="Select a movie..."), outputs=gr.List(label="Top 5 Recommendations"), title="Movie Recommender") if __name__ == "__main__": demo.launch()