Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pandas as pd | |
| import ast | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from nltk.stem.porter import PorterStemmer | |
| import gradio as gr | |
| import nltk | |
| import difflib | |
| nltk.download('punkt') | |
| # Load data | |
| movies = pd.read_csv('tmdb_5000_movies.csv') | |
| credits = pd.read_csv('tmdb_5000_credits.csv') | |
| movies = movies.merge(credits, on='title') | |
| movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']] | |
| movies.dropna(inplace=True) | |
| # Process genres, keywords | |
| def convert(obj): | |
| return [i['name'].replace(" ", "") for i in ast.literal_eval(obj)] | |
| movies['genres'] = movies['genres'].apply(convert) | |
| movies['keywords'] = movies['keywords'].apply(convert) | |
| # Top 3 cast | |
| def convert3(obj): | |
| return [i['name'].replace(" ", "") for i in ast.literal_eval(obj)[:3]] | |
| movies['cast'] = movies['cast'].apply(convert3) | |
| # Director | |
| def fetch_director(obj): | |
| for i in ast.literal_eval(obj): | |
| if i['job'] == 'Director': | |
| return [i['name'].replace(" ", "")] | |
| return [] | |
| movies['crew'] = movies['crew'].apply(fetch_director) | |
| # Overview processing | |
| movies['overview'] = movies['overview'].apply(lambda x: x.split()) | |
| # Create tags | |
| movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew'] | |
| new_df = movies[['movie_id', 'title', 'tags']] | |
| new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower()) | |
| # Stemming | |
| ps = PorterStemmer() | |
| def stem(text): | |
| return " ".join([ps.stem(word) for word in text.split()]) | |
| new_df['tags'] = new_df['tags'].apply(stem) | |
| # Vectorization | |
| cv = CountVectorizer(max_features=5000, stop_words='english') | |
| vectors = cv.fit_transform(new_df['tags']).toarray() | |
| # Similarity | |
| similarity = cosine_similarity(vectors) | |
| # Recommendation function | |
| def recommend(movie): | |
| movie = movie.lower() | |
| titles = new_df['title'].str.lower().tolist() | |
| # close_matches = difflib.get_close_matches(movie, titles, n=10, cutoff=0.4) | |
| # | |
| # if not close_matches: | |
| # return ["Movie not found"], [] | |
| # | |
| # if movie not in new_df['title'].str.lower().values: | |
| # return ["Movie not found in database :( "] | |
| # # movie = close_matches[0] | |
| index = new_df[new_df['title'].str.lower() == movie].index[0] | |
| distances = similarity[index] | |
| movie_list = sorted(enumerate(distances), reverse=True, key=lambda x: x[1])[1:6] | |
| return [new_df.iloc[i[0]].title for i in movie_list] | |
| movie_list = new_df['title'].tolist() | |
| # Gradio interface | |
| def recommend_interface(movie_name): | |
| return recommend(movie_name) | |
| demo = gr.Interface(fn=recommend_interface, | |
| inputs=gr.Dropdown(movie_list, label="Select a movie..."), | |
| outputs=gr.List(label="Top 5 Recommendations"), | |
| title="Movie Recommender") | |
| if __name__ == "__main__": | |
| demo.launch() | |