import requests import pandas as pd import spacy from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from textblob import TextBlob import streamlit as st # 🔹 Replace this with your TMDB API Key API_KEY = "bbb69cf69be036e363d9ab8996f7f4ee" BASE_URL = "https://api.themoviedb.org/3" # 🔹 TMDB Image Base URL for posters IMAGE_BASE_URL = "https://image.tmdb.org/t/p/w500" # Load Spacy English NLP model nlp = spacy.load("en_core_web_sm") # =========================== # STEP 1: FETCH MOVIE DATA # =========================== def fetch_movies(num_pages=2): """Fetch popular movies from TMDB API.""" all_movies = [] for page in range(1, num_pages + 1): url = f"{BASE_URL}/discover/movie?api_key={API_KEY}&language=en-US&sort_by=popularity.desc&page={page}" response = requests.get(url) data = response.json() if "results" in data: for movie in data["results"]: all_movies.append({ "id": movie["id"], "title": movie["title"], "overview": movie["overview"], "vote_average": movie["vote_average"], "release_date": movie["release_date"] }) return pd.DataFrame(all_movies) # =========================== # STEP 2: FETCH ADDITIONAL DETAILS (GENRES, CAST, DIRECTOR) # =========================== def fetch_genres(): """Retrieve genre names from TMDB API and return a dictionary mapping genre IDs to names.""" url = f"{BASE_URL}/genre/movie/list?api_key={API_KEY}&language=en-US" response = requests.get(url) data = response.json() return {genre["id"]: genre["name"] for genre in data["genres"]} def fetch_movie_details(movie_id): """Fetch top 3 cast members and director for a given movie.""" url = f"{BASE_URL}/movie/{movie_id}/credits?api_key={API_KEY}" response = requests.get(url) data = requests.get(url).json() # Get top 3 cast members cast = ", ".join([member["name"] for member in data.get("cast", [])[:3]]) # Get director director = next((crew["name"] for crew in data.get("crew", []) if crew["job"] == "Director"), "Unknown") return cast, director # =========================== # STEP 3: ENRICH MOVIE DATA WITH GENRES, CAST, DIRECTOR # =========================== def enhance_movie_data(movies_df): """Add genres, top cast, and director information to the movie dataset.""" genre_dict = fetch_genres() movies_df["cast"], movies_df["director"] = zip(*movies_df["id"].apply(fetch_movie_details)) return movies_df # =========================== # STEP 4: FEATURE ENGINEERING (KEYWORDS & SENTIMENT) # =========================== def extract_keywords(text, num_keywords=5): """Extract top keywords from text using TF-IDF.""" vectorizer = TfidfVectorizer(stop_words="english", max_features=50) tfidf_matrix = vectorizer.fit_transform([text]) feature_names = vectorizer.get_feature_names_out() sorted_indices = tfidf_matrix.toarray().argsort()[0][-num_keywords:] return ", ".join(feature_names[i] for i in sorted_indices) def get_sentiment(text): """Analyze sentiment (-1 to 1) from movie description using TextBlob.""" return TextBlob(text).sentiment.polarity # =========================== # STEP 5: BUILD RECOMMENDER SYSTEM (CONTENT-BASED FILTERING) # =========================== def recommend_movies(movie_title, num_recommendations=5): """Recommend similar movies based on content similarity.""" if movie_title not in movies_df["title"].values: return "Movie not found in dataset!" # Get index of the selected movie movie_index = movies_df[movies_df["title"] == movie_title].index[0] # Compute similarity scores and sort them similarity_scores = list(enumerate(cosine_sim[movie_index])) similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations + 1] # Get recommended movie titles return [movies_df.iloc[i[0]]["title"] for i in similarity_scores] # =========================== # STEP 6: FETCH MOVIE POSTER # =========================== def get_movie_poster(movie_title): """Fetch movie poster from TMDB API.""" movie = movies_df[movies_df["title"] == movie_title] if not movie.empty: movie_id = movie.iloc[0]["id"] url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={API_KEY}" response = requests.get(url).json() return IMAGE_BASE_URL + response.get("poster_path", "") return None # =========================== # STEP 7: LOAD & PROCESS MOVIE DATA # =========================== movies_df = fetch_movies(num_pages=2) # Fetch movie data movies_df = enhance_movie_data(movies_df) # Add cast, director info # Apply feature extraction movies_df["keywords"] = movies_df["overview"].apply(lambda x: extract_keywords(str(x))) movies_df["sentiment"] = movies_df["overview"].apply(lambda x: get_sentiment(str(x))) # Combine relevant text features for recommendation movies_df["combined_features"] = ( movies_df["overview"].fillna("") + " " + movies_df["keywords"].fillna("") ) # Convert text into numerical vectors using TF-IDF tfidf_vectorizer = TfidfVectorizer(stop_words="english") tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df["combined_features"]) # Compute similarity scores between movies cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) # =========================== # STEP 8: STREAMLIT APP UI # =========================== st.title("🎬 Movie Recommendation System") # Dropdown to select a movie selected_movie = st.selectbox("Select a Movie", movies_df["title"].values) # Recommend button if st.button("Recommend"): recommendations = recommend_movies(selected_movie) if isinstance(recommendations, list): st.subheader(f"Movies similar to {selected_movie}:") # Display recommended movies in a horizontal layout cols = st.columns(len(recommendations)) for i, movie in enumerate(recommendations): poster_url = get_movie_poster(movie) with cols[i]: if poster_url: st.image(poster_url, width=150) st.write(f"**{movie}**") else: st.error("No recommendations found.")