File size: 6,382 Bytes
ff74c4e
 
 
 
 
 
279fe6d
ff74c4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0aa905c
 
ff74c4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0aa905c
ff74c4e
 
 
 
 
 
024d3eb
ff74c4e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import requests
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
import streamlit as st

# 🔹 Replace this with your TMDB API Key
API_KEY = "bbb69cf69be036e363d9ab8996f7f4ee"
BASE_URL = "https://api.themoviedb.org/3"

# 🔹 TMDB Image Base URL for posters
IMAGE_BASE_URL = "https://image.tmdb.org/t/p/w500"

# Load Spacy English NLP model
nlp = spacy.load("en_core_web_sm")


# ===========================
#  STEP 1: FETCH MOVIE DATA
# ===========================
def fetch_movies(num_pages=2):
    """Fetch popular movies from TMDB API."""
    all_movies = []

    for page in range(1, num_pages + 1):
        url = f"{BASE_URL}/discover/movie?api_key={API_KEY}&language=en-US&sort_by=popularity.desc&page={page}"
        response = requests.get(url)
        data = response.json()

        if "results" in data:
            for movie in data["results"]:
                all_movies.append({
                    "id": movie["id"],
                    "title": movie["title"],
                    "overview": movie["overview"],
                    "vote_average": movie["vote_average"],
                    "release_date": movie["release_date"]
                })

    return pd.DataFrame(all_movies)


# ===========================
#  STEP 2: FETCH ADDITIONAL DETAILS (GENRES, CAST, DIRECTOR)
# ===========================
def fetch_genres():
    """Retrieve genre names from TMDB API and return a dictionary mapping genre IDs to names."""
    url = f"{BASE_URL}/genre/movie/list?api_key={API_KEY}&language=en-US"
    response = requests.get(url)
    data = response.json()

    return {genre["id"]: genre["name"] for genre in data["genres"]}


def fetch_movie_details(movie_id):
    """Fetch top 3 cast members and director for a given movie."""
    url = f"{BASE_URL}/movie/{movie_id}/credits?api_key={API_KEY}"
    response = requests.get(url)
    data = requests.get(url).json()

    # Get top 3 cast members
    cast = ", ".join([member["name"] for member in data.get("cast", [])[:3]])

    # Get director
    director = next((crew["name"] for crew in data.get("crew", []) if crew["job"] == "Director"), "Unknown")

    return cast, director


# ===========================
#  STEP 3: ENRICH MOVIE DATA WITH GENRES, CAST, DIRECTOR
# ===========================
def enhance_movie_data(movies_df):
    """Add genres, top cast, and director information to the movie dataset."""
    genre_dict = fetch_genres()

    movies_df["cast"], movies_df["director"] = zip(*movies_df["id"].apply(fetch_movie_details))
    return movies_df


# ===========================
#  STEP 4: FEATURE ENGINEERING (KEYWORDS & SENTIMENT)
# ===========================
def extract_keywords(text, num_keywords=5):
    """Extract top keywords from text using TF-IDF."""
    vectorizer = TfidfVectorizer(stop_words="english", max_features=50)
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()

    sorted_indices = tfidf_matrix.toarray().argsort()[0][-num_keywords:]
    return ", ".join(feature_names[i] for i in sorted_indices)


def get_sentiment(text):
    """Analyze sentiment (-1 to 1) from movie description using TextBlob."""
    return TextBlob(text).sentiment.polarity


# ===========================
#  STEP 5: BUILD RECOMMENDER SYSTEM (CONTENT-BASED FILTERING)
# ===========================
def recommend_movies(movie_title, num_recommendations=5):
    """Recommend similar movies based on content similarity."""
    if movie_title not in movies_df["title"].values:
        return "Movie not found in dataset!"

    # Get index of the selected movie
    movie_index = movies_df[movies_df["title"] == movie_title].index[0]

    # Compute similarity scores and sort them
    similarity_scores = list(enumerate(cosine_sim[movie_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations + 1]

    # Get recommended movie titles
    return [movies_df.iloc[i[0]]["title"] for i in similarity_scores]


# ===========================
#  STEP 6: FETCH MOVIE POSTER
# ===========================
def get_movie_poster(movie_title):
    """Fetch movie poster from TMDB API."""
    movie = movies_df[movies_df["title"] == movie_title]
    if not movie.empty:
        movie_id = movie.iloc[0]["id"]
        url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={API_KEY}"
        response = requests.get(url).json()
        return IMAGE_BASE_URL + response.get("poster_path", "")
    return None


# ===========================
#  STEP 7: LOAD & PROCESS MOVIE DATA
# ===========================
movies_df = fetch_movies(num_pages=2)  # Fetch movie data
movies_df = enhance_movie_data(movies_df)  # Add cast, director info

# Apply feature extraction
movies_df["keywords"] = movies_df["overview"].apply(lambda x: extract_keywords(str(x)))
movies_df["sentiment"] = movies_df["overview"].apply(lambda x: get_sentiment(str(x)))

# Combine relevant text features for recommendation
movies_df["combined_features"] = (
        movies_df["overview"].fillna("") + " " +
        movies_df["keywords"].fillna("")
)

# Convert text into numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df["combined_features"])

# Compute similarity scores between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# ===========================
#  STEP 8: STREAMLIT APP UI
# ===========================
st.title("🎬 Movie Recommendation System")

# Dropdown to select a movie
selected_movie = st.selectbox("Select a Movie", movies_df["title"].values)

# Recommend button
if st.button("Recommend"):
    recommendations = recommend_movies(selected_movie)

    if isinstance(recommendations, list):
        st.subheader(f"Movies similar to {selected_movie}:")

        # Display recommended movies in a horizontal layout
        cols = st.columns(len(recommendations))

        for i, movie in enumerate(recommendations):
            poster_url = get_movie_poster(movie)
            with cols[i]:
                if poster_url:
                    st.image(poster_url, width=150)
                st.write(f"**{movie}**")
    else:
        st.error("No recommendations found.")