Update app.py
Browse files
app.py
CHANGED
|
@@ -1,46 +1,82 @@
|
|
|
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
| 2 |
import kagglehub
|
| 3 |
-
import os
|
| 4 |
-
import random
|
| 5 |
-
from PIL import Image
|
| 6 |
|
| 7 |
# Step 1: Download the latest version of the dataset and get the path
|
| 8 |
-
path = kagglehub.
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
#
|
| 26 |
-
if
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
import streamlit as st
|
| 3 |
+
from surprise import Dataset, Reader, SVD
|
| 4 |
+
from surprise.model_selection import train_test_split
|
| 5 |
+
from collections import defaultdict
|
| 6 |
import kagglehub
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# Step 1: Download the latest version of the dataset and get the path
|
| 9 |
+
path = kagglehub.download_dataset("ashpalsingh1525/imdb-movies-dataset")
|
| 10 |
+
|
| 11 |
+
# Step 2: Define the dataset folder path
|
| 12 |
+
dataset_folder = "/home/user/.cache/kagglehub/datasets/ashpalsingh1525/imdb-movies-dataset/versions/1"
|
| 13 |
+
|
| 14 |
+
# Step 3: Define the CSV file path (Update if the filename is different)
|
| 15 |
+
dataset_path = f"{dataset_folder}/imdb_movies.csv"
|
| 16 |
+
|
| 17 |
+
# Load the dataset
|
| 18 |
+
df = pd.read_csv(dataset_path)
|
| 19 |
+
|
| 20 |
+
# Ensure all categorical columns are strings
|
| 21 |
+
categorical_columns = ['genre', 'orig_title', 'orig_lang', 'country', 'crew']
|
| 22 |
+
|
| 23 |
+
for col in categorical_columns:
|
| 24 |
+
df[col] = df[col].astype(str) # Convert to string explicitly
|
| 25 |
+
|
| 26 |
+
# Check unique values in genre column (to ensure it's not numerical)
|
| 27 |
+
if df['genre'].str.isnumeric().all():
|
| 28 |
+
print("Warning: Genre column is numeric. Mapping needed.")
|
| 29 |
+
genre_mapping = {i: f"Genre_{i}" for i in df['genre'].unique()}
|
| 30 |
+
df['genre'] = df['genre'].map(genre_mapping)
|
| 31 |
+
|
| 32 |
+
# Prepare dataset for Surprise
|
| 33 |
+
reader = Reader(rating_scale=(df['score'].min(), df['score'].max()))
|
| 34 |
+
data = Dataset.load_from_df(df[['orig_title', 'orig_lang', 'score']], reader)
|
| 35 |
+
|
| 36 |
+
# Train collaborative filtering model
|
| 37 |
+
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
|
| 38 |
+
model = SVD(n_factors=50, random_state=42)
|
| 39 |
+
model.fit(trainset)
|
| 40 |
+
|
| 41 |
+
# Function to get movie recommendations
|
| 42 |
+
def get_recommendations(selected_movies, genre):
|
| 43 |
+
if not selected_movies:
|
| 44 |
+
return ["Please select at least one movie."]
|
| 45 |
+
|
| 46 |
+
# Filter dataset by genre
|
| 47 |
+
filtered_movies = df[df['genre'] == genre]
|
| 48 |
+
|
| 49 |
+
# Store average scores of all movies
|
| 50 |
+
movie_scores = defaultdict(float)
|
| 51 |
+
|
| 52 |
+
# Predict ratings for all movies in the filtered dataset
|
| 53 |
+
for movie in filtered_movies['orig_title'].unique():
|
| 54 |
+
est_score = model.predict(uid='user', iid=movie).est
|
| 55 |
+
movie_scores[movie] = est_score
|
| 56 |
+
|
| 57 |
+
# Sort movies by predicted score (descending)
|
| 58 |
+
recommended_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
|
| 59 |
+
|
| 60 |
+
# Exclude already selected movies
|
| 61 |
+
recommended_movies = [movie for movie, _ in recommended_movies if movie not in selected_movies]
|
| 62 |
+
|
| 63 |
+
return recommended_movies[:5] # Return top 5 recommendations
|
| 64 |
+
|
| 65 |
+
# Streamlit UI
|
| 66 |
+
st.title("🎬 Movie Recommendation System")
|
| 67 |
+
|
| 68 |
+
# Genre selection
|
| 69 |
+
selected_genre = st.selectbox("Select a Genre", sorted(df['genre'].unique().tolist()))
|
| 70 |
+
|
| 71 |
+
# Get available movies for the selected genre
|
| 72 |
+
movies_in_genre = df[df['genre'] == selected_genre]['orig_title'].unique().tolist()
|
| 73 |
+
|
| 74 |
+
# Movie selection
|
| 75 |
+
selected_movies = st.multiselect("Select Up to 3 Movies", movies_in_genre, max_selections=3)
|
| 76 |
+
|
| 77 |
+
# Recommendation button
|
| 78 |
+
if st.button("Get Recommendations"):
|
| 79 |
+
recommendations = get_recommendations(selected_movies, selected_genre)
|
| 80 |
+
st.subheader("Recommended Movies:")
|
| 81 |
+
for movie in recommendations:
|
| 82 |
+
st.write(f"- {movie}")
|