File size: 5,691 Bytes
01ce151
 
 
 
7dde2e4
 
 
 
 
 
 
01ce151
7dde2e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01ce151
7dde2e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01ce151
7dde2e4
 
 
 
 
 
 
 
 
 
01ce151
7dde2e4
 
 
 
 
 
 
 
 
01ce151
7dde2e4
 
01ce151
7dde2e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01ce151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dde2e4
01ce151
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# === Install Required Libraries === #
# !pip install pandas numpy sentence-transformers faiss-cpu streamlit

# === Import Required Libraries === #
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import streamlit as st
import os

# === Dataset Loading Function === #
def load_dataset():
    """
    Provides multiple options to load the dataset: manual upload, Kaggle download, or specifying a local path.
    """
    st.write("### Dataset Upload Options")
    upload_option = st.radio(
        "Choose how to provide the dataset:",
        ("Manual Upload", "Download from Kaggle", "Specify Local Path")
    )

    # Manual Upload
    if upload_option == "Manual Upload":
        st.write("#### Upload the file below:")
        uploaded_file = st.file_uploader("Upload your CSV file", type="csv")
        if uploaded_file is not None:
            st.success("File uploaded successfully!")
            return pd.read_csv(uploaded_file)

    # Kaggle Download
    elif upload_option == "Download from Kaggle":
        st.write("#### Enter your Kaggle Dataset Path and API Key")
        kaggle_dataset = st.text_input("Kaggle Dataset Path (e.g., `thedevastator/hydra-movies-dataset-directors-writers-cast-and`):")
        kaggle_api_key = st.text_area("Enter your Kaggle API Key JSON content:")
        if st.button("Download Dataset"):
            if kaggle_dataset and kaggle_api_key:
                # Set up Kaggle API
                os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
                with open(os.path.expanduser("~/.kaggle/kaggle.json"), "w") as f:
                    f.write(kaggle_api_key)
                os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)

                # Download dataset
                os.system(f"!kaggle datasets download -d {kaggle_dataset} --unzip")
                dataset_name = kaggle_dataset.split("/")[-1] + ".csv"
                if os.path.exists(dataset_name):
                    st.success(f"Dataset {dataset_name} downloaded successfully!")
                    return pd.read_csv(dataset_name)
                else:
                    st.error("Failed to download dataset. Please check your inputs.")
            else:
                st.warning("Please provide both the dataset path and your API key.")

    # Specify Local Path
    elif upload_option == "Specify Local Path":
        local_path = st.text_input("Specify the full local path of your CSV file:")
        if st.button("Load Dataset"):
            if os.path.exists(local_path):
                st.success("Dataset loaded successfully from the specified path!")
                return pd.read_csv(local_path)
            else:
                st.error("File not found. Please check the path and try again.")

    return None

# === Preprocess Data === #
def preprocess_data(df):
    """
    Normalizes column names and prepares text for embeddings. Adds placeholders for missing columns if needed.
    """
    # Normalize column names
    df.columns = df.columns.str.strip().str.lower()

    # Verify and handle missing 'genres' column
    if 'genres' not in df.columns:
        print("Warning: 'genres' column missing! Adding a placeholder.")
        df['genres'] = "Unknown"

    # Check if columns like 'title', 'summary', 'cast' exist and handle possible NaN/invalid values
    df['text'] = df['title'].fillna('') + " " + df['summary'].fillna('') + " " + df['genres'] + " " + df['cast'].fillna('')
    
    return df

# === Create Embeddings and FAISS Index === #
def create_faiss_index(df, model):
    """
    Generates embeddings using a sentence-transformer model and creates a FAISS index.
    """
    embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# === Define Retrieval Function === #
def retrieve(query, model, index, df, top_k=5):
    """
    Retrieves top-k results for a given query using FAISS index.
    """
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = df.iloc[indices[0]].to_dict(orient="records")
    return results

# === Main Function === #
if __name__ == "__main__":
    # Streamlit Setup
    st.title("Movie Recommendation Application with FAISS and Sentence-Transformers")

    # Step 1: Load dataset
    df = load_dataset()

    if df is not None:
        st.write("### Preview of Loaded Dataset")
        st.dataframe(df.head())

        # Step 2: Preprocess data
        df = preprocess_data(df)

        # Step 3: Create embeddings and FAISS index
        st.write("### Creating Embeddings and Index...")
        model = SentenceTransformer('all-MiniLM-L6-v2')
        index = create_faiss_index(df, model)

        # Step 4: Query Input and Result Display
        query = st.text_input("Enter a movie name or keyword for recommendations:")
        
        if query:
            st.write("### Query Results")
            results = retrieve(query, model, index, df)
            response = ""
            for i, res in enumerate(results):
                response += f"**{i+1}. {res['title']} ({res['year']})**\n"
                response += f"- **Genres**: {res['genres']}\n"
                response += f"- **Summary**: {res['short summary']}\n"
                response += f"- **Director**: {res['director']}\n"
                response += f"- **Cast**: {res['cast']}\n"
                response += f"- **Rating**: {res['rating']}\n\n"
            st.write(response)

    else:
        st.write("### Please load the dataset to proceed.")