Spaces:

erichier
/

finalcapstone

Sleeping

File size: 6,400 Bytes

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import os
import logging
import re
from scipy.sparse import csr_matrix

# Setup logging
logger = logging.getLogger(__name__)

def normalize_title(title):
    """Convert title to lowercase, strip spaces, and normalize '&' for consistency."""
    title = title.lower().strip()
    title = re.sub(r"[^\w\s&]", "", title)  # Keep '&' but remove other special characters
    title = re.sub(r"\s+", " ", title)  # Replace multiple spaces with a single space
    title = title.replace("&", "and")  # Normalize '&' to 'and'
    return title

def load_and_preprocess_data(csv_file="netflix_titles.csv"):
    """Loads Netflix dataset, cleans, and prepares it for TF-IDF."""
    try:
        df = pd.read_csv(csv_file)
        logger.info(f"Loaded dataset from {csv_file} with {len(df)} rows.")
        
        # Drop duplicates by title
        df.drop_duplicates(subset='title', keep='first', inplace=True)
        
        # Fill missing text fields with 'unknown'
        text_cols = ['director', 'cast', 'country', 'listed_in', 'description']
        for col in text_cols:
            df[col] = df[col].fillna('unknown').astype(str).str.lower()
        
        # Combine text features for recommendations
        df['combined_features'] = (
            df['director'] + ' ' +
            df['cast'] + ' ' +
            df['listed_in'] + ' ' +
            df['description']
        )

        return df
    
    except FileNotFoundError:
        logger.error(f"Dataset file '{csv_file}' not found.")
        raise FileNotFoundError(f"Dataset file '{csv_file}' not found.")
    except Exception as e:
        logger.error(f"Error loading data from {csv_file}: {str(e)}")
        raise Exception(f"Error loading data: {str(e)}")

def build_or_load_model(df, cache_file="/tmp/cosine_sim_cache.pkl"):
    """Builds or loads TF-IDF matrix and cosine similarity, with caching."""
    if os.path.exists(cache_file):
        try:
            tfidf_matrix, cosine_sim_matrix, title_to_index = joblib.load(cache_file)
            logger.info(f"Loaded cached model from {cache_file}.")
            return tfidf_matrix, cosine_sim_matrix, title_to_index
        except Exception as e:
            logger.warning(f"Failed to load cache from {cache_file}: {str(e)}. Rebuilding model.")
    
    # Build model if cache doesn’t exist or fails
    try:
        tfidf = TfidfVectorizer(
            stop_words='english',
            ngram_range=(1, 2),  # Capture word pairs (bigrams) for better similarity
            min_df=2  # Ignore rare words appearing in only 1 document
        )
        tfidf_matrix = tfidf.fit_transform(df['combined_features'])
        
        # Ensure matrix is valid
        if tfidf_matrix.shape[0] == 0 or tfidf_matrix.shape[1] == 0:
            raise ValueError("TF-IDF matrix is empty! Check feature extraction.")

        # Compute cosine similarity
        cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
        
        # Convert to sparse matrix for efficiency
        cosine_sim_matrix = csr_matrix(cosine_sim_matrix)

        # Map normalized titles -> index
        df["normalized_title"] = df["title"].apply(normalize_title)
        title_to_index = pd.Series(df.index, index=df["normalized_title"]).drop_duplicates()

        # Debugging logs
        logger.info(f"Sample normalized titles in title_to_index: {list(title_to_index.keys())[:20]}")
        logger.info(f"Checking if 'carole and tuesday' exists in title_to_index: {'carole and tuesday' in title_to_index}")
        
        # Cache the results
        joblib.dump((tfidf_matrix, cosine_sim_matrix, title_to_index), cache_file)
        logger.info(f"Built and cached model to {cache_file}.")
        
        return tfidf_matrix, cosine_sim_matrix, title_to_index
    
    except Exception as e:
        logger.error(f"Error building model: {str(e)}")
        raise

def get_recommendations(title, df, title_to_index, cosine_sim_matrix, top_n=10, content_type=None, fields=None):
    """Returns a list of recommendation dictionaries based on cosine similarity."""
    
    if not all([df is not None, title_to_index is not None, cosine_sim_matrix is not None]):
        logger.error("One or more critical components (df, title_to_index, cosine_sim_matrix) are None!")
        raise ValueError("DataFrame, title_to_index, and cosine_sim_matrix must not be None.")

    if not isinstance(top_n, int) or top_n <= 0:
        raise ValueError("top_n must be a positive integer.")

    if not isinstance(title, str) or not title.strip():
        raise ValueError("Title must be a non-empty string.")

    # Normalize title for lookup
    title = normalize_title(title)

    # Ensure title exists
    if title not in title_to_index:
        logger.warning(f"'{title}' NOT found in title_to_index!")
        return []

    idx = title_to_index[title]
    
    # Get similarity scores
    try:
        sim_scores = list(enumerate(cosine_sim_matrix[idx].toarray()[0]))
    except Exception as e:
        logger.error(f"Error computing similarity scores for '{title}': {str(e)}")
        return []

    logger.info(f"Raw similarity scores for '{title}': {sim_scores[:10]}")

    # Sort by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]  # Exclude the input title itself

    logger.info(f"Sorted similarity scores for '{title}': {sim_scores[:10]}")

    # If all similarity scores are 0, issue a warning
    if all(score[1] == 0 for score in sim_scores):
        logger.warning(f"⚠️ All similarity scores for '{title}' are 0! No recommendations possible.")
        return []

    # Build recommendations list
    recommendations = []
    for movie_idx, score in sim_scores:
        if content_type and df['type'].iloc[movie_idx].lower() != content_type.lower():
            continue

        recommendation = {field: df[field].iloc[movie_idx] for field in (fields or ['title']) if field in df.columns}
        recommendation['similarity'] = float(score)
        recommendations.append(recommendation)

        if len(recommendations) >= top_n:
            break

    logger.info(f"Found {len(recommendations)} recommendations for '{title}'")
    return recommendations