Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import joblib | |
| import os | |
| import logging | |
| import re | |
| from scipy.sparse import csr_matrix | |
| # Setup logging | |
| logger = logging.getLogger(__name__) | |
| def normalize_title(title): | |
| """Convert title to lowercase, strip spaces, and normalize '&' for consistency.""" | |
| title = title.lower().strip() | |
| title = re.sub(r"[^\w\s&]", "", title) # Keep '&' but remove other special characters | |
| title = re.sub(r"\s+", " ", title) # Replace multiple spaces with a single space | |
| title = title.replace("&", "and") # Normalize '&' to 'and' | |
| return title | |
| def load_and_preprocess_data(csv_file="netflix_titles.csv"): | |
| """Loads Netflix dataset, cleans, and prepares it for TF-IDF.""" | |
| try: | |
| df = pd.read_csv(csv_file) | |
| logger.info(f"Loaded dataset from {csv_file} with {len(df)} rows.") | |
| # Drop duplicates by title | |
| df.drop_duplicates(subset='title', keep='first', inplace=True) | |
| # Fill missing text fields with 'unknown' | |
| text_cols = ['director', 'cast', 'country', 'listed_in', 'description'] | |
| for col in text_cols: | |
| df[col] = df[col].fillna('unknown').astype(str).str.lower() | |
| # Combine text features for recommendations | |
| df['combined_features'] = ( | |
| df['director'] + ' ' + | |
| df['cast'] + ' ' + | |
| df['listed_in'] + ' ' + | |
| df['description'] | |
| ) | |
| return df | |
| except FileNotFoundError: | |
| logger.error(f"Dataset file '{csv_file}' not found.") | |
| raise FileNotFoundError(f"Dataset file '{csv_file}' not found.") | |
| except Exception as e: | |
| logger.error(f"Error loading data from {csv_file}: {str(e)}") | |
| raise Exception(f"Error loading data: {str(e)}") | |
| def build_or_load_model(df, cache_file="/tmp/cosine_sim_cache.pkl"): | |
| """Builds or loads TF-IDF matrix and cosine similarity, with caching.""" | |
| if os.path.exists(cache_file): | |
| try: | |
| tfidf_matrix, cosine_sim_matrix, title_to_index = joblib.load(cache_file) | |
| logger.info(f"Loaded cached model from {cache_file}.") | |
| return tfidf_matrix, cosine_sim_matrix, title_to_index | |
| except Exception as e: | |
| logger.warning(f"Failed to load cache from {cache_file}: {str(e)}. Rebuilding model.") | |
| # Build model if cache doesn’t exist or fails | |
| try: | |
| tfidf = TfidfVectorizer( | |
| stop_words='english', | |
| ngram_range=(1, 2), # Capture word pairs (bigrams) for better similarity | |
| min_df=2 # Ignore rare words appearing in only 1 document | |
| ) | |
| tfidf_matrix = tfidf.fit_transform(df['combined_features']) | |
| # Ensure matrix is valid | |
| if tfidf_matrix.shape[0] == 0 or tfidf_matrix.shape[1] == 0: | |
| raise ValueError("TF-IDF matrix is empty! Check feature extraction.") | |
| # Compute cosine similarity | |
| cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix) | |
| # Convert to sparse matrix for efficiency | |
| cosine_sim_matrix = csr_matrix(cosine_sim_matrix) | |
| # Map normalized titles -> index | |
| df["normalized_title"] = df["title"].apply(normalize_title) | |
| title_to_index = pd.Series(df.index, index=df["normalized_title"]).drop_duplicates() | |
| # Debugging logs | |
| logger.info(f"Sample normalized titles in title_to_index: {list(title_to_index.keys())[:20]}") | |
| logger.info(f"Checking if 'carole and tuesday' exists in title_to_index: {'carole and tuesday' in title_to_index}") | |
| # Cache the results | |
| joblib.dump((tfidf_matrix, cosine_sim_matrix, title_to_index), cache_file) | |
| logger.info(f"Built and cached model to {cache_file}.") | |
| return tfidf_matrix, cosine_sim_matrix, title_to_index | |
| except Exception as e: | |
| logger.error(f"Error building model: {str(e)}") | |
| raise | |
| def get_recommendations(title, df, title_to_index, cosine_sim_matrix, top_n=10, content_type=None, fields=None): | |
| """Returns a list of recommendation dictionaries based on cosine similarity.""" | |
| if not all([df is not None, title_to_index is not None, cosine_sim_matrix is not None]): | |
| logger.error("One or more critical components (df, title_to_index, cosine_sim_matrix) are None!") | |
| raise ValueError("DataFrame, title_to_index, and cosine_sim_matrix must not be None.") | |
| if not isinstance(top_n, int) or top_n <= 0: | |
| raise ValueError("top_n must be a positive integer.") | |
| if not isinstance(title, str) or not title.strip(): | |
| raise ValueError("Title must be a non-empty string.") | |
| # Normalize title for lookup | |
| title = normalize_title(title) | |
| # Ensure title exists | |
| if title not in title_to_index: | |
| logger.warning(f"'{title}' NOT found in title_to_index!") | |
| return [] | |
| idx = title_to_index[title] | |
| # Get similarity scores | |
| try: | |
| sim_scores = list(enumerate(cosine_sim_matrix[idx].toarray()[0])) | |
| except Exception as e: | |
| logger.error(f"Error computing similarity scores for '{title}': {str(e)}") | |
| return [] | |
| logger.info(f"Raw similarity scores for '{title}': {sim_scores[:10]}") | |
| # Sort by similarity | |
| sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) | |
| sim_scores = sim_scores[1:] # Exclude the input title itself | |
| logger.info(f"Sorted similarity scores for '{title}': {sim_scores[:10]}") | |
| # If all similarity scores are 0, issue a warning | |
| if all(score[1] == 0 for score in sim_scores): | |
| logger.warning(f"⚠️ All similarity scores for '{title}' are 0! No recommendations possible.") | |
| return [] | |
| # Build recommendations list | |
| recommendations = [] | |
| for movie_idx, score in sim_scores: | |
| if content_type and df['type'].iloc[movie_idx].lower() != content_type.lower(): | |
| continue | |
| recommendation = {field: df[field].iloc[movie_idx] for field in (fields or ['title']) if field in df.columns} | |
| recommendation['similarity'] = float(score) | |
| recommendations.append(recommendation) | |
| if len(recommendations) >= top_n: | |
| break | |
| logger.info(f"Found {len(recommendations)} recommendations for '{title}'") | |
| return recommendations | |