Spaces:
Sleeping
Sleeping
File size: 6,400 Bytes
0a2f730 acbf545 0a2f730 069d6cd 0a2f730 069d6cd 0a2f730 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import os
import logging
import re
from scipy.sparse import csr_matrix
# Setup logging
logger = logging.getLogger(__name__)
def normalize_title(title):
"""Convert title to lowercase, strip spaces, and normalize '&' for consistency."""
title = title.lower().strip()
title = re.sub(r"[^\w\s&]", "", title) # Keep '&' but remove other special characters
title = re.sub(r"\s+", " ", title) # Replace multiple spaces with a single space
title = title.replace("&", "and") # Normalize '&' to 'and'
return title
def load_and_preprocess_data(csv_file="netflix_titles.csv"):
"""Loads Netflix dataset, cleans, and prepares it for TF-IDF."""
try:
df = pd.read_csv(csv_file)
logger.info(f"Loaded dataset from {csv_file} with {len(df)} rows.")
# Drop duplicates by title
df.drop_duplicates(subset='title', keep='first', inplace=True)
# Fill missing text fields with 'unknown'
text_cols = ['director', 'cast', 'country', 'listed_in', 'description']
for col in text_cols:
df[col] = df[col].fillna('unknown').astype(str).str.lower()
# Combine text features for recommendations
df['combined_features'] = (
df['director'] + ' ' +
df['cast'] + ' ' +
df['listed_in'] + ' ' +
df['description']
)
return df
except FileNotFoundError:
logger.error(f"Dataset file '{csv_file}' not found.")
raise FileNotFoundError(f"Dataset file '{csv_file}' not found.")
except Exception as e:
logger.error(f"Error loading data from {csv_file}: {str(e)}")
raise Exception(f"Error loading data: {str(e)}")
def build_or_load_model(df, cache_file="/tmp/cosine_sim_cache.pkl"):
"""Builds or loads TF-IDF matrix and cosine similarity, with caching."""
if os.path.exists(cache_file):
try:
tfidf_matrix, cosine_sim_matrix, title_to_index = joblib.load(cache_file)
logger.info(f"Loaded cached model from {cache_file}.")
return tfidf_matrix, cosine_sim_matrix, title_to_index
except Exception as e:
logger.warning(f"Failed to load cache from {cache_file}: {str(e)}. Rebuilding model.")
# Build model if cache doesn’t exist or fails
try:
tfidf = TfidfVectorizer(
stop_words='english',
ngram_range=(1, 2), # Capture word pairs (bigrams) for better similarity
min_df=2 # Ignore rare words appearing in only 1 document
)
tfidf_matrix = tfidf.fit_transform(df['combined_features'])
# Ensure matrix is valid
if tfidf_matrix.shape[0] == 0 or tfidf_matrix.shape[1] == 0:
raise ValueError("TF-IDF matrix is empty! Check feature extraction.")
# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Convert to sparse matrix for efficiency
cosine_sim_matrix = csr_matrix(cosine_sim_matrix)
# Map normalized titles -> index
df["normalized_title"] = df["title"].apply(normalize_title)
title_to_index = pd.Series(df.index, index=df["normalized_title"]).drop_duplicates()
# Debugging logs
logger.info(f"Sample normalized titles in title_to_index: {list(title_to_index.keys())[:20]}")
logger.info(f"Checking if 'carole and tuesday' exists in title_to_index: {'carole and tuesday' in title_to_index}")
# Cache the results
joblib.dump((tfidf_matrix, cosine_sim_matrix, title_to_index), cache_file)
logger.info(f"Built and cached model to {cache_file}.")
return tfidf_matrix, cosine_sim_matrix, title_to_index
except Exception as e:
logger.error(f"Error building model: {str(e)}")
raise
def get_recommendations(title, df, title_to_index, cosine_sim_matrix, top_n=10, content_type=None, fields=None):
"""Returns a list of recommendation dictionaries based on cosine similarity."""
if not all([df is not None, title_to_index is not None, cosine_sim_matrix is not None]):
logger.error("One or more critical components (df, title_to_index, cosine_sim_matrix) are None!")
raise ValueError("DataFrame, title_to_index, and cosine_sim_matrix must not be None.")
if not isinstance(top_n, int) or top_n <= 0:
raise ValueError("top_n must be a positive integer.")
if not isinstance(title, str) or not title.strip():
raise ValueError("Title must be a non-empty string.")
# Normalize title for lookup
title = normalize_title(title)
# Ensure title exists
if title not in title_to_index:
logger.warning(f"'{title}' NOT found in title_to_index!")
return []
idx = title_to_index[title]
# Get similarity scores
try:
sim_scores = list(enumerate(cosine_sim_matrix[idx].toarray()[0]))
except Exception as e:
logger.error(f"Error computing similarity scores for '{title}': {str(e)}")
return []
logger.info(f"Raw similarity scores for '{title}': {sim_scores[:10]}")
# Sort by similarity
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:] # Exclude the input title itself
logger.info(f"Sorted similarity scores for '{title}': {sim_scores[:10]}")
# If all similarity scores are 0, issue a warning
if all(score[1] == 0 for score in sim_scores):
logger.warning(f"⚠️ All similarity scores for '{title}' are 0! No recommendations possible.")
return []
# Build recommendations list
recommendations = []
for movie_idx, score in sim_scores:
if content_type and df['type'].iloc[movie_idx].lower() != content_type.lower():
continue
recommendation = {field: df[field].iloc[movie_idx] for field in (fields or ['title']) if field in df.columns}
recommendation['similarity'] = float(score)
recommendations.append(recommendation)
if len(recommendations) >= top_n:
break
logger.info(f"Found {len(recommendations)} recommendations for '{title}'")
return recommendations
|