finalcapstone / recommendation_engine.py
Eric Hierholzer
updates
069d6cd
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import os
import logging
import re
from scipy.sparse import csr_matrix
# Setup logging
logger = logging.getLogger(__name__)
def normalize_title(title):
"""Convert title to lowercase, strip spaces, and normalize '&' for consistency."""
title = title.lower().strip()
title = re.sub(r"[^\w\s&]", "", title) # Keep '&' but remove other special characters
title = re.sub(r"\s+", " ", title) # Replace multiple spaces with a single space
title = title.replace("&", "and") # Normalize '&' to 'and'
return title
def load_and_preprocess_data(csv_file="netflix_titles.csv"):
"""Loads Netflix dataset, cleans, and prepares it for TF-IDF."""
try:
df = pd.read_csv(csv_file)
logger.info(f"Loaded dataset from {csv_file} with {len(df)} rows.")
# Drop duplicates by title
df.drop_duplicates(subset='title', keep='first', inplace=True)
# Fill missing text fields with 'unknown'
text_cols = ['director', 'cast', 'country', 'listed_in', 'description']
for col in text_cols:
df[col] = df[col].fillna('unknown').astype(str).str.lower()
# Combine text features for recommendations
df['combined_features'] = (
df['director'] + ' ' +
df['cast'] + ' ' +
df['listed_in'] + ' ' +
df['description']
)
return df
except FileNotFoundError:
logger.error(f"Dataset file '{csv_file}' not found.")
raise FileNotFoundError(f"Dataset file '{csv_file}' not found.")
except Exception as e:
logger.error(f"Error loading data from {csv_file}: {str(e)}")
raise Exception(f"Error loading data: {str(e)}")
def build_or_load_model(df, cache_file="/tmp/cosine_sim_cache.pkl"):
"""Builds or loads TF-IDF matrix and cosine similarity, with caching."""
if os.path.exists(cache_file):
try:
tfidf_matrix, cosine_sim_matrix, title_to_index = joblib.load(cache_file)
logger.info(f"Loaded cached model from {cache_file}.")
return tfidf_matrix, cosine_sim_matrix, title_to_index
except Exception as e:
logger.warning(f"Failed to load cache from {cache_file}: {str(e)}. Rebuilding model.")
# Build model if cache doesn’t exist or fails
try:
tfidf = TfidfVectorizer(
stop_words='english',
ngram_range=(1, 2), # Capture word pairs (bigrams) for better similarity
min_df=2 # Ignore rare words appearing in only 1 document
)
tfidf_matrix = tfidf.fit_transform(df['combined_features'])
# Ensure matrix is valid
if tfidf_matrix.shape[0] == 0 or tfidf_matrix.shape[1] == 0:
raise ValueError("TF-IDF matrix is empty! Check feature extraction.")
# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Convert to sparse matrix for efficiency
cosine_sim_matrix = csr_matrix(cosine_sim_matrix)
# Map normalized titles -> index
df["normalized_title"] = df["title"].apply(normalize_title)
title_to_index = pd.Series(df.index, index=df["normalized_title"]).drop_duplicates()
# Debugging logs
logger.info(f"Sample normalized titles in title_to_index: {list(title_to_index.keys())[:20]}")
logger.info(f"Checking if 'carole and tuesday' exists in title_to_index: {'carole and tuesday' in title_to_index}")
# Cache the results
joblib.dump((tfidf_matrix, cosine_sim_matrix, title_to_index), cache_file)
logger.info(f"Built and cached model to {cache_file}.")
return tfidf_matrix, cosine_sim_matrix, title_to_index
except Exception as e:
logger.error(f"Error building model: {str(e)}")
raise
def get_recommendations(title, df, title_to_index, cosine_sim_matrix, top_n=10, content_type=None, fields=None):
"""Returns a list of recommendation dictionaries based on cosine similarity."""
if not all([df is not None, title_to_index is not None, cosine_sim_matrix is not None]):
logger.error("One or more critical components (df, title_to_index, cosine_sim_matrix) are None!")
raise ValueError("DataFrame, title_to_index, and cosine_sim_matrix must not be None.")
if not isinstance(top_n, int) or top_n <= 0:
raise ValueError("top_n must be a positive integer.")
if not isinstance(title, str) or not title.strip():
raise ValueError("Title must be a non-empty string.")
# Normalize title for lookup
title = normalize_title(title)
# Ensure title exists
if title not in title_to_index:
logger.warning(f"'{title}' NOT found in title_to_index!")
return []
idx = title_to_index[title]
# Get similarity scores
try:
sim_scores = list(enumerate(cosine_sim_matrix[idx].toarray()[0]))
except Exception as e:
logger.error(f"Error computing similarity scores for '{title}': {str(e)}")
return []
logger.info(f"Raw similarity scores for '{title}': {sim_scores[:10]}")
# Sort by similarity
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:] # Exclude the input title itself
logger.info(f"Sorted similarity scores for '{title}': {sim_scores[:10]}")
# If all similarity scores are 0, issue a warning
if all(score[1] == 0 for score in sim_scores):
logger.warning(f"⚠️ All similarity scores for '{title}' are 0! No recommendations possible.")
return []
# Build recommendations list
recommendations = []
for movie_idx, score in sim_scores:
if content_type and df['type'].iloc[movie_idx].lower() != content_type.lower():
continue
recommendation = {field: df[field].iloc[movie_idx] for field in (fields or ['title']) if field in df.columns}
recommendation['similarity'] = float(score)
recommendations.append(recommendation)
if len(recommendations) >= top_n:
break
logger.info(f"Found {len(recommendations)} recommendations for '{title}'")
return recommendations