recipe-rover-api / app /utils /feature_engineering.py
garvitcpp's picture
Upload 48 files
c30b4ba verified
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
import pandas as pd
import numpy as np
def create_feature_matrices(df, feature_weights):
"""
Create feature matrices for the recommendation system.
"""
tfidf_vectorizer_ingredients = TfidfVectorizer(
stop_words='english',
max_features=5000,
ngram_range=(1, 2),
min_df=1
)
ingredients_text = df['RecipeIngredientParts'].apply(lambda x: ' '.join(x) if x else '')
tfidf_matrix_ingredients = tfidf_vectorizer_ingredients.fit_transform(ingredients_text)
tfidf_vectorizer_keywords = TfidfVectorizer(stop_words='english', max_features=3000)
tfidf_vectorizer_keywords_name = TfidfVectorizer(stop_words='english', max_features=3000)
keywords_text = df['Keywords'].apply(lambda x: ' '.join(x) if x else '')
keywords_name_text = df['keywords_name'].apply(lambda x: ' '.join(x) if x else '')
tfidf_matrix_keywords = tfidf_vectorizer_keywords.fit_transform(keywords_text)
tfidf_matrix_keywords_name = tfidf_vectorizer_keywords_name.fit_transform(keywords_name_text)
category_dummies = pd.get_dummies(df['RecipeCategory'])
category_matrix = category_dummies.values
dietary_columns = ['is_vegetarian', 'is_vegan', 'is_gluten free', 'is_dairy free',
'is_low carb', 'is_keto', 'is_paleo']
dietary_matrix = df[dietary_columns].values
scaler = MinMaxScaler()
calories_matrix = scaler.fit_transform(df[['Calories']].values)
time_matrix = scaler.fit_transform(df[['TotalTime_minutes']].values)
rating_matrix = scaler.fit_transform(df[['AggregatedRating']].values)
combined_matrix = hstack([
tfidf_matrix_ingredients * feature_weights['ingredients'],
category_matrix * feature_weights['category'],
dietary_matrix * feature_weights['dietary'],
calories_matrix * feature_weights['calories'],
time_matrix * feature_weights['time'],
tfidf_matrix_keywords * feature_weights['keywords'],
tfidf_matrix_keywords_name * feature_weights['keywords_name'],
rating_matrix * 0.05 # Small weight for ratings in base similarity
])
return (combined_matrix, tfidf_vectorizer_ingredients, tfidf_vectorizer_keywords,
tfidf_vectorizer_keywords_name, category_dummies, scaler)
def create_query_vector(combined_matrix, tfidf_vectorizer_ingredients, tfidf_vectorizer_keywords,
tfidf_vectorizer_keywords_name, category_dummies, scaler, feature_weights, **kwargs):
"""
Create a query vector based on user input.
"""
query_vector = np.zeros((1, combined_matrix.shape[1]))
current_position = 0
if kwargs.get('ingredients'):
ingredients_query = tfidf_vectorizer_ingredients.transform([' '.join(kwargs['ingredients'])])
query_vector[:, :ingredients_query.shape[1]] = ingredients_query.toarray() * feature_weights['ingredients']
current_position += ingredients_query.shape[1]
category_vector = np.zeros((1, category_dummies.shape[1]))
if kwargs.get('category') and kwargs['category'] in category_dummies.columns:
category_index = category_dummies.columns.get_loc(kwargs['category'])
category_vector[0, category_index] = 1
query_vector[:, current_position:current_position + category_dummies.shape[1]] = (
category_vector * feature_weights['category']
)
current_position += category_dummies.shape[1]
dietary_columns = ['is_vegetarian', 'is_vegan', 'is_gluten free', 'is_dairy free',
'is_low carb', 'is_keto', 'is_paleo']
dietary_vector = np.zeros((1, len(dietary_columns)))
if kwargs.get('dietary_preference') in dietary_columns:
dietary_index = dietary_columns.index(kwargs['dietary_preference'])
dietary_vector[0, dietary_index] = 1
query_vector[:, current_position:current_position + len(dietary_columns)] = (
dietary_vector * feature_weights['dietary']
)
current_position += len(dietary_columns)
calories_vector = np.zeros((1, 1))
time_vector = np.zeros((1, 1))
if kwargs.get('calories'):
calories_vector[0, 0] = kwargs['calories']
if kwargs.get('time'):
time_vector[0, 0] = kwargs['time']
calories_vector = scaler.transform(calories_vector)
time_vector = scaler.transform(time_vector)
query_vector[:, current_position:current_position + 1] = calories_vector * feature_weights['calories']
current_position += 1
query_vector[:, current_position:current_position + 1] = time_vector * feature_weights['time']
current_position += 1
if kwargs.get('keywords'):
keywords_query = tfidf_vectorizer_keywords.transform([' '.join(kwargs['keywords'])])
query_vector[:, current_position:current_position + keywords_query.shape[1]] = (
keywords_query.toarray() * feature_weights['keywords']
)
current_position += keywords_query.shape[1]
if kwargs.get('keywords_name'):
keywords_name_query = tfidf_vectorizer_keywords_name.transform([' '.join(kwargs['keywords_name'])])
query_vector[:, current_position:current_position + keywords_name_query.shape[1]] = (
keywords_name_query.toarray() * feature_weights['keywords_name']
)
return query_vector