Spaces:
Running
Running
File size: 5,428 Bytes
c30b4ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
import pandas as pd
import numpy as np
def create_feature_matrices(df, feature_weights):
"""
Create feature matrices for the recommendation system.
"""
tfidf_vectorizer_ingredients = TfidfVectorizer(
stop_words='english',
max_features=5000,
ngram_range=(1, 2),
min_df=1
)
ingredients_text = df['RecipeIngredientParts'].apply(lambda x: ' '.join(x) if x else '')
tfidf_matrix_ingredients = tfidf_vectorizer_ingredients.fit_transform(ingredients_text)
tfidf_vectorizer_keywords = TfidfVectorizer(stop_words='english', max_features=3000)
tfidf_vectorizer_keywords_name = TfidfVectorizer(stop_words='english', max_features=3000)
keywords_text = df['Keywords'].apply(lambda x: ' '.join(x) if x else '')
keywords_name_text = df['keywords_name'].apply(lambda x: ' '.join(x) if x else '')
tfidf_matrix_keywords = tfidf_vectorizer_keywords.fit_transform(keywords_text)
tfidf_matrix_keywords_name = tfidf_vectorizer_keywords_name.fit_transform(keywords_name_text)
category_dummies = pd.get_dummies(df['RecipeCategory'])
category_matrix = category_dummies.values
dietary_columns = ['is_vegetarian', 'is_vegan', 'is_gluten free', 'is_dairy free',
'is_low carb', 'is_keto', 'is_paleo']
dietary_matrix = df[dietary_columns].values
scaler = MinMaxScaler()
calories_matrix = scaler.fit_transform(df[['Calories']].values)
time_matrix = scaler.fit_transform(df[['TotalTime_minutes']].values)
rating_matrix = scaler.fit_transform(df[['AggregatedRating']].values)
combined_matrix = hstack([
tfidf_matrix_ingredients * feature_weights['ingredients'],
category_matrix * feature_weights['category'],
dietary_matrix * feature_weights['dietary'],
calories_matrix * feature_weights['calories'],
time_matrix * feature_weights['time'],
tfidf_matrix_keywords * feature_weights['keywords'],
tfidf_matrix_keywords_name * feature_weights['keywords_name'],
rating_matrix * 0.05 # Small weight for ratings in base similarity
])
return (combined_matrix, tfidf_vectorizer_ingredients, tfidf_vectorizer_keywords,
tfidf_vectorizer_keywords_name, category_dummies, scaler)
def create_query_vector(combined_matrix, tfidf_vectorizer_ingredients, tfidf_vectorizer_keywords,
tfidf_vectorizer_keywords_name, category_dummies, scaler, feature_weights, **kwargs):
"""
Create a query vector based on user input.
"""
query_vector = np.zeros((1, combined_matrix.shape[1]))
current_position = 0
if kwargs.get('ingredients'):
ingredients_query = tfidf_vectorizer_ingredients.transform([' '.join(kwargs['ingredients'])])
query_vector[:, :ingredients_query.shape[1]] = ingredients_query.toarray() * feature_weights['ingredients']
current_position += ingredients_query.shape[1]
category_vector = np.zeros((1, category_dummies.shape[1]))
if kwargs.get('category') and kwargs['category'] in category_dummies.columns:
category_index = category_dummies.columns.get_loc(kwargs['category'])
category_vector[0, category_index] = 1
query_vector[:, current_position:current_position + category_dummies.shape[1]] = (
category_vector * feature_weights['category']
)
current_position += category_dummies.shape[1]
dietary_columns = ['is_vegetarian', 'is_vegan', 'is_gluten free', 'is_dairy free',
'is_low carb', 'is_keto', 'is_paleo']
dietary_vector = np.zeros((1, len(dietary_columns)))
if kwargs.get('dietary_preference') in dietary_columns:
dietary_index = dietary_columns.index(kwargs['dietary_preference'])
dietary_vector[0, dietary_index] = 1
query_vector[:, current_position:current_position + len(dietary_columns)] = (
dietary_vector * feature_weights['dietary']
)
current_position += len(dietary_columns)
calories_vector = np.zeros((1, 1))
time_vector = np.zeros((1, 1))
if kwargs.get('calories'):
calories_vector[0, 0] = kwargs['calories']
if kwargs.get('time'):
time_vector[0, 0] = kwargs['time']
calories_vector = scaler.transform(calories_vector)
time_vector = scaler.transform(time_vector)
query_vector[:, current_position:current_position + 1] = calories_vector * feature_weights['calories']
current_position += 1
query_vector[:, current_position:current_position + 1] = time_vector * feature_weights['time']
current_position += 1
if kwargs.get('keywords'):
keywords_query = tfidf_vectorizer_keywords.transform([' '.join(kwargs['keywords'])])
query_vector[:, current_position:current_position + keywords_query.shape[1]] = (
keywords_query.toarray() * feature_weights['keywords']
)
current_position += keywords_query.shape[1]
if kwargs.get('keywords_name'):
keywords_name_query = tfidf_vectorizer_keywords_name.transform([' '.join(kwargs['keywords_name'])])
query_vector[:, current_position:current_position + keywords_name_query.shape[1]] = (
keywords_name_query.toarray() * feature_weights['keywords_name']
)
return query_vector |