File size: 5,428 Bytes
c30b4ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
import pandas as pd
import numpy as np

def create_feature_matrices(df, feature_weights):
    """
    Create feature matrices for the recommendation system.
    """
    tfidf_vectorizer_ingredients = TfidfVectorizer(
        stop_words='english',
        max_features=5000,
        ngram_range=(1, 2),
        min_df=1
    )
    
    ingredients_text = df['RecipeIngredientParts'].apply(lambda x: ' '.join(x) if x else '')
    tfidf_matrix_ingredients = tfidf_vectorizer_ingredients.fit_transform(ingredients_text)

    tfidf_vectorizer_keywords = TfidfVectorizer(stop_words='english', max_features=3000)
    tfidf_vectorizer_keywords_name = TfidfVectorizer(stop_words='english', max_features=3000)
    
    keywords_text = df['Keywords'].apply(lambda x: ' '.join(x) if x else '')
    keywords_name_text = df['keywords_name'].apply(lambda x: ' '.join(x) if x else '')
    
    tfidf_matrix_keywords = tfidf_vectorizer_keywords.fit_transform(keywords_text)
    tfidf_matrix_keywords_name = tfidf_vectorizer_keywords_name.fit_transform(keywords_name_text)

    category_dummies = pd.get_dummies(df['RecipeCategory'])
    category_matrix = category_dummies.values

    dietary_columns = ['is_vegetarian', 'is_vegan', 'is_gluten free', 'is_dairy free', 
                     'is_low carb', 'is_keto', 'is_paleo']
    dietary_matrix = df[dietary_columns].values

    scaler = MinMaxScaler()
    calories_matrix = scaler.fit_transform(df[['Calories']].values)
    time_matrix = scaler.fit_transform(df[['TotalTime_minutes']].values)
    rating_matrix = scaler.fit_transform(df[['AggregatedRating']].values)

    combined_matrix = hstack([
        tfidf_matrix_ingredients * feature_weights['ingredients'],
        category_matrix * feature_weights['category'],
        dietary_matrix * feature_weights['dietary'],
        calories_matrix * feature_weights['calories'],
        time_matrix * feature_weights['time'],
        tfidf_matrix_keywords * feature_weights['keywords'],
        tfidf_matrix_keywords_name * feature_weights['keywords_name'],
        rating_matrix * 0.05  # Small weight for ratings in base similarity
    ])

    return (combined_matrix, tfidf_vectorizer_ingredients, tfidf_vectorizer_keywords, 
            tfidf_vectorizer_keywords_name, category_dummies, scaler)

def create_query_vector(combined_matrix, tfidf_vectorizer_ingredients, tfidf_vectorizer_keywords,
                        tfidf_vectorizer_keywords_name, category_dummies, scaler, feature_weights, **kwargs):
    """
    Create a query vector based on user input.
    """
    query_vector = np.zeros((1, combined_matrix.shape[1]))
    current_position = 0

    if kwargs.get('ingredients'):
        ingredients_query = tfidf_vectorizer_ingredients.transform([' '.join(kwargs['ingredients'])])
        query_vector[:, :ingredients_query.shape[1]] = ingredients_query.toarray() * feature_weights['ingredients']
        current_position += ingredients_query.shape[1]

    category_vector = np.zeros((1, category_dummies.shape[1]))
    if kwargs.get('category') and kwargs['category'] in category_dummies.columns:
        category_index = category_dummies.columns.get_loc(kwargs['category'])
        category_vector[0, category_index] = 1
    query_vector[:, current_position:current_position + category_dummies.shape[1]] = (
        category_vector * feature_weights['category']
    )
    current_position += category_dummies.shape[1]

    dietary_columns = ['is_vegetarian', 'is_vegan', 'is_gluten free', 'is_dairy free', 
                       'is_low carb', 'is_keto', 'is_paleo']
    dietary_vector = np.zeros((1, len(dietary_columns)))
    if kwargs.get('dietary_preference') in dietary_columns:
        dietary_index = dietary_columns.index(kwargs['dietary_preference'])
        dietary_vector[0, dietary_index] = 1
    query_vector[:, current_position:current_position + len(dietary_columns)] = (
        dietary_vector * feature_weights['dietary']
    )
    current_position += len(dietary_columns)

    calories_vector = np.zeros((1, 1))
    time_vector = np.zeros((1, 1))
    
    if kwargs.get('calories'):
        calories_vector[0, 0] = kwargs['calories']
    if kwargs.get('time'):
        time_vector[0, 0] = kwargs['time']
        
    calories_vector = scaler.transform(calories_vector)
    time_vector = scaler.transform(time_vector)
    
    query_vector[:, current_position:current_position + 1] = calories_vector * feature_weights['calories']
    current_position += 1
    query_vector[:, current_position:current_position + 1] = time_vector * feature_weights['time']
    current_position += 1

    if kwargs.get('keywords'):
        keywords_query = tfidf_vectorizer_keywords.transform([' '.join(kwargs['keywords'])])
        query_vector[:, current_position:current_position + keywords_query.shape[1]] = (
            keywords_query.toarray() * feature_weights['keywords']
        )
        current_position += keywords_query.shape[1]

    if kwargs.get('keywords_name'):
        keywords_name_query = tfidf_vectorizer_keywords_name.transform([' '.join(kwargs['keywords_name'])])
        query_vector[:, current_position:current_position + keywords_name_query.shape[1]] = (
            keywords_name_query.toarray() * feature_weights['keywords_name']
        )

    return query_vector