Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from ast import literal_eval | |
| from transformers import BertTokenizer, BertModel | |
| from torch import nn | |
| from torch.utils.data import Dataset, DataLoader | |
| import torch | |
| import os | |
| from sklearn.model_selection import train_test_split | |
| import random | |
| import re | |
| def clean_text(text): | |
| #helper function to clean the text from whitespace, double spaces | |
| # converts to lowercase and checks if the text is a string first to avoid errors | |
| if not isinstance(text, str): | |
| return '' | |
| text = text.lower() | |
| text = ' '.join(text.split()) | |
| return text.strip() | |
| def setup_tag_categories(): | |
| tag_categories = { | |
| 'cuisine': [ | |
| 'italian', 'chinese', 'mexican', 'indian', 'french', 'greek', 'thai', | |
| 'japanese', 'american', 'european', 'asian', 'mediterranean', 'spanish', | |
| 'german', 'korean', 'vietnamese', 'turkish', 'moroccan', 'lebanese' | |
| ], | |
| 'course': [ | |
| 'main-dish', 'side-dishes', 'appetizers', 'desserts', 'breakfast', | |
| 'lunch', 'dinner', 'snacks', 'beverages', 'salads', 'soups' | |
| ], | |
| 'main_ingredient': [ | |
| 'chicken', 'beef', 'pork', 'fish', 'seafood', 'vegetables', 'fruit', | |
| 'pasta', 'rice', 'cheese', 'chocolate', 'potato', 'lamb', 'turkey', | |
| 'beans', 'nuts', 'eggs', 'tofu' | |
| ], | |
| 'dietary': [ | |
| 'vegetarian', 'vegan', 'gluten-free', 'low-carb', 'healthy', 'low-fat', | |
| 'diabetic', 'dairy-free', 'keto', 'paleo', 'whole30' | |
| ], | |
| 'cooking_method': [ | |
| 'oven', 'stove-top', 'no-cook', 'microwave', 'slow-cooker', 'grilling', | |
| 'baking', 'roasting', 'frying', 'steaming', 'braising' | |
| ], | |
| 'difficulty': ['easy', 'beginner-cook', 'advanced', 'intermediate', 'quick'], | |
| 'time': [ | |
| '15-minutes-or-less', '30-minutes-or-less', '60-minutes-or-less', | |
| '4-hours-or-less', 'weeknight' | |
| ], | |
| 'occasion': [ | |
| 'holiday-event', 'christmas', 'thanksgiving', 'valentines-day', | |
| 'summer', 'winter', 'spring', 'fall', 'party', 'picnic' | |
| ] | |
| } | |
| return tag_categories | |
| def setup_ingredient_groups(): | |
| ingredient_groups = { | |
| 'proteins': [ | |
| 'chicken', 'beef', 'pork', 'fish', 'salmon', 'tuna', 'shrimp', 'turkey', | |
| 'lamb', 'bacon', 'ham', 'sausage', 'eggs', 'tofu', 'beans', 'lentils' | |
| ], | |
| 'vegetables': [ | |
| 'onion', 'garlic', 'tomato', 'carrot', 'celery', 'pepper', 'mushroom', | |
| 'spinach', 'broccoli', 'zucchini', 'potato', 'sweet potato' | |
| ], | |
| 'grains_starches': [ | |
| 'rice', 'pasta', 'bread', 'flour', 'oats', 'quinoa', 'barley', 'noodles' | |
| ], | |
| 'dairy': [ | |
| 'milk', 'butter', 'cheese', 'cream', 'yogurt', 'sour cream', 'cream cheese' | |
| ] | |
| } | |
| return ingredient_groups | |
| def categorize_recipe_tags(recipe_tags, tag_categories): | |
| categorized_tags = {} | |
| # Initialize empty lists for each category | |
| for category_name in tag_categories.keys(): | |
| categorized_tags[category_name] = [] | |
| # Check each tag | |
| for tag in recipe_tags: | |
| tag_lower = tag.lower() | |
| # Check each category | |
| for category_name in tag_categories.keys(): | |
| category_keywords = tag_categories[category_name] | |
| # Check if any keyword matches this tag | |
| for keyword in category_keywords: | |
| if keyword in tag_lower: | |
| categorized_tags[category_name].append(tag) | |
| break | |
| return categorized_tags | |
| def extract_main_ingredients(ingredients_list, ingredient_groups): | |
| if not ingredients_list or not isinstance(ingredients_list, list): | |
| return [] | |
| # Clean each ingredient | |
| cleaned_ingredients = [] | |
| for ingredient in ingredients_list: | |
| # Convert to string | |
| ingredient_string = str(ingredient) if ingredient is not None else '' | |
| if not ingredient_string or ingredient_string == 'nan': | |
| continue | |
| # Make lowercase | |
| cleaned_ingredient = ingredient_string.lower() | |
| # Remove common descriptor words | |
| words_to_remove = ['fresh', 'dried', 'chopped', 'minced', 'sliced', 'diced', 'ground', 'large', 'small', 'medium'] | |
| for word in words_to_remove: | |
| cleaned_ingredient = cleaned_ingredient.replace(word, '') | |
| # Remove numbers | |
| cleaned_ingredient = re.sub(r'\d+', '', cleaned_ingredient) | |
| # Remove measurement words | |
| measurement_words = ['cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'pound', 'pounds', 'ounce', 'ounces'] | |
| for measurement in measurement_words: | |
| cleaned_ingredient = cleaned_ingredient.replace(measurement, '') | |
| # Clean up extra spaces | |
| cleaned_ingredient = re.sub(r'\s+', ' ', cleaned_ingredient).strip() | |
| # Only keep if it's long enough | |
| if cleaned_ingredient and len(cleaned_ingredient) > 2: | |
| cleaned_ingredients.append(cleaned_ingredient) | |
| # Put ingredients in order of importance | |
| ordered_ingredients = [] | |
| # First, add proteins (most important) | |
| for ingredient in cleaned_ingredients: | |
| for protein in ingredient_groups['proteins']: | |
| if protein in ingredient: | |
| ordered_ingredients.append(ingredient) | |
| break | |
| # Then add vegetables, grains, and dairy | |
| other_groups = ['vegetables', 'grains_starches', 'dairy'] | |
| for group_name in other_groups: | |
| for ingredient in cleaned_ingredients: | |
| if ingredient not in ordered_ingredients: | |
| for group_item in ingredient_groups[group_name]: | |
| if group_item in ingredient: | |
| ordered_ingredients.append(ingredient) | |
| break | |
| # Finally, add any remaining ingredients | |
| for ingredient in cleaned_ingredients: | |
| if ingredient not in ordered_ingredients: | |
| ordered_ingredients.append(ingredient) | |
| return ordered_ingredients | |
| def create_structured_recipe_text(recipe, tag_categories, ingredient_groups): | |
| # Get recipe tags and categorize them | |
| recipe_tags = recipe['tags'] if isinstance(recipe['tags'], list) else [] | |
| categorized_tags = categorize_recipe_tags(recipe_tags, tag_categories) | |
| # Choose tags in priority order | |
| priority_categories = ['main_ingredient', 'cuisine', 'course', 'dietary', 'cooking_method'] | |
| selected_tags = [] | |
| for category in priority_categories: | |
| if category in categorized_tags: | |
| # Take up to 2 tags from each category | |
| category_tags = categorized_tags[category][:2] | |
| for tag in category_tags: | |
| selected_tags.append(tag) | |
| # Add some additional important tags | |
| important_keywords = ['easy', 'quick', 'healthy', 'spicy', 'sweet'] | |
| remaining_tags = [] | |
| for tag in recipe_tags: | |
| if tag not in selected_tags: | |
| for keyword in important_keywords: | |
| if keyword in tag.lower(): | |
| remaining_tags.append(tag) | |
| break | |
| # Add up to 3 remaining tags | |
| for i in range(min(3, len(remaining_tags))): | |
| selected_tags.append(remaining_tags[i]) | |
| # Process ingredients | |
| recipe_ingredients = recipe['ingredients'] if isinstance(recipe['ingredients'], list) else [] | |
| main_ingredients = extract_main_ingredients(recipe_ingredients, ingredient_groups) | |
| # Step 5: Create the final structured text | |
| # Join first 8 ingredients | |
| ingredients_text = ', '.join(main_ingredients[:8]) | |
| # Join first 10 tags | |
| tags_text = ', '.join(selected_tags[:10]) | |
| # Get recipe name | |
| recipe_name = str(recipe['name']).replace(' ', ' ').strip() | |
| # Create final structured text | |
| structured_text = f"Recipe: {recipe_name}. Ingredients: {ingredients_text}. Style: {tags_text}" | |
| return structured_text | |
| def create_pair_data(recipes_df: pd.DataFrame, interactions_df: pd.DataFrame ,num_pairs: int = 15000): | |
| # This function creates the training pairs for the model. | |
| # we first analyzed the data to create catogeries for the tags and ingredients. Under each of these, we have a list for cuisine, dietery, poultry, etc. | |
| # As we trained the model, we found that the model was not able to learn the tags and ingredients so we created a structured text represenation so it can easily learn. | |
| # the prompt used is: Analyze the two csv files attached and created a structured text representation to be used for training a bert model to understand | |
| # tags and ingredients such that if a user later searches for a quick recipe, it can be used to find a recipe that is quick to make. | |
| # Set up the structured text categories and groups | |
| tag_categories = setup_tag_categories() | |
| ingredient_groups = setup_ingredient_groups() | |
| # Make a list to store all our pairs | |
| pair_data_list = [] | |
| # create the pairs | |
| for pair_number in range(num_pairs): | |
| #Pick a random recipe from our dataframe | |
| random_recipe_data = recipes_df.iloc[random.randint(0, len(recipes_df) - 1)] | |
| # Get the tags from this recipe | |
| recipe_tags_list = random_recipe_data['tags'] | |
| # Select some random tags (maximum 5, but maybe less if recipe has fewer tags) | |
| num_tags_to_select = min(5, len(recipe_tags_list)) | |
| selected_tags_list = [] | |
| # Pick random sample of tags and join them to a query string | |
| selected_tags_list = random.sample(recipe_tags_list, num_tags_to_select) | |
| # Create the positive recipe text using structured format | |
| positive_recipe_text = create_structured_recipe_text(random_recipe_data, tag_categories, ingredient_groups) | |
| # Find a negative recipe that has less than 2 tags in common with the query | |
| anchor = ' '.join(selected_tags_list) | |
| anchor_tags_set = set(anchor.split()) | |
| negative_recipe_text = None | |
| attempts_counter = 0 | |
| max_attempts_allowed = 100 | |
| # Keep trying until we find a good negative recipe (Added a max attempts to avoid infinite loop) | |
| while negative_recipe_text is None and attempts_counter < max_attempts_allowed: | |
| random_negative_recipe = recipes_df.iloc[random.randint(0, len(recipes_df) - 1)] | |
| # Get tags from this negative recipe | |
| negative_recipe_tags = random_negative_recipe['tags'] | |
| negative_recipe_tags_set = set(negative_recipe_tags) | |
| # Count how many tags overlap | |
| overlap_count = 0 | |
| for anchor_tag in anchor_tags_set: | |
| if anchor_tag in negative_recipe_tags_set: | |
| overlap_count = overlap_count + 1 | |
| attempts_counter = attempts_counter + 1 | |
| # If overlap is small enough (2 or less), we can use this as negative | |
| if overlap_count <= 2: | |
| # Create the negative recipe text using structured format | |
| negative_recipe_text = create_structured_recipe_text(random_negative_recipe, tag_categories, ingredient_groups) | |
| print(f"Found all negative recipes. Overlap: {overlap_count}") | |
| break | |
| # If we found a negative recipe, add this pair to our list | |
| if negative_recipe_text is not None: | |
| # Create a tuple with the three parts | |
| pair_data_list.append((anchor, positive_recipe_text, negative_recipe_text)) | |
| print(f"Created pair {pair_number + 1}: Anchor='{anchor}', Overlap={overlap_count}") | |
| else: | |
| print(f"Could not find negative recipe for anchor '{anchor}' after {max_attempts_allowed} attempts") | |
| # Show progress every 1000 pairs | |
| if (pair_number + 1) % 1000 == 0: | |
| print(f"Progress: Created {pair_number + 1}/{num_pairs} pairs") | |
| # Convert our list to a pandas DataFrame and return it | |
| result_dataframe = pd.DataFrame(pair_data_list, columns=['anchor', 'positive', 'negative']) | |
| print(f"Final result: Created {len(result_dataframe)} pairs total") | |
| return result_dataframe | |
| class pos_neg_pair_dataset(Dataset): | |
| #typical dataset class to tokenize for bert model and return the ids and masks | |
| def __init__(self, pair_data, tokenizer, max_length=128): | |
| self.pair_data = pair_data | |
| self.tokenizer = tokenizer | |
| self.max_length = max_length | |
| def __len__(self): | |
| return len(self.pair_data) | |
| def __getitem__(self, idx): | |
| anchor = self.tokenizer( | |
| self.pair_data.iloc[idx]['anchor'], | |
| return_tensors='pt', | |
| truncation=True, | |
| max_length=self.max_length, | |
| padding='max_length') | |
| positive = self.tokenizer( | |
| self.pair_data.iloc[idx]['positive'], | |
| return_tensors='pt', | |
| truncation=True, | |
| max_length=self.max_length, | |
| padding='max_length') | |
| negative = self.tokenizer( | |
| self.pair_data.iloc[idx]['negative'], | |
| return_tensors='pt', | |
| truncation=True, | |
| max_length=self.max_length, | |
| padding='max_length') | |
| return { | |
| 'anchor_input_ids': anchor['input_ids'].squeeze(), | |
| 'anchor_attention_mask': anchor['attention_mask'].squeeze(), | |
| 'positive_input_ids': positive['input_ids'].squeeze(), | |
| 'positive_attention_mask': positive['attention_mask'].squeeze(), | |
| 'negative_input_ids': negative['input_ids'].squeeze(), | |
| 'negative_attention_mask': negative['attention_mask'].squeeze() | |
| } | |
| def evaluate_model(model, val_loader): | |
| #evaluation method, same as training but with no gradient updates | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model.to(device) | |
| model.eval() | |
| total_loss = 0 | |
| criterion = nn.TripletMarginLoss(margin=1.0) | |
| with torch.no_grad(): | |
| for batch in val_loader: | |
| anchor_input_ids = batch['anchor_input_ids'].to(device) | |
| anchor_attention_mask = batch['anchor_attention_mask'].to(device) | |
| positive_input_ids = batch['positive_input_ids'].to(device) | |
| positive_attention_mask = batch['positive_attention_mask'].to(device) | |
| negative_input_ids = batch['negative_input_ids'].to(device) | |
| negative_attention_mask = batch['negative_attention_mask'].to(device) | |
| # Forward pass - get raw BERT embeddings | |
| anchor_outputs = model(anchor_input_ids, anchor_attention_mask) | |
| positive_outputs = model(positive_input_ids, positive_attention_mask) | |
| negative_outputs = model(negative_input_ids, negative_attention_mask) | |
| # Extract [CLS] token embeddings | |
| anchor_emb = anchor_outputs.last_hidden_state[:, 0, :] | |
| positive_emb = positive_outputs.last_hidden_state[:, 0, :] | |
| negative_emb = negative_outputs.last_hidden_state[:, 0, :] | |
| # Calculate loss | |
| loss = criterion(anchor_emb, positive_emb, negative_emb) | |
| total_loss += loss.item() | |
| print(f"Average loss on validation set: {total_loss/len(val_loader):.4f}") | |
| def train_model(train_loader, num_epochs=3): | |
| # initialize the model, criterion, and optimizer | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model = BertModel.from_pretrained('bert-base-uncased') | |
| model.to(device) | |
| criterion = nn.TripletMarginLoss(margin=1.0) | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) | |
| for epoch in range(num_epochs): | |
| model.train() | |
| total_loss = 0 | |
| for batch in train_loader: | |
| #load the ids and masks to device | |
| anchor_input_ids = batch['anchor_input_ids'].to(device) | |
| anchor_attention_mask = batch['anchor_attention_mask'].to(device) | |
| positive_input_ids = batch['positive_input_ids'].to(device) | |
| positive_attention_mask = batch['positive_attention_mask'].to(device) | |
| negative_input_ids = batch['negative_input_ids'].to(device) | |
| negative_attention_mask = batch['negative_attention_mask'].to(device) | |
| # get the embeddings to extract the [CLS] token embeddings | |
| model(anchor_input_ids,anchor_attention_mask) | |
| anchor_outputs = model(anchor_input_ids, anchor_attention_mask) | |
| positive_outputs = model(positive_input_ids, positive_attention_mask) | |
| negative_outputs = model(negative_input_ids, negative_attention_mask) | |
| # Extract the[CLS] token embeddings | |
| anchor_emb = anchor_outputs.last_hidden_state[:, 0, :] | |
| positive_emb = positive_outputs.last_hidden_state[:, 0, :] | |
| negative_emb = negative_outputs.last_hidden_state[:, 0, :] | |
| # Calculate loss | |
| loss = criterion(anchor_emb, positive_emb, negative_emb) | |
| # Backward pass | |
| optimizer.zero_grad() | |
| loss.backward() | |
| optimizer.step() | |
| total_loss += loss.item() | |
| # per batch average loss total loss / number of batches | |
| print(f'Epoch {epoch+1}, Average Loss: {total_loss/len(train_loader):.4f}') | |
| return model | |
| if __name__ == '__main__': | |
| if not os.path.exists('pair_data.parquet'): | |
| # Load and prepare the data | |
| print("Loading recipe data") | |
| recipes_df = pd.read_csv('RAW_recipes.csv') | |
| # Clean the data | |
| recipes_df['name'] = recipes_df['name'].apply(clean_text) | |
| recipes_df['tags'] = recipes_df['tags'].apply(literal_eval) | |
| recipes_df['ingredients'] = recipes_df['ingredients'].apply(literal_eval) | |
| # Filter recipes with meaningful data (no empty tags) | |
| recipes_df = recipes_df[recipes_df['tags'].str.len() > 0] | |
| # Load interactions | |
| print("Loading interaction data") | |
| interactions_df = pd.read_csv('RAW_interactions.csv') | |
| interactions_df = interactions_df.dropna(subset=['rating']) | |
| interactions_df['rating'] = pd.to_numeric(interactions_df['rating'], errors='coerce') | |
| interactions_df = interactions_df.dropna(subset=['rating']) | |
| # Create training pairs | |
| pair_data = create_pair_data(recipes_df, interactions_df, num_pairs=15000) | |
| # Save the pair data | |
| pair_data.to_parquet('pair_data.parquet', index=False) | |
| print('Data saved to pair_data.parquet') | |
| else: | |
| pair_data = pd.read_parquet('pair_data.parquet') | |
| print('Data loaded from pair_data.parquet') | |
| # Split data to training and validation (80% training, 20% validation) | |
| train_data, val_data = train_test_split(pair_data, test_size=0.2, random_state=42) | |
| # initialize tokenizer and model | |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
| # Create the datasets with reduced max_length for better performance | |
| train_dataset = pos_neg_pair_dataset(train_data, tokenizer, max_length=128) | |
| val_dataset = pos_neg_pair_dataset(val_data, tokenizer, max_length=128) | |
| # Create dataloaders with smaller batch size for stability | |
| train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True) | |
| val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False) | |
| # Train model | |
| print("Starting training...") | |
| model = train_model(train_loader, num_epochs=3) | |
| #evaluate the model | |
| print("Evaluating model...") | |
| evaluate_model(model, val_loader) | |
| # Save model | |
| torch.save(model.state_dict(), 'tag_based_bert_model.pth') | |
| print("Model saved to tag_based_bert_model.pth") | |
| print("Training Complete") |