kingking111009 commited on
Commit
be9504b
Β·
1 Parent(s): b079cdb

Remove all fallback data and add rating-based recipe filtering

Browse files

- Remove all mock/fallback recipe data
- Add proper rating filter using RAW_interactions.csv
- Only return recipes with rating >= 4.0 and >= 2 reviews
- Service fails cleanly if database cannot be loaded
- No fake data - real database or nothing

πŸ€– Generated with [Claude Code](https://claude.ai/code)

Files changed (1) hide show
  1. app.py +58 -63
app.py CHANGED
@@ -34,6 +34,7 @@ app.add_middleware(
34
  tokenizer = None
35
  model = None
36
  recipes_df = None
 
37
  vectorizer = None
38
  recipe_vectors = None
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -77,9 +78,49 @@ def safe_eval_list(x):
77
  return [item.strip() for item in x.split(',') if item.strip()]
78
  return []
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def load_recipes():
81
- """Load and process the RAW_recipes.csv file from Hugging Face dataset"""
82
- global recipes_df, vectorizer, recipe_vectors
83
 
84
  try:
85
  # Try to load from Hugging Face dataset directly
@@ -115,6 +156,15 @@ def load_recipes():
115
 
116
  df = pd.read_csv(local_csv)
117
  print(f"βœ… Successfully downloaded and loaded {len(df)} recipes from CSV!")
 
 
 
 
 
 
 
 
 
118
  except Exception as hf_error:
119
  print(f"⚠️ Both Hugging Face methods failed: {hf_error}")
120
 
@@ -149,6 +199,11 @@ def load_recipes():
149
  missing_cols = [col for col in required_cols if col not in df.columns]
150
  if missing_cols:
151
  raise ValueError(f"Missing required columns: {missing_cols}")
 
 
 
 
 
152
 
153
  # Parse string lists
154
  df['ingredients'] = df['ingredients'].apply(safe_eval_list)
@@ -191,67 +246,7 @@ def load_recipes():
191
  except Exception as e:
192
  print(f"❌ Error loading recipes: {e}")
193
  print(f"πŸ“ Error details: {type(e).__name__}: {str(e)}")
194
-
195
- # Create a more comprehensive fallback dataset
196
- print("πŸ”„ Creating fallback recipe dataset...")
197
- recipes_df = pd.DataFrame({
198
- 'id': [234567, 458976, 123789, 345678, 567890],
199
- 'name': [
200
- '15-Minute Pasta Aglio e Olio',
201
- 'Lemon Herb Grilled Chicken',
202
- 'Rainbow Buddha Bowl',
203
- 'Mediterranean Quinoa Salad',
204
- 'Classic Caesar Salad'
205
- ],
206
- 'minutes': [15, 25, 30, 20, 10],
207
- 'ingredients': [
208
- ['1 lb spaghetti', '6 cloves garlic (sliced)', '1/2 cup olive oil', '1/4 cup fresh parsley', 'red pepper flakes'],
209
- ['4 chicken breasts', '2 lemons (juiced)', '2 tbsp olive oil', '2 tsp dried herbs', 'salt and pepper'],
210
- ['1 cup quinoa', '2 cups mixed vegetables', '3 tbsp tahini', '1 lemon (juiced)', '2 tbsp olive oil'],
211
- ['2 cups cooked quinoa', '1 cup cherry tomatoes', '1 cucumber (diced)', '1/2 cup olives', '3 tbsp olive oil'],
212
- ['1 large romaine lettuce', '1/2 cup parmesan cheese', '1/4 cup caesar dressing', '1/2 cup croutons', 'black pepper']
213
- ],
214
- 'steps': [
215
- ['Cook pasta until al dente', 'Heat oil and sautΓ© garlic until golden', 'Toss pasta with oil and garlic', 'Add parsley and pepper flakes'],
216
- ['Marinate chicken in lemon juice and herbs for 30 minutes', 'Heat grill to medium-high heat', 'Grill chicken 6-8 minutes per side', 'Rest for 5 minutes before serving'],
217
- ['Cook quinoa according to package directions', 'Roast vegetables at 400Β°F for 25 minutes', 'Whisk tahini with lemon juice', 'Assemble bowl and drizzle with dressing'],
218
- ['Cool cooked quinoa completely', 'Dice all vegetables', 'Combine quinoa and vegetables', 'Dress with olive oil and lemon'],
219
- ['Wash and chop romaine lettuce', 'Toss with caesar dressing', 'Top with parmesan and croutons', 'Season with black pepper']
220
- ],
221
- 'tags': [['quick', 'italian', 'pasta'], ['healthy', 'protein', 'grilled'], ['vegetarian', 'healthy', 'bowl'], ['vegetarian', 'mediterranean', 'salad'], ['salad', 'classic', 'vegetarian']],
222
- 'nutrition': [[], [], [], [], []],
223
- 'description': [
224
- 'A classic Italian dish that\'s simple yet delicious.',
225
- 'Fresh and flavorful grilled chicken with herbs and bright lemon flavor.',
226
- 'A nutritious and colorful bowl packed with healthy ingredients.',
227
- 'A protein-rich salad with fresh vegetables and herbs.',
228
- 'A classic caesar salad with crisp romaine and parmesan.'
229
- ]
230
- })
231
-
232
- # Process the fallback dataset the same way
233
- recipes_df['ingredients_text'] = recipes_df['ingredients'].apply(lambda x: ' '.join(x).lower())
234
- recipes_df['steps_text'] = recipes_df['steps'].apply(lambda x: ' '.join(x).lower())
235
- recipes_df['tags_text'] = recipes_df['tags'].apply(lambda x: ' '.join(x).lower())
236
- recipes_df['search_text'] = (
237
- recipes_df['name'].str.lower() + ' ' +
238
- recipes_df['ingredients_text'] + ' ' +
239
- recipes_df['tags_text'] + ' ' +
240
- recipes_df['description'].fillna('').str.lower()
241
- )
242
-
243
- # Create simple vectorizer for fallback
244
- print("πŸ” Building fallback search index...")
245
- vectorizer = TfidfVectorizer(
246
- max_features=1000,
247
- stop_words='english',
248
- ngram_range=(1, 2),
249
- min_df=1
250
- )
251
- recipe_vectors = vectorizer.fit_transform(recipes_df['search_text'])
252
-
253
- print(f"βœ… Fallback dataset ready with {len(recipes_df)} recipes!")
254
- return # Exit early for fallback dataset
255
 
256
  @torch.inference_mode()
257
  def extract_query_features_with_gpt2(query_text, preferences="", max_minutes=30):
 
34
  tokenizer = None
35
  model = None
36
  recipes_df = None
37
+ interactions_df = None
38
  vectorizer = None
39
  recipe_vectors = None
40
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
78
  return [item.strip() for item in x.split(',') if item.strip()]
79
  return []
80
 
81
+ def filter_by_ratings(recipes_df, interactions_df, min_rating=4.0, min_reviews=2):
82
+ """Filter recipes to only include those with good ratings"""
83
+ try:
84
+ print(f"πŸ“Š Processing {len(interactions_df)} interactions for rating filter...")
85
+
86
+ # Calculate average rating and review count for each recipe
87
+ recipe_stats = interactions_df.groupby('recipe_id').agg({
88
+ 'rating': ['mean', 'count'],
89
+ 'review': lambda x: x.dropna().apply(lambda review: len(str(review)) > 10).sum() # Count meaningful reviews
90
+ }).reset_index()
91
+
92
+ # Flatten column names
93
+ recipe_stats.columns = ['recipe_id', 'avg_rating', 'rating_count', 'meaningful_reviews']
94
+
95
+ # Filter for high-quality recipes
96
+ high_quality = recipe_stats[
97
+ (recipe_stats['avg_rating'] >= min_rating) &
98
+ (recipe_stats['rating_count'] >= min_reviews)
99
+ ]
100
+
101
+ print(f"πŸ† Found {len(high_quality)} recipes with rating >= {min_rating} and >= {min_reviews} reviews")
102
+
103
+ # Join with recipes and keep only high-quality ones
104
+ filtered_recipes = recipes_df.merge(
105
+ high_quality[['recipe_id', 'avg_rating', 'rating_count']],
106
+ left_on='id',
107
+ right_on='recipe_id',
108
+ how='inner'
109
+ )
110
+
111
+ # Add rating info to the dataframe
112
+ filtered_recipes['avg_rating'] = filtered_recipes['avg_rating'].round(1)
113
+
114
+ print(f"βœ… Quality filter complete: {len(filtered_recipes)} highly-rated recipes")
115
+ return filtered_recipes
116
+
117
+ except Exception as e:
118
+ print(f"⚠️ Rating filter failed: {e}")
119
+ raise Exception(f"Failed to apply rating filter: {e}")
120
+
121
  def load_recipes():
122
+ """Load and process both RAW_recipes.csv and RAW_interactions.csv with rating filtering"""
123
+ global recipes_df, interactions_df, vectorizer, recipe_vectors
124
 
125
  try:
126
  # Try to load from Hugging Face dataset directly
 
156
 
157
  df = pd.read_csv(local_csv)
158
  print(f"βœ… Successfully downloaded and loaded {len(df)} recipes from CSV!")
159
+
160
+ # Also download interactions CSV for rating filtering
161
+ interactions_url = "https://huggingface.co/datasets/nutrientartcd/recipe-dataset/resolve/main/RAW_interactions.csv"
162
+ local_interactions = "/tmp/RAW_interactions_downloaded.csv"
163
+
164
+ print("πŸ“Š Downloading interactions data for rating filtering...")
165
+ urllib.request.urlretrieve(interactions_url, local_interactions)
166
+ interactions_df = pd.read_csv(local_interactions)
167
+ print(f"βœ… Loaded {len(interactions_df)} interactions for rating filtering!")
168
  except Exception as hf_error:
169
  print(f"⚠️ Both Hugging Face methods failed: {hf_error}")
170
 
 
199
  missing_cols = [col for col in required_cols if col not in df.columns]
200
  if missing_cols:
201
  raise ValueError(f"Missing required columns: {missing_cols}")
202
+
203
+ # Filter recipes based on ratings from interactions
204
+ if interactions_df is not None:
205
+ df = filter_by_ratings(df, interactions_df)
206
+ print(f"πŸ“ˆ After rating filter: {len(df)} high-quality recipes remaining")
207
 
208
  # Parse string lists
209
  df['ingredients'] = df['ingredients'].apply(safe_eval_list)
 
246
  except Exception as e:
247
  print(f"❌ Error loading recipes: {e}")
248
  print(f"πŸ“ Error details: {type(e).__name__}: {str(e)}")
249
+ raise Exception(f"Failed to load recipe database: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
  @torch.inference_mode()
252
  def extract_query_features_with_gpt2(query_text, preferences="", max_minutes=30):