Liori25 commited on
Commit
dab9d95
Β·
verified Β·
1 Parent(s): 35b320e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -53
app.py CHANGED
@@ -1,48 +1,63 @@
1
  import gradio as gr
2
  import pandas as pd
 
3
  import numpy as np
4
  import os
5
  import random
6
  import base64
7
  from huggingface_hub import InferenceClient
8
  from sklearn.metrics.pairwise import cosine_similarity
9
- from datasets import load_dataset # Added for HF Dataset loading
10
  from IO_pipeline import RecipeDigitalizerPipeline
11
 
12
  # ==========================================
13
- # 1. SETUP & DATA LOADING (UPDATED)
14
  # ==========================================
15
  hf_token = os.getenv("HF_TOKEN")
16
  API_MODEL = "BAAI/bge-small-en-v1.5"
17
  client = InferenceClient(token=hf_token) if hf_token else None
18
 
19
- print("⏳ Loading Data from Hugging Face...")
 
 
20
  try:
21
- # Load dataset from Hugging Face
22
- # We load the 'train' split by default.
23
  dataset = load_dataset("Liori25/10k_recipes", split="train")
24
-
25
- # Convert to Pandas DataFrame
26
  df_recipes = dataset.to_pandas()
 
 
 
 
27
 
28
- # Extract Embeddings
29
- # We look for common names for the embedding column
30
- target_col = next((c for c in ['embedding', 'embeddings', 'vectors'] if c in df_recipes.columns), None)
31
-
32
- if target_col:
33
- # Convert the column of lists into a 2D numpy array
34
- # This handles the conversion from the HF list format to the numpy matrix required for cosine_similarity
35
- stored_embeddings = np.vstack(df_recipes[target_col].values)
36
- print(f"βœ… Data Loaded from HF! Shape: {stored_embeddings.shape}")
 
 
 
 
 
 
 
37
  else:
38
- print("⚠️ No embedding column found in dataset.")
39
  stored_embeddings = None
40
 
41
  except Exception as e:
42
- print(f"❌ Error loading data: {e}")
43
- df_recipes = pd.DataFrame({'Title': [], 'Raw_Output': []})
44
  stored_embeddings = None
45
 
 
 
 
 
 
46
  # ==========================================
47
  # 2. HELPER: IMAGE TO BASE64
48
  # ==========================================
@@ -65,14 +80,17 @@ def get_embedding_via_api(text):
65
  return np.array(response)
66
 
67
  def find_similar_recipes_list(query_text):
68
- if stored_embeddings is None: return ["Database error."] * 3
 
 
69
  query_vec = get_embedding_via_api("Represent this recipe for retrieving similar dishes: " + query_text)
70
  if len(query_vec.shape) == 1: query_vec = query_vec.reshape(1, -1)
71
 
 
72
  scores = cosine_similarity(query_vec, stored_embeddings)[0]
73
  top_indices = scores.argsort()[-3:][::-1]
74
 
75
- # Identify the correct column names dynamically
76
  cols = df_recipes.columns
77
  ing_col = next((c for c in cols if 'ingredient' in c.lower()), None)
78
  inst_col = next((c for c in cols if 'instruction' in c.lower()), None)
@@ -81,28 +99,12 @@ def find_similar_recipes_list(query_text):
81
 
82
  # --- HELPER TO CHECK FOR ERRORS ---
83
  def clean_and_validate(raw_text):
84
- """
85
- Returns cleaned text if valid, None if empty/error/nan.
86
- """
87
  val = str(raw_text).strip()
88
-
89
- # 1. Clean list syntax if present like "['item']"
90
  if val.startswith("[") and val.endswith("]"):
91
- val = val[1:-1].replace("'", "").replace('"', "")
92
- val = val.strip()
93
-
94
- # 2. Check for invalid states
95
  val_lower = val.lower()
96
- invalid_markers = ['nan', 'none', 'null', '[]', '']
97
-
98
- # If it matches a standard "empty" marker
99
- if val_lower in invalid_markers:
100
  return None
101
-
102
- # If it explicitly says "error" or "parse" (catch scraper errors)
103
- if "error" in val_lower or "parse" in val_lower:
104
- return None
105
-
106
  return val
107
 
108
  for idx in top_indices:
@@ -111,29 +113,19 @@ def find_similar_recipes_list(query_text):
111
  title = row.get('Title', 'Unknown Recipe')
112
  score_display = f"{score:.3%}"
113
 
114
- # Build the content block
115
  content_parts = []
116
 
117
- # 1. Ingredients
118
  if ing_col:
119
  cleaned_ing = clean_and_validate(row[ing_col])
120
- if cleaned_ing:
121
- content_parts.append(f"<b>πŸ›’ INGREDIENTS:</b><br>{cleaned_ing}")
122
 
123
- # 2. Instructions
124
  if inst_col:
125
  cleaned_inst = clean_and_validate(row[inst_col])
126
- if cleaned_inst:
127
- content_parts.append(f"<b>🍳 INSTRUCTIONS:</b><br>{cleaned_inst}")
128
 
129
- # Fallback if neither found (or both were errors)
130
  if not content_parts:
131
- # We also validate the raw output fallback to ensure we don't show an error dump
132
  raw_out = str(row.get('Raw_Output', 'No details available.'))
133
- if "error" not in raw_out.lower():
134
- display_text = raw_out
135
- else:
136
- display_text = "<i>Details unavailable for this recipe.</i>"
137
  else:
138
  display_text = "<br><br>".join(content_parts)
139
 
@@ -455,7 +447,6 @@ with gr.Blocks(title="CookBook AI") as demo:
455
  time_options = ["2h", "3h", "4h", "6h", "9h", "12h", "a day ago", "2 days ago"]
456
  post_time = random.choice(time_options)
457
 
458
- # Fallback for feed display
459
  raw_desc = str(row.get('Raw_Output', 'Delicious recipe...'))[:250]
460
  title_feed = row.get('Title', 'Recipe')
461
 
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import pickle
4
  import numpy as np
5
  import os
6
  import random
7
  import base64
8
  from huggingface_hub import InferenceClient
9
  from sklearn.metrics.pairwise import cosine_similarity
10
+ from datasets import load_dataset
11
  from IO_pipeline import RecipeDigitalizerPipeline
12
 
13
  # ==========================================
14
+ # 1. SETUP & DATA LOADING (HYBRID)
15
  # ==========================================
16
  hf_token = os.getenv("HF_TOKEN")
17
  API_MODEL = "BAAI/bge-small-en-v1.5"
18
  client = InferenceClient(token=hf_token) if hf_token else None
19
 
20
+ print("⏳ Initializing Data Loading...")
21
+
22
+ # --- A. Load Text Data from Hugging Face Dataset ---
23
  try:
24
+ print(" ...Downloading recipes from HF Dataset (Liori25/10k_recipes)")
 
25
  dataset = load_dataset("Liori25/10k_recipes", split="train")
 
 
26
  df_recipes = dataset.to_pandas()
27
+ print(f"βœ… Recipes Loaded! Count: {len(df_recipes)}")
28
+ except Exception as e:
29
+ print(f"❌ Error loading HF Dataset: {e}")
30
+ df_recipes = pd.DataFrame({'Title': [], 'Raw_Output': []})
31
 
32
+ # --- B. Load Embeddings from Local File (Space Repo) ---
33
+ try:
34
+ print(" ...Loading embeddings from local 'recipe_embeddings.pkl'")
35
+ if os.path.exists('recipe_embeddings.pkl'):
36
+ with open('recipe_embeddings.pkl', 'rb') as f:
37
+ data = pickle.load(f)
38
+
39
+ # Logic to handle different pickle formats
40
+ if isinstance(data, dict):
41
+ stored_embeddings = np.array(data['embeddings'])
42
+ elif isinstance(data, pd.DataFrame):
43
+ target_col = next((c for c in ['embedding', 'embeddings', 'vectors'] if c in data.columns), None)
44
+ stored_embeddings = np.vstack(data[target_col].values) if target_col else data
45
+ else:
46
+ stored_embeddings = data
47
+ print(f"βœ… Embeddings Loaded! Shape: {stored_embeddings.shape}")
48
  else:
49
+ print("❌ 'recipe_embeddings.pkl' not found locally.")
50
  stored_embeddings = None
51
 
52
  except Exception as e:
53
+ print(f"❌ Error loading pickle file: {e}")
 
54
  stored_embeddings = None
55
 
56
+ # --- C. Safety Check ---
57
+ if stored_embeddings is not None and not df_recipes.empty:
58
+ if len(stored_embeddings) != len(df_recipes):
59
+ print(f"⚠️ WARNING: Row mismatch! Recipes: {len(df_recipes)}, Embeddings: {len(stored_embeddings)}")
60
+
61
  # ==========================================
62
  # 2. HELPER: IMAGE TO BASE64
63
  # ==========================================
 
80
  return np.array(response)
81
 
82
  def find_similar_recipes_list(query_text):
83
+ if stored_embeddings is None: return ["Database error: Embeddings missing."] * 3
84
+ if df_recipes.empty: return ["Database error: Recipes missing."] * 3
85
+
86
  query_vec = get_embedding_via_api("Represent this recipe for retrieving similar dishes: " + query_text)
87
  if len(query_vec.shape) == 1: query_vec = query_vec.reshape(1, -1)
88
 
89
+ # Calculate Similarity
90
  scores = cosine_similarity(query_vec, stored_embeddings)[0]
91
  top_indices = scores.argsort()[-3:][::-1]
92
 
93
+ # Identify column names
94
  cols = df_recipes.columns
95
  ing_col = next((c for c in cols if 'ingredient' in c.lower()), None)
96
  inst_col = next((c for c in cols if 'instruction' in c.lower()), None)
 
99
 
100
  # --- HELPER TO CHECK FOR ERRORS ---
101
  def clean_and_validate(raw_text):
 
 
 
102
  val = str(raw_text).strip()
 
 
103
  if val.startswith("[") and val.endswith("]"):
104
+ val = val[1:-1].replace("'", "").replace('"', "").strip()
 
 
 
105
  val_lower = val.lower()
106
+ if val_lower in ['nan', 'none', 'null', '[]', '', 'error']:
 
 
 
107
  return None
 
 
 
 
 
108
  return val
109
 
110
  for idx in top_indices:
 
113
  title = row.get('Title', 'Unknown Recipe')
114
  score_display = f"{score:.3%}"
115
 
 
116
  content_parts = []
117
 
 
118
  if ing_col:
119
  cleaned_ing = clean_and_validate(row[ing_col])
120
+ if cleaned_ing: content_parts.append(f"<b>πŸ›’ INGREDIENTS:</b><br>{cleaned_ing}")
 
121
 
 
122
  if inst_col:
123
  cleaned_inst = clean_and_validate(row[inst_col])
124
+ if cleaned_inst: content_parts.append(f"<b>🍳 INSTRUCTIONS:</b><br>{cleaned_inst}")
 
125
 
 
126
  if not content_parts:
 
127
  raw_out = str(row.get('Raw_Output', 'No details available.'))
128
+ display_text = raw_out if "error" not in raw_out.lower() else "<i>Details unavailable.</i>"
 
 
 
129
  else:
130
  display_text = "<br><br>".join(content_parts)
131
 
 
447
  time_options = ["2h", "3h", "4h", "6h", "9h", "12h", "a day ago", "2 days ago"]
448
  post_time = random.choice(time_options)
449
 
 
450
  raw_desc = str(row.get('Raw_Output', 'Delicious recipe...'))[:250]
451
  title_feed = row.get('Title', 'Recipe')
452