Spaces:

Liori25
/

CookBookAI

Sleeping

App Files Files Community

Liori25 commited on Jan 21

Commit

dab9d95

verified ·

1 Parent(s): 35b320e

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -53

app.py CHANGED Viewed

@@ -1,48 +1,63 @@
 import gradio as gr
 import pandas as pd
 import numpy as np
 import os
 import random
 import base64
 from huggingface_hub import InferenceClient
 from sklearn.metrics.pairwise import cosine_similarity
-from datasets import load_dataset  # Added for HF Dataset loading
 from IO_pipeline import RecipeDigitalizerPipeline
 # ==========================================
-# 1. SETUP & DATA LOADING (UPDATED)
 # ==========================================
 hf_token = os.getenv("HF_TOKEN")
 API_MODEL = "BAAI/bge-small-en-v1.5"
 client = InferenceClient(token=hf_token) if hf_token else None
-print("⏳ Loading Data from Hugging Face...")
 try:
-    # Load dataset from Hugging Face
-    # We load the 'train' split by default.
     dataset = load_dataset("Liori25/10k_recipes", split="train")
-    # Convert to Pandas DataFrame
     df_recipes = dataset.to_pandas()
-    # Extract Embeddings
-    # We look for common names for the embedding column
-    target_col = next((c for c in ['embedding', 'embeddings', 'vectors'] if c in df_recipes.columns), None)
-    if target_col:
-        # Convert the column of lists into a 2D numpy array
-        # This handles the conversion from the HF list format to the numpy matrix required for cosine_similarity
-        stored_embeddings = np.vstack(df_recipes[target_col].values)
-        print(f"✅ Data Loaded from HF! Shape: {stored_embeddings.shape}")
     else:
-        print("⚠️ No embedding column found in dataset.")
         stored_embeddings = None
 except Exception as e:
-    print(f"❌ Error loading data: {e}")
-    df_recipes = pd.DataFrame({'Title': [], 'Raw_Output': []})
     stored_embeddings = None
 # ==========================================
 # 2. HELPER: IMAGE TO BASE64
 # ==========================================
@@ -65,14 +80,17 @@ def get_embedding_via_api(text):
     return np.array(response)
 def find_similar_recipes_list(query_text):
-    if stored_embeddings is None: return ["Database error."] * 3
     query_vec = get_embedding_via_api("Represent this recipe for retrieving similar dishes: " + query_text)
     if len(query_vec.shape) == 1: query_vec = query_vec.reshape(1, -1)
     scores = cosine_similarity(query_vec, stored_embeddings)[0]
     top_indices = scores.argsort()[-3:][::-1]
-    # Identify the correct column names dynamically
     cols = df_recipes.columns
     ing_col = next((c for c in cols if 'ingredient' in c.lower()), None)
     inst_col = next((c for c in cols if 'instruction' in c.lower()), None)
@@ -81,28 +99,12 @@ def find_similar_recipes_list(query_text):
     # --- HELPER TO CHECK FOR ERRORS ---
     def clean_and_validate(raw_text):
-        """
-        Returns cleaned text if valid, None if empty/error/nan.
-        """
         val = str(raw_text).strip()
-        # 1. Clean list syntax if present like "['item']"
         if val.startswith("[") and val.endswith("]"):
-            val = val[1:-1].replace("'", "").replace('"', "")
-            val = val.strip()
-        # 2. Check for invalid states
         val_lower = val.lower()
-        invalid_markers = ['nan', 'none', 'null', '[]', '']
-        # If it matches a standard "empty" marker
-        if val_lower in invalid_markers:
             return None
-        # If it explicitly says "error" or "parse" (catch scraper errors)
-        if "error" in val_lower or "parse" in val_lower:
-            return None
         return val
     for idx in top_indices:
@@ -111,29 +113,19 @@ def find_similar_recipes_list(query_text):
         title = row.get('Title', 'Unknown Recipe')
         score_display = f"{score:.3%}"
-        # Build the content block
         content_parts = []
-        # 1. Ingredients
         if ing_col:
             cleaned_ing = clean_and_validate(row[ing_col])
-            if cleaned_ing:
-                content_parts.append(f"<b>🛒 INGREDIENTS:</b><br>{cleaned_ing}")
-        # 2. Instructions
         if inst_col:
             cleaned_inst = clean_and_validate(row[inst_col])
-            if cleaned_inst:
-                content_parts.append(f"<b>🍳 INSTRUCTIONS:</b><br>{cleaned_inst}")
-        # Fallback if neither found (or both were errors)
         if not content_parts:
-            # We also validate the raw output fallback to ensure we don't show an error dump
             raw_out = str(row.get('Raw_Output', 'No details available.'))
-            if "error" not in raw_out.lower():
-                display_text = raw_out
-            else:
-                display_text = "<i>Details unavailable for this recipe.</i>"
         else:
             display_text = "<br><br>".join(content_parts)
@@ -455,7 +447,6 @@ with gr.Blocks(title="CookBook AI") as demo:
                         time_options = ["2h", "3h", "4h", "6h", "9h", "12h", "a day ago", "2 days ago"]
                         post_time = random.choice(time_options)
-                        # Fallback for feed display
                         raw_desc = str(row.get('Raw_Output', 'Delicious recipe...'))[:250]
                         title_feed = row.get('Title', 'Recipe')

 import gradio as gr
 import pandas as pd
+import pickle
 import numpy as np
 import os
 import random
 import base64
 from huggingface_hub import InferenceClient
 from sklearn.metrics.pairwise import cosine_similarity
+from datasets import load_dataset
 from IO_pipeline import RecipeDigitalizerPipeline
 # ==========================================
+# 1. SETUP & DATA LOADING (HYBRID)
 # ==========================================
 hf_token = os.getenv("HF_TOKEN")
 API_MODEL = "BAAI/bge-small-en-v1.5"
 client = InferenceClient(token=hf_token) if hf_token else None
+print("⏳ Initializing Data Loading...")
+# --- A. Load Text Data from Hugging Face Dataset ---
 try:
+    print("   ...Downloading recipes from HF Dataset (Liori25/10k_recipes)")
     dataset = load_dataset("Liori25/10k_recipes", split="train")
     df_recipes = dataset.to_pandas()
+    print(f"✅ Recipes Loaded! Count: {len(df_recipes)}")
+except Exception as e:
+    print(f"❌ Error loading HF Dataset: {e}")
+    df_recipes = pd.DataFrame({'Title': [], 'Raw_Output': []})
+# --- B. Load Embeddings from Local File (Space Repo) ---
+try:
+    print("   ...Loading embeddings from local 'recipe_embeddings.pkl'")
+    if os.path.exists('recipe_embeddings.pkl'):
+        with open('recipe_embeddings.pkl', 'rb') as f:
+            data = pickle.load(f)
+            # Logic to handle different pickle formats
+            if isinstance(data, dict):
+                stored_embeddings = np.array(data['embeddings'])
+            elif isinstance(data, pd.DataFrame):
+                target_col = next((c for c in ['embedding', 'embeddings', 'vectors'] if c in data.columns), None)
+                stored_embeddings = np.vstack(data[target_col].values) if target_col else data
+            else:
+                stored_embeddings = data
+        print(f"✅ Embeddings Loaded! Shape: {stored_embeddings.shape}")
     else:
+        print("❌ 'recipe_embeddings.pkl' not found locally.")
         stored_embeddings = None
 except Exception as e:
+    print(f"❌ Error loading pickle file: {e}")
     stored_embeddings = None
+# --- C. Safety Check ---
+if stored_embeddings is not None and not df_recipes.empty:
+    if len(stored_embeddings) != len(df_recipes):
+        print(f"⚠️ WARNING: Row mismatch! Recipes: {len(df_recipes)}, Embeddings: {len(stored_embeddings)}")
 # ==========================================
 # 2. HELPER: IMAGE TO BASE64
 # ==========================================
     return np.array(response)
 def find_similar_recipes_list(query_text):
+    if stored_embeddings is None: return ["Database error: Embeddings missing."] * 3
+    if df_recipes.empty: return ["Database error: Recipes missing."] * 3
     query_vec = get_embedding_via_api("Represent this recipe for retrieving similar dishes: " + query_text)
     if len(query_vec.shape) == 1: query_vec = query_vec.reshape(1, -1)
+    # Calculate Similarity
     scores = cosine_similarity(query_vec, stored_embeddings)[0]
     top_indices = scores.argsort()[-3:][::-1]
+    # Identify column names
     cols = df_recipes.columns
     ing_col = next((c for c in cols if 'ingredient' in c.lower()), None)
     inst_col = next((c for c in cols if 'instruction' in c.lower()), None)
     # --- HELPER TO CHECK FOR ERRORS ---
     def clean_and_validate(raw_text):
         val = str(raw_text).strip()
         if val.startswith("[") and val.endswith("]"):
+            val = val[1:-1].replace("'", "").replace('"', "").strip()
         val_lower = val.lower()
+        if val_lower in ['nan', 'none', 'null', '[]', '', 'error']:
             return None
         return val
     for idx in top_indices:
         title = row.get('Title', 'Unknown Recipe')
         score_display = f"{score:.3%}"
         content_parts = []
         if ing_col:
             cleaned_ing = clean_and_validate(row[ing_col])
+            if cleaned_ing: content_parts.append(f"<b>🛒 INGREDIENTS:</b><br>{cleaned_ing}")
         if inst_col:
             cleaned_inst = clean_and_validate(row[inst_col])
+            if cleaned_inst: content_parts.append(f"<b>🍳 INSTRUCTIONS:</b><br>{cleaned_inst}")
         if not content_parts:
             raw_out = str(row.get('Raw_Output', 'No details available.'))
+            display_text = raw_out if "error" not in raw_out.lower() else "<i>Details unavailable.</i>"
         else:
             display_text = "<br><br>".join(content_parts)
                         time_options = ["2h", "3h", "4h", "6h", "9h", "12h", "a day ago", "2 days ago"]
                         post_time = random.choice(time_options)
                         raw_desc = str(row.get('Raw_Output', 'Delicious recipe...'))[:250]
                         title_feed = row.get('Title', 'Recipe')