Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,48 +1,63 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import os
|
| 5 |
import random
|
| 6 |
import base64
|
| 7 |
from huggingface_hub import InferenceClient
|
| 8 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
-
from datasets import load_dataset
|
| 10 |
from IO_pipeline import RecipeDigitalizerPipeline
|
| 11 |
|
| 12 |
# ==========================================
|
| 13 |
-
# 1. SETUP & DATA LOADING (
|
| 14 |
# ==========================================
|
| 15 |
hf_token = os.getenv("HF_TOKEN")
|
| 16 |
API_MODEL = "BAAI/bge-small-en-v1.5"
|
| 17 |
client = InferenceClient(token=hf_token) if hf_token else None
|
| 18 |
|
| 19 |
-
print("β³
|
|
|
|
|
|
|
| 20 |
try:
|
| 21 |
-
|
| 22 |
-
# We load the 'train' split by default.
|
| 23 |
dataset = load_dataset("Liori25/10k_recipes", split="train")
|
| 24 |
-
|
| 25 |
-
# Convert to Pandas DataFrame
|
| 26 |
df_recipes = dataset.to_pandas()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
else:
|
| 38 |
-
print("
|
| 39 |
stored_embeddings = None
|
| 40 |
|
| 41 |
except Exception as e:
|
| 42 |
-
print(f"β Error loading
|
| 43 |
-
df_recipes = pd.DataFrame({'Title': [], 'Raw_Output': []})
|
| 44 |
stored_embeddings = None
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
# ==========================================
|
| 47 |
# 2. HELPER: IMAGE TO BASE64
|
| 48 |
# ==========================================
|
|
@@ -65,14 +80,17 @@ def get_embedding_via_api(text):
|
|
| 65 |
return np.array(response)
|
| 66 |
|
| 67 |
def find_similar_recipes_list(query_text):
|
| 68 |
-
if stored_embeddings is None: return ["Database error."] * 3
|
|
|
|
|
|
|
| 69 |
query_vec = get_embedding_via_api("Represent this recipe for retrieving similar dishes: " + query_text)
|
| 70 |
if len(query_vec.shape) == 1: query_vec = query_vec.reshape(1, -1)
|
| 71 |
|
|
|
|
| 72 |
scores = cosine_similarity(query_vec, stored_embeddings)[0]
|
| 73 |
top_indices = scores.argsort()[-3:][::-1]
|
| 74 |
|
| 75 |
-
# Identify
|
| 76 |
cols = df_recipes.columns
|
| 77 |
ing_col = next((c for c in cols if 'ingredient' in c.lower()), None)
|
| 78 |
inst_col = next((c for c in cols if 'instruction' in c.lower()), None)
|
|
@@ -81,28 +99,12 @@ def find_similar_recipes_list(query_text):
|
|
| 81 |
|
| 82 |
# --- HELPER TO CHECK FOR ERRORS ---
|
| 83 |
def clean_and_validate(raw_text):
|
| 84 |
-
"""
|
| 85 |
-
Returns cleaned text if valid, None if empty/error/nan.
|
| 86 |
-
"""
|
| 87 |
val = str(raw_text).strip()
|
| 88 |
-
|
| 89 |
-
# 1. Clean list syntax if present like "['item']"
|
| 90 |
if val.startswith("[") and val.endswith("]"):
|
| 91 |
-
val = val[1:-1].replace("'", "").replace('"', "")
|
| 92 |
-
val = val.strip()
|
| 93 |
-
|
| 94 |
-
# 2. Check for invalid states
|
| 95 |
val_lower = val.lower()
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
# If it matches a standard "empty" marker
|
| 99 |
-
if val_lower in invalid_markers:
|
| 100 |
return None
|
| 101 |
-
|
| 102 |
-
# If it explicitly says "error" or "parse" (catch scraper errors)
|
| 103 |
-
if "error" in val_lower or "parse" in val_lower:
|
| 104 |
-
return None
|
| 105 |
-
|
| 106 |
return val
|
| 107 |
|
| 108 |
for idx in top_indices:
|
|
@@ -111,29 +113,19 @@ def find_similar_recipes_list(query_text):
|
|
| 111 |
title = row.get('Title', 'Unknown Recipe')
|
| 112 |
score_display = f"{score:.3%}"
|
| 113 |
|
| 114 |
-
# Build the content block
|
| 115 |
content_parts = []
|
| 116 |
|
| 117 |
-
# 1. Ingredients
|
| 118 |
if ing_col:
|
| 119 |
cleaned_ing = clean_and_validate(row[ing_col])
|
| 120 |
-
if cleaned_ing:
|
| 121 |
-
content_parts.append(f"<b>π INGREDIENTS:</b><br>{cleaned_ing}")
|
| 122 |
|
| 123 |
-
# 2. Instructions
|
| 124 |
if inst_col:
|
| 125 |
cleaned_inst = clean_and_validate(row[inst_col])
|
| 126 |
-
if cleaned_inst:
|
| 127 |
-
content_parts.append(f"<b>π³ INSTRUCTIONS:</b><br>{cleaned_inst}")
|
| 128 |
|
| 129 |
-
# Fallback if neither found (or both were errors)
|
| 130 |
if not content_parts:
|
| 131 |
-
# We also validate the raw output fallback to ensure we don't show an error dump
|
| 132 |
raw_out = str(row.get('Raw_Output', 'No details available.'))
|
| 133 |
-
if "error" not in raw_out.lower()
|
| 134 |
-
display_text = raw_out
|
| 135 |
-
else:
|
| 136 |
-
display_text = "<i>Details unavailable for this recipe.</i>"
|
| 137 |
else:
|
| 138 |
display_text = "<br><br>".join(content_parts)
|
| 139 |
|
|
@@ -455,7 +447,6 @@ with gr.Blocks(title="CookBook AI") as demo:
|
|
| 455 |
time_options = ["2h", "3h", "4h", "6h", "9h", "12h", "a day ago", "2 days ago"]
|
| 456 |
post_time = random.choice(time_options)
|
| 457 |
|
| 458 |
-
# Fallback for feed display
|
| 459 |
raw_desc = str(row.get('Raw_Output', 'Delicious recipe...'))[:250]
|
| 460 |
title_feed = row.get('Title', 'Recipe')
|
| 461 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
+
import pickle
|
| 4 |
import numpy as np
|
| 5 |
import os
|
| 6 |
import random
|
| 7 |
import base64
|
| 8 |
from huggingface_hub import InferenceClient
|
| 9 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 10 |
+
from datasets import load_dataset
|
| 11 |
from IO_pipeline import RecipeDigitalizerPipeline
|
| 12 |
|
| 13 |
# ==========================================
|
| 14 |
+
# 1. SETUP & DATA LOADING (HYBRID)
|
| 15 |
# ==========================================
|
| 16 |
hf_token = os.getenv("HF_TOKEN")
|
| 17 |
API_MODEL = "BAAI/bge-small-en-v1.5"
|
| 18 |
client = InferenceClient(token=hf_token) if hf_token else None
|
| 19 |
|
| 20 |
+
print("β³ Initializing Data Loading...")
|
| 21 |
+
|
| 22 |
+
# --- A. Load Text Data from Hugging Face Dataset ---
|
| 23 |
try:
|
| 24 |
+
print(" ...Downloading recipes from HF Dataset (Liori25/10k_recipes)")
|
|
|
|
| 25 |
dataset = load_dataset("Liori25/10k_recipes", split="train")
|
|
|
|
|
|
|
| 26 |
df_recipes = dataset.to_pandas()
|
| 27 |
+
print(f"β
Recipes Loaded! Count: {len(df_recipes)}")
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"β Error loading HF Dataset: {e}")
|
| 30 |
+
df_recipes = pd.DataFrame({'Title': [], 'Raw_Output': []})
|
| 31 |
|
| 32 |
+
# --- B. Load Embeddings from Local File (Space Repo) ---
|
| 33 |
+
try:
|
| 34 |
+
print(" ...Loading embeddings from local 'recipe_embeddings.pkl'")
|
| 35 |
+
if os.path.exists('recipe_embeddings.pkl'):
|
| 36 |
+
with open('recipe_embeddings.pkl', 'rb') as f:
|
| 37 |
+
data = pickle.load(f)
|
| 38 |
+
|
| 39 |
+
# Logic to handle different pickle formats
|
| 40 |
+
if isinstance(data, dict):
|
| 41 |
+
stored_embeddings = np.array(data['embeddings'])
|
| 42 |
+
elif isinstance(data, pd.DataFrame):
|
| 43 |
+
target_col = next((c for c in ['embedding', 'embeddings', 'vectors'] if c in data.columns), None)
|
| 44 |
+
stored_embeddings = np.vstack(data[target_col].values) if target_col else data
|
| 45 |
+
else:
|
| 46 |
+
stored_embeddings = data
|
| 47 |
+
print(f"β
Embeddings Loaded! Shape: {stored_embeddings.shape}")
|
| 48 |
else:
|
| 49 |
+
print("β 'recipe_embeddings.pkl' not found locally.")
|
| 50 |
stored_embeddings = None
|
| 51 |
|
| 52 |
except Exception as e:
|
| 53 |
+
print(f"β Error loading pickle file: {e}")
|
|
|
|
| 54 |
stored_embeddings = None
|
| 55 |
|
| 56 |
+
# --- C. Safety Check ---
|
| 57 |
+
if stored_embeddings is not None and not df_recipes.empty:
|
| 58 |
+
if len(stored_embeddings) != len(df_recipes):
|
| 59 |
+
print(f"β οΈ WARNING: Row mismatch! Recipes: {len(df_recipes)}, Embeddings: {len(stored_embeddings)}")
|
| 60 |
+
|
| 61 |
# ==========================================
|
| 62 |
# 2. HELPER: IMAGE TO BASE64
|
| 63 |
# ==========================================
|
|
|
|
| 80 |
return np.array(response)
|
| 81 |
|
| 82 |
def find_similar_recipes_list(query_text):
|
| 83 |
+
if stored_embeddings is None: return ["Database error: Embeddings missing."] * 3
|
| 84 |
+
if df_recipes.empty: return ["Database error: Recipes missing."] * 3
|
| 85 |
+
|
| 86 |
query_vec = get_embedding_via_api("Represent this recipe for retrieving similar dishes: " + query_text)
|
| 87 |
if len(query_vec.shape) == 1: query_vec = query_vec.reshape(1, -1)
|
| 88 |
|
| 89 |
+
# Calculate Similarity
|
| 90 |
scores = cosine_similarity(query_vec, stored_embeddings)[0]
|
| 91 |
top_indices = scores.argsort()[-3:][::-1]
|
| 92 |
|
| 93 |
+
# Identify column names
|
| 94 |
cols = df_recipes.columns
|
| 95 |
ing_col = next((c for c in cols if 'ingredient' in c.lower()), None)
|
| 96 |
inst_col = next((c for c in cols if 'instruction' in c.lower()), None)
|
|
|
|
| 99 |
|
| 100 |
# --- HELPER TO CHECK FOR ERRORS ---
|
| 101 |
def clean_and_validate(raw_text):
|
|
|
|
|
|
|
|
|
|
| 102 |
val = str(raw_text).strip()
|
|
|
|
|
|
|
| 103 |
if val.startswith("[") and val.endswith("]"):
|
| 104 |
+
val = val[1:-1].replace("'", "").replace('"', "").strip()
|
|
|
|
|
|
|
|
|
|
| 105 |
val_lower = val.lower()
|
| 106 |
+
if val_lower in ['nan', 'none', 'null', '[]', '', 'error']:
|
|
|
|
|
|
|
|
|
|
| 107 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
return val
|
| 109 |
|
| 110 |
for idx in top_indices:
|
|
|
|
| 113 |
title = row.get('Title', 'Unknown Recipe')
|
| 114 |
score_display = f"{score:.3%}"
|
| 115 |
|
|
|
|
| 116 |
content_parts = []
|
| 117 |
|
|
|
|
| 118 |
if ing_col:
|
| 119 |
cleaned_ing = clean_and_validate(row[ing_col])
|
| 120 |
+
if cleaned_ing: content_parts.append(f"<b>π INGREDIENTS:</b><br>{cleaned_ing}")
|
|
|
|
| 121 |
|
|
|
|
| 122 |
if inst_col:
|
| 123 |
cleaned_inst = clean_and_validate(row[inst_col])
|
| 124 |
+
if cleaned_inst: content_parts.append(f"<b>π³ INSTRUCTIONS:</b><br>{cleaned_inst}")
|
|
|
|
| 125 |
|
|
|
|
| 126 |
if not content_parts:
|
|
|
|
| 127 |
raw_out = str(row.get('Raw_Output', 'No details available.'))
|
| 128 |
+
display_text = raw_out if "error" not in raw_out.lower() else "<i>Details unavailable.</i>"
|
|
|
|
|
|
|
|
|
|
| 129 |
else:
|
| 130 |
display_text = "<br><br>".join(content_parts)
|
| 131 |
|
|
|
|
| 447 |
time_options = ["2h", "3h", "4h", "6h", "9h", "12h", "a day ago", "2 days ago"]
|
| 448 |
post_time = random.choice(time_options)
|
| 449 |
|
|
|
|
| 450 |
raw_desc = str(row.get('Raw_Output', 'Delicious recipe...'))[:250]
|
| 451 |
title_feed = row.get('Title', 'Recipe')
|
| 452 |
|