BiteWiseFinal / services /dataset.py
anaygupta's picture
Upload 12 files
9373226 verified
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional
import re
import pandas as pd
from .text_utils import as_aliases, dedupe_preserve_order, ingredient_variants, normalize_text
REQUIRED_COLUMNS = [
"Ingredient",
"Context",
"Aliases",
"Category",
"Is_Keto_Friendly",
"Is_Vegan_Friendly",
"Keto_Substitution",
"Keto_Instruction",
"Vegan_Substitution",
"Vegan_Instruction",
"Vegan_Keto_Substitution",
"Vegan_Keto_Instruction",
]
BAKE_CONTEXTS = {
"Baking & Desserts",
"Baking (Binder)",
"Baking (Leavening)",
"Pastries",
"Bagels",
"Puddings",
"Tiramisu",
}
COOK_CONTEXTS = {
"Main Course & Heavy Cooking",
"Sauces, Dips & Dressings",
"Soups & Savory Liquids",
"Cold Prep & Light Meals",
"Pasta",
"Lasagna",
"Roast",
"Stir-Fry",
"Appetizer",
"Indian",
"Beverages",
"Cheese Making",
"Processed",
"Technical & Additives",
}
@dataclass(frozen=True)
class IngredientRow:
ingredient: str
context: str
aliases: List[str]
category: str
is_keto_friendly: bool
is_vegan_friendly: bool
keto_substitution: Optional[str]
keto_instruction: Optional[str]
vegan_substitution: Optional[str]
vegan_instruction: Optional[str]
vegan_keto_substitution: Optional[str]
vegan_keto_instruction: Optional[str]
@property
def lookup_terms(self) -> List[str]:
terms = [self.ingredient, *self.aliases]
ing = normalize_text(self.ingredient)
if "egg" in ing:
terms.extend([
"egg", "eggs", "whole egg", "whole eggs", "egg white", "egg whites",
"egg yolk", "egg yolks",
])
if "pancetta" in ing or "bacon" in ing:
terms.extend(["pancetta", "bacon", "guanciale", "prosciutto", "cured pork"])
if "chicken" in ing:
terms.extend(["chicken", "chicken thighs", "chicken breast", "chicken pieces", "chicken drumsticks"])
if "milk" in ing:
terms.extend(["milk", "whole milk", "dairy milk"])
if "cheese" in ing:
terms.extend(["cheese", "hard cheese", "soft cheese"])
return dedupe_preserve_order(terms)
class SubstitutionDatabase:
def __init__(self, csv_path: str | Path):
self.csv_path = Path(csv_path)
self.df = self._load()
self.rows = [self._row_from_series(row) for _, row in self.df.iterrows()]
self._preferred_rows_cache: Dict[str, List[IngredientRow]] = {}
self._semantic_terms_cache: Dict[str, List[str]] = {}
def _load(self) -> pd.DataFrame:
if not self.csv_path.exists():
raise FileNotFoundError(
f"Could not find substitution database at {self.csv_path}. "
"Place united_master_database_corrected.csv in data/ or set BITEWISE_DATASET_PATH."
)
df = pd.read_csv(self.csv_path)
missing = [c for c in REQUIRED_COLUMNS if c not in df.columns]
if missing:
raise ValueError(f"Dataset missing required columns: {missing}")
df = df.copy()
df = df.dropna(subset=["Ingredient", "Context"])
df["_ingredient_norm"] = df["Ingredient"].astype(str).map(normalize_text)
df["_aliases_norm"] = df["Aliases"].apply(as_aliases)
return df.reset_index(drop=True)
def _row_from_series(self, row: pd.Series) -> IngredientRow:
return IngredientRow(
ingredient=str(row["Ingredient"]),
context=str(row["Context"]),
aliases=as_aliases(row.get("Aliases")),
category=str(row.get("Category", "")),
is_keto_friendly=bool(row.get("Is_Keto_Friendly", False)),
is_vegan_friendly=bool(row.get("Is_Vegan_Friendly", False)),
keto_substitution=None if pd.isna(row.get("Keto_Substitution")) else str(row.get("Keto_Substitution")),
keto_instruction=None if pd.isna(row.get("Keto_Instruction")) else str(row.get("Keto_Instruction")),
vegan_substitution=None if pd.isna(row.get("Vegan_Substitution")) else str(row.get("Vegan_Substitution")),
vegan_instruction=None if pd.isna(row.get("Vegan_Instruction")) else str(row.get("Vegan_Instruction")),
vegan_keto_substitution=None if pd.isna(row.get("Vegan_Keto_Substitution")) else str(row.get("Vegan_Keto_Substitution")),
vegan_keto_instruction=None if pd.isna(row.get("Vegan_Keto_Instruction")) else str(row.get("Vegan_Keto_Instruction")),
)
def contexts_for_recipe_type(self, recipe_type: str) -> set[str]:
return BAKE_CONTEXTS if recipe_type == "baked" else COOK_CONTEXTS
def preferred_rows(self, recipe_type: str) -> List[IngredientRow]:
if recipe_type not in self._preferred_rows_cache:
contexts = self.contexts_for_recipe_type(recipe_type)
self._preferred_rows_cache[recipe_type] = [row for row in self.rows if row.context in contexts]
return self._preferred_rows_cache[recipe_type]
def _normalize_terms(self, row: IngredientRow) -> List[str]:
return [normalize_text(term) for term in row.lookup_terms if normalize_text(term)]
def _match_exact(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
query = normalize_text(query)
exact_rows = []
for row in rows:
terms = self._normalize_terms(row)
if not terms:
continue
if normalize_text(row.ingredient) == query or query in terms:
exact_rows.append(row)
return exact_rows
def _match_partial(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
query = normalize_text(query)
if len(query) < 4:
return []
pattern = re.compile(rf"(?<!\w){re.escape(query)}(?!\w)")
partial_rows = []
for row in rows:
candidates = [normalize_text(row.ingredient), *self._normalize_terms(row)]
if any(pattern.search(candidate) for candidate in candidates):
partial_rows.append(row)
return partial_rows
def _rank_rows(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
q = normalize_text(query)
def score(row: IngredientRow) -> tuple[int, int, int, int]:
ingredient_norm = normalize_text(row.ingredient)
alias_norms = [normalize_text(a) for a in row.aliases]
exact_ingredient = int(ingredient_norm == q)
exact_alias = int(q in alias_norms)
alias_specificity = int(len(row.aliases) > 0)
length = len(ingredient_norm)
return (exact_ingredient, exact_alias, alias_specificity, length)
return sorted(rows, key=score, reverse=True)
def find_rows(self, query: str, recipe_type: str) -> List[IngredientRow]:
query = normalize_text(query)
if not query:
return []
variants = ingredient_variants(query)
preferred = self.preferred_rows(recipe_type)
for candidate in variants:
exact_preferred = self._match_exact(preferred, candidate)
if exact_preferred:
return self._rank_rows(exact_preferred, candidate)
for candidate in variants:
exact_all = self._match_exact(self.rows, candidate)
if exact_all:
return self._rank_rows(exact_all, candidate)
for candidate in variants:
partial_preferred = self._match_partial(preferred, candidate)
if partial_preferred:
return self._rank_rows(partial_preferred, candidate)
for candidate in variants:
partial_all = self._match_partial(self.rows, candidate)
if partial_all:
return self._rank_rows(partial_all, candidate)
return []
def semantic_terms(self, recipe_type: str) -> List[str]:
if recipe_type not in self._semantic_terms_cache:
rows = self.preferred_rows(recipe_type) or self.rows
terms = []
for row in rows:
terms.extend(row.lookup_terms)
self._semantic_terms_cache[recipe_type] = dedupe_preserve_order(terms)
return self._semantic_terms_cache[recipe_type]
def pick_substitution(self, row: IngredientRow, diet: str) -> tuple[str, str, bool]:
if diet == "vegan":
sub = row.vegan_substitution
instr = row.vegan_instruction
compatible = row.is_vegan_friendly
elif diet == "keto":
sub = row.keto_substitution
instr = row.keto_instruction
compatible = row.is_keto_friendly
else:
sub = row.vegan_keto_substitution
instr = row.vegan_keto_instruction
compatible = row.is_vegan_friendly and row.is_keto_friendly
if compatible or not sub or str(sub).strip().lower() in {"nan", "none"}:
return row.ingredient, "Already compatible — no substitution needed.", True
return str(sub), (str(instr) if instr and str(instr).lower() != "nan" else ""), False