BiteWiseFinal

Sleeping

App Files Files Community

BiteWiseFinal / services /dataset.py

anaygupta

Upload 12 files

9373226 verified 14 days ago

raw

history blame contribute delete

9.23 kB

	from __future__ import annotations

	from dataclasses import dataclass
	from pathlib import Path
	from typing import Dict, List, Optional
	import re

	import pandas as pd

	from .text_utils import as_aliases, dedupe_preserve_order, ingredient_variants, normalize_text

	REQUIRED_COLUMNS = [
	"Ingredient",
	"Context",
	"Aliases",
	"Category",
	"Is_Keto_Friendly",
	"Is_Vegan_Friendly",
	"Keto_Substitution",
	"Keto_Instruction",
	"Vegan_Substitution",
	"Vegan_Instruction",
	"Vegan_Keto_Substitution",
	"Vegan_Keto_Instruction",
	]

	BAKE_CONTEXTS = {
	"Baking & Desserts",
	"Baking (Binder)",
	"Baking (Leavening)",
	"Pastries",
	"Bagels",
	"Puddings",
	"Tiramisu",
	}

	COOK_CONTEXTS = {
	"Main Course & Heavy Cooking",
	"Sauces, Dips & Dressings",
	"Soups & Savory Liquids",
	"Cold Prep & Light Meals",
	"Pasta",
	"Lasagna",
	"Roast",
	"Stir-Fry",
	"Appetizer",
	"Indian",
	"Beverages",
	"Cheese Making",
	"Processed",
	"Technical & Additives",
	}


	@dataclass(frozen=True)
	class IngredientRow:
	ingredient: str
	context: str
	aliases: List[str]
	category: str
	is_keto_friendly: bool
	is_vegan_friendly: bool
	keto_substitution: Optional[str]
	keto_instruction: Optional[str]
	vegan_substitution: Optional[str]
	vegan_instruction: Optional[str]
	vegan_keto_substitution: Optional[str]
	vegan_keto_instruction: Optional[str]

	@property
	def lookup_terms(self) -> List[str]:
	terms = [self.ingredient, *self.aliases]
	ing = normalize_text(self.ingredient)

	if "egg" in ing:
	terms.extend([
	"egg", "eggs", "whole egg", "whole eggs", "egg white", "egg whites",
	"egg yolk", "egg yolks",
	])
	if "pancetta" in ing or "bacon" in ing:
	terms.extend(["pancetta", "bacon", "guanciale", "prosciutto", "cured pork"])
	if "chicken" in ing:
	terms.extend(["chicken", "chicken thighs", "chicken breast", "chicken pieces", "chicken drumsticks"])
	if "milk" in ing:
	terms.extend(["milk", "whole milk", "dairy milk"])
	if "cheese" in ing:
	terms.extend(["cheese", "hard cheese", "soft cheese"])

	return dedupe_preserve_order(terms)


	class SubstitutionDatabase:
	def __init__(self, csv_path: str \| Path):
	self.csv_path = Path(csv_path)
	self.df = self._load()
	self.rows = [self._row_from_series(row) for _, row in self.df.iterrows()]
	self._preferred_rows_cache: Dict[str, List[IngredientRow]] = {}
	self._semantic_terms_cache: Dict[str, List[str]] = {}

	def _load(self) -> pd.DataFrame:
	if not self.csv_path.exists():
	raise FileNotFoundError(
	f"Could not find substitution database at {self.csv_path}. "
	"Place united_master_database_corrected.csv in data/ or set BITEWISE_DATASET_PATH."
	)

	df = pd.read_csv(self.csv_path)
	missing = [c for c in REQUIRED_COLUMNS if c not in df.columns]
	if missing:
	raise ValueError(f"Dataset missing required columns: {missing}")

	df = df.copy()
	df = df.dropna(subset=["Ingredient", "Context"])
	df["_ingredient_norm"] = df["Ingredient"].astype(str).map(normalize_text)
	df["_aliases_norm"] = df["Aliases"].apply(as_aliases)
	return df.reset_index(drop=True)

	def _row_from_series(self, row: pd.Series) -> IngredientRow:
	return IngredientRow(
	ingredient=str(row["Ingredient"]),
	context=str(row["Context"]),
	aliases=as_aliases(row.get("Aliases")),
	category=str(row.get("Category", "")),
	is_keto_friendly=bool(row.get("Is_Keto_Friendly", False)),
	is_vegan_friendly=bool(row.get("Is_Vegan_Friendly", False)),
	keto_substitution=None if pd.isna(row.get("Keto_Substitution")) else str(row.get("Keto_Substitution")),
	keto_instruction=None if pd.isna(row.get("Keto_Instruction")) else str(row.get("Keto_Instruction")),
	vegan_substitution=None if pd.isna(row.get("Vegan_Substitution")) else str(row.get("Vegan_Substitution")),
	vegan_instruction=None if pd.isna(row.get("Vegan_Instruction")) else str(row.get("Vegan_Instruction")),
	vegan_keto_substitution=None if pd.isna(row.get("Vegan_Keto_Substitution")) else str(row.get("Vegan_Keto_Substitution")),
	vegan_keto_instruction=None if pd.isna(row.get("Vegan_Keto_Instruction")) else str(row.get("Vegan_Keto_Instruction")),
	)

	def contexts_for_recipe_type(self, recipe_type: str) -> set[str]:
	return BAKE_CONTEXTS if recipe_type == "baked" else COOK_CONTEXTS

	def preferred_rows(self, recipe_type: str) -> List[IngredientRow]:
	if recipe_type not in self._preferred_rows_cache:
	contexts = self.contexts_for_recipe_type(recipe_type)
	self._preferred_rows_cache[recipe_type] = [row for row in self.rows if row.context in contexts]
	return self._preferred_rows_cache[recipe_type]

	def _normalize_terms(self, row: IngredientRow) -> List[str]:
	return [normalize_text(term) for term in row.lookup_terms if normalize_text(term)]

	def _match_exact(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
	query = normalize_text(query)
	exact_rows = []
	for row in rows:
	terms = self._normalize_terms(row)
	if not terms:
	continue
	if normalize_text(row.ingredient) == query or query in terms:
	exact_rows.append(row)
	return exact_rows

	def _match_partial(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
	query = normalize_text(query)
	if len(query) < 4:
	return []
	pattern = re.compile(rf"(?<!\w){re.escape(query)}(?!\w)")
	partial_rows = []
	for row in rows:
	candidates = [normalize_text(row.ingredient), *self._normalize_terms(row)]
	if any(pattern.search(candidate) for candidate in candidates):
	partial_rows.append(row)
	return partial_rows

	def _rank_rows(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
	q = normalize_text(query)

	def score(row: IngredientRow) -> tuple[int, int, int, int]:
	ingredient_norm = normalize_text(row.ingredient)
	alias_norms = [normalize_text(a) for a in row.aliases]
	exact_ingredient = int(ingredient_norm == q)
	exact_alias = int(q in alias_norms)
	alias_specificity = int(len(row.aliases) > 0)
	length = len(ingredient_norm)
	return (exact_ingredient, exact_alias, alias_specificity, length)

	return sorted(rows, key=score, reverse=True)

	def find_rows(self, query: str, recipe_type: str) -> List[IngredientRow]:
	query = normalize_text(query)
	if not query:
	return []

	variants = ingredient_variants(query)
	preferred = self.preferred_rows(recipe_type)

	for candidate in variants:
	exact_preferred = self._match_exact(preferred, candidate)
	if exact_preferred:
	return self._rank_rows(exact_preferred, candidate)

	for candidate in variants:
	exact_all = self._match_exact(self.rows, candidate)
	if exact_all:
	return self._rank_rows(exact_all, candidate)

	for candidate in variants:
	partial_preferred = self._match_partial(preferred, candidate)
	if partial_preferred:
	return self._rank_rows(partial_preferred, candidate)

	for candidate in variants:
	partial_all = self._match_partial(self.rows, candidate)
	if partial_all:
	return self._rank_rows(partial_all, candidate)

	return []

	def semantic_terms(self, recipe_type: str) -> List[str]:
	if recipe_type not in self._semantic_terms_cache:
	rows = self.preferred_rows(recipe_type) or self.rows
	terms = []
	for row in rows:
	terms.extend(row.lookup_terms)
	self._semantic_terms_cache[recipe_type] = dedupe_preserve_order(terms)
	return self._semantic_terms_cache[recipe_type]

	def pick_substitution(self, row: IngredientRow, diet: str) -> tuple[str, str, bool]:
	if diet == "vegan":
	sub = row.vegan_substitution
	instr = row.vegan_instruction
	compatible = row.is_vegan_friendly
	elif diet == "keto":
	sub = row.keto_substitution
	instr = row.keto_instruction
	compatible = row.is_keto_friendly
	else:
	sub = row.vegan_keto_substitution
	instr = row.vegan_keto_instruction
	compatible = row.is_vegan_friendly and row.is_keto_friendly

	if compatible or not sub or str(sub).strip().lower() in {"nan", "none"}:
	return row.ingredient, "Already compatible — no substitution needed.", True

	return str(sub), (str(instr) if instr and str(instr).lower() != "nan" else ""), False