epicure-explorer / epicure.py
josefchen's picture
Initial release: Epicure Explorer Gradio demo. Paper arxiv 2605.22391.
7200047 verified
"""
Epicure: minimal loader for the three sibling ingredient embeddings.
Usage
-----
from epicure import Epicure
m = Epicure.from_pretrained("Kaikaku/epicure-cooc")
m.neighbors("chicken", k=5)
m.slerp("rice", "cuisine:South_Asian/South Asian", theta_deg=30, k=5)
m.closest_mode("miso", kind="factor", k=3)
The three repos (epicure-cooc, epicure-core, epicure-chem) ship the same loader.
Paper: https://arxiv.org/abs/2605.22391
"""
from __future__ import annotations
import json
import os
from dataclasses import dataclass
from typing import Iterable
import numpy as np
def _try_hf_download(repo_id: str, filename: str, revision: str | None = None) -> str:
try:
from huggingface_hub import hf_hub_download
except ImportError as exc:
raise ImportError(
"huggingface_hub is required for from_pretrained(). "
"Install with: pip install huggingface_hub safetensors numpy"
) from exc
return hf_hub_download(repo_id=repo_id, filename=filename, revision=revision)
def _load_safetensors(path: str) -> np.ndarray:
try:
from safetensors.numpy import load_file
except ImportError as exc:
raise ImportError("safetensors required. pip install safetensors") from exc
return load_file(path)["embeddings"]
def _unit(v: np.ndarray, axis: int = -1, eps: float = 1e-9) -> np.ndarray:
n = np.linalg.norm(v, axis=axis, keepdims=True)
return v / np.maximum(n, eps)
@dataclass
class ModeEntry:
mode_id: str
kind: str
property: str
label: str
n_members: int
members: list[str]
pole: np.ndarray # (d_model,) unit-normalised
class Epicure:
"""Lookup-table embedding with neighbour, SLERP, and closest-mode operators."""
def __init__(
self,
E: np.ndarray,
vocab: dict[str, int],
modes: list[ModeEntry],
supervised_poles: dict[str, np.ndarray],
config: dict,
):
self.E_raw = E.astype(np.float32)
self.E = _unit(self.E_raw)
self.vocab = vocab
self.itos = {i: n for n, i in vocab.items()}
self.modes = modes
self.supervised_poles = supervised_poles
self.config = config
# ----- constructors -----
@classmethod
def from_pretrained(cls, repo_id_or_path: str, revision: str | None = None) -> "Epicure":
if os.path.isdir(repo_id_or_path):
base = repo_id_or_path
getp = lambda fn: os.path.join(base, fn)
else:
getp = lambda fn: _try_hf_download(repo_id_or_path, fn, revision=revision)
E = _load_safetensors(getp("embeddings.safetensors"))
with open(getp("vocab.json")) as f:
vocab = json.load(f)
with open(getp("modes.json")) as f:
modes_raw = json.load(f)
with open(getp("supervised_poles.json")) as f:
sup_raw = json.load(f)
with open(getp("config.json")) as f:
config = json.load(f)
modes = [
ModeEntry(
mode_id=m["mode_id"],
kind=m["kind"],
property=m["property"],
label=m["label"],
n_members=m["n_members"],
members=m["members"],
pole=np.array(m["pole"], dtype=np.float32),
)
for m in modes_raw
]
supervised_poles = {k: np.array(v, dtype=np.float32) for k, v in sup_raw.items()}
return cls(E, vocab, modes, supervised_poles, config)
# ----- core operators -----
def vec(self, name: str, normalised: bool = True) -> np.ndarray:
i = self.vocab[name]
return self.E[i] if normalised else self.E_raw[i]
def neighbors(self, name: str, k: int = 5, exclude_self: bool = True) -> list[tuple[str, float]]:
v = self.vec(name)
sims = self.E @ v
order = np.argsort(-sims)
start = 1 if exclude_self else 0
return [(self.itos[int(i)], float(sims[i])) for i in order[start:start + k]]
def slerp(
self,
seed: str,
direction: str | np.ndarray,
theta_deg: float,
k: int = 5,
exclude_seed: bool = True,
) -> list[tuple[str, float]]:
"""Rotate the seed vector toward a unit direction by angle theta on the unit sphere.
``direction`` is either a supervised pole key (e.g.
``"cuisine:South_Asian"``) or a raw (d_model,) np.ndarray.
At theta=0 the query is the seed. At theta=60deg cosine to seed = 0.5.
With ``exclude_seed=True`` (default) the seed ingredient is removed from results
(the paper's reported tables also exclude it).
"""
seed_idx = self.vocab[seed]
v = self.E[seed_idx]
d = self.supervised_poles[direction] if isinstance(direction, str) else direction
d = np.asarray(d, dtype=np.float32)
d = _unit(d)
# Gram-Schmidt: orthogonal component of d relative to v
d_perp = d - (d @ v) * v
n_perp = np.linalg.norm(d_perp)
if n_perp < 1e-9:
# d is colinear with v: rotation has no defined plane; return seed neighbours
return self.neighbors(seed, k=k)
d_perp = d_perp / n_perp
theta = np.deg2rad(float(theta_deg))
q = np.cos(theta) * v + np.sin(theta) * d_perp
q = _unit(q)
sims = self.E @ q
if exclude_seed:
sims[seed_idx] = -np.inf
order = np.argsort(-sims)
return [(self.itos[int(i)], float(sims[i])) for i in order[:k]]
def closest_mode(
self,
name: str,
kind: str | None = None,
k: int = 3,
) -> list[tuple[str, str, float]]:
"""Return the top-k closest modes to the named ingredient.
``kind`` filters by mode kind: 'factor', 'cuisine', 'food_group',
'nova_level', 'cf_sensory', 'usda_nutrient' or None for all.
"""
v = self.vec(name)
scored = []
for m in self.modes:
if kind is not None and m.kind != kind:
continue
scored.append((m.mode_id, m.label, float(_unit(m.pole) @ v)))
scored.sort(key=lambda x: -x[2])
return scored[:k]
def mode_members(self, mode_id: str, k: int | None = None) -> list[str]:
for m in self.modes:
if m.mode_id == mode_id:
return m.members[:k] if k is not None else m.members
raise KeyError(mode_id)
# ----- introspection -----
def list_supervised_poles(self, prefix: str | None = None) -> list[str]:
if prefix is None:
return list(self.supervised_poles.keys())
return [k for k in self.supervised_poles if k.startswith(prefix)]
def list_modes(self, kind: str | None = None) -> list[tuple[str, str]]:
if kind is None:
return [(m.mode_id, m.label) for m in self.modes]
return [(m.mode_id, m.label) for m in self.modes if m.kind == kind]
def __repr__(self) -> str:
return (
f"Epicure(schema={self.config.get('schema')!r}, "
f"d_model={self.config.get('d_model')}, "
f"vocab_size={self.config.get('vocab_size')}, "
f"modes={len(self.modes)}, "
f"supervised_poles={len(self.supervised_poles)})"
)