cove-api / src /models /meta_encoder.py
mickey1976's picture
Deploy: Minimal FastAPI backend for CoVE Space
549c270
# src/models/meta_encoder.py
from __future__ import annotations
import json
import hashlib
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Optional
import numpy as np
def _stable_hash(s: str, mod: int) -> int:
"""Deterministic hash -> [0, mod)."""
h = hashlib.sha1(s.encode("utf-8")).hexdigest()
return int(h[:8], 16) % max(1, mod)
def _l2norm(x: np.ndarray, eps: float = 1e-12) -> np.ndarray:
n = np.linalg.norm(x, axis=-1, keepdims=True) + eps
return x / n
@dataclass
class MetaEncoderConfig:
brand_vocab_size: int = 5000
cat_vocab_size: int = 10000
price_bins: int = 10
dim_brand: int = 32
dim_cat: int = 32
dim_price: int = 16
seed: int = 42
# price stats for bucketing
price_min: float = 0.0
price_max: float = 200.0
class MetaEncoder:
"""
Stateless, deterministic metadata embedder:
- brand: hashed lookup (dim_brand)
- categories (list[str]): mean of hashed lookups (dim_cat)
- price: bucket -> embedding (dim_price)
Final item_vec = concat([brand, categories, price]) -> L2-normalized.
"""
def __init__(self, cfg: MetaEncoderConfig):
self.cfg = cfg
rng = np.random.default_rng(cfg.seed)
self.E_brand = rng.normal(0, 1, size=(cfg.brand_vocab_size, cfg.dim_brand)).astype(np.float32)
self.E_cat = rng.normal(0, 1, size=(cfg.cat_vocab_size, cfg.dim_cat)).astype(np.float32)
self.E_price = rng.normal(0, 1, size=(cfg.price_bins, cfg.dim_price)).astype(np.float32)
# pre-normalize rows
self.E_brand = _l2norm(self.E_brand)
self.E_cat = _l2norm(self.E_cat)
self.E_price = _l2norm(self.E_price)
def _embed_brand(self, brand: Optional[str]) -> np.ndarray:
if not brand:
return np.zeros((self.cfg.dim_brand,), dtype=np.float32)
idx = _stable_hash(brand.strip().lower(), self.cfg.brand_vocab_size)
return self.E_brand[idx]
def _embed_categories(self, cats: Optional[Iterable[str]]) -> np.ndarray:
if not cats:
return np.zeros((self.cfg.dim_cat,), dtype=np.float32)
vecs: List[np.ndarray] = []
for c in cats:
if not c:
continue
idx = _stable_hash(str(c).strip().lower(), self.cfg.cat_vocab_size)
vecs.append(self.E_cat[idx])
if not vecs:
return np.zeros((self.cfg.dim_cat,), dtype=np.float32)
v = np.mean(np.stack(vecs, axis=0), axis=0)
return v.astype(np.float32)
def _embed_price(self, price: Optional[float]) -> np.ndarray:
if price is None or np.isnan(price):
return np.zeros((self.cfg.dim_price,), dtype=np.float32)
p = float(price)
p = max(self.cfg.price_min, min(self.cfg.price_max, p))
# linear bucket
bin_idx = int((p - self.cfg.price_min) / (self.cfg.price_max - self.cfg.price_min + 1e-9) * (self.cfg.price_bins - 1))
return self.E_price[bin_idx]
def encode_item(self, brand: Optional[str], categories: Optional[Iterable[str]], price: Optional[float]) -> np.ndarray:
b = self._embed_brand(brand)
c = self._embed_categories(categories)
p = self._embed_price(price)
fused = np.concatenate([b, c, p], axis=0)
return _l2norm(fused)
@property
def dim(self) -> int:
return self.cfg.dim_brand + self.cfg.dim_cat + self.cfg.dim_price
def save_report(self, path: Path):
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w") as f:
json.dump({
"brand_vocab_size": self.cfg.brand_vocab_size,
"cat_vocab_size": self.cfg.cat_vocab_size,
"price_bins": self.cfg.price_bins,
"dim_brand": self.cfg.dim_brand,
"dim_cat": self.cfg.dim_cat,
"dim_price": self.cfg.dim_price,
"total_dim": self.dim,
"seed": self.cfg.seed,
"price_min": self.cfg.price_min,
"price_max": self.cfg.price_max
}, f, indent=2)
__all__ = ["MetaEncoder"]