Spaces:

mickey1976
/

cove-api

Sleeping

App Files Files Community

cove-api / src /models /meta_encoder.py

mickey1976

Deploy: Minimal FastAPI backend for CoVE Space

549c270 4 months ago

raw

history blame contribute delete

4.17 kB

	# src/models/meta_encoder.py
	from __future__ import annotations
	import json
	import hashlib
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Iterable, List, Optional

	import numpy as np


	def _stable_hash(s: str, mod: int) -> int:
	"""Deterministic hash -> [0, mod)."""
	h = hashlib.sha1(s.encode("utf-8")).hexdigest()
	return int(h[:8], 16) % max(1, mod)


	def _l2norm(x: np.ndarray, eps: float = 1e-12) -> np.ndarray:
	n = np.linalg.norm(x, axis=-1, keepdims=True) + eps
	return x / n


	@dataclass
	class MetaEncoderConfig:
	brand_vocab_size: int = 5000
	cat_vocab_size: int = 10000
	price_bins: int = 10
	dim_brand: int = 32
	dim_cat: int = 32
	dim_price: int = 16
	seed: int = 42
	# price stats for bucketing
	price_min: float = 0.0
	price_max: float = 200.0


	class MetaEncoder:
	"""
	Stateless, deterministic metadata embedder:
	- brand: hashed lookup (dim_brand)
	- categories (list[str]): mean of hashed lookups (dim_cat)
	- price: bucket -> embedding (dim_price)
	Final item_vec = concat([brand, categories, price]) -> L2-normalized.
	"""
	def __init__(self, cfg: MetaEncoderConfig):
	self.cfg = cfg
	rng = np.random.default_rng(cfg.seed)

	self.E_brand = rng.normal(0, 1, size=(cfg.brand_vocab_size, cfg.dim_brand)).astype(np.float32)
	self.E_cat = rng.normal(0, 1, size=(cfg.cat_vocab_size, cfg.dim_cat)).astype(np.float32)
	self.E_price = rng.normal(0, 1, size=(cfg.price_bins, cfg.dim_price)).astype(np.float32)

	# pre-normalize rows
	self.E_brand = _l2norm(self.E_brand)
	self.E_cat = _l2norm(self.E_cat)
	self.E_price = _l2norm(self.E_price)

	def _embed_brand(self, brand: Optional[str]) -> np.ndarray:
	if not brand:
	return np.zeros((self.cfg.dim_brand,), dtype=np.float32)
	idx = _stable_hash(brand.strip().lower(), self.cfg.brand_vocab_size)
	return self.E_brand[idx]

	def _embed_categories(self, cats: Optional[Iterable[str]]) -> np.ndarray:
	if not cats:
	return np.zeros((self.cfg.dim_cat,), dtype=np.float32)
	vecs: List[np.ndarray] = []
	for c in cats:
	if not c:
	continue
	idx = _stable_hash(str(c).strip().lower(), self.cfg.cat_vocab_size)
	vecs.append(self.E_cat[idx])
	if not vecs:
	return np.zeros((self.cfg.dim_cat,), dtype=np.float32)
	v = np.mean(np.stack(vecs, axis=0), axis=0)
	return v.astype(np.float32)

	def _embed_price(self, price: Optional[float]) -> np.ndarray:
	if price is None or np.isnan(price):
	return np.zeros((self.cfg.dim_price,), dtype=np.float32)
	p = float(price)
	p = max(self.cfg.price_min, min(self.cfg.price_max, p))
	# linear bucket
	bin_idx = int((p - self.cfg.price_min) / (self.cfg.price_max - self.cfg.price_min + 1e-9) * (self.cfg.price_bins - 1))
	return self.E_price[bin_idx]

	def encode_item(self, brand: Optional[str], categories: Optional[Iterable[str]], price: Optional[float]) -> np.ndarray:
	b = self._embed_brand(brand)
	c = self._embed_categories(categories)
	p = self._embed_price(price)
	fused = np.concatenate([b, c, p], axis=0)
	return _l2norm(fused)

	@property
	def dim(self) -> int:
	return self.cfg.dim_brand + self.cfg.dim_cat + self.cfg.dim_price

	def save_report(self, path: Path):
	path.parent.mkdir(parents=True, exist_ok=True)
	with open(path, "w") as f:
	json.dump({
	"brand_vocab_size": self.cfg.brand_vocab_size,
	"cat_vocab_size": self.cfg.cat_vocab_size,
	"price_bins": self.cfg.price_bins,
	"dim_brand": self.cfg.dim_brand,
	"dim_cat": self.cfg.dim_cat,
	"dim_price": self.cfg.dim_price,
	"total_dim": self.dim,
	"seed": self.cfg.seed,
	"price_min": self.cfg.price_min,
	"price_max": self.cfg.price_max
	}, f, indent=2)

	__all__ = ["MetaEncoder"]