github-actions[bot]
Deploy hyper3labs/HyperView-ABO-Catalog from Hyper3Labs/hyperview-spaces@0affea7
bc4ab71
#!/usr/bin/env python
"""ABO product-catalog comparison demo for CLIP vs Hyper3-CLIP in HyperView."""
from __future__ import annotations
import os
import re
import shutil
import urllib.request
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any
from datasets import load_dataset
from hyperview.core.sample import Sample
from PIL import Image, ImageOps
import hyperview as hv
SPACE_DIR = Path(__file__).resolve().parent
SPACE_HOST = os.environ.get("HYPERVIEW_HOST", "127.0.0.1")
SPACE_PORT = int(os.environ.get("HYPERVIEW_PORT", "6262"))
WORKSPACE_ID = os.environ.get("HYPERVIEW_WORKSPACE_ID", "abo-catalog-clip-hyper3clip-split")
DATASET_NAME = os.environ.get("HYPERVIEW_DATASET_NAME", "abo_catalog_clip_hyper3clip_side_by_side")
EXTENSION_DIR = SPACE_DIR / ".hyperview" / "extensions" / "abo-catalog-readout"
HF_ABO_DATASET = os.environ.get("ABO_HF_DATASET", "hyper3labs/amazon-berkeley-objects")
HF_ABO_CONFIG = os.environ.get("ABO_HF_CONFIG", "listings")
HF_ABO_SPLIT = os.environ.get("ABO_HF_SPLIT", "train")
MAX_PRODUCT_TYPES = int(os.environ.get("ABO_MAX_PRODUCT_TYPES", "20"))
SAMPLES_PER_PRODUCT_TYPE = int(os.environ.get("ABO_SAMPLES_PER_PRODUCT_TYPE", "4"))
MIN_PRODUCT_TYPE_COUNT = int(os.environ.get("ABO_MIN_PRODUCT_TYPE_COUNT", "10"))
IMAGE_MAX_SIZE = (768, 768)
IMAGE_DOWNLOAD_TIMEOUT_SEC = int(os.environ.get("ABO_IMAGE_DOWNLOAD_TIMEOUT_SEC", "20"))
FORCE_SAMPLE_REFRESH = os.environ.get("HYPERVIEW_ABO_FORCE_REFRESH", "").lower() in {
"1",
"true",
"yes",
}
ALLOWED_COUNTRIES = set(
item.strip()
for item in os.environ.get("ABO_ALLOWED_COUNTRIES", "US,GB,AU,CA,AE,SG,IN").split(",")
if item.strip()
)
MODEL_SPECS = [
{
"key": "clip",
"display_name": os.environ.get("ABO_BASELINE_DISPLAY_NAME", "CLIP"),
"provider": os.environ.get("ABO_BASELINE_PROVIDER", "embed-anything"),
"model": os.environ.get("ABO_BASELINE_MODEL", "openai/clip-vit-base-patch32"),
"layout": os.environ.get("ABO_BASELINE_LAYOUT", "euclidean:2d"),
"geometry": os.environ.get("ABO_BASELINE_GEOMETRY", "euclidean"),
"layout_dimension": int(os.environ.get("ABO_BASELINE_LAYOUT_DIMENSION", "2")),
"metric": os.environ.get("ABO_BASELINE_METRIC", "cosine"),
"panel_title": os.environ.get("ABO_BASELINE_PANEL_TITLE", "CLIP - Euclidean Catalog Map"),
},
{
"key": "candidate",
"display_name": os.environ.get("ABO_CANDIDATE_DISPLAY_NAME", "Hyper3-CLIP"),
"provider": os.environ.get("ABO_CANDIDATE_PROVIDER", "hyper-models"),
"model": os.environ.get("ABO_CANDIDATE_MODEL", "hyper3-clip-v0.5"),
"layout": os.environ.get("ABO_CANDIDATE_LAYOUT", "poincare:2d"),
"geometry": os.environ.get("ABO_CANDIDATE_GEOMETRY", "poincare"),
"layout_dimension": int(os.environ.get("ABO_CANDIDATE_LAYOUT_DIMENSION", "2")),
"metric": os.environ.get("ABO_CANDIDATE_METRIC", "cosine"),
"panel_title": os.environ.get("ABO_CANDIDATE_PANEL_TITLE", "Hyper3-CLIP - Poincare Catalog Map"),
},
]
DEMO_EXAMPLES = [
{
"id": "lighting",
"mode": "image-neighborhood",
"title": "Lighting fixture",
"family": "Lighting",
"guide": "Look for category drift: the same fixture should retrieve lights and lamps, not jewelry, shoes, or office products.",
"queryId": "B07HK5WXQP_510lSNJKiyL",
"queryLabel": "LIGHT_FIXTURE",
"summaries": {
"clip": {
"hits": 4,
"text": "Also returns earrings, sandals, office products, table, and sofa.",
},
"candidate": {
"hits": 8,
"text": "Mostly returns light fixtures and lamps.",
},
},
},
{
"id": "chandelier",
"mode": "image-neighborhood",
"title": "Chandelier-style fixture",
"family": "Lighting",
"guide": "Look for hierarchy consistency: a chandelier-like fixture should stay inside the lighting neighborhood.",
"queryId": "B07MF1RNWQ_51Vei4EHzBL",
"queryLabel": "LIGHT_FIXTURE",
"summaries": {
"clip": {
"hits": 4,
"text": "Also returns earrings, accessories, office products, sofa, and home.",
},
"candidate": {
"hits": 9,
"text": "Mostly returns light fixtures and lamps.",
},
},
},
{
"id": "footwear",
"mode": "image-neighborhood",
"title": "Sandal",
"family": "Footwear",
"guide": "Look for product-family boundaries: a sandal query should stay with sandals and shoes instead of handbags or home goods.",
"queryId": "B07WHRRNQK_61_LTvw9qDL",
"queryLabel": "SANDAL",
"summaries": {
"clip": {
"hits": 3,
"text": "Also returns handbags, jewelry, home goods, and accessories.",
},
"candidate": {
"hits": 8,
"text": "Mostly returns sandals, shoes, and boots.",
},
},
},
{
"id": "grey-velvet-sofa",
"mode": "text-to-product",
"title": "Grey velvet sofa",
"family": "Furniture / sofa",
"query": "a product photo of grey velvet sofa: Frederick Mid-Century Modern Tufted Velvet Sofa Couch 77.5 W Grey, with metal, brass finish, wood, velvet upholstery, tufted",
"targetTitle": "Rivet Frederick Mid-Century Modern Tufted Velvet Sofa Couch, 77.5 W, Grey",
"targetSampleId": "B082VLTCM4_816VP-arPsL",
"summaries": {
"clip": {
"rank": 20,
"typePrecision": 0.7,
"text": "CLIP stays in the sofa family, but the exact grey velvet target first appears at rank 20.",
},
"candidate": {
"rank": 1,
"typePrecision": 1.0,
"text": "Hyper3 ranks the exact grey velvet target first and keeps the sofa set coherent.",
},
},
"results": {
"candidate": [
{"id": "B082VLTCM4_816VP-arPsL", "rank": 1, "target": True},
{"id": "B075X4VM73_81oPx1e8s_L", "rank": 2},
{"id": "B07B4DBBPG_81dwblf8ogL", "rank": 3},
{"id": "B07B4N29DG_81f4K_PGY-L", "rank": 4},
{"id": "B075X2X4GY_81RzuSu3GLL", "rank": 5},
{"id": "B082VLYCPC_71pzr1IKkIL", "rank": 6},
],
"clip": [
{"id": "B075X4VM73_81oPx1e8s_L", "rank": 1},
{"id": "B082VLYCPC_71pzr1IKkIL", "rank": 2},
{"id": "B075X4F56V_81Oy-OJ68KL", "rank": 3},
{"id": "B07J2Z2BS5_81xPV3Ey1kL", "rank": 4},
{"id": "B07J2R9TVF_A1cLonmKWvL", "rank": 5},
{"id": "B075X4N4W9_81Z8ICQ9dUL", "rank": 6},
],
},
},
{
"id": "chrome-clear-glass-lamp",
"mode": "text-to-product",
"title": "Chrome clear-glass lamp",
"family": "Home / lamp",
"query": "a product photo of chrome with clear glass lamp: Ravenna Home Modern Round Table Lamp With LED Light Bulb - Chrome with Clear Glass, with table lamp, shade, metal, chrome finish, glass",
"targetTitle": "Ravenna Home Modern Round Table Lamp With LED Light Bulb, Chrome with Clear Glass",
"targetSampleId": "B07DBHB6B5_41SHn0jzPOL",
"summaries": {
"clip": {
"rank": 31,
"typePrecision": 0.7,
"text": "CLIP recognizes lamps, but loses the requested chrome and clear-glass variant.",
},
"candidate": {
"rank": 3,
"typePrecision": 0.9,
"text": "Hyper3 keeps the requested material and finish in the first few results.",
},
},
"results": {
"candidate": [
{"id": "B001CD1GC0_71oh4KcBXJS", "rank": 1},
{"id": "B07DBK2P2G_41TnBzn9tFL", "rank": 2},
{"id": "B07DBHB6B5_41SHn0jzPOL", "rank": 3, "target": True},
{"id": "B07DBJQC4S_41Jv3-wdZ3L", "rank": 4},
{"id": "B07DTFBDTL_414HtkR3QZL", "rank": 5},
{"id": "B0742DR7C2_81jtRdRtkPL", "rank": 6},
],
"clip": [
{"id": "B0742DR7C2_81jtRdRtkPL", "rank": 1},
{"id": "B07374K538_71XEFMKYAGL", "rank": 2},
{"id": "B001CD1GFM_61lepTd-EmS", "rank": 3},
{"id": "B07DBK2P2G_41TnBzn9tFL", "rank": 4},
{"id": "B07HKF59YX_41yzx8pNH6L", "rank": 5},
{"id": "B07DT153M1_418DZS6SpTL", "rank": 6},
],
},
},
{
"id": "silver-mid-century-chair",
"mode": "text-to-product",
"title": "Silver mid-century chair",
"family": "Furniture / chair",
"query": "a product photo of silver mid-century modern chair: Logan Mid-Century Modern Dining Chair Set of 2 20.1 W Silver, with metal, silver tone, wood, mid-century style, modern style",
"targetTitle": "Rivet Logan Mid-Century Modern Dining Chair, Set of 2, 20.1 W, Silver",
"targetSampleId": "B0853L3W72_81mvF2gJy5L",
"summaries": {
"clip": {
"rank": 28,
"typePrecision": 0.3,
"text": "CLIP drifts across nearby furniture and misses the exact silver chair until rank 28.",
},
"candidate": {
"rank": 2,
"typePrecision": 0.8,
"text": "Hyper3 brings the requested chair variant to rank 2 while preserving product type.",
},
},
"results": {
"candidate": [
{"id": "B0746HFVWY_91-Pi95Hy9L", "rank": 1},
{"id": "B0853L3W72_81mvF2gJy5L", "rank": 2, "target": True},
{"id": "B0853KZW7Z_71LY8AELiuL", "rank": 3},
{"id": "B07B7B244W_71DNWOd-5bL", "rank": 4},
{"id": "B07B7B3RVS_7196wQbG7KL", "rank": 5},
{"id": "B075YN3HN9_81Rub8QQCvL", "rank": 6},
],
"clip": [
{"id": "B075YPTG2Q_710tYWSG8SL", "rank": 1},
{"id": "B07HSB4WV6_71k4-giCqJL", "rank": 2},
{"id": "B075YP3C53_81tDegaK7zL", "rank": 3},
{"id": "B07HSLKKXG_81MUeTeLXVL", "rank": 4},
{"id": "B07HSK3534_81WypZMR_2L", "rank": 5},
{"id": "B0853KZW7Z_71LY8AELiuL", "rank": 6},
],
},
},
]
DEMO_QUERY_IDS = {example["queryId"] for example in DEMO_EXAMPLES if "queryId" in example}
DEMO_REQUIRED_SAMPLE_IDS = set(DEMO_QUERY_IDS)
for example in DEMO_EXAMPLES:
target_sample_id = example.get("targetSampleId")
if target_sample_id:
DEMO_REQUIRED_SAMPLE_IDS.add(str(target_sample_id))
for model_results in dict(example.get("results") or {}).values():
for result in list(model_results or []):
sample_id = result.get("id")
if sample_id:
DEMO_REQUIRED_SAMPLE_IDS.add(str(sample_id))
def media_root() -> Path:
root = Path(os.environ.get("HYPERVIEW_MEDIA_DIR", str(SPACE_DIR / "demo_data" / "media")))
path = root / DATASET_NAME
path.mkdir(parents=True, exist_ok=True)
return path
def readable_product_type(label: str | None) -> str:
if not label:
return ""
text = label.replace("_", " ").replace("-", " ").lower()
return re.sub(r"\s+", " ", text).strip()
def safe_sample_id(item_id: str, image_id: str) -> str:
raw = f"{item_id}_{image_id}"
return re.sub(r"[^A-Za-z0-9_.-]+", "_", raw).strip("_")[:96]
def select_balanced(records: list[dict]) -> list[dict]:
grouped: dict[str, list[dict]] = defaultdict(list)
for record in records:
grouped[record["product_type"]].append(record)
eligible = [
(ptype, items)
for ptype, items in grouped.items()
if len(items) >= MIN_PRODUCT_TYPE_COUNT
]
eligible.sort(key=lambda item: (-len(item[1]), item[0]))
selected: list[dict] = []
for _ptype, items in eligible[:MAX_PRODUCT_TYPES]:
selected.extend(items[:SAMPLES_PER_PRODUCT_TYPE])
selected_ids = {
safe_sample_id(str(record["item_id"]), str(record["image_id"])) for record in selected
}
for record in records:
sample_id = safe_sample_id(str(record["item_id"]), str(record["image_id"]))
if sample_id in DEMO_REQUIRED_SAMPLE_IDS and sample_id not in selected_ids:
selected.append(record)
selected_ids.add(sample_id)
return selected
def download_product_image(record: dict, destination: Path) -> bool:
if destination.exists() and destination.stat().st_size > 0:
return True
url = record.get("image_url")
if not url:
return False
raw_path = destination.with_suffix(destination.suffix + ".download")
tmp_path = destination.with_suffix(destination.suffix + ".tmp")
try:
with urllib.request.urlopen(url, timeout=IMAGE_DOWNLOAD_TIMEOUT_SEC) as response:
with raw_path.open("wb") as handle:
shutil.copyfileobj(response, handle)
image = ImageOps.exif_transpose(Image.open(raw_path)).convert("RGB")
image.thumbnail(IMAGE_MAX_SIZE, Image.Resampling.LANCZOS)
image.save(tmp_path, format="JPEG", quality=90, optimize=True)
tmp_path.replace(destination)
return True
except Exception as exc:
print(f"Skipping image {url}: {exc}", flush=True)
return False
finally:
raw_path.unlink(missing_ok=True)
tmp_path.unlink(missing_ok=True)
def hf_catalog_records() -> list[dict]:
print(f"Loading ABO listings from Hugging Face dataset {HF_ABO_DATASET}...", flush=True)
source = load_dataset(HF_ABO_DATASET, HF_ABO_CONFIG, split=HF_ABO_SPLIT)
records = []
for row in source:
if ALLOWED_COUNTRIES and row.get("country") not in ALLOWED_COUNTRIES:
continue
if not row.get("title") or not row.get("product_type") or not row.get("main_image_id"):
continue
if not row.get("department") or not row.get("main_image_url"):
continue
records.append(
{
"item_id": row.get("item_id"),
"title": row.get("title"),
"product_type": row.get("product_type"),
"product_type_readable": row.get("product_type_readable")
or readable_product_type(row.get("product_type")),
"department": row.get("department"),
"country": row.get("country"),
"brand": row.get("brand"),
"color": row.get("color"),
"style": row.get("style"),
"image_id": row.get("main_image_id"),
"image_url": row.get("main_image_url"),
"source": HF_ABO_DATASET,
}
)
return records
def prepare_catalog_records() -> list[dict]:
records = select_balanced(hf_catalog_records())
print(
f"Selected {len(records)} ABO products across "
f"{len({record['product_type'] for record in records})} product types.",
flush=True,
)
return records
def add_abo_samples(dataset: hv.Dataset) -> None:
existing_ids = {sample.id for sample in dataset.samples}
media_dir = media_root()
skipped = 0
product_counts: Counter[str] = Counter()
samples: list[Sample] = []
records = prepare_catalog_records()
expected_ids = {
safe_sample_id(str(record["item_id"]), str(record["image_id"])) for record in records
}
missing_ids = expected_ids - existing_ids
missing_media = [
sample_id for sample_id in expected_ids if not (media_dir / f"{sample_id}.jpg").exists()
]
if not FORCE_SAMPLE_REFRESH and not missing_ids and not missing_media:
print(
f"ABO samples already prepared ({len(records)} products). "
"Existing sample rows will be reused.",
flush=True,
)
for index, record in enumerate(records, start=1):
sample_id = safe_sample_id(str(record["item_id"]), str(record["image_id"]))
destination = media_dir / f"{sample_id}.jpg"
if not download_product_image(record, destination):
skipped += 1
continue
metadata = dict(record)
metadata["hierarchy"] = f"{record['department']} -> {record['product_type_readable']}"
samples.append(
Sample(
id=sample_id,
filepath=str(destination),
label=record["product_type"],
metadata=metadata,
)
)
product_counts[record["product_type"]] += 1
if index == 1 or index % 50 == 0 or index == len(records):
print(
f"Prepared media for {index}/{len(records)} products "
f"({skipped} skipped).",
flush=True,
)
upserted, skipped_existing = dataset.add_samples(
samples,
skip_existing=not FORCE_SAMPLE_REFRESH,
)
updated = (
sum(1 for sample in samples if sample.id in existing_ids)
if FORCE_SAMPLE_REFRESH
else 0
)
added = upserted - updated
if skipped_existing:
print(f"Skipped {skipped_existing} existing ABO sample rows.", flush=True)
print(f"Prepared ABO samples ({added} added, {updated} updated, {skipped} media skipped).", flush=True)
print(f"Product-type counts: {dict(product_counts)}", flush=True)
def ensure_layouts(dataset: hv.Dataset) -> dict[str, str]:
layouts: dict[str, str] = {}
for spec in MODEL_SPECS:
print(f"Ensuring {spec['display_name']} embeddings...", flush=True)
space_key = dataset.compute_embeddings(
model=spec["model"],
provider=spec["provider"],
batch_size=32,
show_progress=True,
)
print(f"Ensuring {spec['display_name']} layout...", flush=True)
layouts[spec["key"]] = dataset.compute_visualization(
space_key=space_key,
layout=spec["layout"],
n_neighbors=20,
min_dist=0.08,
metric=spec["metric"],
)
return layouts
def build_dataset() -> tuple[hv.Dataset, dict[str, str]]:
dataset = hv.Dataset(DATASET_NAME)
add_abo_samples(dataset)
layouts = ensure_layouts(dataset)
return dataset, layouts
def model_panel_props(layouts: dict[str, str]) -> list[dict[str, Any]]:
props = []
for spec in MODEL_SPECS:
layout_key = layouts[spec["key"]]
props.append(
{
"key": spec["key"],
"displayName": spec["display_name"],
"layoutKey": layout_key,
}
)
return props
def build_demo_view(layouts: dict[str, str]) -> hv.ui.View:
samples_panel = hv.ui.Samples(
id="grid",
title="Samples",
position="center",
layout=hv.ui.PanelLayout(
width=int(os.environ.get("ABO_SAMPLES_PANEL_WIDTH", "220")),
min_width=180,
min_height=360,
),
)
scatter_panels = []
for index, spec in enumerate(MODEL_SPECS):
panel_id = f"{spec['key']}-catalog-map"
scatter_panels.append(
hv.ui.Scatter(
id=panel_id,
title=spec["panel_title"],
layout_key=layouts[spec["key"]],
geometry=spec["geometry"],
layout_dimension=spec["layout_dimension"],
reference_panel_id="grid" if index == 0 else scatter_panels[0].id,
direction="right" if index == 0 else "below",
layout=hv.ui.PanelLayout(min_width=420, min_height=240),
)
)
return hv.ui.View(
samples_panel,
*scatter_panels,
hv.ui.ExtensionPanel(
id="catalog-hierarchy-readout",
extension="abo-catalog-readout",
panel="catalog-comparison",
position="right",
layout=hv.ui.PanelLayout(
width=int(os.environ.get("ABO_READOUT_PANEL_WIDTH", "320")),
min_width=300,
),
props={
"models": model_panel_props(layouts),
"examples": DEMO_EXAMPLES,
},
),
)
def launch_demo(dataset: hv.Dataset, layouts: dict[str, str]) -> hv.Session:
session = hv.launch(
dataset,
host=SPACE_HOST,
port=SPACE_PORT,
open_browser=False,
workspace_id=WORKSPACE_ID,
block=False,
)
print("Installing ABO demo extension...", flush=True)
session.ui.add_extension(EXTENSION_DIR, workspace_id=WORKSPACE_ID)
print("Applying ABO stacked comparison demo view...", flush=True)
session.ui.apply_view(build_demo_view(layouts), workspace_id=WORKSPACE_ID)
print("Clearing initial query state...", flush=True)
session.ui.set_active_layout(None, workspace_id=WORKSPACE_ID)
session.ui.set_selection([], workspace_id=WORKSPACE_ID)
print(f"\nHyperView ABO catalog demo is running at {session.url}", flush=True)
model_names = " and ".join(spec["display_name"] for spec in MODEL_SPECS)
print(
f" Samples, {model_names} stacked maps, and the guided readout are pinned.",
flush=True,
)
print(" Press Ctrl+C to stop.\n", flush=True)
return session
def main() -> None:
dataset, layouts = build_dataset()
print("Layouts:", flush=True)
for spec in MODEL_SPECS:
print(f" {spec['display_name']}: {layouts[spec['key']]}", flush=True)
session = launch_demo(dataset, layouts)
session.wait()
if __name__ == "__main__":
main()