"""Build the candidate model pool consumed by the recommendation web app. The output is a single .npz that bundles, for every candidate model: - model_name (str) - size_id (int, bucket id matching the trained MLPMetric) - family_id (int) - popularity (int, HF downloads in the last 30d; 0 if unknown) - hf_url (str, https://huggingface.co/ if name looks like a repo id) Run from the project root: python web/build_model_pool.py \ --data-dir data/unified_augmented \ --args checkpoint/mlp/unified_augmented/ablation_no_model_id_no_dataset_id/args.json \ --out web/assets/model_pool.npz """ from __future__ import annotations import argparse import json import os import numpy as np SIZE_EDGES_DEFAULT = [ 0.001, 0.003, 0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1, 3, 7, 14, 35, 70, 100, 1000, ] def assign_size_bucket(size_b: float, size_edges: np.ndarray, unknown_id: int) -> int: try: x = float(size_b) except (TypeError, ValueError): return unknown_id if not np.isfinite(x) or x == 0.0: return unknown_id return int(np.searchsorted(size_edges, x, side="right")) def get_size_b(profile_entry) -> float: if not isinstance(profile_entry, dict): return float("nan") size = profile_entry.get("size") try: if isinstance(size, str) and size.strip().lower() == "unknown": return float("nan") x = float(size) return x if x != 0.0 else float("nan") except Exception: return float("nan") def hf_url_for(name: str) -> str: return f"https://huggingface.co/{name}" if "/" in name else "" def main(argv=None): p = argparse.ArgumentParser() p.add_argument("--data-dir", default="data/unified_augmented") p.add_argument( "--args", default="checkpoint/mlp/unified_augmented/ablation_no_model_id_no_dataset_id/args.json", help="Path to the training args.json — used to read size_bucket so bucket ids align with the checkpoint.", ) p.add_argument("--out", default="web/assets/model_pool.npz") p.add_argument( "--min-popularity", type=int, default=0, help="Drop candidate models with HF download count below this. 0 keeps all.", ) args = p.parse_args(argv) os.makedirs(os.path.dirname(args.out), exist_ok=True) with open(os.path.join(args.data_dir, "model2id.json")) as f: model2id = json.load(f) with open(os.path.join(args.data_dir, "model2family.json")) as f: model2family = json.load(f) with open(os.path.join(args.data_dir, "family2id.json")) as f: family2id = json.load(f) with open(os.path.join(args.data_dir, "model_profile.json")) as f: model_profile = json.load(f) pop_path = os.path.join(args.data_dir, "model_popularity.json") pop_map = {} if os.path.exists(pop_path): pop_doc = json.load(open(pop_path)) # Doc shape: {fetched_at, source, num_models, status_counts, models: {name: {downloads, status}}} models_field = pop_doc.get("models", pop_doc) for name, entry in models_field.items(): if isinstance(entry, dict): pop_map[name] = int(entry.get("downloads", 0) or 0) else: try: pop_map[name] = int(entry) except Exception: pop_map[name] = 0 if os.path.exists(args.args): train_args = json.load(open(args.args)) size_edges = np.array(train_args.get("size_bucket", SIZE_EDGES_DEFAULT), dtype=float) else: size_edges = np.array(SIZE_EDGES_DEFAULT, dtype=float) unknown_size_id = len(size_edges) + 1 unknown_family_id = family2id.get("unknown", len(family2id) - 1) names = [] size_ids = [] sizes_b = [] family_ids = [] popularities = [] urls = [] dropped_pop = 0 for name in model2id.keys(): pop = pop_map.get(name, 0) if pop < args.min_popularity: dropped_pop += 1 continue size_b = get_size_b(model_profile.get(name)) sid = assign_size_bucket(size_b, size_edges, unknown_size_id) fam = model2family.get(name, "unknown") fid = family2id.get(fam, unknown_family_id) names.append(name) size_ids.append(sid) sizes_b.append(size_b) # NaN means unknown family_ids.append(fid) popularities.append(pop) urls.append(hf_url_for(name)) names_arr = np.array(names, dtype=object) size_arr = np.array(size_ids, dtype=np.int64) sizes_b_arr = np.array(sizes_b, dtype=np.float32) fam_arr = np.array(family_ids, dtype=np.int64) pop_arr = np.array(popularities, dtype=np.int64) url_arr = np.array(urls, dtype=object) np.savez( args.out, names=names_arr, size_ids=size_arr, sizes_b=sizes_b_arr, family_ids=fam_arr, popularities=pop_arr, urls=url_arr, ) print(f"Wrote {len(names)} models to {args.out} (dropped {dropped_pop} below min-popularity={args.min_popularity})") print(f" unique families: {len(set(family_ids))}, unique size buckets: {len(set(size_ids))}") print(f" models with HF URL: {sum(1 for u in urls if u)} / {len(urls)}") if __name__ == "__main__": main()