ModelLens / build_model_pool.py
luisrui
Deploy ModelLens v1: BYOK OpenAI key, size filter, official-only filter, 47k HF model pool
c330598
"""Build the candidate model pool consumed by the recommendation web app.
The output is a single .npz that bundles, for every candidate model:
- model_name (str)
- size_id (int, bucket id matching the trained MLPMetric)
- family_id (int)
- popularity (int, HF downloads in the last 30d; 0 if unknown)
- hf_url (str, https://huggingface.co/<name> if name looks like a repo id)
Run from the project root:
python web/build_model_pool.py \
--data-dir data/unified_augmented \
--args checkpoint/mlp/unified_augmented/ablation_no_model_id_no_dataset_id/args.json \
--out web/assets/model_pool.npz
"""
from __future__ import annotations
import argparse
import json
import os
import numpy as np
SIZE_EDGES_DEFAULT = [
0.001, 0.003, 0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.3, 0.4,
0.5, 0.6, 0.8, 1, 3, 7, 14, 35, 70, 100, 1000,
]
def assign_size_bucket(size_b: float, size_edges: np.ndarray, unknown_id: int) -> int:
try:
x = float(size_b)
except (TypeError, ValueError):
return unknown_id
if not np.isfinite(x) or x == 0.0:
return unknown_id
return int(np.searchsorted(size_edges, x, side="right"))
def get_size_b(profile_entry) -> float:
if not isinstance(profile_entry, dict):
return float("nan")
size = profile_entry.get("size")
try:
if isinstance(size, str) and size.strip().lower() == "unknown":
return float("nan")
x = float(size)
return x if x != 0.0 else float("nan")
except Exception:
return float("nan")
def hf_url_for(name: str) -> str:
return f"https://huggingface.co/{name}" if "/" in name else ""
def main(argv=None):
p = argparse.ArgumentParser()
p.add_argument("--data-dir", default="data/unified_augmented")
p.add_argument(
"--args",
default="checkpoint/mlp/unified_augmented/ablation_no_model_id_no_dataset_id/args.json",
help="Path to the training args.json — used to read size_bucket so bucket ids align with the checkpoint.",
)
p.add_argument("--out", default="web/assets/model_pool.npz")
p.add_argument(
"--min-popularity",
type=int,
default=0,
help="Drop candidate models with HF download count below this. 0 keeps all.",
)
args = p.parse_args(argv)
os.makedirs(os.path.dirname(args.out), exist_ok=True)
with open(os.path.join(args.data_dir, "model2id.json")) as f:
model2id = json.load(f)
with open(os.path.join(args.data_dir, "model2family.json")) as f:
model2family = json.load(f)
with open(os.path.join(args.data_dir, "family2id.json")) as f:
family2id = json.load(f)
with open(os.path.join(args.data_dir, "model_profile.json")) as f:
model_profile = json.load(f)
pop_path = os.path.join(args.data_dir, "model_popularity.json")
pop_map = {}
if os.path.exists(pop_path):
pop_doc = json.load(open(pop_path))
# Doc shape: {fetched_at, source, num_models, status_counts, models: {name: {downloads, status}}}
models_field = pop_doc.get("models", pop_doc)
for name, entry in models_field.items():
if isinstance(entry, dict):
pop_map[name] = int(entry.get("downloads", 0) or 0)
else:
try:
pop_map[name] = int(entry)
except Exception:
pop_map[name] = 0
if os.path.exists(args.args):
train_args = json.load(open(args.args))
size_edges = np.array(train_args.get("size_bucket", SIZE_EDGES_DEFAULT), dtype=float)
else:
size_edges = np.array(SIZE_EDGES_DEFAULT, dtype=float)
unknown_size_id = len(size_edges) + 1
unknown_family_id = family2id.get("unknown", len(family2id) - 1)
names = []
size_ids = []
sizes_b = []
family_ids = []
popularities = []
urls = []
dropped_pop = 0
for name in model2id.keys():
pop = pop_map.get(name, 0)
if pop < args.min_popularity:
dropped_pop += 1
continue
size_b = get_size_b(model_profile.get(name))
sid = assign_size_bucket(size_b, size_edges, unknown_size_id)
fam = model2family.get(name, "unknown")
fid = family2id.get(fam, unknown_family_id)
names.append(name)
size_ids.append(sid)
sizes_b.append(size_b) # NaN means unknown
family_ids.append(fid)
popularities.append(pop)
urls.append(hf_url_for(name))
names_arr = np.array(names, dtype=object)
size_arr = np.array(size_ids, dtype=np.int64)
sizes_b_arr = np.array(sizes_b, dtype=np.float32)
fam_arr = np.array(family_ids, dtype=np.int64)
pop_arr = np.array(popularities, dtype=np.int64)
url_arr = np.array(urls, dtype=object)
np.savez(
args.out,
names=names_arr,
size_ids=size_arr,
sizes_b=sizes_b_arr,
family_ids=fam_arr,
popularities=pop_arr,
urls=url_arr,
)
print(f"Wrote {len(names)} models to {args.out} (dropped {dropped_pop} below min-popularity={args.min_popularity})")
print(f" unique families: {len(set(family_ids))}, unique size buckets: {len(set(size_ids))}")
print(f" models with HF URL: {sum(1 for u in urls if u)} / {len(urls)}")
if __name__ == "__main__":
main()