Upload ModernBERT router checkpoint (PID loss, utility=0.9762)

Browse files

Files changed (7) hide show

README.md +117 -0
config.json +83 -0
model.safetensors +3 -0
router_config.json +19 -0
sweep_results.json +62 -0
tokenizer.json +0 -0
tokenizer_config.json +16 -0

README.md ADDED Viewed

	@@ -0,0 +1,117 @@

+---
+library_name: transformers
+license: apache-2.0
+base_model: answerdotai/ModernBERT-base
+tags:
+  - router
+  - llm-routing
+  - modernbert
+  - text-classification
+  - on-device
+pipeline_tag: text-classification
+datasets:
+  - custom
+metrics:
+  - accuracy
+language:
+  - en
+---
+# Vibe Router — ModernBERT
+A tiny LLM router that decides whether a chat request should run **locally** (on-device) or in the **cloud**, built on [ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base).
+## How it works
+Given a user prompt, the model outputs a single logit. After sigmoid, values above the threshold (0.371) route to cloud; below routes to device.
+- **Device model**: [LiquidAI/LFM2.5-1.2B-Instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct) (runs locally via MLX)
+- **Cloud model**: GPT-5.2
+## Training
+Fine-tuned end-to-end from `answerdotai/ModernBERT-base` using **Privileged Information Distillation (PID)** loss on 5,318 labeled prompt pairs with soft teacher labels derived from a GPT-4o judge.
+| Hyperparameter | Value |
+|----------------|-------|
+| Learning rate | 2e-5 |
+| β_kl | 0.05 |
+| Weight decay | 0.01 |
+| Warmup ratio | 0.1 |
+| Epochs | 3 (early stopping) |
+| Batch size | 32 |
+| Hardware | NVIDIA H100 80GB |
+## Performance
+| Metric | Value |
+|--------|-------|
+| Utility | 0.9762 |
+| Cloud rate | 79.4% |
+| Regret | 0.0064 |
+| Catastrophic miss rate | 0.0% |
+| ECE | 0.173 |
+| Best threshold | 0.371 |
+### Baselines
+| Model | Utility | Cloud% | Regret |
+|-------|---------|--------|--------|
+| Always device | 0.879 | 0% | 0.104 |
+| Always cloud | 0.894 | 100% | 0.089 |
+| **ModernBERT (PID)** | **0.976** | **79.4%** | **0.006** |
+## Latency
+~7ms per inference on GPU, ~10ms on CPU (Apple Silicon MPS).
+## Usage
+```python
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+import torch
+model_id = "trymirai/vibe-router-modernbert"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1)
+model.eval()
+prompt = "Write a Python B-tree implementation"
+inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+with torch.no_grad():
+    logits = model(**inputs).logits
+    p_cloud = torch.sigmoid(logits).item()
+threshold = 0.371
+decision = "cloud" if p_cloud > threshold else "device"
+print(f"p(cloud)={p_cloud:.3f} → {decision}")
+```
+## Routing examples
+| Prompt | p(cloud) | Decision |
+|--------|----------|----------|
+| hi | 0.011 | device |
+| 2+2 | 0.009 | device |
+| tell me a joke | 0.012 | device |
+| hello | 0.011 | device |
+| Write a Python B-tree with insert, delete, search | 0.911 | cloud |
+| Implement a REST API with auth and rate limiting | 0.762 | cloud |
+| Derive the volume of a sphere using integration | 0.900 | cloud |
+| Who was the first host of Top Chef? | 0.946 | cloud |
+## License
+Apache 2.0
+## Citation
+```bibtex
+@misc{vibe-router-2026,
+  title={Vibe Router: On-Device LLM Routing with Privileged Information Distillation},
+  author={Mirai},
+  year={2026},
+  url={https://github.com/trymirai/vibe_router}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "float32",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-05,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "local_attention": 128,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "rope_parameters": {
+    "full_attention": {
+      "rope_theta": 160000.0,
+      "rope_type": "default"
+    },
+    "sliding_attention": {
+      "rope_theta": 10000.0,
+      "rope_type": "default"
+    }
+  },
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.2.0",
+  "vocab_size": 50368
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb128103dab9e2938e447b4079d4f0bb3034e2f26cfd2668159f37aeaa54f67f
+size 598436708

router_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "base_model": "answerdotai/ModernBERT-base",
+  "best_threshold": 0.37105263157894736,
+  "loss": "PID",
+  "hp": {
+    "lr": 2e-05,
+    "beta_kl": 0.05,
+    "weight_decay": 0.01,
+    "warmup_ratio": 0.1
+  },
+  "device_model": "LiquidAI/LFM2.5-1.2B-Instruct",
+  "cloud_model": "gpt-5.2",
+  "test_results": {
+    "utility": 0.9762406349182129,
+    "cloud_rate": 0.7944862155388471,
+    "regret": 0.006434837356209755,
+    "cat_miss": 0.0
+  }
+}

sweep_results.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+  {
+    "hp": {
+      "lr": 1e-05,
+      "beta_kl": 0.05,
+      "weight_decay": 0.01,
+      "warmup_ratio": 0.1
+    },
+    "val_loss": 0.05074503788000751,
+    "time_s": 95.0573191291187
+  },
+  {
+    "hp": {
+      "lr": 1e-05,
+      "beta_kl": 0.1,
+      "weight_decay": 0.01,
+      "warmup_ratio": 0.1
+    },
+    "val_loss": 0.0569811669310373,
+    "time_s": 107.19165365281515
+  },
+  {
+    "hp": {
+      "lr": 2e-05,
+      "beta_kl": 0.05,
+      "weight_decay": 0.01,
+      "warmup_ratio": 0.1
+    },
+    "val_loss": 0.04958628546137836,
+    "time_s": 106.77600225992501
+  },
+  {
+    "hp": {
+      "lr": 2e-05,
+      "beta_kl": 0.1,
+      "weight_decay": 0.01,
+      "warmup_ratio": 0.1
+    },
+    "val_loss": 0.05651537539578055,
+    "time_s": 145.05425760895014
+  },
+  {
+    "hp": {
+      "lr": 5e-05,
+      "beta_kl": 0.05,
+      "weight_decay": 0.01,
+      "warmup_ratio": 0.1
+    },
+    "val_loss": 0.04995061208804449,
+    "time_s": 89.70805354882032
+  },
+  {
+    "hp": {
+      "lr": 5e-05,
+      "beta_kl": 0.1,
+      "weight_decay": 0.01,
+      "warmup_ratio": 0.1
+    },
+    "val_loss": 0.05411159153170129,
+    "time_s": 125.99459161888808
+  }
+]

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "backend": "tokenizers",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "is_local": false,
+  "mask_token": "[MASK]",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 8192,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "[UNK]"
+}