upload trained ModernBERT capability classifier

Browse files

Files changed (5) hide show

README.md +91 -0
config.json +101 -0
model.safetensors +3 -0
tokenizer.json +0 -0
tokenizer_config.json +17 -0

README.md ADDED Viewed

	@@ -0,0 +1,91 @@

+---
+base_model: answerdotai/ModernBERT-base
+library_name: transformers
+license: apache-2.0
+datasets:
+  - massaindustries/dataset-B-modernbert-train
+tags:
+  - text-classification
+  - multi-label
+  - modernbert
+  - capability-classifier
+  - routing
+---
+# ModernBERT capability classifier (6 dimensions)
+Fine-tuned on [`massaindustries/dataset-B-modernbert-train`](https://huggingface.co/datasets/massaindustries/dataset-B-modernbert-train).
+Outputs sigmoid scores in [0,1] over 6 capability dimensions:
+1. `instruction_following`
+2. `coding`
+3. `math_reasoning`
+4. `world_knowledge`
+5. `planning_agentic`
+6. `creative_synthesis`
+Designed for downstream routing in the Brick semantic router as a drop-in replacement for the domain classifier.
+## Training
+- Architecture: ModernBERT + Linear(hidden→6) + sigmoid
+- Loss: BCEWithLogitsLoss on soft float labels (judge mean)
+- Precision: bf16 + FlashAttention-2
+- HF problem_type: `multi_label_classification`
+## Inference example
+```python
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+import torch
+m = AutoModelForSequenceClassification.from_pretrained('massaindustries/modernbert-capability-classifier')
+t = AutoTokenizer.from_pretrained('massaindustries/modernbert-capability-classifier')
+inp = t('write a python sort function', return_tensors='pt')
+scores = torch.sigmoid(m(**inp).logits)[0]
+for i, d in enumerate(m.config.id2label.values()):
+    print(f'{d}: {scores[i].item():.3f}')
+```
+## Evaluation (human_eval split, 200 Claude-annotated)
+```json
+{
+  "eval_loss": 0.42123839259147644,
+  "eval_model_preparation_time": 0.0022,
+  "eval_mae_instruction_following": 0.24792593717575073,
+  "eval_rmse_instruction_following": 0.30881765484809875,
+  "eval_brier_instruction_following": 0.09536834806203842,
+  "eval_pearson_instruction_following": 0.8270609378814697,
+  "eval_spearman_instruction_following": 0.8144904545331433,
+  "eval_mae_coding": 0.07370934635400772,
+  "eval_rmse_coding": 0.18934082984924316,
+  "eval_brier_coding": 0.03584995120763779,
+  "eval_pearson_coding": 0.9140766263008118,
+  "eval_spearman_coding": 0.8615511297152596,
+  "eval_mae_math_reasoning": 0.10867060720920563,
+  "eval_rmse_math_reasoning": 0.1694405972957611,
+  "eval_brier_math_reasoning": 0.02871011756360531,
+  "eval_pearson_math_reasoning": 0.9191069602966309,
+  "eval_spearman_math_reasoning": 0.8252107128077218,
+  "eval_mae_world_knowledge": 0.13477517664432526,
+  "eval_rmse_world_knowledge": 0.1875971555709839,
+  "eval_brier_world_knowledge": 0.03519269451498985,
+  "eval_pearson_world_knowledge": 0.8357715606689453,
+  "eval_spearman_world_knowledge": 0.8138721105892404,
+  "eval_mae_planning_agentic": 0.19774200022220612,
+  "eval_rmse_planning_agentic": 0.2537391781806946,
+  "eval_brier_planning_agentic": 0.06438356637954712,
+  "eval_pearson_planning_agentic": 0.8233083486557007,
+  "eval_spearman_planning_agentic": 0.7674644757779185,
+  "eval_mae_creative_synthesis": 0.08937528729438782,
+  "eval_rmse_creative_synthesis": 0.16472801566123962,
+  "eval_brier_creative_synthesis": 0.027135320007801056,
+  "eval_pearson_creative_synthesis": 0.9154033660888672,
+  "eval_spearman_creative_synthesis": 0.8138763391203128,
+  "eval_pearson_macro": 0.8724546333154043,
+  "eval_mae_macro": 0.14203305914998055,
+  "eval_spearman_macro": 0.8160775370905994,
+  "eval_f1_macro_t3": 0.8775192561604114,
+  "eval_f1_macro_t5": 0.8368971405647821,
+  "eval_f1_macro_t7": 0.8287502804667367,
+  "eval_runtime": 1.384,
+  "eval_samples_per_second": 144.51,
+  "eval_steps_p
+```

config.json ADDED Viewed

	@@ -0,0 +1,101 @@

+{
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "bfloat16",
+  "embedding_dropout": 0.0,
+  "eos_token_id": null,
+  "global_attn_every_n_layers": 3,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "instruction_following",
+    "1": "coding",
+    "2": "math_reasoning",
+    "3": "world_knowledge",
+    "4": "planning_agentic",
+    "5": "creative_synthesis"
+  },
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 2624,
+  "label2id": {
+    "instruction_following": 0,
+    "coding": 1,
+    "math_reasoning": 2,
+    "world_knowledge": 3,
+    "planning_agentic": 4,
+    "creative_synthesis": 5
+  },
+  "layer_norm_eps": 1e-05,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "local_attention": 128,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "multi_label_classification",
+  "rope_parameters": {
+    "full_attention": {
+      "rope_theta": 160000.0,
+      "rope_type": "default"
+    },
+    "sliding_attention": {
+      "rope_theta": 10000.0,
+      "rope_type": "default"
+    }
+  },
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.7.0",
+  "vocab_size": 50368,
+  "num_labels": 6
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d94439d1ee1ad4370c1dc3ba905b5ef71720e1ab85d5e9ee335db3ffa44cced
+size 791693180

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "backend": "tokenizers",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "is_local": true,
+  "local_files_only": false,
+  "mask_token": "[MASK]",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 8192,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "[UNK]"
+}