Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

README.md +4 -5
config.json +11 -1
configuration_cloverlm.py +40 -1
lm_eval/test_eval.log +94 -0
lm_eval/test_eval2.log +130 -0
vllm_plugin/SERVING.md +116 -0
vllm_plugin/__init__.py +0 -0
vllm_plugin/cloverlm_vllm.py +370 -0
vllm_plugin/quartet2_quant.py +135 -0
vllm_plugin/serve.py +109 -0

README.md CHANGED Viewed

@@ -122,7 +122,7 @@ model = AutoModelForCausalLM.from_pretrained(
     "daslab-testing/CloverLM",
     trust_remote_code=True,
     dtype="bfloat16",
-    quartet_2_impl="pseudoquant",  # on non-Blackwell GPUs or "quartet2" for native NVFP4 kernel
 ).to("cuda")  # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(
@@ -134,7 +134,6 @@ input_ids = tokenizer("The capital of France is", return_tensors="pt").input_ids
 output = model.generate(input_ids.to(model.device), max_new_tokens=32)
 print(tokenizer.decode(output[0]))
 ```
-Note that `quartet_2_impl="quartet2"` only supports inputs with `(micro_batch_size * seq_length) % 128 == 0`.
 ### Running Evaluations
@@ -165,7 +164,7 @@ Attention backend options: `pytorch` (default), `flash2`, `flash3`, `flash4`.
 - PyTorch 2.10+ with CUDA 13.0
 - `transformers ≥ 5.3.0`
 - `tokenmonster ≥ 1.1.12`
-- [Quartet II kernels](https://github.com/IST-DASLab/Quartet-II)
 ## Architecture Details
@@ -191,8 +190,8 @@ The model uses 264 weight tensors totaling ~4.14 B parameters.
 @article{cloverlm2026,
   title   = {Speedrunning GPT3: Pretraining an OPT-175B-Quality Model Cheaply
              by Leveraging Native NVFP4},
-  author  = {Erik Schultheis and Georgios Vlassis and Matin Ansaripour and
-             Andrei Panferov and Dan Alistarh},
   year    = {2026},
 }
 ```

     "daslab-testing/CloverLM",
     trust_remote_code=True,
     dtype="bfloat16",
+    quartet_2_impl="quartet2",  # native NVFP4 kernel or "pseudoquant" on non-Blackwell GPUs
 ).to("cuda")  # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(
 output = model.generate(input_ids.to(model.device), max_new_tokens=32)
 print(tokenizer.decode(output[0]))
 ```
 ### Running Evaluations
 - PyTorch 2.10+ with CUDA 13.0
 - `transformers ≥ 5.3.0`
 - `tokenmonster ≥ 1.1.12`
+- [Quartet II kernels](https://github.com/IST-DASLab/Quartet-II) (for native FP4; `pseudoquant` mode works without them)
 ## Architecture Details
 @article{cloverlm2026,
   title   = {Speedrunning GPT3: Pretraining an OPT-175B-Quality Model Cheaply
              by Leveraging Native NVFP4},
+  author  = {Erik Schultheis and Matin Ansaripour and Andrei Panferov and
+             Georgios Vlassis and Dan Alistarh},
   year    = {2026},
 }
 ```

config.json CHANGED Viewed

@@ -13,14 +13,24 @@
   },
   "d_head": 128,
   "heads": 28,
   "max_context": 1024,
   "model_type": "cloverlm",
   "num_blocks": 29,
   "num_hidden_layers": 29,
   "quartet_2_impl": "pseudoquant",
   "ratio": 4,
   "scale_type": "1/sqrt(d)",
   "transformers_version": "5.3.0",
   "vocab_size": 32000,
-  "weight_tying": true
 }

   },
   "d_head": 128,
   "heads": 28,
+  "hidden_size": 3584,
+  "intermediate_size": 14336,
   "max_context": 1024,
+  "max_position_embeddings": 1024,
   "model_type": "cloverlm",
+  "num_attention_heads": 28,
   "num_blocks": 29,
   "num_hidden_layers": 29,
+  "num_key_value_heads": 7,
   "quartet_2_impl": "pseudoquant",
   "ratio": 4,
   "scale_type": "1/sqrt(d)",
+  "head_dim": 128,
+  "tie_word_embeddings": true,
   "transformers_version": "5.3.0",
   "vocab_size": 32000,
+  "weight_tying": true,
+  "quantization_config": {
+    "quant_method": "quartet2"
+  }
 }

configuration_cloverlm.py CHANGED Viewed

@@ -16,6 +16,14 @@ class CloverLMConfig(PretrainedConfig):
         quartet_2_impl="pseudoquant",
         weight_tying=True,
         attn_backend="pytorch",
         **kwargs,
     ):
         self.num_blocks = num_blocks
@@ -28,4 +36,35 @@ class CloverLMConfig(PretrainedConfig):
         self.quartet_2_impl = quartet_2_impl
         self.weight_tying = weight_tying
         self.attn_backend = attn_backend
-        super().__init__(vocab_size=vocab_size, **kwargs)

         quartet_2_impl="pseudoquant",
         weight_tying=True,
         attn_backend="pytorch",
+        # Optional: HuggingFace / vLLM tooling (defaults derived from shape)
+        hidden_size=None,
+        intermediate_size=None,
+        max_position_embeddings=None,
+        num_attention_heads=None,
+        num_key_value_heads=None,
+        head_dim=None,
+        quantization_config=None,
         **kwargs,
     ):
         self.num_blocks = num_blocks
         self.quartet_2_impl = quartet_2_impl
         self.weight_tying = weight_tying
         self.attn_backend = attn_backend
+        d_model = heads * d_head
+        self.hidden_size = hidden_size if hidden_size is not None else d_model
+        self.intermediate_size = (
+            intermediate_size if intermediate_size is not None else 4 * d_model
+        )
+        self.max_position_embeddings = (
+            max_position_embeddings
+            if max_position_embeddings is not None
+            else max_context
+        )
+        self.num_attention_heads = (
+            num_attention_heads if num_attention_heads is not None else heads
+        )
+        self.num_key_value_heads = (
+            num_key_value_heads
+            if num_key_value_heads is not None
+            else heads // ratio
+        )
+        self.head_dim = head_dim if head_dim is not None else d_head
+        self.quantization_config = (
+            quantization_config
+            if quantization_config is not None
+            else {"quant_method": "quartet2"}
+        )
+        kwargs.pop("tie_word_embeddings", None)
+        super().__init__(
+            vocab_size=vocab_size,
+            tie_word_embeddings=weight_tying,
+            **kwargs,
+        )

lm_eval/test_eval.log ADDED Viewed

@@ -0,0 +1,94 @@
  0%|          | 0/919 [00:00<?, ?it/s]
 10%|▉         | 91/919 [00:00<00:00, 900.62it/s]2026-03-19:14:15:45 INFO     [tasks:700] Selected tasks:
 20%|█▉        | 182/919 [00:00<00:00, 905.61it/s]
  0%|          | 0/919 [00:00<?, ?it/s]
 30%|██▉       | 274/919 [00:00<00:00, 910.65it/s]
 10%|▉         | 91/919 [00:00<00:00, 907.36it/s]
 40%|███▉      | 366/919 [00:00<00:00, 911.36it/s]
 20%|█▉        | 183/919 [00:00<00:00, 911.26it/s]
 50%|████▉     | 458/919 [00:00<00:00, 911.45it/s]
 30%|██▉       | 275/919 [00:00<00:00, 915.08it/s]
 60%|█████▉    | 550/919 [00:00<00:00, 911.49it/s]
 40%|███▉      | 367/919 [00:00<00:00, 915.11it/s]
 70%|██████▉   | 642/919 [00:00<00:00, 912.08it/s]
 50%|████▉     | 459/919 [00:00<00:00, 916.54it/s]
 80%|███████▉  | 734/919 [00:00<00:00, 911.69it/s]
 60%|█████▉    | 551/919 [00:00<00:00, 914.63it/s]
 90%|████████▉ | 826/919 [00:00<00:00, 912.45it/s]
 70%|██████▉   | 643/919 [00:00<00:00, 914.29it/s]
 80%|███████▉  | 735/919 [00:00<00:00, 912.73it/s]
 90%|████████▉ | 827/919 [00:00<00:00, 913.03it/s]
  0%|          | 0/5021 [00:00<?, ?it/s]
  6%|▌         | 278/5021 [00:00<00:01, 2773.97it/s]
 11%|█         | 564/5021 [00:00<00:01, 2824.23it/s]
  0%|          | 0/5021 [00:00<?, ?it/s]
 17%|█▋        | 852/5021 [00:00<00:01, 2845.58it/s]
  4%|▎         | 178/5021 [00:00<00:02, 1779.16it/s]
 23%|██▎       | 1140/5021 [00:00<00:01, 2855.94it/s]
  7%|▋         | 362/5021 [00:00<00:02, 1813.26it/s]
 28%|██▊       | 1427/5021 [00:00<00:01, 2860.60it/s]
 11%|█         | 547/5021 [00:00<00:02, 1826.42it/s]
 34%|███▍      | 1714/5021 [00:00<00:01, 2863.10it/s]
 15%|█▍        | 730/5021 [00:00<00:02, 1826.29it/s]
 40%|███▉      | 2001/5021 [00:00<00:01, 2863.11it/s]
 18%|█▊        | 913/5021 [00:00<00:02, 1824.48it/s]
 46%|████▌     | 2289/5021 [00:00<00:00, 2868.32it/s]
 22%|██▏       | 1096/5021 [00:00<00:02, 1819.99it/s]
 51%|█████▏    | 2577/5021 [00:00<00:00, 2871.27it/s]
 25%|██▌       | 1279/5021 [00:00<00:02, 1815.56it/s]
 29%|██▉       | 1461/5021 [00:00<00:01, 1802.42it/s]
 57%|█████▋    | 2865/5021 [00:01<00:01, 1693.34it/s]
 63%|██████▎   | 3149/5021 [00:01<00:00, 1928.96it/s]
 68%|██████▊   | 3433/5021 [00:01<00:00, 2135.30it/s]
 33%|███▎      | 1642/5021 [00:01<00:03, 1001.41it/s]
 74%|███████▍  | 3718/5021 [00:01<00:00, 2308.62it/s]
 36%|███▋      | 1823/5021 [00:01<00:02, 1159.85it/s]
 80%|███████▉  | 4004/5021 [00:01<00:00, 2449.72it/s]
 40%|███▉      | 2002/5021 [00:01<00:02, 1297.94it/s]
 85%|████████▌ | 4290/5021 [00:01<00:00, 2558.71it/s]
 43%|████▎     | 2183/5021 [00:01<00:02, 1418.79it/s]
 91%|█████████ | 4577/5021 [00:01<00:00, 2643.21it/s]
 47%|████▋     | 2364/5021 [00:01<00:01, 1516.12it/s]
 97%|█████████▋| 4864/5021 [00:01<00:00, 2705.33it/s]
 51%|█████     | 2544/5021 [00:01<00:01, 1591.38it/s]
 54%|█████▍    | 2725/5021 [00:01<00:01, 1650.39it/s]
 58%|█████▊    | 2905/5021 [00:01<00:01, 1692.57it/s]
 61%|██████▏   | 3085/5021 [00:01<00:01, 1722.37it/s]
 65%|██████▌   | 3266/5021 [00:02<00:01, 1745.32it/s]
 69%|██████▊   | 3447/5021 [00:02<00:00, 1762.66it/s]
 72%|███████▏  | 3627/5021 [00:02<00:00, 1772.66it/s]
 76%|███████▌  | 3807/5021 [00:02<00:00, 1779.65it/s]
 79%|███████▉  | 3987/5021 [00:02<00:00, 1781.31it/s]
 83%|████████▎ | 4168/5021 [00:02<00:00, 1787.31it/s]
 87%|████████▋ | 4349/5021 [00:02<00:00, 1792.50it/s]
 90%|█████████ | 4529/5021 [00:02<00:00, 1790.62it/s]
 94%|█████████▍| 4709/5021 [00:02<00:00, 1790.61it/s]
 97%|█████████▋| 4890/5021 [00:02<00:00, 1793.65it/s]
  0%|          | 0/586 [00:00<?, ?it/s]
  0%|          | 0/586 [00:00<?, ?it/s]
 17%|█▋        | 98/586 [00:00<00:00, 971.82it/s]
 11%|█         | 62/586 [00:00<00:00, 613.37it/s]
 34%|███▎      | 197/586 [00:00<00:00, 976.00it/s]
 21%|██▏       | 125/586 [00:00<00:00, 619.71it/s]
 51%|█████     | 297/586 [00:00<00:00, 985.47it/s]
 32%|███▏      | 189/586 [00:00<00:00, 624.63it/s]
 68%|██████▊   | 396/586 [00:00<00:00, 986.66it/s]
 43%|████▎     | 253/586 [00:00<00:00, 627.18it/s]
 85%|████████▍ | 496/586 [00:00<00:00, 989.81it/s]
 54%|█████▍    | 317/586 [00:00<00:00, 628.58it/s]
 65%|██████▌   | 381/586 [00:00<00:00, 629.34it/s]
 76%|███████▌  | 445/586 [00:00<00:00, 630.51it/s]
 87%|████████▋ | 509/586 [00:00<00:00, 632.25it/s]
 98%|█████████▊| 573/586 [00:00<00:00, 632.82it/s]
  0%|          | 0/1188 [00:00<?, ?it/s]
  0%|          | 0/1188 [00:00<?, ?it/s]
  8%|▊         | 100/1188 [00:00<00:01, 993.05it/s]
  5%|▌         | 63/1188 [00:00<00:01, 626.46it/s]
 17%|█▋        | 200/1188 [00:00<00:00, 991.06it/s]
 11%|█         | 127/1188 [00:00<00:01, 629.71it/s]
 25%|██▌       | 300/1188 [00:00<00:00, 994.71it/s]
 16%|█▌        | 191/1188 [00:00<00:01, 630.93it/s]
 34%|███▎      | 400/1188 [00:00<00:00, 993.35it/s]
 21%|██▏       | 255/1188 [00:00<00:01, 633.00it/s]
 42%|████▏     | 500/1188 [00:00<00:00, 993.59it/s]
 27%|██▋       | 319/1188 [00:00<00:01, 634.81it/s]
 51%|█████     | 600/1188 [00:00<00:00, 993.86it/s]
 32%|███▏      | 383/1188 [00:00<00:01, 636.25it/s]
 59%|█████▉    | 700/1188 [00:00<00:00, 992.08it/s]
 38%|███▊      | 447/1188 [00:00<00:01, 636.11it/s]
 67%|██████▋   | 800/1188 [00:00<00:00, 988.12it/s]
 43%|████▎     | 511/1188 [00:00<00:01, 634.75it/s]
 76%|███████▌  | 899/1188 [00:00<00:00, 988.66it/s]
 48%|████▊     | 575/1188 [00:00<00:00, 634.67it/s]
 84%|████████▍ | 999/1188 [00:01<00:00, 991.15it/s]
 54%|█████▍    | 639/1188 [00:01<00:00, 633.38it/s]
 93%|█████████▎| 1099/1188 [00:01<00:00, 993.44it/s]
 59%|█████▉    | 703/1188 [00:01<00:00, 633.82it/s]
 65%|██████▍   | 768/1188 [00:01<00:00, 635.72it/s]
 70%|███████   | 832/1188 [00:01<00:00, 636.49it/s]
 75%|███████▌  | 896/1188 [00:01<00:00, 632.00it/s]
 81%|████████  | 960/1188 [00:01<00:00, 628.76it/s]
 86%|████████▌ | 1023/1188 [00:01<00:00, 624.90it/s]
 91%|█████████▏| 1086/1188 [00:01<00:00, 625.37it/s]
 97%|█████████▋| 1149/1188 [00:01<00:00, 625.85it/s]

+The following values were not passed to `accelerate launch` and had defaults used instead:
+	`--num_processes` was set to a value of `2`
+		More than one GPU was found, enabling multi-GPU training.
+		If this was unintended please pass in `--num_processes=1`.
+	`--num_machines` was set to a value of `1`
+	`--mixed_precision` was set to a value of `'no'`
+	`--dynamo_backend` was set to a value of `'no'`
+To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
+2026-03-19:14:15:25 INFO     [_cli.run:375] Including path: ./
+2026-03-19:14:15:25 INFO     [_cli.run:376] Selected Tasks: ['arc_easy_mi', 'arc_challenge_mi', 'hellaswag', 'piqa']
+2026-03-19:14:15:25 INFO     [evaluator:211] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+2026-03-19:14:15:25 INFO     [evaluator:236] Initializing cloverlm model, with arguments: {'pretrained': 'daslab-testing/CloverLM', 'dtype': 'bfloat16', 'quartet_2_impl': 'quartet2', 'attn_backend': 'pytorch', 'trust_remote_code': True}
+2026-03-19:14:15:25 INFO     [_cli.run:375] Including path: ./
+2026-03-19:14:15:25 INFO     [_cli.run:376] Selected Tasks: ['arc_easy_mi', 'arc_challenge_mi', 'hellaswag', 'piqa']
+2026-03-19:14:15:25 INFO     [evaluator:211] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+2026-03-19:14:15:25 INFO     [evaluator:236] Initializing cloverlm model, with arguments: {'pretrained': 'daslab-testing/CloverLM', 'dtype': 'bfloat16', 'quartet_2_impl': 'quartet2', 'attn_backend': 'pytorch', 'trust_remote_code': True}
+2026-03-19:14:15:26 INFO     [models.huggingface:178] Using `accelerate launch` or `parallelize=True`, device 'cuda:0' will be overridden when placing model.
+2026-03-19:14:15:26 INFO     [models.huggingface:178] Using `accelerate launch` or `parallelize=True`, device 'cuda:0' will be overridden when placing model.
+2026-03-19:14:15:26 INFO     [models.huggingface:548] Model type cannot be determined. Using default model type 'causal'
+2026-03-19:14:15:26 INFO     [models.huggingface:548] Model type cannot be determined. Using default model type 'causal'
+2026-03-19:14:15:28 INFO     [models.huggingface:423] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:1'}
+2026-03-19:14:15:28 INFO     [models.huggingface:423] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
+The tied weights mapping and config for this model specifies to tie transformer.emb.weight to transformer.linear.weight, but both are present in the checkpoints, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warning
+The tied weights mapping and config for this model specifies to tie transformer.emb.weight to transformer.linear.weight, but both are present in the checkpoints, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warning
+2026-03-19:14:15:45 INFO     [tasks:700] Selected tasks:
+2026-03-19:14:15:45 INFO     [tasks:691] Task: piqa (.venv/lib/python3.11/site-packages/lm_eval/tasks/piqa/piqa.yaml)
+2026-03-19:14:15:45 INFO     [tasks:691] Task: hellaswag (.venv/lib/python3.11/site-packages/lm_eval/tasks/hellaswag/hellaswag.yaml)
+2026-03-19:14:15:45 INFO     [tasks:691] Task: arc_challenge_mi (arc_challenge.yaml)
+2026-03-19:14:15:45 INFO     [tasks:691] Task: arc_easy_mi (arc_easy_mi.yaml)
+2026-03-19:14:15:45 WARNING  [evaluator:333] Overwriting default num_fewshot of piqa from None to 0
+2026-03-19:14:15:45 WARNING  [evaluator:333] Overwriting default num_fewshot of hellaswag from None to 0
+2026-03-19:14:15:45 WARNING  [evaluator:333] Overwriting default num_fewshot of arc_challenge_mi from None to 0
+2026-03-19:14:15:45 WARNING  [evaluator:333] Overwriting default num_fewshot of arc_easy_mi from None to 0
+2026-03-19:14:15:45 INFO     [api.task:311] Building contexts for piqa on rank 0...
  0%|          | 0/919 [00:00<?, ?it/s]
 10%|▉         | 91/919 [00:00<00:00, 900.62it/s]2026-03-19:14:15:45 INFO     [tasks:700] Selected tasks:
+2026-03-19:14:15:45 INFO     [tasks:691] Task: piqa (.venv/lib/python3.11/site-packages/lm_eval/tasks/piqa/piqa.yaml)
+2026-03-19:14:15:45 INFO     [tasks:691] Task: hellaswag (.venv/lib/python3.11/site-packages/lm_eval/tasks/hellaswag/hellaswag.yaml)
+2026-03-19:14:15:45 INFO     [tasks:691] Task: arc_challenge_mi (arc_challenge.yaml)
+2026-03-19:14:15:45 INFO     [tasks:691] Task: arc_easy_mi (arc_easy_mi.yaml)
+2026-03-19:14:15:45 WARNING  [evaluator:333] Overwriting default num_fewshot of piqa from None to 0
+2026-03-19:14:15:45 WARNING  [evaluator:333] Overwriting default num_fewshot of hellaswag from None to 0
+2026-03-19:14:15:45 WARNING  [evaluator:333] Overwriting default num_fewshot of arc_challenge_mi from None to 0
+2026-03-19:14:15:45 WARNING  [evaluator:333] Overwriting default num_fewshot of arc_easy_mi from None to 0
+2026-03-19:14:15:45 INFO     [api.task:311] Building contexts for piqa on rank 1...
 20%|█▉        | 182/919 [00:00<00:00, 905.61it/s]
  0%|          | 0/919 [00:00<?, ?it/s]
 30%|██▉       | 274/919 [00:00<00:00, 910.65it/s]
 10%|▉         | 91/919 [00:00<00:00, 907.36it/s]
 40%|███▉      | 366/919 [00:00<00:00, 911.36it/s]
 20%|█▉        | 183/919 [00:00<00:00, 911.26it/s]
 50%|████▉     | 458/919 [00:00<00:00, 911.45it/s]
 30%|██▉       | 275/919 [00:00<00:00, 915.08it/s]
 60%|█████▉    | 550/919 [00:00<00:00, 911.49it/s]
 40%|███▉      | 367/919 [00:00<00:00, 915.11it/s]
 70%|██████▉   | 642/919 [00:00<00:00, 912.08it/s]
 50%|████▉     | 459/919 [00:00<00:00, 916.54it/s]
 80%|███████▉  | 734/919 [00:00<00:00, 911.69it/s]
 60%|█████▉    | 551/919 [00:00<00:00, 914.63it/s]
 90%|████████▉ | 826/919 [00:00<00:00, 912.45it/s]
 70%|██████▉   | 643/919 [00:00<00:00, 914.29it/s]
 80%|███████▉  | 735/919 [00:00<00:00, 912.73it/s]
 90%|████████▉ | 827/919 [00:00<00:00, 913.03it/s]
+2026-03-19:14:15:48 INFO     [api.task:311] Building contexts for hellaswag on rank 0...
+2026-03-19:14:15:48 INFO     [api.task:311] Building contexts for hellaswag on rank 1...
  0%|          | 0/5021 [00:00<?, ?it/s]
  6%|▌         | 278/5021 [00:00<00:01, 2773.97it/s]
 11%|█         | 564/5021 [00:00<00:01, 2824.23it/s]
  0%|          | 0/5021 [00:00<?, ?it/s]
 17%|█▋        | 852/5021 [00:00<00:01, 2845.58it/s]
  4%|▎         | 178/5021 [00:00<00:02, 1779.16it/s]
 23%|██▎       | 1140/5021 [00:00<00:01, 2855.94it/s]
  7%|▋         | 362/5021 [00:00<00:02, 1813.26it/s]
 28%|██▊       | 1427/5021 [00:00<00:01, 2860.60it/s]
 11%|█         | 547/5021 [00:00<00:02, 1826.42it/s]
 34%|███▍      | 1714/5021 [00:00<00:01, 2863.10it/s]
 15%|█▍        | 730/5021 [00:00<00:02, 1826.29it/s]
 40%|███▉      | 2001/5021 [00:00<00:01, 2863.11it/s]
 18%|█▊        | 913/5021 [00:00<00:02, 1824.48it/s]
 46%|████▌     | 2289/5021 [00:00<00:00, 2868.32it/s]
 22%|██▏       | 1096/5021 [00:00<00:02, 1819.99it/s]
 51%|█████▏    | 2577/5021 [00:00<00:00, 2871.27it/s]
 25%|██▌       | 1279/5021 [00:00<00:02, 1815.56it/s]
 29%|██▉       | 1461/5021 [00:00<00:01, 1802.42it/s]
 57%|█████▋    | 2865/5021 [00:01<00:01, 1693.34it/s]
 63%|██████▎   | 3149/5021 [00:01<00:00, 1928.96it/s]
 68%|██████▊   | 3433/5021 [00:01<00:00, 2135.30it/s]
 33%|███▎      | 1642/5021 [00:01<00:03, 1001.41it/s]
 74%|███████▍  | 3718/5021 [00:01<00:00, 2308.62it/s]
 36%|███▋      | 1823/5021 [00:01<00:02, 1159.85it/s]
 80%|███████▉  | 4004/5021 [00:01<00:00, 2449.72it/s]
 40%|███▉      | 2002/5021 [00:01<00:02, 1297.94it/s]
 85%|████████▌ | 4290/5021 [00:01<00:00, 2558.71it/s]
 43%|████▎     | 2183/5021 [00:01<00:02, 1418.79it/s]
 91%|█████████ | 4577/5021 [00:01<00:00, 2643.21it/s]
 47%|████▋     | 2364/5021 [00:01<00:01, 1516.12it/s]
 97%|█████████▋| 4864/5021 [00:01<00:00, 2705.33it/s]
 51%|█████     | 2544/5021 [00:01<00:01, 1591.38it/s]
 54%|█████▍    | 2725/5021 [00:01<00:01, 1650.39it/s]
 58%|█████▊    | 2905/5021 [00:01<00:01, 1692.57it/s]
 61%|██████▏   | 3085/5021 [00:01<00:01, 1722.37it/s]
 65%|██████▌   | 3266/5021 [00:02<00:01, 1745.32it/s]
 69%|██████▊   | 3447/5021 [00:02<00:00, 1762.66it/s]
 72%|███████▏  | 3627/5021 [00:02<00:00, 1772.66it/s]
 76%|███████▌  | 3807/5021 [00:02<00:00, 1779.65it/s]
 79%|███████▉  | 3987/5021 [00:02<00:00, 1781.31it/s]
 83%|████████▎ | 4168/5021 [00:02<00:00, 1787.31it/s]
 87%|████████▋ | 4349/5021 [00:02<00:00, 1792.50it/s]
 90%|█████████ | 4529/5021 [00:02<00:00, 1790.62it/s]
 94%|█████████▍| 4709/5021 [00:02<00:00, 1790.61it/s]
 97%|█████████▋| 4890/5021 [00:02<00:00, 1793.65it/s]
+2026-03-19:14:15:52 INFO     [api.task:311] Building contexts for arc_challenge_mi on rank 1...
+2026-03-19:14:15:52 INFO     [api.task:311] Building contexts for arc_challenge_mi on rank 0...
  0%|          | 0/586 [00:00<?, ?it/s]
  0%|          | 0/586 [00:00<?, ?it/s]
 17%|█▋        | 98/586 [00:00<00:00, 971.82it/s]
 11%|█         | 62/586 [00:00<00:00, 613.37it/s]
 34%|███▎      | 197/586 [00:00<00:00, 976.00it/s]
 21%|██▏       | 125/586 [00:00<00:00, 619.71it/s]
 51%|█████     | 297/586 [00:00<00:00, 985.47it/s]
 32%|███▏      | 189/586 [00:00<00:00, 624.63it/s]
 68%|██████▊   | 396/586 [00:00<00:00, 986.66it/s]
 43%|████▎     | 253/586 [00:00<00:00, 627.18it/s]
 85%|████████▍ | 496/586 [00:00<00:00, 989.81it/s]
 54%|█████▍    | 317/586 [00:00<00:00, 628.58it/s]
 65%|██████▌   | 381/586 [00:00<00:00, 629.34it/s]
 76%|███████▌  | 445/586 [00:00<00:00, 630.51it/s]
 87%|████████▋ | 509/586 [00:00<00:00, 632.25it/s]
 98%|█████████▊| 573/586 [00:00<00:00, 632.82it/s]
+2026-03-19:14:15:53 INFO     [api.task:311] Building contexts for arc_easy_mi on rank 0...
+2026-03-19:14:15:53 INFO     [api.task:311] Building contexts for arc_easy_mi on rank 1...
  0%|          | 0/1188 [00:00<?, ?it/s]
  0%|          | 0/1188 [00:00<?, ?it/s]
  8%|▊         | 100/1188 [00:00<00:01, 993.05it/s]
  5%|▌         | 63/1188 [00:00<00:01, 626.46it/s]
 17%|█▋        | 200/1188 [00:00<00:00, 991.06it/s]
 11%|█         | 127/1188 [00:00<00:01, 629.71it/s]
 25%|██▌       | 300/1188 [00:00<00:00, 994.71it/s]
 16%|█▌        | 191/1188 [00:00<00:01, 630.93it/s]
 34%|███▎      | 400/1188 [00:00<00:00, 993.35it/s]
 21%|██▏       | 255/1188 [00:00<00:01, 633.00it/s]
 42%|████▏     | 500/1188 [00:00<00:00, 993.59it/s]
 27%|██▋       | 319/1188 [00:00<00:01, 634.81it/s]
 51%|█████     | 600/1188 [00:00<00:00, 993.86it/s]
 32%|███▏      | 383/1188 [00:00<00:01, 636.25it/s]
 59%|█████▉    | 700/1188 [00:00<00:00, 992.08it/s]
 38%|███▊      | 447/1188 [00:00<00:01, 636.11it/s]
 67%|██████▋   | 800/1188 [00:00<00:00, 988.12it/s]
 43%|████▎     | 511/1188 [00:00<00:01, 634.75it/s]
 76%|███████▌  | 899/1188 [00:00<00:00, 988.66it/s]
 48%|████▊     | 575/1188 [00:00<00:00, 634.67it/s]
 84%|████████▍ | 999/1188 [00:01<00:00, 991.15it/s]
 54%|█████▍    | 639/1188 [00:01<00:00, 633.38it/s]
 93%|█████████▎| 1099/1188 [00:01<00:00, 993.44it/s]
 59%|█████▉    | 703/1188 [00:01<00:00, 633.82it/s]
 65%|██████▍   | 768/1188 [00:01<00:00, 635.72it/s]
 70%|███████   | 832/1188 [00:01<00:00, 636.49it/s]
 75%|███████▌  | 896/1188 [00:01<00:00, 632.00it/s]
 81%|████████  | 960/1188 [00:01<00:00, 628.76it/s]
 86%|████████▌ | 1023/1188 [00:01<00:00, 624.90it/s]
 91%|█████████▏| 1086/1188 [00:01<00:00, 625.37it/s]
 97%|█████████▋| 1149/1188 [00:01<00:00, 625.85it/s]
+2026-03-19:14:15:55 INFO     [evaluator:584] Running loglikelihood requests
+2026-03-19:14:15:55 INFO     [evaluator:584] Running loglikelihood requests
+Passed argument batch_size = auto:1. Detecting largest batch size
+Determined largest batch size: 64
+Determined largest batch size: 64
+[rank1]:W0319 14:20:19.946000 1458598 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8]    function: 'abs_max' (/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/quartet2/linear.py:147)
+[rank1]:W0319 14:20:19.946000 1458598 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8]    last reason: 0/5: tensor 'x' requires_grad mismatch. expected requires_grad=1
+[rank1]:W0319 14:20:19.946000 1458598 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[rank1]:W0319 14:20:19.946000 1458598 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/compile/programming_model.recompilation.html
+[rank0]:W0319 14:20:22.844000 1458597 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8]    function: 'abs_max' (/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/quartet2/linear.py:147)
+[rank0]:W0319 14:20:22.844000 1458597 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8]    last reason: 0/5: tensor 'x' requires_grad mismatch. expected requires_grad=1
+[rank0]:W0319 14:20:22.844000 1458597 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[rank0]:W0319 14:20:22.844000 1458597 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/compile/programming_model.recompilation.html
+fatal: not a git repository (or any of the parent directories): .git
+2026-03-19:14:20:29 INFO     [loggers.evaluation_tracker:316] Output path not provided, skipping saving results aggregated
+cloverlm ({'pretrained': 'daslab-testing/CloverLM', 'dtype': 'bfloat16', 'quartet_2_impl': 'quartet2', 'attn_backend': 'pytorch'}), gen_kwargs: ({}), limit: None, num_fewshot: 0, batch_size: auto (64)
+|     Tasks      |Version|Filter|n-shot|    Metric     |   |Value |   |Stderr|
+|----------------|------:|------|-----:|---------------|---|-----:|---|-----:|
+|arc_challenge_mi|      1|none  |     0|acc            |↑  |0.4642|±  |0.0146|
+|                |       |none  |     0|acc_mutual_info|↑  |0.5017|±  |0.0146|
+|                |       |none  |     0|acc_norm       |↑  |0.4940|±  |0.0146|
+|arc_easy_mi     |      1|none  |     0|acc            |↑  |0.8005|±  |0.0082|
+|                |       |none  |     0|acc_mutual_info|↑  |0.7193|±  |0.0092|
+|                |       |none  |     0|acc_norm       |↑  |0.7740|±  |0.0086|
+|hellaswag       |      1|none  |     0|acc            |↑  |0.5392|±  |0.0050|
+|                |       |none  |     0|acc_norm       |↑  |0.7169|±  |0.0045|
+|piqa            |      1|none  |     0|acc            |↑  |0.7911|±  |0.0095|
+|                |       |none  |     0|acc_norm       |↑  |0.8090|±  |0.0092|
+[rank0]:[W319 14:20:30.213375773 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())

lm_eval/test_eval2.log ADDED Viewed

	@@ -0,0 +1,130 @@

+The following values were not passed to `accelerate launch` and had defaults used instead:
+	`--num_processes` was set to a value of `2`
+		More than one GPU was found, enabling multi-GPU training.
+		If this was unintended please pass in `--num_processes=1`.
+	`--num_machines` was set to a value of `1`
+	`--mixed_precision` was set to a value of `'no'`
+	`--dynamo_backend` was set to a value of `'no'`
+To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
+2026-03-19:16:52:56 INFO     [_cli.run:375] Including path: ./
+2026-03-19:16:52:56 INFO     [_cli.run:376] Selected Tasks: ['arc_easy_mi', 'arc_challenge_mi', 'hellaswag', 'piqa']
+2026-03-19:16:52:56 INFO     [evaluator:211] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+2026-03-19:16:52:56 INFO     [evaluator:236] Initializing cloverlm model, with arguments: {'pretrained': 'daslab-testing/CloverLM', 'dtype': 'bfloat16', 'quartet_2_impl': 'quartet2', 'attn_backend': 'pytorch', 'trust_remote_code': True}
+2026-03-19:16:52:56 INFO     [models.huggingface:178] Using `accelerate launch` or `parallelize=True`, device 'cuda:0' will be overridden when placing model.
+2026-03-19:16:52:56 INFO     [_cli.run:375] Including path: ./
+2026-03-19:16:52:56 INFO     [_cli.run:376] Selected Tasks: ['arc_easy_mi', 'arc_challenge_mi', 'hellaswag', 'piqa']
+2026-03-19:16:52:56 INFO     [evaluator:211] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+2026-03-19:16:52:56 INFO     [evaluator:236] Initializing cloverlm model, with arguments: {'pretrained': 'daslab-testing/CloverLM', 'dtype': 'bfloat16', 'quartet_2_impl': 'quartet2', 'attn_backend': 'pytorch', 'trust_remote_code': True}
+2026-03-19:16:52:57 INFO     [models.huggingface:178] Using `accelerate launch` or `parallelize=True`, device 'cuda:0' will be overridden when placing model.
+Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
+2026-03-19:16:52:57 INFO     [models.huggingface:548] Model type cannot be determined. Using default model type 'causal'
+2026-03-19:16:52:57 INFO     [models.huggingface:548] Model type cannot be determined. Using default model type 'causal'
+Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/eval.py", line 65, in <module>
+[rank1]:     cli_evaluate()
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/__main__.py", line 10, in cli_evaluate
+[rank1]:     parser.execute(args)
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/_cli/harness.py", line 60, in execute
+[rank1]:     args.func(args)
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/_cli/run.py", line 379, in _execute
+[rank1]:     results = simple_evaluate(
+[rank1]:               ^^^^^^^^^^^^^^^^
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/utils.py", line 498, in _wrapper
+[rank1]:     return fn(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/evaluator.py", line 239, in simple_evaluate
+[rank1]:     lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
+[rank1]:          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/api/model.py", line 180, in create_from_arg_obj
+[rank1]:     return cls(**arg_dict, **additional_config)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/eval.py", line 11, in __init__
+[rank1]:     super().__init__(**kwargs)
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/models/huggingface.py", line 204, in __init__
+[rank1]:     self._create_tokenizer(
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/models/huggingface.py", line 793, in _create_tokenizer
+[rank1]:     self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+[rank1]:                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py", line 732, in from_pretrained
+[rank1]:     tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
+[rank1]:                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/transformers/dynamic_module_utils.py", line 567, in get_class_from_dynamic_module
+[rank1]:     module_file, class_name = class_reference.split(".")
+[rank1]:     ^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]: ValueError: not enough values to unpack (expected 2, got 1)
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/eval.py", line 65, in <module>
+[rank0]:     cli_evaluate()
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/__main__.py", line 10, in cli_evaluate
+[rank0]:     parser.execute(args)
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/_cli/harness.py", line 60, in execute
+[rank0]:     args.func(args)
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/_cli/run.py", line 379, in _execute
+[rank0]:     results = simple_evaluate(
+[rank0]:               ^^^^^^^^^^^^^^^^
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/utils.py", line 498, in _wrapper
+[rank0]:     return fn(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/evaluator.py", line 239, in simple_evaluate
+[rank0]:     lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
+[rank0]:          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/api/model.py", line 180, in create_from_arg_obj
+[rank0]:     return cls(**arg_dict, **additional_config)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/eval.py", line 11, in __init__
+[rank0]:     super().__init__(**kwargs)
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/models/huggingface.py", line 204, in __init__
+[rank0]:     self._create_tokenizer(
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/models/huggingface.py", line 793, in _create_tokenizer
+[rank0]:     self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+[rank0]:                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py", line 732, in from_pretrained
+[rank0]:     tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
+[rank0]:                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/transformers/dynamic_module_utils.py", line 567, in get_class_from_dynamic_module
+[rank0]:     module_file, class_name = class_reference.split(".")
+[rank0]:     ^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]: ValueError: not enough values to unpack (expected 2, got 1)
+[rank0]:[W319 16:52:58.069226968 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+W0319 16:52:59.444000 1490612 torch/distributed/elastic/multiprocessing/api.py:1010] Sending process 1490848 closing signal SIGTERM
+E0319 16:52:59.508000 1490612 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 1490847) of binary: /home/matin/convert_dir/CloverLM/lm_eval/.venv/bin/python
+Traceback (most recent call last):
+  File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/bin/accelerate", line 10, in <module>
+    sys.exit(main())
+             ^^^^^^
+  File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main
+    args.func(args)
+  File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1396, in launch_command
+    multi_gpu_launcher(args)
+  File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1023, in multi_gpu_launcher
+    distrib_run.run(args)
+  File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/torch/distributed/run.py", line 982, in run
+    elastic_launch(
+  File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+eval.py FAILED
+------------------------------------------------------------
+Failures:
+[1]:
+  time      : 2026-03-19_16:52:59
+  host      : b300-eval.datacrunch.io
+  rank      : 1 (local_rank: 1)
+  exitcode  : 1 (pid: 1490848)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2026-03-19_16:52:59
+  host      : b300-eval.datacrunch.io
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 1490847)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================

vllm_plugin/SERVING.md ADDED Viewed

	@@ -0,0 +1,116 @@

+# Serving CloverLM with vLLM (Quartet II NVFP4)
+## Prerequisites
+- NVIDIA Blackwell GPU (B300 / B200 / RTX 5090) for real Quartet II NVFP4 kernels
+- CUDA 13.0+
+- Python 3.11+
+- The Quartet II kernels (`quartet2` package) installed
+## 1. Environment Setup
+```bash
+# Activate the existing environment
+source .venv/bin/activate
+# Set CUDA paths
+export CUDA_HOME=/usr/local/cuda-13.0/
+export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+export PATH=/usr/local/cuda/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}
+```
+## 2. Install vLLM
+```bash
+export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest \
+    | jq -r .tag_name | sed 's/^v//')
+export CUDA_VERSION=130
+export CPU_ARCH=$(uname -m)
+uv pip install \
+    "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_35_${CPU_ARCH}.whl" \
+    --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
+```
+## 3. Serve the Model
+### Offline inference (quick test)
+```bash
+cd /home/matin/convert_dir/CloverLM/vllm_plugin
+python serve.py
+```
+### OpenAI-compatible API server
+```bash
+cd /home/matin/convert_dir/CloverLM/vllm_plugin
+python serve.py --api --port 8000
+```
+Then query:
+```bash
+curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "/home/matin/convert_dir/CloverLM",
+        "prompt": "The capital of France is",
+        "max_tokens": 64,
+        "temperature": 0.8
+    }'
+```
+### Options
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--model` | `../` (CloverLM dir) | Path to CloverLM model directory |
+| `--api` | off | Start OpenAI-compatible API server |
+| `--port` | 8000 | API server port |
+| `--host` | 0.0.0.0 | API server host |
+| `--tp` | 1 | Tensor parallel size |
+| `--max-model-len` | 1024 | Maximum context length |
+| `--gpu-memory-utilization` | 0.9 | GPU memory fraction to use |
+## Architecture
+The vLLM integration consists of three components:
+1. **`quartet2_quant.py`** -- Quartet II quantization plugin registered as `"quartet2"`.
+   Wraps the Quartet II on-the-fly FP4 quantization (`quant_fp4` + `flashinfer.mm_fp4`)
+   into vLLM's `LinearMethodBase` interface. Weights stay in bf16; quantization happens
+   at each forward pass.
+2. **`cloverlm_vllm.py`** -- Full vLLM model implementation with paged KV cache.
+   Reimplements CloverLM's architecture using vLLM primitives:
+   - `ColumnParallelLinear` / `RowParallelLinear` for Q/K/V/O and MLP projections
+   - vLLM `Attention` for paged KV caching and efficient attention
+   - Custom RoPE (base 1024, repeat_interleave pattern)
+   - Sphere normalization on Q/K before attention
+   - Per-head learnable scale parameter
+   - Squared ReLU activation in MLP
+   - Post-sublayer RMSNorm (not pre-norm)
+3. **`serve.py`** -- Entry point that registers both the quantization plugin and model,
+   then launches vLLM in offline or API mode.
+## Known Limitations
+- **CUDA graphs**: Currently `enforce_eager=True` is required because the Quartet II
+  on-the-fly quantization kernels (`quant_fp4` + `mm_fp4`) are not compatible with
+  CUDA graph capture. This means slightly higher per-token latency compared to
+  CUDA-graph-enabled models. A future update to the Quartet II kernels could remove
+  this limitation.
+## Troubleshooting
+**"No module named 'quartet2'"**: Ensure the Quartet II kernels are installed:
+```bash
+uv pip install "quartet2 @ git+https://github.com/IST-DASLab/Quartet-II.git#subdirectory=kernels"
+```
+**CUDA errors**: Make sure `CUDA_HOME` points to CUDA 13.0+ and `TRITON_PTXAS_PATH` is set.
+**Out of memory**: Reduce `--gpu-memory-utilization` or use `--tp 2` for tensor parallelism.

vllm_plugin/__init__.py ADDED Viewed

File without changes

vllm_plugin/cloverlm_vllm.py ADDED Viewed

	@@ -0,0 +1,370 @@

+from __future__ import annotations
+from typing import Iterable, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.utils import AutoWeightsLoader, WeightsMapper
+def _build_rope_cos_sin(
+    positions: torch.Tensor,
+    d_head: int,
+    device: torch.device,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    js = torch.arange(d_head // 2, device=device, dtype=torch.float32)
+    theta = 1.0 / (1024.0 ** (2.0 * js / d_head))
+    phi = positions.float().unsqueeze(-1) * theta.unsqueeze(0)
+    cos = torch.cos(phi).repeat_interleave(2, dim=-1)
+    sin = torch.sin(phi).repeat_interleave(2, dim=-1)
+    return cos, sin
+def _apply_rope(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> torch.Tensor:
+    x_rot = torch.empty_like(x)
+    x_rot[..., 0::2] = -x[..., 1::2]
+    x_rot[..., 1::2] = x[..., 0::2]
+    return (x * cos + x_rot * sin).to(x.dtype)
+class CloverLMAttention(nn.Module):
+    def __init__(
+        self,
+        d: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        cache_config=None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.q_size = num_heads * head_dim
+        self.kv_size = num_kv_heads * head_dim
+        self.lq = ColumnParallelLinear(
+            d, self.q_size, bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.lq",
+        )
+        self.lk = ColumnParallelLinear(
+            d, self.kv_size, bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.lk",
+        )
+        self.lv = ColumnParallelLinear(
+            d, self.kv_size, bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.lv",
+        )
+        self.lo = RowParallelLinear(
+            self.q_size, d, bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.lo",
+        )
+        # Per-head learnable scale: stored as (1, heads, 1, 1) in checkpoint,
+        # reshaped to (heads,) for efficient multiply after sphere norm.
+        self.scale = nn.Parameter(
+            torch.empty(1, num_heads, 1, 1),
+            requires_grad=False,
+        )
+        self.attn = Attention(
+            num_heads=num_heads,
+            head_size=head_dim,
+            scale=1.0,
+            num_kv_heads=num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        q, _ = self.lq(hidden_states)
+        k, _ = self.lk(hidden_states)
+        v, _ = self.lv(hidden_states)
+        cos, sin = _build_rope_cos_sin(
+            positions, self.head_dim, hidden_states.device,
+        )
+        q = q.view(-1, self.num_heads, self.head_dim)
+        k = k.view(-1, self.num_kv_heads, self.head_dim)
+        q = _apply_rope(q, cos.unsqueeze(1), sin.unsqueeze(1))
+        k = _apply_rope(k, cos.unsqueeze(1), sin.unsqueeze(1))
+        q = F.normalize(q, dim=-1)
+        k = F.normalize(k, dim=-1)
+        # scale: (1, heads, 1, 1) → broadcast over (tokens, heads, head_dim)
+        q = q * self.scale.squeeze(-1)
+        q = q.reshape(-1, self.q_size)
+        k = k.reshape(-1, self.kv_size)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.lo(attn_output)
+        return output
+class CloverLMMLP(nn.Module):
+    def __init__(
+        self,
+        d: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        d_hidden = 4 * d
+        self.l1 = ColumnParallelLinear(
+            d, d_hidden, bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.l1.0",
+        )
+        self.l2 = RowParallelLinear(
+            d_hidden, d, bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.l2",
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.l1(x)
+        x = F.relu(x) ** 2
+        x, _ = self.l2(x)
+        return x
+class CloverLMBlock(nn.Module):
+    def __init__(
+        self,
+        d: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        cache_config=None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.mhsa = CloverLMAttention(
+            d, num_heads, num_kv_heads, head_dim,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mhsa",
+        )
+        self.out_att_norm = RMSNorm(d)
+        self.mlp = CloverLMMLP(
+            d,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.out_mlp_norm = RMSNorm(d)
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Post-norm attention residual
+        attn_out = self.mhsa(positions, hidden_states)
+        attn_out = self.out_att_norm(attn_out)
+        hidden_states = hidden_states + attn_out
+        # Post-norm MLP residual
+        mlp_out = self.mlp(hidden_states)
+        mlp_out = self.out_mlp_norm(mlp_out)
+        hidden_states = hidden_states + mlp_out
+        return hidden_states
+class CloverLMModel(nn.Module):
+    def __init__(
+        self,
+        config,
+        cache_config=None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        d = config.heads * config.d_head
+        self.emb = VocabParallelEmbedding(
+            config.vocab_size, d,
+            quant_config=quant_config,
+            prefix=f"{prefix}.emb",
+        )
+        self.blocks = nn.ModuleList([
+            CloverLMBlock(
+                d, config.heads,
+                config.heads // config.ratio,
+                config.d_head,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.blocks.{i}",
+            )
+            for i in range(config.num_blocks)
+        ])
+        self.out_norm = RMSNorm(d)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.emb(input_ids)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors=None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.emb(input_ids)
+        for block in self.blocks:
+            hidden_states = block(positions, hidden_states)
+        hidden_states = self.out_norm(hidden_states)
+        return hidden_states
+_HF_TO_VLLM = WeightsMapper(
+    orig_to_new_prefix={"transformer.": "model."},
+)
+class CloverLMForCausalLM_vLLM(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        d = config.heads * config.d_head
+        self.config = config
+        self.model = CloverLMModel(
+            config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}model",
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size, d, bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}lm_head",
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        if getattr(config, "weight_tying", True):
+            self.lm_head.weight = self.model.emb.weight
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors=None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds,
+        )
+        return hidden_states
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.logits_processor(self.lm_head, hidden_states)
+    def load_weights(
+        self,
+        weights: Iterable[tuple[str, torch.Tensor]],
+    ) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded: set[str] = set()
+        skip_prefixes = set()
+        if getattr(self.config, "weight_tying", True):
+            skip_prefixes.add("transformer.linear.weight")
+        skipped = []
+        unmapped = []
+        for hf_name, loaded_weight in weights:
+            if hf_name in skip_prefixes:
+                skipped.append(hf_name)
+                continue
+            # Map HuggingFace names → vLLM names
+            vllm_name = hf_name.replace("transformer.", "model.", 1)
+            # In HuggingFace model, MLP l1 is Sequential(Linear, ReLU²),
+            # so the linear weight is at "mlp.l1.0.weight".  In our vLLM
+            # model l1 is a flat ColumnParallelLinear → "mlp.l1.weight".
+            vllm_name = vllm_name.replace(".mlp.l1.0.", ".mlp.l1.")
+            if vllm_name not in params_dict:
+                unmapped.append(f"{hf_name} -> {vllm_name}")
+                continue
+            param = params_dict[vllm_name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded.add(vllm_name)
+        not_loaded = set(params_dict.keys()) - loaded
+        import logging
+        logger = logging.getLogger(__name__)
+        logger.info("Loaded %d/%d params, skipped %d, unmapped %d, "
+                     "not_loaded %d",
+                     len(loaded), len(params_dict), len(skipped),
+                     len(unmapped), len(not_loaded))
+        if unmapped:
+            logger.warning("Unmapped HF keys: %s", unmapped)
+        if not_loaded:
+            logger.warning("Params not loaded: %s", sorted(not_loaded))
+        return loaded

vllm_plugin/quartet2_quant.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import torch
+import torch.nn.functional as F
+from torch.nn import Parameter
+from vllm.model_executor.layers.quantization import (
+    register_quantization_config,
+)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.parameter import ModelWeightParameter
+@register_quantization_config("quartet2")
+class QuartetIIConfig(QuantizationConfig):
+    def get_name(self) -> str:
+        return "quartet2"
+    def get_supported_act_dtypes(self) -> list:
+        return [torch.bfloat16]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 100  # Blackwell (SM 10.0)
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return []
+    @classmethod
+    def from_config(cls, config: dict) -> "QuartetIIConfig":
+        return cls()
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> QuantizeMethodBase | None:
+        if isinstance(layer, LinearBase):
+            return QuartetIILinearMethod(self)
+        return None
+class QuartetIILinearMethod(LinearMethodBase):
+    def __init__(self, config: QuartetIIConfig):
+        self.config = config
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=extra_weight_attrs.get("weight_loader"),
+        )
+        layer.register_parameter("weight", weight)
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        from scipy.linalg import hadamard as scipy_hadamard
+        device = layer.weight.device
+        had_np = scipy_hadamard(128) * 128 ** -0.5
+        layer.had = torch.tensor(
+            had_np, dtype=torch.bfloat16, device=device, requires_grad=False,
+        )
+        layer.scratch_amax = torch.empty(
+            (), dtype=torch.uint32, device=device,
+        )
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        from quartet2.quant import quant_fp4, NVFP4QuantMode
+        from quartet2.linear import abs_max, _fp4_mm
+        weight = layer.weight
+        orig_shape = x.shape
+        flat_x = x.reshape(-1, x.shape[-1])
+        # Quartet II requires rows to be multiples of 128; pad if needed.
+        num_rows = flat_x.shape[0]
+        remainder = num_rows % 128
+        if remainder != 0:
+            pad_rows = 128 - remainder
+            flat_x = F.pad(flat_x, (0, 0, 0, pad_rows))
+        else:
+            pad_rows = 0
+        input_amax = abs_max(flat_x)
+        weight_amax = abs_max(weight)
+        mode = NVFP4QuantMode.FOUR_SIX
+        scale_override = 1.0
+        input_fp4 = quant_fp4(
+            flat_x, amax=input_amax,
+            scale_override=scale_override, mode=mode,
+        )
+        weight_fp4 = quant_fp4(
+            weight, amax=weight_amax,
+            scale_override=scale_override, mode=mode,
+        )
+        alpha = input_fp4.tensor_scale * weight_fp4.tensor_scale
+        output = _fp4_mm(
+            input_fp4.fp4, weight_fp4.fp4,
+            input_fp4.micro_scales, weight_fp4.micro_scales,
+            alpha,
+        )
+        if pad_rows > 0:
+            output = output[:num_rows]
+        output = output.reshape(*orig_shape[:-1], output.shape[-1])
+        if bias is not None:
+            output = output + bias
+        return output

vllm_plugin/serve.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import argparse
+import os
+import sys
+PLUGIN_DIR = os.path.dirname(os.path.abspath(__file__))
+MODEL_DIR = os.path.dirname(PLUGIN_DIR)
+sys.path.insert(0, PLUGIN_DIR)
+# Register the Quartet II quantization plugin before any vLLM imports
+import quartet2_quant  # noqa: F401  — triggers @register_quantization_config
+from vllm import ModelRegistry
+from cloverlm_vllm import CloverLMForCausalLM_vLLM
+ModelRegistry.register_model(
+    "CloverLMForCausalLM", CloverLMForCausalLM_vLLM,
+)
+def main():
+    parser = argparse.ArgumentParser(description="Serve CloverLM with vLLM")
+    parser.add_argument(
+        "--model", default=MODEL_DIR,
+        help="Path to CloverLM model directory",
+    )
+    parser.add_argument("--api", action="store_true", help="Start OpenAI API server")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument(
+        "--max-model-len", type=int, default=1024,
+        help="Maximum context length",
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization", type=float, default=0.9,
+    )
+    args = parser.parse_args()
+    if args.api:
+        _serve_api(args)
+    else:
+        _offline_inference(args)
+def _offline_inference(args):
+    from vllm import LLM, SamplingParams
+    llm = LLM(
+        model=args.model,
+        quantization="quartet2",
+        trust_remote_code=True,
+        dtype="bfloat16",
+        max_model_len=args.max_model_len,
+        tensor_parallel_size=args.tp,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        enforce_eager=True,
+    )
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=128,
+    )
+    prompts = [
+        "The capital of France is",
+        "Large language models are",
+        "In the year 2030,",
+    ]
+    print("=" * 60)
+    print("  CloverLM — vLLM Offline Inference (Quartet II NVFP4)")
+    print("=" * 60)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated = output.outputs[0].text
+        print(f"\nPrompt:    {prompt}")
+        print(f"Generated: {generated}")
+def _serve_api(args):
+    sys.argv = [
+        "vllm",
+        "--model", args.model,
+        "--quantization", "quartet2",
+        "--trust-remote-code",
+        "--dtype", "bfloat16",
+        "--max-model-len", str(args.max_model_len),
+        "--tensor-parallel-size", str(args.tp),
+        "--gpu-memory-utilization", str(args.gpu_memory_utilization),
+        "--enforce-eager",
+        "--host", args.host,
+        "--port", str(args.port),
+    ]
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+    from vllm.entrypoints.openai.cli_args import make_arg_parser
+    from vllm.entrypoints.openai.api_server import run_server
+    import asyncio
+    vllm_parser = make_arg_parser(FlexibleArgumentParser())
+    vllm_args = vllm_parser.parse_args()
+    asyncio.run(run_server(vllm_args))
+if __name__ == "__main__":
+    main()