Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

.gitattributes +1 -0
README.md +98 -3
added_tokens.json +3 -0
calib.json +117 -0
config.json +58 -0
generation_config.json +7 -0
model.safetensors +3 -0
special_tokens_map.json +33 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,98 @@
----
-license: apache-2.0
----

+---
+license: gemma
+base_model: google/gemma-3-270m
+library_name: transformers
+tags:
+  - fp8
+  - quantized
+  - embedding
+  - nvidia-modelopt
+  - nanojet
+pipeline_tag: feature-extraction
+---
+# gemma-3-270m-modelopt-fp8
+FP8 (E4M3) quantized version of [google/gemma-3-270m](https://huggingface.co/google/gemma-3-270m), quantized using [NVIDIA ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) static FP8 quantization.
+## Model Details
+| Property | Value |
+|----------|-------|
+| Base Model | [google/gemma-3-270m](https://huggingface.co/google/gemma-3-270m) |
+| Architecture | Gemma3 (18 layers, 4 heads, 1 KV head) |
+| Hidden Size | 640 |
+| Intermediate Size | 2048 |
+| Head Dim | 256 |
+| Vocab Size | 262,144 |
+| Max Position Embeddings | 32,768 |
+| Attention | Sliding window (512) + full attention (every 6th layer) |
+| Quantization | FP8 E4M3 (weights + input activations) |
+| Quantization Method | NVIDIA ModelOpt (mtq.FP8_DEFAULT_CFG) |
+| Model Size | 416 MB (safetensors) |
+## Quantization Details
+### Method
+- **Tool**: NVIDIA ModelOpt static FP8 quantization
+- **Format**: FP8 E4M3 (torch.float8_e4m3fn)
+- **Scope**: All linear layers (QKV projections, output projections, MLP layers) are quantized to FP8. Embeddings and RMSNorms remain in BF16.
+- **Scales**: Per-tensor weight scales and input activation scales are stored alongside the quantized weights.
+### Calibration
+- **Dataset**: [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) (real text data)
+- **Samples**: 64
+- **Sequence Length**: 256
+- **Batch Size**: 4
+- **Activation Scales**: Collected at 4 points per layer (post-layernorm, attention output, MLP input, GELU output), saved in calib.json
+## Precision Evaluation
+Cosine similarity between this FP8 model and the original BF16 model, measured on CNN/DailyMail text inputs (threshold: 0.99):
+| Batch | Seq Len | Cosine Similarity | Result |
+|-------|---------|-------------------|--------|
+| 1 | 128 | 0.9919 | PASS |
+| 2 | 512 | 0.9937 | PASS |
+| 4 | 1024 | 0.9935 | PASS |
+| 8 | 2048 | 0.9937 | PASS |
+| 8 | 100 | 0.9920 | PASS |
+| 8 | 500 | 0.9933 | PASS |
+| 8 | 4000 | 0.9937 | PASS |
+All configurations achieve >0.99 cosine similarity with the BF16 baseline.
+## File Structure
+```
+.
+├── config.json              # Model config with quantization_config
+├── model.safetensors        # FP8 quantized weights + scales
+├── calib.json               # Activation scales per layer
+├── tokenizer.json           # Tokenizer
+├── tokenizer_config.json    # Tokenizer config
+├── special_tokens_map.json  # Special tokens
+├── added_tokens.json        # Added tokens
+└── generation_config.json   # Generation config
+```
+## Usage
+This model is designed to be used with the [NanoJet](https://github.com/ai-microsoft/NanoJet_Kernels) inference engine:
+```python
+from test.infrastructure.model_utils import load_nanojet_model
+model = load_nanojet_model(
+    "1kxia/gemma-3-270m-modelopt-fp8",
+    batch=8,
+    seq_len=4096,
+    quantization="fp8"
+)
+```
+## Intended Use
+This model is intended for efficient FP8 inference on NVIDIA GPUs with FP8 support (Hopper architecture and above).

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

calib.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_type": "gemma3",
+  "hidden_size": 640,
+  "num_hidden_layers": 18,
+  "calibration_samples": 64,
+  "calibration_seq_len": 256,
+  "activation_scales": [
+    [
+      0.7857142857142857,
+      0.8169642857142857,
+      0.36830357142857145,
+      0.029854910714285716
+    ],
+    [
+      0.5758928571428571,
+      0.6741071428571429,
+      2.2857142857142856,
+      0.018415178571428572
+    ],
+    [
+      0.125,
+      0.140625,
+      1.2232142857142858,
+      0.015206473214285714
+    ],
+    [
+      0.10435267857142858,
+      0.05161830357142857,
+      2.982142857142857,
+      0.024832589285714284
+    ],
+    [
+      0.11439732142857142,
+      0.013741629464285714,
+      2.5714285714285716,
+      0.025669642857142856
+    ],
+    [
+      0.12555803571428573,
+      0.020926339285714284,
+      7.285714285714286,
+      0.01416015625
+    ],
+    [
+      0.10379464285714286,
+      0.013532366071428572,
+      3.6607142857142856,
+      0.018136160714285716
+    ],
+    [
+      0.1328125,
+      0.07421875,
+      6.214285714285714,
+      0.020647321428571428
+    ],
+    [
+      0.1875,
+      0.03766741071428571,
+      4.785714285714286,
+      0.016671316964285716
+    ],
+    [
+      0.18638392857142858,
+      0.05022321428571429,
+      6.928571428571429,
+      0.020228794642857144
+    ],
+    [
+      0.22209821428571427,
+      0.037388392857142856,
+      8.392857142857142,
+      0.015276227678571428
+    ],
+    [
+      0.03208705357142857,
+      0.013602120535714286,
+      7.821428571428571,
+      0.016950334821428572
+    ],
+    [
+      0.13169642857142858,
+      0.07254464285714286,
+      8.428571428571429,
+      0.022321428571428572
+    ],
+    [
+      0.25558035714285715,
+      0.054966517857142856,
+      13.857142857142858,
+      0.024553571428571428
+    ],
+    [
+      0.15736607142857142,
+      0.13113839285714285,
+      10.214285714285714,
+      0.019810267857142856
+    ],
+    [
+      0.16964285714285715,
+      0.12276785714285714,
+      8.178571428571429,
+      0.022042410714285716
+    ],
+    [
+      0.19977678571428573,
+      0.13727678571428573,
+      11.785714285714286,
+      0.01318359375
+    ],
+    [
+      0.09542410714285714,
+      0.030412946428571428,
+      4.678571428571429,
+      0.009207589285714286
+    ]
+  ]
+}

config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "_sliding_window_pattern": 6,
+  "architectures": [
+    "Gemma3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "bos_token_id": 2,
+  "eos_token_id": 1,
+  "final_logit_softcapping": null,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 640,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_types": [
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "model_type": "gemma3_text",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 18,
+  "num_key_value_heads": 1,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000.0,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": 512,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.0.dev0",
+  "use_bidirectional_attention": false,
+  "use_cache": true,
+  "vocab_size": 262144,
+  "quantization_config": {
+    "quant_algo": "FP8",
+    "kv_cache_quant_algo": null
+  }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cache_implementation": "hybrid",
+  "do_sample": true,
+  "top_k": 64,
+  "top_p": 0.95,
+  "transformers_version": "4.55.0.dev0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b45d41e87257bf24b1f40fd37797a2764742152750e96e93f306dbefd024b95
+size 435975888

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d4046bf0505a327dd5a0abbb427ecd4fc82f99c2ceaa170bc61ecde12809b0c
+size 33384570

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff