Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
|
@@ -14,6 +14,13 @@ base_model: google/gemma-3n-E4B-it
|
|
| 14 |
|
| 15 |
executorch .pte export of google/gemma-3n-E4B-it for on-device mobile inference
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
## model details
|
| 18 |
|
| 19 |
| property | value |
|
|
@@ -21,9 +28,7 @@ executorch .pte export of google/gemma-3n-E4B-it for on-device mobile inference
|
|
| 21 |
| source model | google/gemma-3n-E4B-it |
|
| 22 |
| text parameters | 7.40b |
|
| 23 |
| transformer layers | 35 |
|
| 24 |
-
| dtype | float16 |
|
| 25 |
| format | executorch .pte |
|
| 26 |
-
| output size | 13.1 gb |
|
| 27 |
|
| 28 |
## text-only export
|
| 29 |
|
|
@@ -39,6 +44,13 @@ this export contains only the text decoder components extracted from the full mu
|
|
| 39 |
|
| 40 |
use this export for text-only inference tasks. if you need multimodal capabilities use the original huggingface model
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
## export configuration
|
| 43 |
|
| 44 |
- fixed sequence length: 32 tokens
|
|
@@ -51,12 +63,12 @@ use this export for text-only inference tasks. if you need multimodal capabiliti
|
|
| 51 |
from executorch.runtime import Runtime
|
| 52 |
|
| 53 |
runtime = Runtime.get()
|
| 54 |
-
program = runtime.load_program("
|
| 55 |
method = program.load_method("forward")
|
| 56 |
|
| 57 |
# input_ids shape: [1, 32] dtype: torch.long
|
| 58 |
output = method.execute([input_ids])
|
| 59 |
-
# output shape: [1, 32, 262400] dtype: torch.
|
| 60 |
```
|
| 61 |
|
| 62 |
## required patches
|
|
|
|
| 14 |
|
| 15 |
executorch .pte export of google/gemma-3n-E4B-it for on-device mobile inference
|
| 16 |
|
| 17 |
+
## available models
|
| 18 |
+
|
| 19 |
+
| variant | dtype | size | file |
|
| 20 |
+
|---------|-------|------|------|
|
| 21 |
+
| bf16 | bfloat16 | 13.1 gb | Gemma3n-E4B-IT-text-only.pte |
|
| 22 |
+
| int8 | int8 weights | 9.6 gb | Gemma3n-E4B-text-only-int8.pte |
|
| 23 |
+
|
| 24 |
## model details
|
| 25 |
|
| 26 |
| property | value |
|
|
|
|
| 28 |
| source model | google/gemma-3n-E4B-it |
|
| 29 |
| text parameters | 7.40b |
|
| 30 |
| transformer layers | 35 |
|
|
|
|
| 31 |
| format | executorch .pte |
|
|
|
|
| 32 |
|
| 33 |
## text-only export
|
| 34 |
|
|
|
|
| 44 |
|
| 45 |
use this export for text-only inference tasks. if you need multimodal capabilities use the original huggingface model
|
| 46 |
|
| 47 |
+
## quantization
|
| 48 |
+
|
| 49 |
+
- **bf16**: full bfloat16 precision weights
|
| 50 |
+
- **int8**: int8 weight-only quantization via torchao - recommended for mobile deployment
|
| 51 |
+
|
| 52 |
+
note: int4 quantization requires gpu for inference and is not suitable for cpu-only mobile deployment
|
| 53 |
+
|
| 54 |
## export configuration
|
| 55 |
|
| 56 |
- fixed sequence length: 32 tokens
|
|
|
|
| 63 |
from executorch.runtime import Runtime
|
| 64 |
|
| 65 |
runtime = Runtime.get()
|
| 66 |
+
program = runtime.load_program("Gemma3n-E4B-text-only-int8.pte")
|
| 67 |
method = program.load_method("forward")
|
| 68 |
|
| 69 |
# input_ids shape: [1, 32] dtype: torch.long
|
| 70 |
output = method.execute([input_ids])
|
| 71 |
+
# output shape: [1, 32, 262400] dtype: torch.bfloat16
|
| 72 |
```
|
| 73 |
|
| 74 |
## required patches
|