multilingual upgrade upload of gemma-3n
Browse files- README.md +13 -12
- config.json +32 -47
- model-00001-of-00003.safetensors +3 -0
- model-00002-of-00003.safetensors +3 -0
- model-00003-of-00003.safetensors +3 -0
- model.safetensors.index.json +0 -0
README.md
CHANGED
|
@@ -3,12 +3,11 @@ license: gemma
|
|
| 3 |
library_name: transformers
|
| 4 |
pipeline_tag: image-text-to-text
|
| 5 |
extra_gated_heading: Access Gemma on Hugging Face
|
| 6 |
-
extra_gated_prompt:
|
| 7 |
-
|
| 8 |
-
Google’s usage license. To do this, please ensure you’re logged in to Hugging
|
| 9 |
Face and click below. Requests are processed immediately.
|
| 10 |
extra_gated_button_content: Acknowledge license
|
| 11 |
-
base_model: google/gemma-3n-E4B
|
| 12 |
tags:
|
| 13 |
- automatic-speech-recognition
|
| 14 |
- automatic-speech-translation
|
|
@@ -17,16 +16,18 @@ tags:
|
|
| 17 |
---
|
| 18 |
|
| 19 |
> [!Note]
|
| 20 |
-
> This repository corresponds to the launch version of Gemma 3n
|
| 21 |
> supporting text, audio, and vision (image and video) inputs.
|
| 22 |
>
|
| 23 |
> Gemma 3n models have multiple architecture innovations:
|
| 24 |
-
> * They are available in two sizes based on [effective parameters](https://ai.google.dev/gemma/docs/gemma-3n#parameters). While the raw parameter count of this model is
|
| 25 |
-
> * They use a MatFormer architecture that allows nesting sub-models within the E4B model
|
| 26 |
>
|
| 27 |
> Learn more about these techniques in the [technical blog post](https://developers.googleblog.com/en/introducing-gemma-3n-developer-guide)
|
| 28 |
> and the [Gemma documentation](https://ai.google.dev/gemma/docs/gemma-3n).
|
| 29 |
|
|
|
|
|
|
|
| 30 |
# Gemma 3n model card
|
| 31 |
|
| 32 |
**Model Page**: [Gemma 3n](https://ai.google.dev/gemma/docs/gemma-3n)
|
|
@@ -101,7 +102,7 @@ import torch
|
|
| 101 |
|
| 102 |
pipe = pipeline(
|
| 103 |
"image-text-to-text",
|
| 104 |
-
model="google/gemma-3n-
|
| 105 |
device="cuda",
|
| 106 |
torch_dtype=torch.bfloat16,
|
| 107 |
)
|
|
@@ -140,9 +141,9 @@ from PIL import Image
|
|
| 140 |
import requests
|
| 141 |
import torch
|
| 142 |
|
| 143 |
-
model_id = "google/gemma-3n-
|
| 144 |
|
| 145 |
-
model = Gemma3nForConditionalGeneration.from_pretrained(model_id,
|
| 146 |
|
| 147 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 148 |
|
|
@@ -166,7 +167,7 @@ inputs = processor.apply_chat_template(
|
|
| 166 |
tokenize=True,
|
| 167 |
return_dict=True,
|
| 168 |
return_tensors="pt",
|
| 169 |
-
).to(model.device)
|
| 170 |
|
| 171 |
input_len = inputs["input_ids"].shape[-1]
|
| 172 |
|
|
@@ -526,4 +527,4 @@ development compared to similarly sized models.
|
|
| 526 |
|
| 527 |
Using the benchmark evaluation metrics described in this document, these models
|
| 528 |
have shown to provide superior performance to other, comparably-sized open model
|
| 529 |
-
alternatives.
|
|
|
|
| 3 |
library_name: transformers
|
| 4 |
pipeline_tag: image-text-to-text
|
| 5 |
extra_gated_heading: Access Gemma on Hugging Face
|
| 6 |
+
extra_gated_prompt: To access Gemma on Hugging Face, you’re required to review and
|
| 7 |
+
agree to Google’s usage license. To do this, please ensure you’re logged in to Hugging
|
|
|
|
| 8 |
Face and click below. Requests are processed immediately.
|
| 9 |
extra_gated_button_content: Acknowledge license
|
| 10 |
+
base_model: google/gemma-3n-E4B-it
|
| 11 |
tags:
|
| 12 |
- automatic-speech-recognition
|
| 13 |
- automatic-speech-translation
|
|
|
|
| 16 |
---
|
| 17 |
|
| 18 |
> [!Note]
|
| 19 |
+
> This repository corresponds to the launch version of Gemma 3n E2B IT (Instruct), to be used with Hugging Face `transformers`,
|
| 20 |
> supporting text, audio, and vision (image and video) inputs.
|
| 21 |
>
|
| 22 |
> Gemma 3n models have multiple architecture innovations:
|
| 23 |
+
> * They are available in two sizes based on [effective parameters](https://ai.google.dev/gemma/docs/gemma-3n#parameters). While the raw parameter count of this model is 6B, the architecture design allows the model to be run with a memory footprint comparable to a traditional 2B model by offloading low-utilization matrices from the accelerator.
|
| 24 |
+
> * They use a MatFormer architecture that allows nesting sub-models within the [E4B model](https://huggingface.co/google/gemma-3n-E4B-it). We provide one sub-model (this model repository), or you can access a spectrum of custom-sized models using the [Mix-and-Match method](https://goo.gle/gemma3n-matformer-lab).
|
| 25 |
>
|
| 26 |
> Learn more about these techniques in the [technical blog post](https://developers.googleblog.com/en/introducing-gemma-3n-developer-guide)
|
| 27 |
> and the [Gemma documentation](https://ai.google.dev/gemma/docs/gemma-3n).
|
| 28 |
|
| 29 |
+
|
| 30 |
+
|
| 31 |
# Gemma 3n model card
|
| 32 |
|
| 33 |
**Model Page**: [Gemma 3n](https://ai.google.dev/gemma/docs/gemma-3n)
|
|
|
|
| 102 |
|
| 103 |
pipe = pipeline(
|
| 104 |
"image-text-to-text",
|
| 105 |
+
model="google/gemma-3n-e2b-it",
|
| 106 |
device="cuda",
|
| 107 |
torch_dtype=torch.bfloat16,
|
| 108 |
)
|
|
|
|
| 141 |
import requests
|
| 142 |
import torch
|
| 143 |
|
| 144 |
+
model_id = "google/gemma-3n-e2b-it"
|
| 145 |
|
| 146 |
+
model = Gemma3nForConditionalGeneration.from_pretrained(model_id, device="cuda", torch_dtype=torch.bfloat16,).eval()
|
| 147 |
|
| 148 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 149 |
|
|
|
|
| 167 |
tokenize=True,
|
| 168 |
return_dict=True,
|
| 169 |
return_tensors="pt",
|
| 170 |
+
).to(model.device, dtype=torch.bfloat16)
|
| 171 |
|
| 172 |
input_len = inputs["input_ids"].shape[-1]
|
| 173 |
|
|
|
|
| 527 |
|
| 528 |
Using the benchmark evaluation metrics described in this document, these models
|
| 529 |
have shown to provide superior performance to other, comparably-sized open model
|
| 530 |
+
alternatives.
|
config.json
CHANGED
|
@@ -90,11 +90,6 @@
|
|
| 90 |
0.0,
|
| 91 |
0.0,
|
| 92 |
0.0,
|
| 93 |
-
0.0,
|
| 94 |
-
0.0,
|
| 95 |
-
0.0,
|
| 96 |
-
0.0,
|
| 97 |
-
0.0,
|
| 98 |
0.0
|
| 99 |
],
|
| 100 |
"altup_active_idx": 0,
|
|
@@ -110,41 +105,36 @@
|
|
| 110 |
"hidden_size_per_layer_input": 256,
|
| 111 |
"initializer_range": 0.02,
|
| 112 |
"intermediate_size": [
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
16384,
|
| 144 |
-
16384,
|
| 145 |
-
16384,
|
| 146 |
-
16384,
|
| 147 |
-
16384
|
| 148 |
],
|
| 149 |
"laurel_rank": 64,
|
| 150 |
"layer_types": [
|
|
@@ -177,19 +167,14 @@
|
|
| 177 |
"sliding_attention",
|
| 178 |
"sliding_attention",
|
| 179 |
"sliding_attention",
|
| 180 |
-
"full_attention",
|
| 181 |
-
"sliding_attention",
|
| 182 |
-
"sliding_attention",
|
| 183 |
-
"sliding_attention",
|
| 184 |
-
"sliding_attention",
|
| 185 |
"full_attention"
|
| 186 |
],
|
| 187 |
"max_position_embeddings": 32768,
|
| 188 |
"model_type": "gemma3n_text",
|
| 189 |
"num_attention_heads": 8,
|
| 190 |
-
"num_hidden_layers":
|
| 191 |
"num_key_value_heads": 2,
|
| 192 |
-
"num_kv_shared_layers":
|
| 193 |
"rms_norm_eps": 1e-06,
|
| 194 |
"rope_local_base_freq": 10000.0,
|
| 195 |
"rope_scaling": null,
|
|
|
|
| 90 |
0.0,
|
| 91 |
0.0,
|
| 92 |
0.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
0.0
|
| 94 |
],
|
| 95 |
"altup_active_idx": 0,
|
|
|
|
| 105 |
"hidden_size_per_layer_input": 256,
|
| 106 |
"initializer_range": 0.02,
|
| 107 |
"intermediate_size": [
|
| 108 |
+
8192,
|
| 109 |
+
8192,
|
| 110 |
+
8192,
|
| 111 |
+
8192,
|
| 112 |
+
8192,
|
| 113 |
+
8192,
|
| 114 |
+
8192,
|
| 115 |
+
8192,
|
| 116 |
+
8192,
|
| 117 |
+
8192,
|
| 118 |
+
8192,
|
| 119 |
+
8192,
|
| 120 |
+
8192,
|
| 121 |
+
8192,
|
| 122 |
+
8192,
|
| 123 |
+
8192,
|
| 124 |
+
8192,
|
| 125 |
+
8192,
|
| 126 |
+
8192,
|
| 127 |
+
8192,
|
| 128 |
+
8192,
|
| 129 |
+
8192,
|
| 130 |
+
8192,
|
| 131 |
+
8192,
|
| 132 |
+
8192,
|
| 133 |
+
8192,
|
| 134 |
+
8192,
|
| 135 |
+
8192,
|
| 136 |
+
8192,
|
| 137 |
+
8192
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
],
|
| 139 |
"laurel_rank": 64,
|
| 140 |
"layer_types": [
|
|
|
|
| 167 |
"sliding_attention",
|
| 168 |
"sliding_attention",
|
| 169 |
"sliding_attention",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
"full_attention"
|
| 171 |
],
|
| 172 |
"max_position_embeddings": 32768,
|
| 173 |
"model_type": "gemma3n_text",
|
| 174 |
"num_attention_heads": 8,
|
| 175 |
+
"num_hidden_layers": 30,
|
| 176 |
"num_key_value_heads": 2,
|
| 177 |
+
"num_kv_shared_layers": 10,
|
| 178 |
"rms_norm_eps": 1e-06,
|
| 179 |
"rope_local_base_freq": 10000.0,
|
| 180 |
"rope_scaling": null,
|
model-00001-of-00003.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1af26e1fd61af0dc067252c907bf52900c7cd5864893e29970e6ea87320322a6
|
| 3 |
+
size 3077103824
|
model-00002-of-00003.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f53ec36e9b34a1dda547103f7371eaf4dcce40a9e85ef2a04dfa12f30e1146ff
|
| 3 |
+
size 4981242176
|
model-00003-of-00003.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28e11f2029b8d13aa3fde2a948808baa01d8bee0fd184c4faedbd09065e7fd0b
|
| 3 |
+
size 2820739840
|
model.safetensors.index.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|