Ashiedu commited on
Commit
5ecf91a
·
verified ·
1 Parent(s): 7af23a8

sync: push ML models as-is 2026-04-03

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +351 -0
  2. .gitignore +5 -0
  3. 4bit/.gitattributes +36 -0
  4. 4bit/README.md +530 -0
  5. 4bit/chat_template.jinja +49 -0
  6. 4bit/config.json +208 -0
  7. 4bit/generation_config.json +13 -0
  8. 4bit/model-00001-of-00003.safetensors +3 -0
  9. 4bit/model-00002-of-00003.safetensors +3 -0
  10. 4bit/model-00003-of-00003.safetensors +3 -0
  11. 4bit/model.safetensors.index.json +0 -0
  12. 4bit/notebook.ipynb +0 -0
  13. 4bit/preprocessor_config.json +52 -0
  14. 4bit/processor_config.json +5 -0
  15. 4bit/special_tokens_map.json +36 -0
  16. 4bit/tokenizer.json +3 -0
  17. 4bit/tokenizer.model +3 -0
  18. 4bit/tokenizer_config.json +0 -0
  19. AGENTS.md +43 -0
  20. README.md +523 -14
  21. depthformer/depthformer_base_decoder_step_decoder_step_meta.json +66 -0
  22. depthformer/depthformer_base_encoder_encoder_meta.json +41 -0
  23. depthformer_base_decoder_step.onnx +0 -0
  24. depthformer_base_decoder_step.vmfb +0 -0
  25. depthformer_base_decoder_step_decoder_step_meta.json +66 -0
  26. depthformer_base_decoder_step_ple/ple_manifest.json +5 -0
  27. depthformer_base_encoder.onnx +0 -0
  28. depthformer_base_encoder.vmfb +0 -0
  29. depthformer_base_encoder_encoder_meta.json +41 -0
  30. depthformer_base_encoder_ple/ple_manifest.json +5 -0
  31. export_iree_metadata.json +10 -0
  32. export_log.json +52 -0
  33. export_metadata.json +229 -0
  34. export_metadata_iree.json +188 -0
  35. magenta_raw/checkpoints/llm_base_x4286_c1860k.tar +3 -0
  36. magenta_raw/checkpoints/llm_base_x4286_c1860k/checkpoint +3 -0
  37. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_0.pre_mlp_layer_norm.scale.v/.zarray +1 -0
  38. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_0.pre_mlp_layer_norm.scale.v/0 +0 -0
  39. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_0.pre_self_attention_layer_norm.scale.v/.zarray +1 -0
  40. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_0.pre_self_attention_layer_norm.scale.v/0 +0 -0
  41. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_1.pre_mlp_layer_norm.scale.v/.zarray +1 -0
  42. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_1.pre_mlp_layer_norm.scale.v/0 +0 -0
  43. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_1.pre_self_attention_layer_norm.scale.v/.zarray +1 -0
  44. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_1.pre_self_attention_layer_norm.scale.v/0 +0 -0
  45. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_mlp_layer_norm.scale.v/.zarray +1 -0
  46. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_mlp_layer_norm.scale.v/0 +0 -0
  47. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/.zarray +1 -0
  48. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/0 +0 -0
  49. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_3.pre_mlp_layer_norm.scale.v/.zarray +1 -0
  50. magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_3.pre_mlp_layer_norm.scale.v/0 +0 -0
.gitattributes CHANGED
@@ -34,3 +34,354 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ 4bit/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/checkpoint filter=lfs diff=lfs merge=lfs -text
39
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_0.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
40
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_0.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
41
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_0.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
42
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_0.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
43
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_0.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
44
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_0.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
45
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_0.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
46
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_1.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
47
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_1.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
48
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_1.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
49
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_1.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
50
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_1.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
51
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_1.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
52
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_1.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
53
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_2.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
54
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_2.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
55
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_2.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
56
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_2.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
57
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_2.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
58
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_2.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
59
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_2.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
60
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_3.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
61
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_3.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
62
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_3.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
63
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_3.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
64
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_3.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
65
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_3.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
66
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.depth_decoder.depth_layers_3.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
67
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_0.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
68
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_0.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
69
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_0.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
70
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_0.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
71
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_0.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
72
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_0.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
73
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_0.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
74
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_0.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
75
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_0.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
76
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_0.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
77
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_0.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
78
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_1.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
79
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_1.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
80
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_1.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
81
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_1.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
82
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_1.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
83
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_1.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
84
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_1.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
85
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_1.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
86
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_1.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
87
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_1.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
88
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_1.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
89
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_10.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
90
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_10.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
91
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_10.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
92
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_10.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
93
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_10.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
94
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_10.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
95
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_10.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
96
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_10.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
97
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_10.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
98
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_10.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
99
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_10.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
100
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_11.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
101
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_11.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
102
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_11.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
103
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_11.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
104
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_11.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
105
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_11.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
106
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_11.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
107
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_11.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
108
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_11.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
109
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_11.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
110
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_11.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
111
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_12.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
112
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_12.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
113
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_12.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
114
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_12.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
115
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_12.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
116
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_12.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
117
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_12.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
118
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_12.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
119
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_12.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
120
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_12.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
121
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_12.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
122
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_13.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
123
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_13.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
124
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_13.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
125
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_13.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
126
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_13.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
127
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_13.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
128
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_13.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
129
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_13.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
130
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_13.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
131
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_13.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
132
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_13.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
133
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_14.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
134
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_14.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
135
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_14.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
136
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_14.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
137
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_14.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
138
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_14.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
139
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_14.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
140
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_14.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
141
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_14.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
142
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_14.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
143
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_14.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
144
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_15.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
145
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_15.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
146
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_15.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
147
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_15.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
148
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_15.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
149
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_15.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
150
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_15.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
151
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_15.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
152
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_15.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
153
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_15.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
154
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_15.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
155
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
156
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
157
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
158
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
159
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_16.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
160
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_16.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
161
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_16.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
162
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_16.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
163
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_16.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
164
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_16.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
165
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_16.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
166
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_17.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
167
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_17.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
168
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_17.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
169
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_17.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
170
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_17.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
171
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_17.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
172
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_17.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
173
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_17.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
174
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_17.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
175
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_17.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
176
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_17.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
177
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_18.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
178
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_18.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
179
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_18.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
180
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_18.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
181
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_18.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
182
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_18.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
183
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_18.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
184
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_18.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
185
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_18.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
186
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_18.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
187
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_18.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
188
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_19.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
189
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_19.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
190
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_19.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
191
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_19.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
192
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_19.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
193
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_19.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
194
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_19.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
195
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_19.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
196
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_19.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
197
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_19.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
198
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_19.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
199
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_2.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
200
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_2.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
201
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_2.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
202
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_2.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
203
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_2.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
204
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_2.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
205
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_2.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
206
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_2.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
207
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_2.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
208
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_2.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
209
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_2.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
210
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_3.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
211
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_3.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
212
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_3.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
213
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_3.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
214
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_3.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
215
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_3.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
216
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_3.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
217
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_3.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
218
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_3.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
219
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_3.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
220
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_3.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
221
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_4.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
222
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_4.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
223
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_4.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
224
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_4.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
225
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_4.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
226
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_4.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
227
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_4.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
228
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_4.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
229
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_4.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
230
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_4.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
231
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_4.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
232
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_5.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
233
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_5.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
234
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_5.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
235
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_5.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
236
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_5.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
237
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_5.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
238
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_5.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
239
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_5.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
240
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_5.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
241
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_5.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
242
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_5.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
243
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
244
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
245
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
246
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
247
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_6.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
248
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_6.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
249
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_6.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
250
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_6.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
251
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_6.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
252
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_6.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
253
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_6.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
254
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_7.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
255
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_7.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
256
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_7.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
257
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_7.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
258
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_7.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
259
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_7.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
260
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_7.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
261
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_7.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
262
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_7.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
263
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_7.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
264
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_7.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
265
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_8.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
266
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_8.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
267
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_8.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
268
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_8.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
269
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_8.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
270
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_8.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
271
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_8.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
272
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
273
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
274
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
275
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
276
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_9.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
277
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_9.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
278
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_9.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
279
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_9.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
280
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_9.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
281
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_9.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
282
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_9.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
283
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_9.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
284
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_9.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
285
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_9.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
286
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.decoder.temporal_decoder.layers_9.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
287
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.logits_dense.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
288
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.decoder.logits_dense.kernel/0.1 filter=lfs diff=lfs merge=lfs -text
289
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_0.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
290
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_0.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
291
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_0.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
292
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_0.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
293
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_0.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
294
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_0.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
295
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_0.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
296
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_1.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
297
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_1.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
298
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_1.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
299
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_1.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
300
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_1.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
301
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_1.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
302
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_1.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
303
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_10.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
304
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_10.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
305
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_10.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
306
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_10.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
307
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_10.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
308
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_10.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
309
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_10.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
310
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_11.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
311
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_11.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
312
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_11.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
313
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_11.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
314
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_11.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
315
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_11.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
316
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_11.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
317
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_2.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
318
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_2.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
319
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_2.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
320
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_2.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
321
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_2.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
322
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_2.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
323
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_2.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
324
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_3.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
325
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_3.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
326
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_3.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
327
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_3.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
328
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_3.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
329
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_3.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
330
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_3.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
331
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_4.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
332
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_4.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
333
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_4.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
334
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_4.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
335
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_4.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
336
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_4.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
337
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_4.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
338
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_5.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
339
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_5.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
340
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_5.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
341
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_5.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
342
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_5.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
343
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_5.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
344
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_5.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
345
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_6.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
346
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_6.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
347
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_6.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
348
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_6.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
349
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_6.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
350
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_6.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
351
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_6.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
352
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_7.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
353
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_7.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
354
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_7.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
355
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_7.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
356
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_7.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
357
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_7.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
358
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_7.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
359
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_8.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
360
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_8.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
361
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_8.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
362
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_8.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
363
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_8.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
364
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_8.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
365
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_8.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
366
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_9.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
367
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_9.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
368
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_9.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
369
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_9.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
370
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_9.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
371
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_9.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
372
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.encoder.layers_9.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text
373
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.token_embedder.embedding/0.0 filter=lfs diff=lfs merge=lfs -text
374
+ magenta_raw/checkpoints/llm_base_x4286_c1860k/target.token_embedder.embedding/1.0 filter=lfs diff=lfs merge=lfs -text
375
+ magenta_raw/savedmodels/musiccoca_mv212_quant/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
376
+ magenta_raw/savedmodels/musiccoca_mv212f_cpu_compat/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
377
+ magenta_raw/savedmodels/musiccoca_mv212f_cpu_novocab/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
378
+ magenta_raw/savedmodels/ssv2_48k_stereo/decoder/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
379
+ magenta_raw/savedmodels/ssv2_48k_stereo/encoder/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
380
+ magenta_raw/savedmodels/ssv2_48k_stereo/quantizer/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
381
+ magenta_rt.mlir filter=lfs diff=lfs merge=lfs -text
382
+ magenta_rt.vmfb filter=lfs diff=lfs merge=lfs -text
383
+ magenta_rt_fp16.mlir filter=lfs diff=lfs merge=lfs -text
384
+ magenta_rt_fp16_rocm.vmfb filter=lfs diff=lfs merge=lfs -text
385
+ magenta_rt_int8.mlir filter=lfs diff=lfs merge=lfs -text
386
+ magenta_rt_int8.vmfb filter=lfs diff=lfs merge=lfs -text
387
+ magenta_rt_rocm.vmfb filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Model files are large - commit only if needed
2
+ # *.onnx
3
+ # *.vmfb
4
+ # *.meta.json
5
+ # validation.json
4bit/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
4bit/README.md ADDED
@@ -0,0 +1,530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: gemma
3
+ library_name: transformers
4
+ pipeline_tag: image-text-to-text
5
+ extra_gated_heading: Access Gemma on Hugging Face
6
+ extra_gated_prompt: To access Gemma on Hugging Face, you’re required to review and
7
+ agree to Google’s usage license. To do this, please ensure you’re logged in to Hugging
8
+ Face and click below. Requests are processed immediately.
9
+ extra_gated_button_content: Acknowledge license
10
+ base_model: google/gemma-3n-E4B-it
11
+ tags:
12
+ - automatic-speech-recognition
13
+ - automatic-speech-translation
14
+ - audio-text-to-text
15
+ - video-text-to-text
16
+ ---
17
+
18
+ > [!Note]
19
+ > This repository corresponds to the launch version of Gemma 3n E2B IT (Instruct), to be used with Hugging Face `transformers`,
20
+ > supporting text, audio, and vision (image and video) inputs.
21
+ >
22
+ > Gemma 3n models have multiple architecture innovations:
23
+ > * They are available in two sizes based on [effective parameters](https://ai.google.dev/gemma/docs/gemma-3n#parameters). While the raw parameter count of this model is 6B, the architecture design allows the model to be run with a memory footprint comparable to a traditional 2B model by offloading low-utilization matrices from the accelerator.
24
+ > * They use a MatFormer architecture that allows nesting sub-models within the [E4B model](https://huggingface.co/google/gemma-3n-E4B-it). We provide one sub-model (this model repository), or you can access a spectrum of custom-sized models using the [Mix-and-Match method](https://goo.gle/gemma3n-matformer-lab).
25
+ >
26
+ > Learn more about these techniques in the [technical blog post](https://developers.googleblog.com/en/introducing-gemma-3n-developer-guide)
27
+ > and the [Gemma documentation](https://ai.google.dev/gemma/docs/gemma-3n).
28
+
29
+
30
+
31
+ # Gemma 3n model card
32
+
33
+ **Model Page**: [Gemma 3n](https://ai.google.dev/gemma/docs/gemma-3n)
34
+
35
+ **Resources and Technical Documentation**:
36
+
37
+ - [Responsible Generative AI Toolkit](https://ai.google.dev/responsible)
38
+ - [Gemma on Kaggle](https://www.kaggle.com/models/google/gemma-3n)
39
+ - [Gemma on HuggingFace](https://huggingface.co/collections/google/gemma-3n-685065323f5984ef315c93f4)
40
+ - [Gemma on Vertex Model Garden](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemma3n)
41
+
42
+ **Terms of Use**: [Terms](https://ai.google.dev/gemma/terms)\
43
+ **Authors**: Google DeepMind
44
+
45
+ ## Model Information
46
+
47
+ Summary description and brief definition of inputs and outputs.
48
+
49
+ ### Description
50
+
51
+ Gemma is a family of lightweight, state-of-the-art open models from Google,
52
+ built from the same research and technology used to create the Gemini models.
53
+ Gemma 3n models are designed for efficient execution on low-resource devices.
54
+ They are capable of multimodal input, handling text, image, video, and audio
55
+ input, and generating text outputs, with open weights for pre-trained and
56
+ instruction-tuned variants. These models were trained with data in over 140
57
+ spoken languages.
58
+
59
+ Gemma 3n models use selective parameter activation technology to reduce resource
60
+ requirements. This technique allows the models to operate at an effective size
61
+ of 2B and 4B parameters, which is lower than the total number of parameters they
62
+ contain. For more information on Gemma 3n's efficient parameter management
63
+ technology, see the
64
+ [Gemma 3n](https://ai.google.dev/gemma/docs/gemma-3n#parameters)
65
+ page.
66
+
67
+ ### Inputs and outputs
68
+
69
+ - **Input:**
70
+ - Text string, such as a question, a prompt, or a document to be
71
+ summarized
72
+ - Images, normalized to 256x256, 512x512, or 768x768 resolution
73
+ and encoded to 256 tokens each
74
+ - Audio data encoded to 6.25 tokens per second from a single channel
75
+ - Total input context of 32K tokens
76
+ - **Output:**
77
+ - Generated text in response to the input, such as an answer to a
78
+ question, analysis of image content, or a summary of a document
79
+ - Total output length up to 32K tokens, subtracting the request
80
+ input tokens
81
+
82
+ ### Usage
83
+
84
+ Below, there are some code snippets on how to get quickly started with running
85
+ the model. First, install the Transformers library. Gemma 3n is supported
86
+ starting from transformers 4.53.0.
87
+
88
+ ```sh
89
+ $ pip install -U transformers
90
+ ```
91
+
92
+ Then, copy the snippet from the section that is relevant for your use case.
93
+
94
+ #### Running with the `pipeline` API
95
+
96
+ You can initialize the model and processor for inference with `pipeline` as
97
+ follows.
98
+
99
+ ```python
100
+ from transformers import pipeline
101
+ import torch
102
+
103
+ pipe = pipeline(
104
+ "image-text-to-text",
105
+ model="google/gemma-3n-e2b-it",
106
+ device="cuda",
107
+ torch_dtype=torch.bfloat16,
108
+ )
109
+ ```
110
+
111
+ With instruction-tuned models, you need to use chat templates to process our
112
+ inputs first. Then, you can pass it to the pipeline.
113
+
114
+ ```python
115
+ messages = [
116
+ {
117
+ "role": "system",
118
+ "content": [{"type": "text", "text": "You are a helpful assistant."}]
119
+ },
120
+ {
121
+ "role": "user",
122
+ "content": [
123
+ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
124
+ {"type": "text", "text": "What animal is on the candy?"}
125
+ ]
126
+ }
127
+ ]
128
+
129
+ output = pipe(text=messages, max_new_tokens=200)
130
+ print(output[0]["generated_text"][-1]["content"])
131
+ # Okay, let's take a look!
132
+ # Based on the image, the animal on the candy is a **turtle**.
133
+ # You can see the shell shape and the head and legs.
134
+ ```
135
+
136
+ #### Running the model on a single GPU
137
+
138
+ ```python
139
+ from transformers import AutoProcessor, Gemma3nForConditionalGeneration
140
+ from PIL import Image
141
+ import requests
142
+ import torch
143
+
144
+ model_id = "google/gemma-3n-e2b-it"
145
+
146
+ model = Gemma3nForConditionalGeneration.from_pretrained(model_id, device="cuda", torch_dtype=torch.bfloat16,).eval()
147
+
148
+ processor = AutoProcessor.from_pretrained(model_id)
149
+
150
+ messages = [
151
+ {
152
+ "role": "system",
153
+ "content": [{"type": "text", "text": "You are a helpful assistant."}]
154
+ },
155
+ {
156
+ "role": "user",
157
+ "content": [
158
+ {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
159
+ {"type": "text", "text": "Describe this image in detail."}
160
+ ]
161
+ }
162
+ ]
163
+
164
+ inputs = processor.apply_chat_template(
165
+ messages,
166
+ add_generation_prompt=True,
167
+ tokenize=True,
168
+ return_dict=True,
169
+ return_tensors="pt",
170
+ ).to(model.device, dtype=torch.bfloat16)
171
+
172
+ input_len = inputs["input_ids"].shape[-1]
173
+
174
+ with torch.inference_mode():
175
+ generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
176
+ generation = generation[0][input_len:]
177
+
178
+ decoded = processor.decode(generation, skip_special_tokens=True)
179
+ print(decoded)
180
+
181
+ # **Overall Impression:** The image is a close-up shot of a vibrant garden scene,
182
+ # focusing on a cluster of pink cosmos flowers and a busy bumblebee.
183
+ # It has a slightly soft, natural feel, likely captured in daylight.
184
+ ```
185
+
186
+ ### Citation
187
+
188
+ ```
189
+ @article{gemma_3n_2025,
190
+ title={Gemma 3n},
191
+ url={https://ai.google.dev/gemma/docs/gemma-3n},
192
+ publisher={Google DeepMind},
193
+ author={Gemma Team},
194
+ year={2025}
195
+ }
196
+ ```
197
+
198
+ ## Model Data
199
+
200
+ Data used for model training and how the data was processed.
201
+
202
+ ### Training Dataset
203
+
204
+ These models were trained on a dataset that includes a wide variety of sources
205
+ totalling approximately 11 trillion tokens. The knowledge cutoff date for the
206
+ training data was June 2024. Here are the key components:
207
+
208
+ - **Web Documents**: A diverse collection of web text ensures the model
209
+ is exposed to a broad range of linguistic styles, topics, and vocabulary.
210
+ The training dataset includes content in over 140 languages.
211
+ - **Code**: Exposing the model to code helps it to learn the syntax and
212
+ patterns of programming languages, which improves its ability to generate
213
+ code and understand code-related questions.
214
+ - **Mathematics**: Training on mathematical text helps the model learn
215
+ logical reasoning, symbolic representation, and to address mathematical queries.
216
+ - **Images**: A wide range of images enables the model to perform image
217
+ analysis and visual data extraction tasks.
218
+ - Audio: A diverse set of sound samples enables the model to recognize
219
+ speech, transcribe text from recordings, and identify information in audio data.
220
+
221
+ The combination of these diverse data sources is crucial for training a
222
+ powerful multimodal model that can handle a wide variety of different tasks and
223
+ data formats.
224
+
225
+ ### Data Preprocessing
226
+
227
+ Here are the key data cleaning and filtering methods applied to the training
228
+ data:
229
+
230
+ - **CSAM Filtering**: Rigorous CSAM (Child Sexual Abuse Material)
231
+ filtering was applied at multiple stages in the data preparation process to
232
+ ensure the exclusion of harmful and illegal content.
233
+ - **Sensitive Data Filtering**: As part of making Gemma pre-trained models
234
+ safe and reliable, automated techniques were used to filter out certain
235
+ personal information and other sensitive data from training sets.
236
+ - **Additional methods**: Filtering based on content quality and safety in
237
+ line with
238
+ [our policies](https://ai.google/static/documents/ai-responsibility-update-published-february-2025.pdf).
239
+
240
+ ## Implementation Information
241
+
242
+ Details about the model internals.
243
+
244
+ ### Hardware
245
+
246
+ Gemma was trained using [Tensor Processing Unit
247
+ (TPU)](https://cloud.google.com/tpu/docs/intro-to-tpu) hardware (TPUv4p, TPUv5p
248
+ and TPUv5e). Training generative models requires significant computational
249
+ power. TPUs, designed specifically for matrix operations common in machine
250
+ learning, offer several advantages in this domain:
251
+
252
+ - **Performance**: TPUs are specifically designed to handle the massive
253
+ computations involved in training generative models. They can speed up
254
+ training considerably compared to CPUs.
255
+ - **Memory**: TPUs often come with large amounts of high-bandwidth memory,
256
+ allowing for the handling of large models and batch sizes during training.
257
+ This can lead to better model quality.
258
+ - **Scalability**: TPU Pods (large clusters of TPUs) provide a scalable
259
+ solution for handling the growing complexity of large foundation models.
260
+ You can distribute training across multiple TPU devices for faster and more
261
+ efficient processing.
262
+ - **Cost-effectiveness**: In many scenarios, TPUs can provide a more
263
+ cost-effective solution for training large models compared to CPU-based
264
+ infrastructure, especially when considering the time and resources saved
265
+ due to faster training.
266
+
267
+ These advantages are aligned with
268
+ [Google's commitments to operate sustainably](https://sustainability.google/operating-sustainably/).
269
+
270
+ ### Software
271
+
272
+ Training was done using [JAX](https://github.com/jax-ml/jax) and
273
+ [ML Pathways](https://blog.google/technology/ai/introducing-pathways-next-generation-ai-architecture/).
274
+ JAX allows researchers to take advantage of the latest generation of hardware,
275
+ including TPUs, for faster and more efficient training of large models. ML
276
+ Pathways is Google's latest effort to build artificially intelligent systems
277
+ capable of generalizing across multiple tasks. This is specially suitable for
278
+ foundation models, including large language models like these ones.
279
+
280
+ Together, JAX and ML Pathways are used as described in the
281
+ [paper about the Gemini family of models](https://goo.gle/gemma2report):
282
+ *"the 'single controller' programming model of Jax and Pathways allows a single
283
+ Python process to orchestrate the entire training run, dramatically simplifying
284
+ the development workflow."*
285
+
286
+ ## Evaluation
287
+
288
+ Model evaluation metrics and results.
289
+
290
+ ### Benchmark Results
291
+
292
+ These models were evaluated at full precision (float32) against a large
293
+ collection of different datasets and metrics to cover different aspects of
294
+ content generation. Evaluation results marked with **IT** are for
295
+ instruction-tuned models. Evaluation results marked with **PT** are for
296
+ pre-trained models.
297
+
298
+ #### Reasoning and factuality
299
+
300
+ | Benchmark | Metric | n-shot | E2B PT | E4B PT |
301
+ | ------------------------------ |----------------|----------|:--------:|:--------:|
302
+ | [HellaSwag][hellaswag] | Accuracy | 10-shot | 72.2 | 78.6 |
303
+ | [BoolQ][boolq] | Accuracy | 0-shot | 76.4 | 81.6 |
304
+ | [PIQA][piqa] | Accuracy | 0-shot | 78.9 | 81.0 |
305
+ | [SocialIQA][socialiqa] | Accuracy | 0-shot | 48.8 | 50.0 |
306
+ | [TriviaQA][triviaqa] | Accuracy | 5-shot | 60.8 | 70.2 |
307
+ | [Natural Questions][naturalq] | Accuracy | 5-shot | 15.5 | 20.9 |
308
+ | [ARC-c][arc] | Accuracy | 25-shot | 51.7 | 61.6 |
309
+ | [ARC-e][arc] | Accuracy | 0-shot | 75.8 | 81.6 |
310
+ | [WinoGrande][winogrande] | Accuracy | 5-shot | 66.8 | 71.7 |
311
+ | [BIG-Bench Hard][bbh] | Accuracy | few-shot | 44.3 | 52.9 |
312
+ | [DROP][drop] | Token F1 score | 1-shot | 53.9 | 60.8 |
313
+
314
+ [hellaswag]: https://arxiv.org/abs/1905.07830
315
+ [boolq]: https://arxiv.org/abs/1905.10044
316
+ [piqa]: https://arxiv.org/abs/1911.11641
317
+ [socialiqa]: https://arxiv.org/abs/1904.09728
318
+ [triviaqa]: https://arxiv.org/abs/1705.03551
319
+ [naturalq]: https://github.com/google-research-datasets/natural-questions
320
+ [arc]: https://arxiv.org/abs/1911.01547
321
+ [winogrande]: https://arxiv.org/abs/1907.10641
322
+ [bbh]: https://paperswithcode.com/dataset/bbh
323
+ [drop]: https://arxiv.org/abs/1903.00161
324
+
325
+ #### Multilingual
326
+
327
+ | Benchmark | Metric | n-shot | E2B IT | E4B IT |
328
+ | ------------------------------------|-------------------------|----------|:--------:|:--------:|
329
+ | [MGSM][mgsm] | Accuracy | 0-shot | 53.1 | 60.7 |
330
+ | [WMT24++][wmt24pp] (ChrF) | Character-level F-score | 0-shot | 42.7 | 50.1 |
331
+ | [Include][include] | Accuracy | 0-shot | 38.6 | 57.2 |
332
+ | [MMLU][mmlu] (ProX) | Accuracy | 0-shot | 8.1 | 19.9 |
333
+ | [OpenAI MMLU][openai-mmlu] | Accuracy | 0-shot | 22.3 | 35.6 |
334
+ | [Global-MMLU][global-mmlu] | Accuracy | 0-shot | 55.1 | 60.3 |
335
+ | [ECLeKTic][eclektic] | ECLeKTic score | 0-shot | 2.5 | 1.9 |
336
+
337
+ [mgsm]: https://arxiv.org/abs/2210.03057
338
+ [wmt24pp]: https://arxiv.org/abs/2502.12404v1
339
+ [include]:https://arxiv.org/abs/2411.19799
340
+ [mmlu]: https://arxiv.org/abs/2009.03300
341
+ [openai-mmlu]: https://huggingface.co/datasets/openai/MMMLU
342
+ [global-mmlu]: https://huggingface.co/datasets/CohereLabs/Global-MMLU
343
+ [eclektic]: https://arxiv.org/abs/2502.21228
344
+
345
+ #### STEM and code
346
+
347
+ | Benchmark | Metric | n-shot | E2B IT | E4B IT |
348
+ | ------------------------------------|--------------------------|----------|:--------:|:--------:|
349
+ | [GPQA][gpqa] Diamond | RelaxedAccuracy/accuracy | 0-shot | 24.8 | 23.7 |
350
+ | [LiveCodeBench][lcb] v5 | pass@1 | 0-shot | 18.6 | 25.7 |
351
+ | Codegolf v2.2 | pass@1 | 0-shot | 11.0 | 16.8 |
352
+ | [AIME 2025][aime-2025] | Accuracy | 0-shot | 6.7 | 11.6 |
353
+
354
+ [gpqa]: https://arxiv.org/abs/2311.12022
355
+ [lcb]: https://arxiv.org/abs/2403.07974
356
+ [aime-2025]: https://www.vals.ai/benchmarks/aime-2025-05-09
357
+
358
+ #### Additional benchmarks
359
+
360
+ | Benchmark | Metric | n-shot | E2B IT | E4B IT |
361
+ | ------------------------------------ |------------|----------|:--------:|:--------:|
362
+ | [MMLU][mmlu] | Accuracy | 0-shot | 60.1 | 64.9 |
363
+ | [MBPP][mbpp] | pass@1 | 3-shot | 56.6 | 63.6 |
364
+ | [HumanEval][humaneval] | pass@1 | 0-shot | 66.5 | 75.0 |
365
+ | [LiveCodeBench][lcb] | pass@1 | 0-shot | 13.2 | 13.2 |
366
+ | HiddenMath | Accuracy | 0-shot | 27.7 | 37.7 |
367
+ | [Global-MMLU-Lite][global-mmlu-lite] | Accuracy | 0-shot | 59.0 | 64.5 |
368
+ | [MMLU][mmlu] (Pro) | Accuracy | 0-shot | 40.5 | 50.6 |
369
+
370
+ [gpqa]: https://arxiv.org/abs/2311.12022
371
+ [mbpp]: https://arxiv.org/abs/2108.07732
372
+ [humaneval]: https://arxiv.org/abs/2107.03374
373
+ [lcb]: https://arxiv.org/abs/2403.07974
374
+ [global-mmlu-lite]: https://huggingface.co/datasets/CohereForAI/Global-MMLU-Lite
375
+
376
+ ## Ethics and Safety
377
+
378
+ Ethics and safety evaluation approach and results.
379
+
380
+ ### Evaluation Approach
381
+
382
+ Our evaluation methods include structured evaluations and internal red-teaming
383
+ testing of relevant content policies. Red-teaming was conducted by a number of
384
+ different teams, each with different goals and human evaluation metrics. These
385
+ models were evaluated against a number of different categories relevant to
386
+ ethics and safety, including:
387
+
388
+ - **Child Safety**: Evaluation of text-to-text and image to text prompts
389
+ covering child safety policies, including child sexual abuse and
390
+ exploitation.
391
+ - **Content Safety:** Evaluation of text-to-text and image to text prompts
392
+ covering safety policies including, harassment, violence and gore, and hate
393
+ speech.
394
+ - **Representational Harms**: Evaluation of text-to-text and image to text
395
+ prompts covering safety policies including bias, stereotyping, and harmful
396
+ associations or inaccuracies.
397
+
398
+ In addition to development level evaluations, we conduct "assurance
399
+ evaluations" which are our 'arms-length' internal evaluations for responsibility
400
+ governance decision making. They are conducted separately from the model
401
+ development team, to inform decision making about release. High level findings
402
+ are fed back to the model team, but prompt sets are held-out to prevent
403
+ overfitting and preserve the results' ability to inform decision making. Notable
404
+ assurance evaluation results are reported to our Responsibility & Safety Council
405
+ as part of release review.
406
+
407
+ ### Evaluation Results
408
+
409
+ For all areas of safety testing, we saw safe levels of performance across the
410
+ categories of child safety, content safety, and representational harms relative
411
+ to previous Gemma models. All testing was conducted without safety filters to
412
+ evaluate the model capabilities and behaviors. For text-to-text, image-to-text,
413
+ and audio-to-text, and across all model sizes, the model produced minimal policy
414
+ violations, and showed significant improvements over previous Gemma models'
415
+ performance with respect to high severity violations. A limitation of our
416
+ evaluations was they included primarily English language prompts.
417
+
418
+ ## Usage and Limitations
419
+
420
+ These models have certain limitations that users should be aware of.
421
+
422
+ ### Intended Usage
423
+
424
+ Open generative models have a wide range of applications across various
425
+ industries and domains. The following list of potential uses is not
426
+ comprehensive. The purpose of this list is to provide contextual information
427
+ about the possible use-cases that the model creators considered as part of model
428
+ training and development.
429
+
430
+ - Content Creation and Communication
431
+ - **Text Generation**: Generate creative text formats such as
432
+ poems, scripts, code, marketing copy, and email drafts.
433
+ - **Chatbots and Conversational AI**: Power conversational
434
+ interfaces for customer service, virtual assistants, or interactive
435
+ applications.
436
+ - **Text Summarization**: Generate concise summaries of a text
437
+ corpus, research papers, or reports.
438
+ - **Image Data Extraction**: Extract, interpret, and summarize
439
+ visual data for text communications.
440
+ - **Audio Data Extraction**: Transcribe spoken language, translate speech
441
+ to text in other languages, and analyze sound-based data.
442
+ - Research and Education
443
+ - **Natural Language Processing (NLP) and generative model
444
+ Research**: These models can serve as a foundation for researchers to
445
+ experiment with generative models and NLP techniques, develop
446
+ algorithms, and contribute to the advancement of the field.
447
+ - **Language Learning Tools**: Support interactive language
448
+ learning experiences, aiding in grammar correction or providing writing
449
+ practice.
450
+ - **Knowledge Exploration**: Assist researchers in exploring large
451
+ bodies of data by generating summaries or answering questions about
452
+ specific topics.
453
+
454
+ ### Limitations
455
+
456
+ - Training Data
457
+ - The quality and diversity of the training data significantly
458
+ influence the model's capabilities. Biases or gaps in the training data
459
+ can lead to limitations in the model's responses.
460
+ - The scope of the training dataset determines the subject areas
461
+ the model can handle effectively.
462
+ - Context and Task Complexity
463
+ - Models are better at tasks that can be framed with clear
464
+ prompts and instructions. Open-ended or highly complex tasks might be
465
+ challenging.
466
+ - A model's performance can be influenced by the amount of context
467
+ provided (longer context generally leads to better outputs, up to a
468
+ certain point).
469
+ - Language Ambiguity and Nuance
470
+ - Natural language is inherently complex. Models might struggle
471
+ to grasp subtle nuances, sarcasm, or figurative language.
472
+ - Factual Accuracy
473
+ - Models generate responses based on information they learned
474
+ from their training datasets, but they are not knowledge bases. They
475
+ may generate incorrect or outdated factual statements.
476
+ - Common Sense
477
+ - Models rely on statistical patterns in language. They might
478
+ lack the ability to apply common sense reasoning in certain situations.
479
+
480
+ ### Ethical Considerations and Risks
481
+
482
+ The development of generative models raises several ethical concerns. In
483
+ creating an open model, we have carefully considered the following:
484
+
485
+ - Bias and Fairness
486
+ - Generative models trained on large-scale, real-world text and image data
487
+ can reflect socio-cultural biases embedded in the training material.
488
+ These models underwent careful scrutiny, input data pre-processing
489
+ described and posterior evaluations reported in this card.
490
+ - Misinformation and Misuse
491
+ - Generative models can be misused to generate text that is
492
+ false, misleading, or harmful.
493
+ - Guidelines are provided for responsible use with the model, see the
494
+ [Responsible Generative AI Toolkit](https://ai.google.dev/responsible).
495
+ - Transparency and Accountability:
496
+ - This model card summarizes details on the models' architecture,
497
+ capabilities, limitations, and evaluation processes.
498
+ - A responsibly developed open model offers the opportunity to
499
+ share innovation by making generative model technology accessible to
500
+ developers and researchers across the AI ecosystem.
501
+
502
+ Risks identified and mitigations:
503
+
504
+ - **Perpetuation of biases**: It's encouraged to perform continuous monitoring
505
+ (using evaluation metrics, human review) and the exploration of de-biasing
506
+ techniques during model training, fine-tuning, and other use cases.
507
+ - **Generation of harmful content**: Mechanisms and guidelines for content
508
+ safety are essential. Developers are encouraged to exercise caution and
509
+ implement appropriate content safety safeguards based on their specific
510
+ product policies and application use cases.
511
+ - **Misuse for malicious purposes**: Technical limitations and developer
512
+ and end-user education can help mitigate against malicious applications of
513
+ generative models. Educational resources and reporting mechanisms for users
514
+ to flag misuse are provided. Prohibited uses of Gemma models are outlined
515
+ in the
516
+ [Gemma Prohibited Use Policy](https://ai.google.dev/gemma/prohibited_use_policy).
517
+ - **Privacy violations**: Models were trained on data filtered for removal of
518
+ certain personal information and other sensitive data. Developers are
519
+ encouraged to adhere to privacy regulations with privacy-preserving
520
+ techniques.
521
+
522
+ ### Benefits
523
+
524
+ At the time of release, this family of models provides high-performance open
525
+ generative model implementations designed from the ground up for responsible AI
526
+ development compared to similarly sized models.
527
+
528
+ Using the benchmark evaluation metrics described in this document, these models
529
+ have shown to provide superior performance to other, comparably-sized open model
530
+ alternatives.
4bit/chat_template.jinja ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'audio' -%}
33
+ {{ '<audio_soft_token>' }}
34
+ {%- elif item['type'] == 'image' -%}
35
+ {{ '<image_soft_token>' }}
36
+ {%- elif item['type'] == 'text' -%}
37
+ {{ item['text'] | trim }}
38
+ {%- endif -%}
39
+ {%- endfor -%}
40
+ {%- else -%}
41
+ {{ raise_exception("Invalid content type") }}
42
+ {%- endif -%}
43
+ {{ '<end_of_turn>
44
+ ' }}
45
+ {%- endfor -%}
46
+ {%- if add_generation_prompt -%}
47
+ {{'<start_of_turn>model
48
+ '}}
49
+ {%- endif -%}
4bit/config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma3nForConditionalGeneration"
4
+ ],
5
+ "audio_config": {
6
+ "conf_attention_chunk_size": 12,
7
+ "conf_attention_context_left": 13,
8
+ "conf_attention_context_right": 0,
9
+ "conf_attention_logit_cap": 50.0,
10
+ "conf_conv_kernel_size": 5,
11
+ "conf_num_attention_heads": 8,
12
+ "conf_num_hidden_layers": 12,
13
+ "conf_reduction_factor": 4,
14
+ "conf_residual_weight": 0.5,
15
+ "gradient_clipping": 10000000000.0,
16
+ "hidden_size": 1536,
17
+ "input_feat_size": 128,
18
+ "model_type": "gemma3n_audio",
19
+ "rms_norm_eps": 1e-06,
20
+ "sscp_conv_channel_size": [
21
+ 128,
22
+ 32
23
+ ],
24
+ "sscp_conv_group_norm_eps": 0.001,
25
+ "sscp_conv_kernel_size": [
26
+ [
27
+ 3,
28
+ 3
29
+ ],
30
+ [
31
+ 3,
32
+ 3
33
+ ]
34
+ ],
35
+ "sscp_conv_stride_size": [
36
+ [
37
+ 2,
38
+ 2
39
+ ],
40
+ [
41
+ 2,
42
+ 2
43
+ ]
44
+ ],
45
+ "torch_dtype": "bfloat16",
46
+ "vocab_offset": 262272,
47
+ "vocab_size": 128
48
+ },
49
+ "audio_soft_tokens_per_image": 188,
50
+ "audio_token_id": 262273,
51
+ "boa_token_id": 256000,
52
+ "boi_token_id": 255999,
53
+ "eoa_token_id": 262272,
54
+ "eoi_token_id": 262144,
55
+ "eos_token_id": [
56
+ 1,
57
+ 106
58
+ ],
59
+ "image_token_id": 262145,
60
+ "initializer_range": 0.02,
61
+ "model_type": "gemma3n",
62
+ "text_config": {
63
+ "activation_sparsity_pattern": [
64
+ 0.95,
65
+ 0.95,
66
+ 0.95,
67
+ 0.95,
68
+ 0.95,
69
+ 0.95,
70
+ 0.95,
71
+ 0.95,
72
+ 0.95,
73
+ 0.95,
74
+ 0.0,
75
+ 0.0,
76
+ 0.0,
77
+ 0.0,
78
+ 0.0,
79
+ 0.0,
80
+ 0.0,
81
+ 0.0,
82
+ 0.0,
83
+ 0.0,
84
+ 0.0,
85
+ 0.0,
86
+ 0.0,
87
+ 0.0,
88
+ 0.0,
89
+ 0.0,
90
+ 0.0,
91
+ 0.0,
92
+ 0.0,
93
+ 0.0
94
+ ],
95
+ "altup_active_idx": 0,
96
+ "altup_coef_clip": 120.0,
97
+ "altup_correct_scale": true,
98
+ "altup_num_inputs": 4,
99
+ "attention_bias": false,
100
+ "attention_dropout": 0.0,
101
+ "final_logit_softcapping": 30.0,
102
+ "head_dim": 256,
103
+ "hidden_activation": "gelu_pytorch_tanh",
104
+ "hidden_size": 2048,
105
+ "hidden_size_per_layer_input": 256,
106
+ "initializer_range": 0.02,
107
+ "intermediate_size": [
108
+ 8192,
109
+ 8192,
110
+ 8192,
111
+ 8192,
112
+ 8192,
113
+ 8192,
114
+ 8192,
115
+ 8192,
116
+ 8192,
117
+ 8192,
118
+ 8192,
119
+ 8192,
120
+ 8192,
121
+ 8192,
122
+ 8192,
123
+ 8192,
124
+ 8192,
125
+ 8192,
126
+ 8192,
127
+ 8192,
128
+ 8192,
129
+ 8192,
130
+ 8192,
131
+ 8192,
132
+ 8192,
133
+ 8192,
134
+ 8192,
135
+ 8192,
136
+ 8192,
137
+ 8192
138
+ ],
139
+ "laurel_rank": 64,
140
+ "layer_types": [
141
+ "sliding_attention",
142
+ "sliding_attention",
143
+ "sliding_attention",
144
+ "sliding_attention",
145
+ "full_attention",
146
+ "sliding_attention",
147
+ "sliding_attention",
148
+ "sliding_attention",
149
+ "sliding_attention",
150
+ "full_attention",
151
+ "sliding_attention",
152
+ "sliding_attention",
153
+ "sliding_attention",
154
+ "sliding_attention",
155
+ "full_attention",
156
+ "sliding_attention",
157
+ "sliding_attention",
158
+ "sliding_attention",
159
+ "sliding_attention",
160
+ "full_attention",
161
+ "sliding_attention",
162
+ "sliding_attention",
163
+ "sliding_attention",
164
+ "sliding_attention",
165
+ "full_attention",
166
+ "sliding_attention",
167
+ "sliding_attention",
168
+ "sliding_attention",
169
+ "sliding_attention",
170
+ "full_attention"
171
+ ],
172
+ "max_position_embeddings": 32768,
173
+ "model_type": "gemma3n_text",
174
+ "num_attention_heads": 8,
175
+ "num_hidden_layers": 30,
176
+ "num_key_value_heads": 2,
177
+ "num_kv_shared_layers": 10,
178
+ "rms_norm_eps": 1e-06,
179
+ "rope_local_base_freq": 10000.0,
180
+ "rope_scaling": null,
181
+ "rope_theta": 1000000.0,
182
+ "sliding_window": 512,
183
+ "torch_dtype": "bfloat16",
184
+ "use_cache": true,
185
+ "vocab_size": 262400,
186
+ "vocab_size_per_layer_input": 262144
187
+ },
188
+ "torch_dtype": "bfloat16",
189
+ "transformers_version": "4.53.0.dev0",
190
+ "vision_config": {
191
+ "architecture": "mobilenetv5_300m_enc",
192
+ "do_pooling": false,
193
+ "hidden_size": 2048,
194
+ "initializer_range": 0.02,
195
+ "label_names": [
196
+ "LABEL_0",
197
+ "LABEL_1"
198
+ ],
199
+ "model_args": null,
200
+ "model_type": "gemma3n_vision",
201
+ "num_classes": 2,
202
+ "rms_norm_eps": 1e-06,
203
+ "torch_dtype": "bfloat16",
204
+ "vocab_offset": 262144,
205
+ "vocab_size": 128
206
+ },
207
+ "vision_soft_tokens_per_image": 256
208
+ }
4bit/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "cache_implementation": "hybrid",
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 1,
7
+ 106
8
+ ],
9
+ "pad_token_id": 0,
10
+ "top_k": 64,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.54.0.dev0"
13
+ }
4bit/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1af26e1fd61af0dc067252c907bf52900c7cd5864893e29970e6ea87320322a6
3
+ size 3077103824
4bit/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f53ec36e9b34a1dda547103f7371eaf4dcce40a9e85ef2a04dfa12f30e1146ff
3
+ size 4981242176
4bit/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28e11f2029b8d13aa3fde2a948808baa01d8bee0fd184c4faedbd09065e7fd0b
3
+ size 2820739840
4bit/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
4bit/notebook.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
4bit/preprocessor_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": false,
5
+ "device": null,
6
+ "disable_grouping": null,
7
+ "dither": 0.0,
8
+ "do_center_crop": null,
9
+ "do_convert_rgb": null,
10
+ "do_normalize": false,
11
+ "do_rescale": true,
12
+ "do_resize": true,
13
+ "feature_extractor_type": "Gemma3nAudioFeatureExtractor",
14
+ "feature_size": 128,
15
+ "fft_length": 1024,
16
+ "fft_overdrive": true,
17
+ "frame_length": 512,
18
+ "hop_length": 160,
19
+ "image_mean": [
20
+ 0.5,
21
+ 0.5,
22
+ 0.5
23
+ ],
24
+ "image_processor_type": "SiglipImageProcessorFast",
25
+ "image_seq_length": 256,
26
+ "image_std": [
27
+ 0.5,
28
+ 0.5,
29
+ 0.5
30
+ ],
31
+ "input_data_format": null,
32
+ "input_scale_factor": 1.0,
33
+ "max_frequency": 7600.0,
34
+ "mel_floor": 1e-05,
35
+ "min_frequency": 125.0,
36
+ "padding_side": "right",
37
+ "padding_value": 0.0,
38
+ "per_bin_mean": null,
39
+ "per_bin_stddev": null,
40
+ "preemphasis": 0.97,
41
+ "preemphasis_htk_flavor": true,
42
+ "processor_class": "Gemma3nProcessor",
43
+ "resample": 2,
44
+ "rescale_factor": 0.00392156862745098,
45
+ "return_attention_mask": true,
46
+ "return_tensors": null,
47
+ "sampling_rate": 16000,
48
+ "size": {
49
+ "height": 768,
50
+ "width": 768
51
+ }
52
+ }
4bit/processor_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "audio_seq_length": 188,
3
+ "image_seq_length": 256,
4
+ "processor_class": "Gemma3nProcessor"
5
+ }
4bit/special_tokens_map.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<audio_soft_token>",
3
+ "boa_token": "<start_of_audio>",
4
+ "boi_token": "<start_of_image>",
5
+ "bos_token": {
6
+ "content": "<bos>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "eoa_token": "<end_of_audio>",
13
+ "eoi_token": "<end_of_image>",
14
+ "eos_token": {
15
+ "content": "<eos>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "image_token": "<image_soft_token>",
22
+ "pad_token": {
23
+ "content": "<pad>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false
28
+ },
29
+ "unk_token": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ }
36
+ }
4bit/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4c19736bf24d1c6805cf49340e31bd02c70fb7857a2cb31065c90c2b5719c4e
3
+ size 33442559
4bit/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea5f0cc48abfbfc04d14562270a32e02149a3e7035f368cc5a462786f4a59961
3
+ size 4696020
4bit/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
AGENTS.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🤖 Agents in `Content/MLModels/`
2
+
3
+ This directory handles model binaries, model cards, quantization logs, and dataset provenance.
4
+
5
+ ## 🏛️ **Relevant Agents**
6
+
7
+ ### ✍️ **Scribe**
8
+
9
+ - **Domain**: Documentation & Knowledge Integrity
10
+ - **Responsibilities**:
11
+ - Maintaining `MODEL_CARD.md` files for all models.
12
+ - Tracking quantization details and dataset provenance.
13
+ - Ensuring model versions and benchmarks are documented.
14
+
15
+ ### 🎷 **Jazz**
16
+
17
+ - **Domain**: Music Intelligence
18
+ - **Responsibilities**:
19
+ - Music model selection and training evaluation (Magenta RT, Performance RNN, CLAP).
20
+ - Fine-tuning and quantization targets.
21
+
22
+ ### 🔊 **Resonance**
23
+
24
+ - **Domain**: Audio Engine
25
+ - **Responsibilities**:
26
+ - ONNX Runtime and WinML/DirectML execution targets.
27
+ - VRAM envelope and inference performance benchmarks.
28
+
29
+ ### ⚡ **Bolt**
30
+
31
+ - **Domain**: Performance & Optimization
32
+ - **Responsibilities**:
33
+ - Quantization and pruning for hot-path performance.
34
+
35
+ ---
36
+
37
+ ## 🧭 **Boundaries**
38
+
39
+ - No code logic belongs here. Only model artifacts, datasets, and their metadata.
40
+
41
+ ---
42
+
43
+ [Global AGENTS.md](../AGENTS.md)
README.md CHANGED
@@ -1,21 +1,530 @@
1
  ---
2
- base_model: unsloth/gemma-3n-e4b-unsloth-bnb-4bit
 
 
 
 
 
 
 
 
3
  tags:
4
- - text-generation-inference
5
- - transformers
6
- - unsloth
7
- - gemma3n
8
- license: apache-2.0
9
- language:
10
- - en
11
  ---
12
 
13
- # Uploaded finetuned model
 
 
 
 
 
 
 
 
 
14
 
15
- - **Developed by:** Ashiedu
16
- - **License:** apache-2.0
17
- - **Finetuned from model :** unsloth/gemma-3n-e4b-unsloth-bnb-4bit
18
 
19
- This gemma3n model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
20
 
21
- [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ license: gemma
3
+ library_name: transformers
4
+ pipeline_tag: image-text-to-text
5
+ extra_gated_heading: Access Gemma on Hugging Face
6
+ extra_gated_prompt: To access Gemma on Hugging Face, you’re required to review and
7
+ agree to Google’s usage license. To do this, please ensure you’re logged in to Hugging
8
+ Face and click below. Requests are processed immediately.
9
+ extra_gated_button_content: Acknowledge license
10
+ base_model: google/gemma-3n-E4B-it
11
  tags:
12
+ - automatic-speech-recognition
13
+ - automatic-speech-translation
14
+ - audio-text-to-text
15
+ - video-text-to-text
 
 
 
16
  ---
17
 
18
+ > [!Note]
19
+ > This repository corresponds to the launch version of Gemma 3n E2B IT (Instruct), to be used with Hugging Face `transformers`,
20
+ > supporting text, audio, and vision (image and video) inputs.
21
+ >
22
+ > Gemma 3n models have multiple architecture innovations:
23
+ > * They are available in two sizes based on [effective parameters](https://ai.google.dev/gemma/docs/gemma-3n#parameters). While the raw parameter count of this model is 6B, the architecture design allows the model to be run with a memory footprint comparable to a traditional 2B model by offloading low-utilization matrices from the accelerator.
24
+ > * They use a MatFormer architecture that allows nesting sub-models within the [E4B model](https://huggingface.co/google/gemma-3n-E4B-it). We provide one sub-model (this model repository), or you can access a spectrum of custom-sized models using the [Mix-and-Match method](https://goo.gle/gemma3n-matformer-lab).
25
+ >
26
+ > Learn more about these techniques in the [technical blog post](https://developers.googleblog.com/en/introducing-gemma-3n-developer-guide)
27
+ > and the [Gemma documentation](https://ai.google.dev/gemma/docs/gemma-3n).
28
 
 
 
 
29
 
 
30
 
31
+ # Gemma 3n model card
32
+
33
+ **Model Page**: [Gemma 3n](https://ai.google.dev/gemma/docs/gemma-3n)
34
+
35
+ **Resources and Technical Documentation**:
36
+
37
+ - [Responsible Generative AI Toolkit](https://ai.google.dev/responsible)
38
+ - [Gemma on Kaggle](https://www.kaggle.com/models/google/gemma-3n)
39
+ - [Gemma on HuggingFace](https://huggingface.co/collections/google/gemma-3n-685065323f5984ef315c93f4)
40
+ - [Gemma on Vertex Model Garden](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemma3n)
41
+
42
+ **Terms of Use**: [Terms](https://ai.google.dev/gemma/terms)\
43
+ **Authors**: Google DeepMind
44
+
45
+ ## Model Information
46
+
47
+ Summary description and brief definition of inputs and outputs.
48
+
49
+ ### Description
50
+
51
+ Gemma is a family of lightweight, state-of-the-art open models from Google,
52
+ built from the same research and technology used to create the Gemini models.
53
+ Gemma 3n models are designed for efficient execution on low-resource devices.
54
+ They are capable of multimodal input, handling text, image, video, and audio
55
+ input, and generating text outputs, with open weights for pre-trained and
56
+ instruction-tuned variants. These models were trained with data in over 140
57
+ spoken languages.
58
+
59
+ Gemma 3n models use selective parameter activation technology to reduce resource
60
+ requirements. This technique allows the models to operate at an effective size
61
+ of 2B and 4B parameters, which is lower than the total number of parameters they
62
+ contain. For more information on Gemma 3n's efficient parameter management
63
+ technology, see the
64
+ [Gemma 3n](https://ai.google.dev/gemma/docs/gemma-3n#parameters)
65
+ page.
66
+
67
+ ### Inputs and outputs
68
+
69
+ - **Input:**
70
+ - Text string, such as a question, a prompt, or a document to be
71
+ summarized
72
+ - Images, normalized to 256x256, 512x512, or 768x768 resolution
73
+ and encoded to 256 tokens each
74
+ - Audio data encoded to 6.25 tokens per second from a single channel
75
+ - Total input context of 32K tokens
76
+ - **Output:**
77
+ - Generated text in response to the input, such as an answer to a
78
+ question, analysis of image content, or a summary of a document
79
+ - Total output length up to 32K tokens, subtracting the request
80
+ input tokens
81
+
82
+ ### Usage
83
+
84
+ Below, there are some code snippets on how to get quickly started with running
85
+ the model. First, install the Transformers library. Gemma 3n is supported
86
+ starting from transformers 4.53.0.
87
+
88
+ ```sh
89
+ $ pip install -U transformers
90
+ ```
91
+
92
+ Then, copy the snippet from the section that is relevant for your use case.
93
+
94
+ #### Running with the `pipeline` API
95
+
96
+ You can initialize the model and processor for inference with `pipeline` as
97
+ follows.
98
+
99
+ ```python
100
+ from transformers import pipeline
101
+ import torch
102
+
103
+ pipe = pipeline(
104
+ "image-text-to-text",
105
+ model="google/gemma-3n-e2b-it",
106
+ device="cuda",
107
+ torch_dtype=torch.bfloat16,
108
+ )
109
+ ```
110
+
111
+ With instruction-tuned models, you need to use chat templates to process our
112
+ inputs first. Then, you can pass it to the pipeline.
113
+
114
+ ```python
115
+ messages = [
116
+ {
117
+ "role": "system",
118
+ "content": [{"type": "text", "text": "You are a helpful assistant."}]
119
+ },
120
+ {
121
+ "role": "user",
122
+ "content": [
123
+ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
124
+ {"type": "text", "text": "What animal is on the candy?"}
125
+ ]
126
+ }
127
+ ]
128
+
129
+ output = pipe(text=messages, max_new_tokens=200)
130
+ print(output[0]["generated_text"][-1]["content"])
131
+ # Okay, let's take a look!
132
+ # Based on the image, the animal on the candy is a **turtle**.
133
+ # You can see the shell shape and the head and legs.
134
+ ```
135
+
136
+ #### Running the model on a single GPU
137
+
138
+ ```python
139
+ from transformers import AutoProcessor, Gemma3nForConditionalGeneration
140
+ from PIL import Image
141
+ import requests
142
+ import torch
143
+
144
+ model_id = "google/gemma-3n-e2b-it"
145
+
146
+ model = Gemma3nForConditionalGeneration.from_pretrained(model_id, device="cuda", torch_dtype=torch.bfloat16,).eval()
147
+
148
+ processor = AutoProcessor.from_pretrained(model_id)
149
+
150
+ messages = [
151
+ {
152
+ "role": "system",
153
+ "content": [{"type": "text", "text": "You are a helpful assistant."}]
154
+ },
155
+ {
156
+ "role": "user",
157
+ "content": [
158
+ {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
159
+ {"type": "text", "text": "Describe this image in detail."}
160
+ ]
161
+ }
162
+ ]
163
+
164
+ inputs = processor.apply_chat_template(
165
+ messages,
166
+ add_generation_prompt=True,
167
+ tokenize=True,
168
+ return_dict=True,
169
+ return_tensors="pt",
170
+ ).to(model.device, dtype=torch.bfloat16)
171
+
172
+ input_len = inputs["input_ids"].shape[-1]
173
+
174
+ with torch.inference_mode():
175
+ generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
176
+ generation = generation[0][input_len:]
177
+
178
+ decoded = processor.decode(generation, skip_special_tokens=True)
179
+ print(decoded)
180
+
181
+ # **Overall Impression:** The image is a close-up shot of a vibrant garden scene,
182
+ # focusing on a cluster of pink cosmos flowers and a busy bumblebee.
183
+ # It has a slightly soft, natural feel, likely captured in daylight.
184
+ ```
185
+
186
+ ### Citation
187
+
188
+ ```
189
+ @article{gemma_3n_2025,
190
+ title={Gemma 3n},
191
+ url={https://ai.google.dev/gemma/docs/gemma-3n},
192
+ publisher={Google DeepMind},
193
+ author={Gemma Team},
194
+ year={2025}
195
+ }
196
+ ```
197
+
198
+ ## Model Data
199
+
200
+ Data used for model training and how the data was processed.
201
+
202
+ ### Training Dataset
203
+
204
+ These models were trained on a dataset that includes a wide variety of sources
205
+ totalling approximately 11 trillion tokens. The knowledge cutoff date for the
206
+ training data was June 2024. Here are the key components:
207
+
208
+ - **Web Documents**: A diverse collection of web text ensures the model
209
+ is exposed to a broad range of linguistic styles, topics, and vocabulary.
210
+ The training dataset includes content in over 140 languages.
211
+ - **Code**: Exposing the model to code helps it to learn the syntax and
212
+ patterns of programming languages, which improves its ability to generate
213
+ code and understand code-related questions.
214
+ - **Mathematics**: Training on mathematical text helps the model learn
215
+ logical reasoning, symbolic representation, and to address mathematical queries.
216
+ - **Images**: A wide range of images enables the model to perform image
217
+ analysis and visual data extraction tasks.
218
+ - Audio: A diverse set of sound samples enables the model to recognize
219
+ speech, transcribe text from recordings, and identify information in audio data.
220
+
221
+ The combination of these diverse data sources is crucial for training a
222
+ powerful multimodal model that can handle a wide variety of different tasks and
223
+ data formats.
224
+
225
+ ### Data Preprocessing
226
+
227
+ Here are the key data cleaning and filtering methods applied to the training
228
+ data:
229
+
230
+ - **CSAM Filtering**: Rigorous CSAM (Child Sexual Abuse Material)
231
+ filtering was applied at multiple stages in the data preparation process to
232
+ ensure the exclusion of harmful and illegal content.
233
+ - **Sensitive Data Filtering**: As part of making Gemma pre-trained models
234
+ safe and reliable, automated techniques were used to filter out certain
235
+ personal information and other sensitive data from training sets.
236
+ - **Additional methods**: Filtering based on content quality and safety in
237
+ line with
238
+ [our policies](https://ai.google/static/documents/ai-responsibility-update-published-february-2025.pdf).
239
+
240
+ ## Implementation Information
241
+
242
+ Details about the model internals.
243
+
244
+ ### Hardware
245
+
246
+ Gemma was trained using [Tensor Processing Unit
247
+ (TPU)](https://cloud.google.com/tpu/docs/intro-to-tpu) hardware (TPUv4p, TPUv5p
248
+ and TPUv5e). Training generative models requires significant computational
249
+ power. TPUs, designed specifically for matrix operations common in machine
250
+ learning, offer several advantages in this domain:
251
+
252
+ - **Performance**: TPUs are specifically designed to handle the massive
253
+ computations involved in training generative models. They can speed up
254
+ training considerably compared to CPUs.
255
+ - **Memory**: TPUs often come with large amounts of high-bandwidth memory,
256
+ allowing for the handling of large models and batch sizes during training.
257
+ This can lead to better model quality.
258
+ - **Scalability**: TPU Pods (large clusters of TPUs) provide a scalable
259
+ solution for handling the growing complexity of large foundation models.
260
+ You can distribute training across multiple TPU devices for faster and more
261
+ efficient processing.
262
+ - **Cost-effectiveness**: In many scenarios, TPUs can provide a more
263
+ cost-effective solution for training large models compared to CPU-based
264
+ infrastructure, especially when considering the time and resources saved
265
+ due to faster training.
266
+
267
+ These advantages are aligned with
268
+ [Google's commitments to operate sustainably](https://sustainability.google/operating-sustainably/).
269
+
270
+ ### Software
271
+
272
+ Training was done using [JAX](https://github.com/jax-ml/jax) and
273
+ [ML Pathways](https://blog.google/technology/ai/introducing-pathways-next-generation-ai-architecture/).
274
+ JAX allows researchers to take advantage of the latest generation of hardware,
275
+ including TPUs, for faster and more efficient training of large models. ML
276
+ Pathways is Google's latest effort to build artificially intelligent systems
277
+ capable of generalizing across multiple tasks. This is specially suitable for
278
+ foundation models, including large language models like these ones.
279
+
280
+ Together, JAX and ML Pathways are used as described in the
281
+ [paper about the Gemini family of models](https://goo.gle/gemma2report):
282
+ *"the 'single controller' programming model of Jax and Pathways allows a single
283
+ Python process to orchestrate the entire training run, dramatically simplifying
284
+ the development workflow."*
285
+
286
+ ## Evaluation
287
+
288
+ Model evaluation metrics and results.
289
+
290
+ ### Benchmark Results
291
+
292
+ These models were evaluated at full precision (float32) against a large
293
+ collection of different datasets and metrics to cover different aspects of
294
+ content generation. Evaluation results marked with **IT** are for
295
+ instruction-tuned models. Evaluation results marked with **PT** are for
296
+ pre-trained models.
297
+
298
+ #### Reasoning and factuality
299
+
300
+ | Benchmark | Metric | n-shot | E2B PT | E4B PT |
301
+ | ------------------------------ |----------------|----------|:--------:|:--------:|
302
+ | [HellaSwag][hellaswag] | Accuracy | 10-shot | 72.2 | 78.6 |
303
+ | [BoolQ][boolq] | Accuracy | 0-shot | 76.4 | 81.6 |
304
+ | [PIQA][piqa] | Accuracy | 0-shot | 78.9 | 81.0 |
305
+ | [SocialIQA][socialiqa] | Accuracy | 0-shot | 48.8 | 50.0 |
306
+ | [TriviaQA][triviaqa] | Accuracy | 5-shot | 60.8 | 70.2 |
307
+ | [Natural Questions][naturalq] | Accuracy | 5-shot | 15.5 | 20.9 |
308
+ | [ARC-c][arc] | Accuracy | 25-shot | 51.7 | 61.6 |
309
+ | [ARC-e][arc] | Accuracy | 0-shot | 75.8 | 81.6 |
310
+ | [WinoGrande][winogrande] | Accuracy | 5-shot | 66.8 | 71.7 |
311
+ | [BIG-Bench Hard][bbh] | Accuracy | few-shot | 44.3 | 52.9 |
312
+ | [DROP][drop] | Token F1 score | 1-shot | 53.9 | 60.8 |
313
+
314
+ [hellaswag]: https://arxiv.org/abs/1905.07830
315
+ [boolq]: https://arxiv.org/abs/1905.10044
316
+ [piqa]: https://arxiv.org/abs/1911.11641
317
+ [socialiqa]: https://arxiv.org/abs/1904.09728
318
+ [triviaqa]: https://arxiv.org/abs/1705.03551
319
+ [naturalq]: https://github.com/google-research-datasets/natural-questions
320
+ [arc]: https://arxiv.org/abs/1911.01547
321
+ [winogrande]: https://arxiv.org/abs/1907.10641
322
+ [bbh]: https://paperswithcode.com/dataset/bbh
323
+ [drop]: https://arxiv.org/abs/1903.00161
324
+
325
+ #### Multilingual
326
+
327
+ | Benchmark | Metric | n-shot | E2B IT | E4B IT |
328
+ | ------------------------------------|-------------------------|----------|:--------:|:--------:|
329
+ | [MGSM][mgsm] | Accuracy | 0-shot | 53.1 | 60.7 |
330
+ | [WMT24++][wmt24pp] (ChrF) | Character-level F-score | 0-shot | 42.7 | 50.1 |
331
+ | [Include][include] | Accuracy | 0-shot | 38.6 | 57.2 |
332
+ | [MMLU][mmlu] (ProX) | Accuracy | 0-shot | 8.1 | 19.9 |
333
+ | [OpenAI MMLU][openai-mmlu] | Accuracy | 0-shot | 22.3 | 35.6 |
334
+ | [Global-MMLU][global-mmlu] | Accuracy | 0-shot | 55.1 | 60.3 |
335
+ | [ECLeKTic][eclektic] | ECLeKTic score | 0-shot | 2.5 | 1.9 |
336
+
337
+ [mgsm]: https://arxiv.org/abs/2210.03057
338
+ [wmt24pp]: https://arxiv.org/abs/2502.12404v1
339
+ [include]:https://arxiv.org/abs/2411.19799
340
+ [mmlu]: https://arxiv.org/abs/2009.03300
341
+ [openai-mmlu]: https://huggingface.co/datasets/openai/MMMLU
342
+ [global-mmlu]: https://huggingface.co/datasets/CohereLabs/Global-MMLU
343
+ [eclektic]: https://arxiv.org/abs/2502.21228
344
+
345
+ #### STEM and code
346
+
347
+ | Benchmark | Metric | n-shot | E2B IT | E4B IT |
348
+ | ------------------------------------|--------------------------|----------|:--------:|:--------:|
349
+ | [GPQA][gpqa] Diamond | RelaxedAccuracy/accuracy | 0-shot | 24.8 | 23.7 |
350
+ | [LiveCodeBench][lcb] v5 | pass@1 | 0-shot | 18.6 | 25.7 |
351
+ | Codegolf v2.2 | pass@1 | 0-shot | 11.0 | 16.8 |
352
+ | [AIME 2025][aime-2025] | Accuracy | 0-shot | 6.7 | 11.6 |
353
+
354
+ [gpqa]: https://arxiv.org/abs/2311.12022
355
+ [lcb]: https://arxiv.org/abs/2403.07974
356
+ [aime-2025]: https://www.vals.ai/benchmarks/aime-2025-05-09
357
+
358
+ #### Additional benchmarks
359
+
360
+ | Benchmark | Metric | n-shot | E2B IT | E4B IT |
361
+ | ------------------------------------ |------------|----------|:--------:|:--------:|
362
+ | [MMLU][mmlu] | Accuracy | 0-shot | 60.1 | 64.9 |
363
+ | [MBPP][mbpp] | pass@1 | 3-shot | 56.6 | 63.6 |
364
+ | [HumanEval][humaneval] | pass@1 | 0-shot | 66.5 | 75.0 |
365
+ | [LiveCodeBench][lcb] | pass@1 | 0-shot | 13.2 | 13.2 |
366
+ | HiddenMath | Accuracy | 0-shot | 27.7 | 37.7 |
367
+ | [Global-MMLU-Lite][global-mmlu-lite] | Accuracy | 0-shot | 59.0 | 64.5 |
368
+ | [MMLU][mmlu] (Pro) | Accuracy | 0-shot | 40.5 | 50.6 |
369
+
370
+ [gpqa]: https://arxiv.org/abs/2311.12022
371
+ [mbpp]: https://arxiv.org/abs/2108.07732
372
+ [humaneval]: https://arxiv.org/abs/2107.03374
373
+ [lcb]: https://arxiv.org/abs/2403.07974
374
+ [global-mmlu-lite]: https://huggingface.co/datasets/CohereForAI/Global-MMLU-Lite
375
+
376
+ ## Ethics and Safety
377
+
378
+ Ethics and safety evaluation approach and results.
379
+
380
+ ### Evaluation Approach
381
+
382
+ Our evaluation methods include structured evaluations and internal red-teaming
383
+ testing of relevant content policies. Red-teaming was conducted by a number of
384
+ different teams, each with different goals and human evaluation metrics. These
385
+ models were evaluated against a number of different categories relevant to
386
+ ethics and safety, including:
387
+
388
+ - **Child Safety**: Evaluation of text-to-text and image to text prompts
389
+ covering child safety policies, including child sexual abuse and
390
+ exploitation.
391
+ - **Content Safety:** Evaluation of text-to-text and image to text prompts
392
+ covering safety policies including, harassment, violence and gore, and hate
393
+ speech.
394
+ - **Representational Harms**: Evaluation of text-to-text and image to text
395
+ prompts covering safety policies including bias, stereotyping, and harmful
396
+ associations or inaccuracies.
397
+
398
+ In addition to development level evaluations, we conduct "assurance
399
+ evaluations" which are our 'arms-length' internal evaluations for responsibility
400
+ governance decision making. They are conducted separately from the model
401
+ development team, to inform decision making about release. High level findings
402
+ are fed back to the model team, but prompt sets are held-out to prevent
403
+ overfitting and preserve the results' ability to inform decision making. Notable
404
+ assurance evaluation results are reported to our Responsibility & Safety Council
405
+ as part of release review.
406
+
407
+ ### Evaluation Results
408
+
409
+ For all areas of safety testing, we saw safe levels of performance across the
410
+ categories of child safety, content safety, and representational harms relative
411
+ to previous Gemma models. All testing was conducted without safety filters to
412
+ evaluate the model capabilities and behaviors. For text-to-text, image-to-text,
413
+ and audio-to-text, and across all model sizes, the model produced minimal policy
414
+ violations, and showed significant improvements over previous Gemma models'
415
+ performance with respect to high severity violations. A limitation of our
416
+ evaluations was they included primarily English language prompts.
417
+
418
+ ## Usage and Limitations
419
+
420
+ These models have certain limitations that users should be aware of.
421
+
422
+ ### Intended Usage
423
+
424
+ Open generative models have a wide range of applications across various
425
+ industries and domains. The following list of potential uses is not
426
+ comprehensive. The purpose of this list is to provide contextual information
427
+ about the possible use-cases that the model creators considered as part of model
428
+ training and development.
429
+
430
+ - Content Creation and Communication
431
+ - **Text Generation**: Generate creative text formats such as
432
+ poems, scripts, code, marketing copy, and email drafts.
433
+ - **Chatbots and Conversational AI**: Power conversational
434
+ interfaces for customer service, virtual assistants, or interactive
435
+ applications.
436
+ - **Text Summarization**: Generate concise summaries of a text
437
+ corpus, research papers, or reports.
438
+ - **Image Data Extraction**: Extract, interpret, and summarize
439
+ visual data for text communications.
440
+ - **Audio Data Extraction**: Transcribe spoken language, translate speech
441
+ to text in other languages, and analyze sound-based data.
442
+ - Research and Education
443
+ - **Natural Language Processing (NLP) and generative model
444
+ Research**: These models can serve as a foundation for researchers to
445
+ experiment with generative models and NLP techniques, develop
446
+ algorithms, and contribute to the advancement of the field.
447
+ - **Language Learning Tools**: Support interactive language
448
+ learning experiences, aiding in grammar correction or providing writing
449
+ practice.
450
+ - **Knowledge Exploration**: Assist researchers in exploring large
451
+ bodies of data by generating summaries or answering questions about
452
+ specific topics.
453
+
454
+ ### Limitations
455
+
456
+ - Training Data
457
+ - The quality and diversity of the training data significantly
458
+ influence the model's capabilities. Biases or gaps in the training data
459
+ can lead to limitations in the model's responses.
460
+ - The scope of the training dataset determines the subject areas
461
+ the model can handle effectively.
462
+ - Context and Task Complexity
463
+ - Models are better at tasks that can be framed with clear
464
+ prompts and instructions. Open-ended or highly complex tasks might be
465
+ challenging.
466
+ - A model's performance can be influenced by the amount of context
467
+ provided (longer context generally leads to better outputs, up to a
468
+ certain point).
469
+ - Language Ambiguity and Nuance
470
+ - Natural language is inherently complex. Models might struggle
471
+ to grasp subtle nuances, sarcasm, or figurative language.
472
+ - Factual Accuracy
473
+ - Models generate responses based on information they learned
474
+ from their training datasets, but they are not knowledge bases. They
475
+ may generate incorrect or outdated factual statements.
476
+ - Common Sense
477
+ - Models rely on statistical patterns in language. They might
478
+ lack the ability to apply common sense reasoning in certain situations.
479
+
480
+ ### Ethical Considerations and Risks
481
+
482
+ The development of generative models raises several ethical concerns. In
483
+ creating an open model, we have carefully considered the following:
484
+
485
+ - Bias and Fairness
486
+ - Generative models trained on large-scale, real-world text and image data
487
+ can reflect socio-cultural biases embedded in the training material.
488
+ These models underwent careful scrutiny, input data pre-processing
489
+ described and posterior evaluations reported in this card.
490
+ - Misinformation and Misuse
491
+ - Generative models can be misused to generate text that is
492
+ false, misleading, or harmful.
493
+ - Guidelines are provided for responsible use with the model, see the
494
+ [Responsible Generative AI Toolkit](https://ai.google.dev/responsible).
495
+ - Transparency and Accountability:
496
+ - This model card summarizes details on the models' architecture,
497
+ capabilities, limitations, and evaluation processes.
498
+ - A responsibly developed open model offers the opportunity to
499
+ share innovation by making generative model technology accessible to
500
+ developers and researchers across the AI ecosystem.
501
+
502
+ Risks identified and mitigations:
503
+
504
+ - **Perpetuation of biases**: It's encouraged to perform continuous monitoring
505
+ (using evaluation metrics, human review) and the exploration of de-biasing
506
+ techniques during model training, fine-tuning, and other use cases.
507
+ - **Generation of harmful content**: Mechanisms and guidelines for content
508
+ safety are essential. Developers are encouraged to exercise caution and
509
+ implement appropriate content safety safeguards based on their specific
510
+ product policies and application use cases.
511
+ - **Misuse for malicious purposes**: Technical limitations and developer
512
+ and end-user education can help mitigate against malicious applications of
513
+ generative models. Educational resources and reporting mechanisms for users
514
+ to flag misuse are provided. Prohibited uses of Gemma models are outlined
515
+ in the
516
+ [Gemma Prohibited Use Policy](https://ai.google.dev/gemma/prohibited_use_policy).
517
+ - **Privacy violations**: Models were trained on data filtered for removal of
518
+ certain personal information and other sensitive data. Developers are
519
+ encouraged to adhere to privacy regulations with privacy-preserving
520
+ techniques.
521
+
522
+ ### Benefits
523
+
524
+ At the time of release, this family of models provides high-performance open
525
+ generative model implementations designed from the ground up for responsible AI
526
+ development compared to similarly sized models.
527
+
528
+ Using the benchmark evaluation metrics described in this document, these models
529
+ have shown to provide superior performance to other, comparably-sized open model
530
+ alternatives.
depthformer/depthformer_base_decoder_step_decoder_step_meta.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "depthformer_base_decoder_step",
3
+ "inputs": [
4
+ {
5
+ "name": "target_token",
6
+ "shape": [
7
+ 1,
8
+ 1
9
+ ],
10
+ "dtype": "int32"
11
+ },
12
+ {
13
+ "name": "encoder_hidden_states",
14
+ "shape": [
15
+ 1,
16
+ 1006,
17
+ 768
18
+ ],
19
+ "dtype": "float32"
20
+ },
21
+ {
22
+ "name": "kv_cache_keys",
23
+ "shape": "dynamic",
24
+ "dtype": "float32"
25
+ },
26
+ {
27
+ "name": "kv_cache_values",
28
+ "shape": "dynamic",
29
+ "dtype": "float32"
30
+ }
31
+ ],
32
+ "outputs": [
33
+ {
34
+ "name": "logits",
35
+ "shape": [
36
+ 1,
37
+ 16384
38
+ ],
39
+ "dtype": "float32"
40
+ },
41
+ {
42
+ "name": "new_kv_cache_keys",
43
+ "shape": "dynamic",
44
+ "dtype": "float32"
45
+ },
46
+ {
47
+ "name": "new_kv_cache_values",
48
+ "shape": "dynamic",
49
+ "dtype": "float32"
50
+ }
51
+ ],
52
+ "model_config": {
53
+ "embed_dim": 768,
54
+ "num_heads": 12,
55
+ "num_decoder_layers": 12,
56
+ "mlp_dim": 2048
57
+ },
58
+ "kv_cache": {
59
+ "max_length": 1806,
60
+ "num_heads": 12,
61
+ "head_dim": 64
62
+ },
63
+ "opset_version": 18,
64
+ "ir_version": 8,
65
+ "precision": "fp16"
66
+ }
depthformer/depthformer_base_encoder_encoder_meta.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "depthformer_base_encoder",
3
+ "inputs": [
4
+ {
5
+ "name": "context_tokens",
6
+ "shape": [
7
+ 1,
8
+ 1006
9
+ ],
10
+ "dtype": "int32"
11
+ },
12
+ {
13
+ "name": "style_tokens",
14
+ "shape": [
15
+ 1,
16
+ 6
17
+ ],
18
+ "dtype": "int32"
19
+ }
20
+ ],
21
+ "output_name": "encoder_hidden_states",
22
+ "output_shape": [
23
+ 1,
24
+ 1006,
25
+ 768
26
+ ],
27
+ "model_config": {
28
+ "embed_dim": 768,
29
+ "num_heads": 12,
30
+ "num_encoder_layers": 12,
31
+ "mlp_dim": 2048
32
+ },
33
+ "token_config": {
34
+ "context_length": 1006,
35
+ "style_rvq_depth": 6,
36
+ "rvq_codebook_size": 1024
37
+ },
38
+ "opset_version": 18,
39
+ "ir_version": 8,
40
+ "precision": "fp16"
41
+ }
depthformer_base_decoder_step.onnx ADDED
File without changes
depthformer_base_decoder_step.vmfb ADDED
File without changes
depthformer_base_decoder_step_decoder_step_meta.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "depthformer_base_decoder_step",
3
+ "inputs": [
4
+ {
5
+ "name": "target_token",
6
+ "shape": [
7
+ 1,
8
+ 1
9
+ ],
10
+ "dtype": "int32"
11
+ },
12
+ {
13
+ "name": "encoder_hidden_states",
14
+ "shape": [
15
+ 1,
16
+ 1006,
17
+ 768
18
+ ],
19
+ "dtype": "float32"
20
+ },
21
+ {
22
+ "name": "kv_cache_keys",
23
+ "shape": "dynamic",
24
+ "dtype": "float32"
25
+ },
26
+ {
27
+ "name": "kv_cache_values",
28
+ "shape": "dynamic",
29
+ "dtype": "float32"
30
+ }
31
+ ],
32
+ "outputs": [
33
+ {
34
+ "name": "logits",
35
+ "shape": [
36
+ 1,
37
+ 16384
38
+ ],
39
+ "dtype": "float32"
40
+ },
41
+ {
42
+ "name": "new_kv_cache_keys",
43
+ "shape": "dynamic",
44
+ "dtype": "float32"
45
+ },
46
+ {
47
+ "name": "new_kv_cache_values",
48
+ "shape": "dynamic",
49
+ "dtype": "float32"
50
+ }
51
+ ],
52
+ "model_config": {
53
+ "embed_dim": 768,
54
+ "num_heads": 12,
55
+ "num_decoder_layers": 12,
56
+ "mlp_dim": 2048
57
+ },
58
+ "kv_cache": {
59
+ "max_length": 1806,
60
+ "num_heads": 12,
61
+ "head_dim": 64
62
+ },
63
+ "opset_version": 18,
64
+ "ir_version": 8,
65
+ "precision": "fp16"
66
+ }
depthformer_base_decoder_step_ple/ple_manifest.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "depthformer_base_decoder_step",
3
+ "layers": [],
4
+ "total_size_bytes": 0
5
+ }
depthformer_base_encoder.onnx ADDED
File without changes
depthformer_base_encoder.vmfb ADDED
File without changes
depthformer_base_encoder_encoder_meta.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "depthformer_base_encoder",
3
+ "inputs": [
4
+ {
5
+ "name": "context_tokens",
6
+ "shape": [
7
+ 1,
8
+ 1006
9
+ ],
10
+ "dtype": "int32"
11
+ },
12
+ {
13
+ "name": "style_tokens",
14
+ "shape": [
15
+ 1,
16
+ 6
17
+ ],
18
+ "dtype": "int32"
19
+ }
20
+ ],
21
+ "output_name": "encoder_hidden_states",
22
+ "output_shape": [
23
+ 1,
24
+ 1006,
25
+ 768
26
+ ],
27
+ "model_config": {
28
+ "embed_dim": 768,
29
+ "num_heads": 12,
30
+ "num_encoder_layers": 12,
31
+ "mlp_dim": 2048
32
+ },
33
+ "token_config": {
34
+ "context_length": 1006,
35
+ "style_rvq_depth": 6,
36
+ "rvq_codebook_size": 1024
37
+ },
38
+ "opset_version": 18,
39
+ "ir_version": 8,
40
+ "precision": "fp16"
41
+ }
depthformer_base_encoder_ple/ple_manifest.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "depthformer_base_encoder",
3
+ "layers": [],
4
+ "total_size_bytes": 0
5
+ }
export_iree_metadata.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "component": "musiccoca",
5
+ "status": "FAILED",
6
+ "step": "tf_to_onnx"
7
+ }
8
+ ],
9
+ "target_chip": "gfx1030"
10
+ }
export_log.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-03-24 19:09:44",
3
+ "config": {
4
+ "quantization": "4bit",
5
+ "fallback_chain": true,
6
+ "model_size": "base"
7
+ },
8
+ "results": {
9
+ "musiccoca_text_encoder": {
10
+ "status": "success",
11
+ "quantization": "4bit",
12
+ "output_path": "Content/MLModels/quant_4bit_musiccoca_text_encoder.onnx",
13
+ "file_size_mb": 0.0016775131225585938
14
+ },
15
+ "musiccoca_audio_encoder": {
16
+ "status": "success",
17
+ "quantization": "4bit",
18
+ "output_path": "Content/MLModels/quant_4bit_musiccoca_audio_encoder.onnx",
19
+ "file_size_mb": 0.0016498565673828125
20
+ },
21
+ "depthformer_base_encoder": {
22
+ "status": "success",
23
+ "quantization": "4bit",
24
+ "output_path": "Content/MLModels/quant_4bit_depthformer_base_encoder.onnx",
25
+ "file_size_mb": 1.4738502502441406
26
+ },
27
+ "depthformer_base_decoder_step": {
28
+ "status": "success",
29
+ "quantization": "4bit",
30
+ "output_path": "Content/MLModels/quant_4bit_depthformer_base_decoder_step.onnx",
31
+ "file_size_mb": 0.03146553039550781
32
+ },
33
+ "spectrostream_encoder": {
34
+ "status": "success",
35
+ "quantization": "4bit",
36
+ "output_path": "Content/MLModels/quant_4bit_spectrostream_encoder.onnx",
37
+ "file_size_mb": 0.02460479736328125
38
+ },
39
+ "spectrostream_decoder": {
40
+ "status": "success",
41
+ "quantization": "4bit",
42
+ "output_path": "Content/MLModels/quant_4bit_spectrostream_decoder.onnx",
43
+ "file_size_mb": 0.36640357971191406
44
+ }
45
+ },
46
+ "summary": {
47
+ "total": 6,
48
+ "success": 6,
49
+ "failed": 0,
50
+ "skipped": 0
51
+ }
52
+ }
export_metadata.json ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "export_timestamp": "[0.]",
3
+ "models": [
4
+ {
5
+ "model_type": "spectrostream_encoder",
6
+ "input_name": "audio",
7
+ "input_shape": [
8
+ 1,
9
+ 96000,
10
+ 2
11
+ ],
12
+ "output_name": "embeddings",
13
+ "output_shape": [
14
+ 1,
15
+ 50,
16
+ 256
17
+ ],
18
+ "sample_rate": 48000,
19
+ "frame_rate": 25.0,
20
+ "embedding_dim": 256,
21
+ "opset_version": 18,
22
+ "ir_version": 8,
23
+ "precision": "fp16"
24
+ },
25
+ {
26
+ "model_type": "spectrostream_decoder",
27
+ "input_name": "tokens",
28
+ "input_shape": [
29
+ 1,
30
+ 50,
31
+ 64
32
+ ],
33
+ "output_name": "audio",
34
+ "output_shape": [
35
+ 1,
36
+ 96000,
37
+ 2
38
+ ],
39
+ "sample_rate": 48000,
40
+ "frame_rate": 25.0,
41
+ "rvq_depth": 64,
42
+ "rvq_codebook_size": 1024,
43
+ "opset_version": 18,
44
+ "ir_version": 8,
45
+ "precision": "fp16"
46
+ },
47
+ {
48
+ "model_type": "musiccoca_text_encoder",
49
+ "inputs": [
50
+ {
51
+ "name": "text_ids",
52
+ "shape": [
53
+ 1,
54
+ 128
55
+ ],
56
+ "dtype": "int32"
57
+ },
58
+ {
59
+ "name": "padding",
60
+ "shape": [
61
+ 1,
62
+ 128
63
+ ],
64
+ "dtype": "float32"
65
+ }
66
+ ],
67
+ "output_name": "embedding",
68
+ "output_shape": [
69
+ 1,
70
+ 768
71
+ ],
72
+ "max_text_length": 128,
73
+ "embedding_dim": 768,
74
+ "opset_version": 18,
75
+ "ir_version": 8,
76
+ "precision": "fp16"
77
+ },
78
+ {
79
+ "model_type": "musiccoca_audio_encoder",
80
+ "input_name": "audio",
81
+ "input_shape": [
82
+ 1,
83
+ 160000
84
+ ],
85
+ "output_name": "embedding",
86
+ "output_shape": [
87
+ 1,
88
+ 768
89
+ ],
90
+ "sample_rate": 16000,
91
+ "clip_length": 10.0,
92
+ "embedding_dim": 768,
93
+ "opset_version": 18,
94
+ "ir_version": 8,
95
+ "precision": "fp16"
96
+ },
97
+ {
98
+ "model_type": "musiccoca_rvq_quantizer",
99
+ "input_name": "embedding",
100
+ "input_shape": [
101
+ 1,
102
+ 768
103
+ ],
104
+ "output_name": "tokens",
105
+ "output_shape": [
106
+ 1,
107
+ 12
108
+ ],
109
+ "embedding_dim": 768,
110
+ "rvq_depth": 12,
111
+ "rvq_codebook_size": 1024,
112
+ "opset_version": 18,
113
+ "ir_version": 8,
114
+ "precision": "fp16"
115
+ },
116
+ {
117
+ "model_type": "depthformer_base_encoder",
118
+ "inputs": [
119
+ {
120
+ "name": "context_tokens",
121
+ "shape": [
122
+ 1,
123
+ 1006
124
+ ],
125
+ "dtype": "int32"
126
+ },
127
+ {
128
+ "name": "style_tokens",
129
+ "shape": [
130
+ 1,
131
+ 6
132
+ ],
133
+ "dtype": "int32"
134
+ }
135
+ ],
136
+ "output_name": "encoder_hidden_states",
137
+ "output_shape": [
138
+ 1,
139
+ 1006,
140
+ 768
141
+ ],
142
+ "model_config": {
143
+ "embed_dim": 768,
144
+ "num_heads": 12,
145
+ "num_encoder_layers": 12,
146
+ "mlp_dim": 2048
147
+ },
148
+ "token_config": {
149
+ "context_length": 1006,
150
+ "style_rvq_depth": 6,
151
+ "rvq_codebook_size": 1024
152
+ },
153
+ "opset_version": 18,
154
+ "ir_version": 8,
155
+ "precision": "fp16"
156
+ },
157
+ {
158
+ "model_type": "depthformer_base_decoder_step",
159
+ "inputs": [
160
+ {
161
+ "name": "target_token",
162
+ "shape": [
163
+ 1,
164
+ 1
165
+ ],
166
+ "dtype": "int32"
167
+ },
168
+ {
169
+ "name": "encoder_hidden_states",
170
+ "shape": [
171
+ 1,
172
+ 1006,
173
+ 768
174
+ ],
175
+ "dtype": "float32"
176
+ },
177
+ {
178
+ "name": "kv_cache_keys",
179
+ "shape": "dynamic",
180
+ "dtype": "float32"
181
+ },
182
+ {
183
+ "name": "kv_cache_values",
184
+ "shape": "dynamic",
185
+ "dtype": "float32"
186
+ }
187
+ ],
188
+ "outputs": [
189
+ {
190
+ "name": "logits",
191
+ "shape": [
192
+ 1,
193
+ 16384
194
+ ],
195
+ "dtype": "float32"
196
+ },
197
+ {
198
+ "name": "new_kv_cache_keys",
199
+ "shape": "dynamic",
200
+ "dtype": "float32"
201
+ },
202
+ {
203
+ "name": "new_kv_cache_values",
204
+ "shape": "dynamic",
205
+ "dtype": "float32"
206
+ }
207
+ ],
208
+ "model_config": {
209
+ "embed_dim": 768,
210
+ "num_heads": 12,
211
+ "num_decoder_layers": 12,
212
+ "mlp_dim": 2048
213
+ },
214
+ "kv_cache": {
215
+ "max_length": 1806,
216
+ "num_heads": 12,
217
+ "head_dim": 64
218
+ },
219
+ "opset_version": 18,
220
+ "ir_version": 8,
221
+ "precision": "fp16"
222
+ }
223
+ ],
224
+ "config": {
225
+ "opset_version": 18,
226
+ "ir_version": 8,
227
+ "precision": "fp16"
228
+ }
229
+ }
export_metadata_iree.json ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "export_timestamp": "[0.]",
3
+ "format": "iree_vmfb",
4
+ "models": [
5
+ {
6
+ "model_type": "spectrostream_encoder_iree",
7
+ "format": "vmfb",
8
+ "input": {
9
+ "name": "audio",
10
+ "shape": [
11
+ 1,
12
+ 96000,
13
+ 2
14
+ ],
15
+ "dtype": "float32"
16
+ },
17
+ "output": {
18
+ "name": "embeddings",
19
+ "shape": [
20
+ 1,
21
+ 50,
22
+ 256
23
+ ],
24
+ "dtype": "float32"
25
+ },
26
+ "target_backend": "rocm",
27
+ "target_chip": "gfx1030",
28
+ "opt_level": 3,
29
+ "file_size_bytes": 0
30
+ },
31
+ {
32
+ "model_type": "spectrostream_decoder_iree",
33
+ "format": "vmfb",
34
+ "input": {
35
+ "name": "embeddings",
36
+ "shape": [
37
+ 1,
38
+ 50,
39
+ 256
40
+ ],
41
+ "dtype": "float32"
42
+ },
43
+ "output": {
44
+ "name": "audio",
45
+ "shape": [
46
+ 1,
47
+ 96000,
48
+ 2
49
+ ],
50
+ "dtype": "float32"
51
+ },
52
+ "target_backend": "rocm",
53
+ "target_chip": "gfx1030",
54
+ "opt_level": 3,
55
+ "file_size_bytes": 0
56
+ },
57
+ {
58
+ "model_type": "musiccoca_text_encoder_iree",
59
+ "format": "vmfb",
60
+ "inputs": [
61
+ {
62
+ "name": "text_ids",
63
+ "shape": [
64
+ 1,
65
+ 128
66
+ ],
67
+ "dtype": "int32"
68
+ },
69
+ {
70
+ "name": "padding",
71
+ "shape": [
72
+ 1,
73
+ 128
74
+ ],
75
+ "dtype": "float32"
76
+ }
77
+ ],
78
+ "output": {
79
+ "name": "embedding",
80
+ "shape": [
81
+ 1,
82
+ 768
83
+ ],
84
+ "dtype": "float32"
85
+ },
86
+ "target_backend": "rocm",
87
+ "target_chip": "gfx1030",
88
+ "file_size_bytes": 0
89
+ },
90
+ {
91
+ "model_type": "musiccoca_audio_encoder_iree",
92
+ "format": "vmfb",
93
+ "input": {
94
+ "name": "audio",
95
+ "shape": [
96
+ 1,
97
+ 160000
98
+ ],
99
+ "dtype": "float32"
100
+ },
101
+ "output": {
102
+ "name": "embedding",
103
+ "shape": [
104
+ 1,
105
+ 768
106
+ ],
107
+ "dtype": "float32"
108
+ },
109
+ "target_backend": "rocm",
110
+ "target_chip": "gfx1030",
111
+ "file_size_bytes": 0
112
+ },
113
+ {
114
+ "model_type": "depthformer_base_encoder_iree",
115
+ "format": "vmfb",
116
+ "inputs": [
117
+ {
118
+ "name": "context_tokens",
119
+ "shape": [
120
+ 1,
121
+ 1006
122
+ ],
123
+ "dtype": "int32"
124
+ },
125
+ {
126
+ "name": "style_tokens",
127
+ "shape": [
128
+ 1,
129
+ 6
130
+ ],
131
+ "dtype": "int32"
132
+ }
133
+ ],
134
+ "output": {
135
+ "name": "encoder_hidden_states",
136
+ "shape": [
137
+ 1,
138
+ 1006,
139
+ 768
140
+ ],
141
+ "dtype": "float32"
142
+ },
143
+ "target_backend": "rocm",
144
+ "target_chip": "gfx1030",
145
+ "file_size_bytes": 0
146
+ },
147
+ {
148
+ "model_type": "depthformer_base_decoder_step_iree",
149
+ "format": "vmfb",
150
+ "inputs": [
151
+ {
152
+ "name": "target_token",
153
+ "shape": [
154
+ 1,
155
+ 1
156
+ ],
157
+ "dtype": "int32"
158
+ },
159
+ {
160
+ "name": "encoder_hidden_states",
161
+ "shape": [
162
+ 1,
163
+ 1006,
164
+ 768
165
+ ],
166
+ "dtype": "float32"
167
+ }
168
+ ],
169
+ "output": {
170
+ "name": "logits",
171
+ "shape": [
172
+ 1,
173
+ 16384
174
+ ],
175
+ "dtype": "float32"
176
+ },
177
+ "target_backend": "rocm",
178
+ "target_chip": "gfx1030",
179
+ "file_size_bytes": 0
180
+ }
181
+ ],
182
+ "config": {
183
+ "target_backend": "rocm",
184
+ "target_chip": "gfx1030",
185
+ "opt_level": 3
186
+ },
187
+ "ue_path": "Content/MLModels/"
188
+ }
magenta_raw/checkpoints/llm_base_x4286_c1860k.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c91ffb542f8fa5d5285325b284d498ee1e9701012bbe9ead3408d327930cbee
3
+ size 1297356800
magenta_raw/checkpoints/llm_base_x4286_c1860k/checkpoint ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a772893769281208fbba5e6c75279ebe1afbed20f86620927e9bd04713d67fe5
3
+ size 2995665
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_0.pre_mlp_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[768],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[768],"zarr_format":2}
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_0.pre_mlp_layer_norm.scale.v/0 ADDED
Binary file (2.76 kB). View file
 
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_0.pre_self_attention_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[768],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[768],"zarr_format":2}
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_0.pre_self_attention_layer_norm.scale.v/0 ADDED
Binary file (2.8 kB). View file
 
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_1.pre_mlp_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[768],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[768],"zarr_format":2}
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_1.pre_mlp_layer_norm.scale.v/0 ADDED
Binary file (2.78 kB). View file
 
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_1.pre_self_attention_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[768],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[768],"zarr_format":2}
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_1.pre_self_attention_layer_norm.scale.v/0 ADDED
Binary file (2.75 kB). View file
 
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_mlp_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[768],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[768],"zarr_format":2}
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_mlp_layer_norm.scale.v/0 ADDED
Binary file (2.71 kB). View file
 
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[768],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[768],"zarr_format":2}
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/0 ADDED
Binary file (2.73 kB). View file
 
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_3.pre_mlp_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[768],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[768],"zarr_format":2}
magenta_raw/checkpoints/llm_base_x4286_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_3.pre_mlp_layer_norm.scale.v/0 ADDED
Binary file (2.78 kB). View file