AINovice2005 commited on
Commit
7b96d73
·
verified ·
1 Parent(s): 44e3dc3

Add files using upload-large-folder tool

Browse files
Files changed (5) hide show
  1. README.md +105 -0
  2. artifact_bytes.bin +3 -0
  3. config.json +108 -0
  4. qmodel.pt +3 -0
  5. smash_config.json +43 -0
README.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - HuggingFaceTB/SmolLM3-3B-Base
4
+ language:
5
+ - en
6
+ - fr
7
+ - es
8
+ - it
9
+ - pt
10
+ - zh
11
+ - ar
12
+ - ru
13
+ library_name: transformers
14
+ license: apache-2.0
15
+ tags:
16
+ - pruna-ai
17
+ - safetensors
18
+ ---
19
+
20
+ # Model Card for AINovice2005/SmolLM3-3B-smashed
21
+
22
+ This model was created using the [pruna](https://github.com/PrunaAI/pruna) library. Pruna is a model optimization framework built for developers, enabling you to deliver more efficient models with minimal implementation overhead.
23
+
24
+ ## Usage
25
+
26
+ First things first, you need to install the pruna library:
27
+
28
+ ```bash
29
+ pip install pruna
30
+ ```
31
+
32
+ You can [use the transformers library to load the model](https://huggingface.co/AINovice2005/SmolLM3-3B-smashed?library=transformers) but this might not include all optimizations by default.
33
+
34
+ To ensure that all optimizations are applied, use the pruna library to load the model using the following code:
35
+
36
+ ```python
37
+ from pruna import PrunaModel
38
+
39
+ loaded_model = PrunaModel.from_pretrained(
40
+ "AINovice2005/SmolLM3-3B-smashed"
41
+ )
42
+ # we can then run inference using the methods supported by the base model
43
+ ```
44
+
45
+
46
+ For inference, you can use the inference methods of the original model like shown in [the original model card](https://huggingface.co/HuggingFaceTB/SmolLM3-3B?library=transformers).
47
+ Alternatively, you can visit [the Pruna documentation](https://docs.pruna.ai/en/stable/) for more information.
48
+
49
+ ## Smash Configuration
50
+
51
+ The compression configuration of the model is stored in the `smash_config.json` file, which describes the optimization methods that were applied to the model.
52
+
53
+ ```bash
54
+ {
55
+ "batcher": null,
56
+ "cacher": null,
57
+ "compiler": "torch_compile",
58
+ "factorizer": null,
59
+ "kernel": null,
60
+ "pruner": null,
61
+ "quantizer": "hqq",
62
+ "hqq_backend": "torchao_int4",
63
+ "hqq_compute_dtype": "torch.bfloat16",
64
+ "hqq_force_hf_implementation": false,
65
+ "hqq_group_size": 128,
66
+ "hqq_use_torchao_kernels": true,
67
+ "hqq_weight_bits": 4,
68
+ "torch_compile_backend": "cudagraphs",
69
+ "torch_compile_dynamic": true,
70
+ "torch_compile_fullgraph": false,
71
+ "torch_compile_make_portable": true,
72
+ "torch_compile_max_kv_cache_size": 1600,
73
+ "torch_compile_mode": "reduce-overhead",
74
+ "torch_compile_seqlen_manual_cuda_graph": 800,
75
+ "torch_compile_target": "model",
76
+ "batch_size": 1,
77
+ "device": "cuda:0",
78
+ "device_map": null,
79
+ "save_fns": [
80
+ "hqq",
81
+ "save_before_apply"
82
+ ],
83
+ "load_fns": [
84
+ "torch_artifacts",
85
+ "hqq"
86
+ ],
87
+ "reapply_after_load": {
88
+ "factorizer": null,
89
+ "pruner": null,
90
+ "quantizer": null,
91
+ "kernel": null,
92
+ "cacher": null,
93
+ "compiler": "torch_compile",
94
+ "batcher": null
95
+ }
96
+ }
97
+ ```
98
+
99
+ ## 🌍 Join the Pruna AI community!
100
+
101
+ [![Twitter](https://img.shields.io/twitter/follow/PrunaAI?style=social)](https://twitter.com/PrunaAI)
102
+ [![GitHub](https://img.shields.io/github/followers/PrunaAI?label=Follow%20%40PrunaAI&style=social)](https://github.com/PrunaAI)
103
+ [![LinkedIn](https://img.shields.io/badge/LinkedIn-Connect-blue)](https://www.linkedin.com/company/93832878/admin/feed/posts/?feedType=following)
104
+ [![Discord](https://img.shields.io/badge/Discord-Join%20Us-blue?style=social&logo=discord)](https://discord.gg/JFQmtFKCjd)
105
+ [![Reddit](https://img.shields.io/reddit/subreddit-subscribers/PrunaAI?style=social)](https://www.reddit.com/r/PrunaAI/)
artifact_bytes.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd2662154e6d76b2b2b92e70c0cac3ccf534f9b74eb5b89819ec509083d00a50
3
+ size 8
config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SmolLM3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 128000,
8
+ "dtype": "float32",
9
+ "eos_token_id": 128012,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 11008,
14
+ "layer_types": [
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention"
51
+ ],
52
+ "max_position_embeddings": 65536,
53
+ "max_window_layers": 28,
54
+ "mlp_bias": false,
55
+ "model_type": "smollm3",
56
+ "no_rope_layer_interval": 4,
57
+ "no_rope_layers": [
58
+ 1,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 1,
63
+ 1,
64
+ 1,
65
+ 0,
66
+ 1,
67
+ 1,
68
+ 1,
69
+ 0,
70
+ 1,
71
+ 1,
72
+ 1,
73
+ 0,
74
+ 1,
75
+ 1,
76
+ 1,
77
+ 0,
78
+ 1,
79
+ 1,
80
+ 1,
81
+ 0,
82
+ 1,
83
+ 1,
84
+ 1,
85
+ 0,
86
+ 1,
87
+ 1,
88
+ 1,
89
+ 0,
90
+ 1,
91
+ 1,
92
+ 1,
93
+ 0
94
+ ],
95
+ "num_attention_heads": 16,
96
+ "num_hidden_layers": 36,
97
+ "num_key_value_heads": 4,
98
+ "pad_token_id": 128004,
99
+ "pretraining_tp": 2,
100
+ "rms_norm_eps": 1e-06,
101
+ "rope_scaling": null,
102
+ "rope_theta": 5000000.0,
103
+ "sliding_window": null,
104
+ "transformers_version": "4.57.0",
105
+ "use_cache": false,
106
+ "use_sliding_window": false,
107
+ "vocab_size": 128256
108
+ }
qmodel.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bba79a324d0c4c051f2b19423c4d4f4cd93b305fd405627787ff3a54faf9a9de
3
+ size 525655861
smash_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "batcher": null,
3
+ "cacher": null,
4
+ "compiler": "torch_compile",
5
+ "factorizer": null,
6
+ "kernel": null,
7
+ "pruner": null,
8
+ "quantizer": "hqq",
9
+ "hqq_backend": "torchao_int4",
10
+ "hqq_compute_dtype": "torch.bfloat16",
11
+ "hqq_force_hf_implementation": false,
12
+ "hqq_group_size": 128,
13
+ "hqq_use_torchao_kernels": true,
14
+ "hqq_weight_bits": 4,
15
+ "torch_compile_backend": "cudagraphs",
16
+ "torch_compile_dynamic": true,
17
+ "torch_compile_fullgraph": false,
18
+ "torch_compile_make_portable": true,
19
+ "torch_compile_max_kv_cache_size": 1600,
20
+ "torch_compile_mode": "reduce-overhead",
21
+ "torch_compile_seqlen_manual_cuda_graph": 800,
22
+ "torch_compile_target": "model",
23
+ "batch_size": 1,
24
+ "device": "cuda:0",
25
+ "device_map": null,
26
+ "save_fns": [
27
+ "hqq",
28
+ "save_before_apply"
29
+ ],
30
+ "load_fns": [
31
+ "torch_artifacts",
32
+ "hqq"
33
+ ],
34
+ "reapply_after_load": {
35
+ "factorizer": null,
36
+ "pruner": null,
37
+ "quantizer": null,
38
+ "kernel": null,
39
+ "cacher": null,
40
+ "compiler": "torch_compile",
41
+ "batcher": null
42
+ }
43
+ }