fs90 commited on
Commit
41202ce
·
verified ·
1 Parent(s): 69afb14

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +55 -0
  2. config.json +26 -0
  3. model.safetensors +3 -0
  4. training_config.json +30 -0
README.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: oxidizr
4
+ tags:
5
+ - oxidizr
6
+ - llm
7
+ - mamba
8
+ pipeline_tag: text-generation
9
+ ---
10
+
11
+ # nano-start_64_26m_f32
12
+
13
+ Trained with [oxidizr](https://github.com/farhan-syah/oxidizr).
14
+
15
+ ## Model Details
16
+
17
+ | Property | Value |
18
+ |----------|-------|
19
+ | Parameters | 26.73M |
20
+ | Architecture | 3 Mamba2 + 1 MLA + MoE (2 experts, top-1) |
21
+ | Vocab Size | 100315 |
22
+ | Max Seq Length | 64 |
23
+ | Hidden Size | 128 |
24
+ | Layers | 4 |
25
+
26
+ ## Training Details
27
+
28
+ | Property | Value |
29
+ |----------|-------|
30
+ | Checkpoint | final |
31
+ | Final Loss | 0.0738 |
32
+ | Total Steps | 241 |
33
+ | Learning Rate | 2.00e-3 |
34
+
35
+ ## Usage
36
+
37
+ ### With blazr (recommended)
38
+
39
+ ```bash
40
+ # Generate text
41
+ blazr generate --model fs90/nano-start_64_26m_f32 --prompt "Hello, world!"
42
+
43
+ # Start inference server
44
+ blazr serve --model fs90/nano-start_64_26m_f32 --port 8080
45
+ ```
46
+
47
+ ### Download
48
+
49
+ ```bash
50
+ # Clone the model
51
+ git clone https://huggingface.co/fs90/nano-start_64_26m_f32
52
+
53
+ # Or use huggingface-cli
54
+ huggingface-cli download fs90/nano-start_64_26m_f32 --local-dir ./model
55
+ ```
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "hidden_size": 128,
3
+ "num_layers": 4,
4
+ "vocab_size": 100315,
5
+ "mamba2_num_heads": 16,
6
+ "mamba2_head_dim": 16,
7
+ "mamba2_state_size": 32,
8
+ "mamba2_chunk_size": 32,
9
+ "mamba2_expand": 2,
10
+ "mamba2_conv_kernel": 4,
11
+ "num_attention_heads": 4,
12
+ "kv_latent_dim": 64,
13
+ "q_latent_dim": 64,
14
+ "d_rope": 8,
15
+ "num_experts": 2,
16
+ "experts_per_tok": 1,
17
+ "shared_expert_enabled": true,
18
+ "intermediate_size": 512,
19
+ "mamba_layers": [
20
+ 0,
21
+ 1,
22
+ 2
23
+ ],
24
+ "rms_norm_eps": 0.00001,
25
+ "max_seq_len": 64
26
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f21d406137fc3d107bb53d6167d7b84a1414f97607ddc5f741d75c5cccd8da4f
3
+ size 106677352
training_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model": {
4
+ "dtype": "f32",
5
+ "experts_per_tok": 1,
6
+ "hidden_size": 128,
7
+ "max_seq_len": 64,
8
+ "name": "nano-start",
9
+ "num_experts": 2,
10
+ "num_heads": 4,
11
+ "num_layers": 4,
12
+ "vocab_size": 100315
13
+ },
14
+ "trainer": {
15
+ "batch_size": 4,
16
+ "effective_batch_size": 8,
17
+ "gradient_accumulation": 2,
18
+ "learning_rate": 0.002,
19
+ "max_steps": 0,
20
+ "num_epochs": 20,
21
+ "seq_len": 64,
22
+ "total_steps": 260
23
+ }
24
+ },
25
+ "dataset_size": 6379,
26
+ "device": "Cuda(CudaDevice(DeviceId(1)))",
27
+ "error": null,
28
+ "run_dir": "./runs/20251205_144741",
29
+ "status": "completed"
30
+ }