teamzero commited on
Commit
09b5abf
·
verified ·
1 Parent(s): 661b5bb

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +219 -177
README.md CHANGED
@@ -1,220 +1,262 @@
1
 
2
- <style>
3
- * { box-sizing: border-box; margin: 0; padding: 0; }
4
- body { font-family: var(--font-sans); color: var(--color-text-primary); }
5
- .page { max-width: 860px; margin: 0 auto; padding: 2rem 1rem 3rem; }
6
- .hero { text-align: center; padding: 3rem 1rem 2.5rem; }
7
- .hero img { height: 70px; object-fit: contain; margin-bottom: 1.75rem; display: block; margin-left: auto; margin-right: auto; }
8
- .hero h1 { font-size: 32px; font-weight: 500; letter-spacing: -0.5px; margin-bottom: 0.5rem; }
9
- .hero p { font-size: 15px; color: var(--color-text-secondary); max-width: 560px; margin: 0 auto 1.75rem; line-height: 1.7; }
10
- .badges { display: flex; gap: 8px; justify-content: center; flex-wrap: wrap; margin-bottom: 1.75rem; }
11
- .badge { font-size: 11px; padding: 4px 11px; border-radius: 99px; border: 0.5px solid var(--color-border-secondary); color: var(--color-text-secondary); }
12
- .badge.purple { background: #EEEDFE; color: #3C3489; border-color: #AFA9EC; }
13
- @media (prefers-color-scheme: dark) { .badge.purple { background: #3C3489; color: #CECBF6; border-color: #534AB7; } }
14
- .hugging { display: inline-flex; align-items: center; gap: 8px; font-size: 13px; padding: 9px 20px; border-radius: var(--border-radius-md); border: 0.5px solid var(--color-border-secondary); color: var(--color-text-secondary); background: var(--color-background-secondary); }
15
- .hugging code { font-family: var(--font-mono); font-size: 12px; color: var(--color-text-secondary); }
16
- .divider { border: none; border-top: 0.5px solid var(--color-border-tertiary); margin: 2.5rem 0; }
17
- .section { margin-bottom: 2.5rem; }
18
- .section-label { font-size: 11px; font-weight: 500; color: var(--color-text-secondary); text-transform: uppercase; letter-spacing: 0.09em; margin-bottom: 1rem; }
19
- .stat-row { display: grid; grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 12px; margin-bottom: 2.5rem; }
20
- .stat { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem; text-align: center; }
21
- .stat .val { font-size: 22px; font-weight: 500; margin-bottom: 3px; }
22
- .stat .lbl { font-size: 12px; color: var(--color-text-secondary); }
23
- .model-card { background: var(--color-background-primary); border: 0.5px solid var(--color-border-secondary); border-radius: var(--border-radius-lg); padding: 1.5rem; }
24
- .model-name { font-size: 22px; font-weight: 500; margin-bottom: 4px; }
25
- .model-sub { font-size: 14px; color: var(--color-text-secondary); margin-bottom: 1rem; line-height: 1.6; }
26
- .pill-row { display: flex; gap: 6px; flex-wrap: wrap; margin-bottom: 1.25rem; }
27
- .pill { font-size: 11px; padding: 3px 10px; border-radius: 99px; }
28
- .pill.blue { background: #E6F1FB; color: #0C447C; }
29
- .pill.teal { background: #E1F5EE; color: #085041; }
30
- .pill.amber { background: #FAEEDA; color: #633806; }
31
- .pill.purple { background: #EEEDFE; color: #3C3489; }
32
- @media (prefers-color-scheme: dark) {
33
- .pill.blue { background: #0C447C; color: #B5D4F4; }
34
- .pill.teal { background: #085041; color: #9FE1CB; }
35
- .pill.amber { background: #633806; color: #FAC775; }
36
- .pill.purple { background: #3C3489; color: #CECBF6; }
37
- }
38
- .model-meta { border-top: 0.5px solid var(--color-border-tertiary); padding-top: 1rem; display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 12px; }
39
- .meta-item .mk { font-size: 11px; color: var(--color-text-secondary); margin-bottom: 3px; }
40
- .meta-item .mv { font-size: 13px; font-weight: 500; }
41
- .feature-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(190px, 1fr)); gap: 12px; }
42
- .feature-card { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem 1.1rem; }
43
- .feature-card .ft { font-size: 14px; font-weight: 500; margin-bottom: 5px; }
44
- .feature-card .fd { font-size: 13px; color: var(--color-text-secondary); line-height: 1.55; }
45
- .bench-section { margin-bottom: 1.5rem; }
46
- .bench-title { font-size: 13px; font-weight: 500; margin-bottom: 10px; padding-bottom: 6px; border-bottom: 0.5px solid var(--color-border-tertiary); }
47
- table { width: 100%; border-collapse: collapse; font-size: 13px; table-layout: fixed; }
48
- th { text-align: left; padding: 7px 10px; color: var(--color-text-secondary); font-weight: 500; border-bottom: 0.5px solid var(--color-border-tertiary); }
49
- td { padding: 7px 10px; border-bottom: 0.5px solid var(--color-border-tertiary); }
50
- tr:last-child td { border-bottom: none; }
51
- td:not(:first-child), th:not(:first-child) { text-align: right; }
52
- .best { font-weight: 500; color: #1D9E75; }
53
- .framework-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(140px, 1fr)); gap: 8px; }
54
- .fw { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 10px 14px; font-size: 13px; }
55
- .fw .fwn { font-weight: 500; margin-bottom: 2px; }
56
- .fw .fwd { font-size: 11px; color: var(--color-text-secondary); }
57
- .arch-row { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; }
58
- .arch-card { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem 1.1rem; }
59
- .arch-card .ak { font-size: 11px; color: var(--color-text-secondary); margin-bottom: 3px; }
60
- .arch-card .av { font-size: 14px; font-weight: 500; }
61
- .arch-card .ad { font-size: 12px; color: var(--color-text-secondary); margin-top: 4px; line-height: 1.5; }
62
- .footer { text-align: center; padding-top: 2rem; font-size: 12px; color: var(--color-text-secondary); line-height: 2; }
63
- </style>
64
 
65
- <div class="page">
66
-
67
- <div class="hero">
68
- <img src="https://i.ibb.co/rGS6dBcf/logo-Astro-X.png" alt="AstroX AI">
69
- <p>A frontier-class Mixture-of-Experts language model — competitive with leading closed-source models at a fraction of the training cost. Fully open-source and commercially licensed.</p>
70
- <div class="badges">
71
- <span class="badge purple">Mixture-of-Experts</span>
72
- <span class="badge">671B total params</span>
73
- <span class="badge">37B activated per token</span>
74
- <span class="badge">128K context</span>
75
- <span class="badge">FP8 training</span>
76
- <span class="badge">MIT License</span>
77
  </div>
78
- <div class="hugging">
79
- <span style="font-size:13px; color: var(--color-text-secondary);">huggingface.co/</span><code>teamzero/astrox</code>
80
  </div>
81
  </div>
82
 
83
- <div class="stat-row">
84
- <div class="stat"><div class="val">671B</div><div class="lbl">Total params</div></div>
85
- <div class="stat"><div class="val">37B</div><div class="lbl">Active per token</div></div>
86
- <div class="stat"><div class="val">128K</div><div class="lbl">Context window</div></div>
87
- <div class="stat"><div class="val">2.79M</div><div class="lbl">H800 GPU hours</div></div>
 
 
 
 
 
 
 
 
 
 
 
 
88
  </div>
89
 
90
- <hr class="divider">
91
 
92
- <div class="section">
93
- <div class="section-label">Model</div>
94
- <div class="model-card">
95
- <div class="model-name">AstroX</div>
96
- <div class="model-sub">Instruction-tuned chat model with reinforcement learning and R1 long-chain-of-thought reasoning distillation. The only available model in the AstroX family.</div>
97
- <div class="pill-row">
98
- <span class="pill purple">MoE</span>
99
- <span class="pill blue">671B / 37B active</span>
100
- <span class="pill teal">128K context</span>
101
- <span class="pill amber">FP8 weights</span>
102
- </div>
103
- <div class="model-meta">
104
- <div class="meta-item"><div class="mk">Architecture</div><div class="mv">DeepSeekMoE + MLA</div></div>
105
- <div class="meta-item"><div class="mk">Experts</div><div class="mv">256 total · 8 active</div></div>
106
- <div class="meta-item"><div class="mk">Pre-training data</div><div class="mv">14.8T tokens</div></div>
107
- <div class="meta-item"><div class="mk">License</div><div class="mv">MIT + Model Agreement</div></div>
108
  </div>
109
  </div>
110
  </div>
111
 
112
- <div class="section">
113
- <div class="section-label">Architecture highlights</div>
114
- <div class="arch-row">
115
- <div class="arch-card">
116
- <div class="ak">Attention</div>
117
- <div class="av">Multi-head Latent Attention (MLA)</div>
118
- <div class="ad">Reduces KV cache memory footprint significantly vs. standard MHA, enabling practical long-context inference.</div>
119
- </div>
120
- <div class="arch-card">
121
- <div class="ak">Load balancing</div>
122
- <div class="av">Auxiliary-loss-free strategy</div>
123
- <div class="ad">Balances expert load without the performance penalty of traditional auxiliary loss terms.</div>
124
- </div>
125
- <div class="arch-card">
126
- <div class="ak">Training objective</div>
127
- <div class="av">Multi-Token Prediction (MTP)</div>
128
- <div class="ad">Predicts multiple future tokens simultaneously, improving performance and enabling speculative decoding.</div>
129
- </div>
130
- <div class="arch-card">
131
- <div class="ak">Post-training</div>
132
- <div class="av">R1 reasoning distillation</div>
133
- <div class="ad">Verification and reflection patterns from DeepSeek-R1 are distilled into the model while keeping output style controlled.</div>
134
  </div>
135
  </div>
136
  </div>
137
 
138
- <div class="section">
139
- <div class="section-label">Key innovations</div>
140
- <div class="feature-grid">
141
- <div class="feature-card">
142
- <div class="ft">FP8 mixed precision</div>
143
- <div class="fd">First large-scale FP8 training validated on a 671B model. Cuts compute cost without quality loss.</div>
144
  </div>
145
- <div class="feature-card">
146
- <div class="ft">Zero training instability</div>
147
- <div class="fd">No irrecoverable loss spikes and no rollbacks throughout the entire pre-training run.</div>
148
  </div>
149
- <div class="feature-card">
150
- <div class="ft">Full comm/compute overlap</div>
151
- <div class="fd">Co-designed algorithms and hardware nearly eliminate the communication bottleneck in cross-node MoE training.</div>
152
  </div>
153
- <div class="feature-card">
154
- <div class="ft">Speculative decoding ready</div>
155
- <div class="fd">The MTP module can be repurposed as a draft head for inference acceleration out of the box.</div>
156
  </div>
157
  </div>
158
  </div>
159
 
160
- <div class="section">
161
- <div class="section-label">Benchmark performance — math &amp; reasoning</div>
162
- <table>
163
- <tr><th style="width:44%">Benchmark</th><th>GPT-4o</th><th>Claude 3.5 Sonnet</th><th>AstroX</th></tr>
164
- <tr><td>AIME 2024 (Pass@1)</td><td>9.3</td><td>16.0</td><td class="best">39.2</td></tr>
165
- <tr><td>MATH-500 (EM)</td><td>74.6</td><td>78.3</td><td class="best">90.2</td></tr>
166
- <tr><td>CNMO 2024 (Pass@1)</td><td>10.8</td><td>13.1</td><td class="best">43.2</td></tr>
167
- <tr><td>GSM8K (EM)</td><td>—</td><td>—</td><td class="best">89.3</td></tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  </table>
169
  </div>
170
 
171
- <div class="section">
172
- <div class="section-label">Benchmark performance — code</div>
173
- <table>
174
- <tr><th style="width:44%">Benchmark</th><th>GPT-4o</th><th>Claude 3.5 Sonnet</th><th>AstroX</th></tr>
175
- <tr><td>LiveCodeBench (Pass@1)</td><td>34.2</td><td>32.8</td><td class="best">37.6</td></tr>
176
- <tr><td>Codeforces (Percentile)</td><td>23.6</td><td>20.3</td><td class="best">51.6</td></tr>
177
- <tr><td>Aider-Polyglot (Acc.)</td><td>16.0</td><td>45.3</td><td class="best">49.6</td></tr>
178
- <tr><td>HumanEval-Mul (Pass@1)</td><td>80.5</td><td>81.7</td><td class="best">82.6</td></tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  </table>
180
  </div>
181
 
182
- <div class="section">
183
- <div class="section-label">Benchmark performance — general</div>
184
- <table>
185
- <tr><th style="width:44%">Benchmark</th><th>GPT-4o</th><th>Claude 3.5 Sonnet</th><th>AstroX</th></tr>
186
- <tr><td>MMLU (EM)</td><td>87.2</td><td>88.3</td><td class="best">88.5</td></tr>
187
- <tr><td>Arena-Hard</td><td>80.4</td><td>85.2</td><td class="best">85.5</td></tr>
188
- <tr><td>AlpacaEval 2.0</td><td>51.1</td><td>52.0</td><td class="best">70.0</td></tr>
189
- <tr><td>DROP (3-shot F1)</td><td>83.7</td><td>88.3</td><td class="best">91.6</td></tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  </table>
191
  </div>
192
 
193
- <div class="section">
194
- <div class="section-label">Supported inference frameworks</div>
195
- <div class="framework-grid">
196
- <div class="fw"><div class="fwn">SGLang</div><div class="fwd">Recommended · FP8 + BF16 · NVIDIA + AMD</div></div>
197
- <div class="fw"><div class="fwn">vLLM</div><div class="fwd">FP8 + BF16 · pipeline parallelism</div></div>
198
- <div class="fw"><div class="fwn">LMDeploy</div><div class="fwd">Offline + online · PyTorch-native</div></div>
199
- <div class="fw"><div class="fwn">TensorRT-LLM</div><div class="fwd">BF16 · INT4/INT8 quant</div></div>
200
- <div class="fw"><div class="fwn">AMD GPU</div><div class="fwd">via SGLang · FP8 + BF16</div></div>
201
- <div class="fw"><div class="fwn">Huawei Ascend</div><div class="fwd">via MindIE · BF16</div></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  </div>
203
  </div>
204
 
205
- <div class="section">
206
- <div class="section-label">Quick start</div>
207
- <div style="background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem 1.25rem;">
208
- <div style="font-size: 12px; color: var(--color-text-secondary); margin-bottom: 8px;">Convert FP8 weights to BF16</div>
209
- <code style="font-family: var(--font-mono); font-size: 12px; color: var(--color-text-primary); display: block; line-height: 1.8;">python fp8_cast_bf16.py \<br>&nbsp;&nbsp;--input-fp8-hf-path /path/to/fp8_weights \<br>&nbsp;&nbsp;--output-bf16-hf-path /path/to/bf16_weights</code>
210
  </div>
211
- <div style="background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem 1.25rem; margin-top: 8px;">
212
- <div style="font-size: 12px; color: var(--color-text-secondary); margin-bottom: 8px;">Run interactive inference (2 nodes · 8 GPUs each)</div>
213
- <code style="font-family: var(--font-mono); font-size: 12px; color: var(--color-text-primary); display: block; line-height: 1.8;">torchrun --nnodes 2 --nproc-per-node 8 generate.py \<br>&nbsp;&nbsp;--node-rank $RANK --master-addr $ADDR \<br>&nbsp;&nbsp;--ckpt-path /path/to/AstroX \<br>&nbsp;&nbsp;--config configs/config_671B.json \<br>&nbsp;&nbsp;--interactive --temperature 0.7 --max-new-tokens 200</code>
214
  </div>
215
  </div>
216
 
217
- <div class="footer">
218
  Code license: MIT &nbsp;·&nbsp; Model license: Model Agreement &nbsp;·&nbsp; Commercial use supported<br>
219
  huggingface.co/teamzero/astrox
220
  </div>
 
1
 
2
+ <div style="max-width:860px;margin:0 auto;padding:2rem 1rem 3rem;font-family:var(--font-sans);color:var(--color-text-primary);">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ <div style="text-align:center;padding:3rem 1rem 2.5rem;">
5
+ <img src="https://i.ibb.co/rGS6dBcf/logo-Astro-X.png" alt="AstroX AI" style="height:70px;object-fit:contain;display:block;margin:0 auto 1.75rem;">
6
+ <p style="font-size:15px;color:var(--color-text-secondary);max-width:560px;margin:0 auto 1.75rem;line-height:1.7;">A frontier-class Mixture-of-Experts language model — competitive with leading closed-source models at a fraction of the training cost. Fully open-source and commercially licensed.</p>
7
+ <div style="display:flex;gap:8px;justify-content:center;flex-wrap:wrap;margin-bottom:1.75rem;">
8
+ <span style="font-size:11px;padding:4px 11px;border-radius:99px;background:#EEEDFE;color:#3C3489;border:0.5px solid #AFA9EC;">Mixture-of-Experts</span>
9
+ <span style="font-size:11px;padding:4px 11px;border-radius:99px;border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);">671B total params</span>
10
+ <span style="font-size:11px;padding:4px 11px;border-radius:99px;border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);">37B activated per token</span>
11
+ <span style="font-size:11px;padding:4px 11px;border-radius:99px;border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);">128K context</span>
12
+ <span style="font-size:11px;padding:4px 11px;border-radius:99px;border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);">FP8 training</span>
13
+ <span style="font-size:11px;padding:4px 11px;border-radius:99px;border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);">MIT License</span>
 
 
14
  </div>
15
+ <div style="display:inline-flex;align-items:center;gap:8px;font-size:13px;padding:9px 20px;border-radius:var(--border-radius-md);border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);background:var(--color-background-secondary);">
16
+ <span>huggingface.co/</span><code style="font-family:var(--font-mono);font-size:12px;">teamzero/astrox</code>
17
  </div>
18
  </div>
19
 
20
+ <div style="display:grid;grid-template-columns:repeat(4,minmax(0,1fr));gap:12px;margin-bottom:2.5rem;">
21
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem;text-align:center;">
22
+ <div style="font-size:22px;font-weight:500;margin-bottom:3px;">671B</div>
23
+ <div style="font-size:12px;color:var(--color-text-secondary);">Total params</div>
24
+ </div>
25
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem;text-align:center;">
26
+ <div style="font-size:22px;font-weight:500;margin-bottom:3px;">37B</div>
27
+ <div style="font-size:12px;color:var(--color-text-secondary);">Active per token</div>
28
+ </div>
29
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem;text-align:center;">
30
+ <div style="font-size:22px;font-weight:500;margin-bottom:3px;">128K</div>
31
+ <div style="font-size:12px;color:var(--color-text-secondary);">Context window</div>
32
+ </div>
33
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem;text-align:center;">
34
+ <div style="font-size:22px;font-weight:500;margin-bottom:3px;">2.79M</div>
35
+ <div style="font-size:12px;color:var(--color-text-secondary);">H800 GPU hours</div>
36
+ </div>
37
  </div>
38
 
39
+ <hr style="border:none;border-top:0.5px solid var(--color-border-tertiary);margin:2.5rem 0;">
40
 
41
+ <div style="margin-bottom:2.5rem;">
42
+ <div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Model</div>
43
+ <div style="background:var(--color-background-primary);border:0.5px solid var(--color-border-secondary);border-radius:var(--border-radius-lg);padding:1.5rem;">
44
+ <div style="font-size:22px;font-weight:500;margin-bottom:4px;">AstroX</div>
45
+ <div style="font-size:14px;color:var(--color-text-secondary);margin-bottom:1rem;line-height:1.6;">Instruction-tuned chat model with reinforcement learning and advanced long-chain-of-thought reasoning distillation. The only available model in the AstroX family.</div>
46
+ <div style="display:flex;gap:6px;flex-wrap:wrap;margin-bottom:1.25rem;">
47
+ <span style="font-size:11px;padding:3px 10px;border-radius:99px;background:#EEEDFE;color:#3C3489;">MoE</span>
48
+ <span style="font-size:11px;padding:3px 10px;border-radius:99px;background:#E6F1FB;color:#0C447C;">671B / 37B active</span>
49
+ <span style="font-size:11px;padding:3px 10px;border-radius:99px;background:#E1F5EE;color:#085041;">128K context</span>
50
+ <span style="font-size:11px;padding:3px 10px;border-radius:99px;background:#FAEEDA;color:#633806;">FP8 weights</span>
51
+ </div>
52
+ <div style="border-top:0.5px solid var(--color-border-tertiary);padding-top:1rem;display:grid;grid-template-columns:repeat(auto-fit,minmax(160px,1fr));gap:12px;">
53
+ <div><div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Architecture</div><div style="font-size:13px;font-weight:500;">MoE + Multi-head Latent Attention</div></div>
54
+ <div><div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Experts</div><div style="font-size:13px;font-weight:500;">256 total · 8 active</div></div>
55
+ <div><div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Pre-training data</div><div style="font-size:13px;font-weight:500;">14.8T tokens</div></div>
56
+ <div><div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">License</div><div style="font-size:13px;font-weight:500;">MIT + Model Agreement</div></div>
57
  </div>
58
  </div>
59
  </div>
60
 
61
+ <div style="margin-bottom:2.5rem;">
62
+ <div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Architecture highlights</div>
63
+ <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;">
64
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
65
+ <div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Attention</div>
66
+ <div style="font-size:14px;font-weight:500;margin-bottom:4px;">Multi-head Latent Attention (MLA)</div>
67
+ <div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">Reduces KV cache memory footprint significantly vs. standard MHA, enabling practical long-context inference.</div>
68
+ </div>
69
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
70
+ <div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Load balancing</div>
71
+ <div style="font-size:14px;font-weight:500;margin-bottom:4px;">Auxiliary-loss-free strategy</div>
72
+ <div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">Balances expert load without the performance penalty of traditional auxiliary loss terms.</div>
73
+ </div>
74
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
75
+ <div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Training objective</div>
76
+ <div style="font-size:14px;font-weight:500;margin-bottom:4px;">Multi-Token Prediction (MTP)</div>
77
+ <div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">Predicts multiple future tokens simultaneously, boosting performance and enabling speculative decoding.</div>
78
+ </div>
79
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
80
+ <div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Post-training</div>
81
+ <div style="font-size:14px;font-weight:500;margin-bottom:4px;">Reasoning distillation</div>
82
+ <div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">Verification and reflection patterns distilled from a long-CoT model, keeping output style and length controlled.</div>
83
  </div>
84
  </div>
85
  </div>
86
 
87
+ <div style="margin-bottom:2.5rem;">
88
+ <div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Key innovations</div>
89
+ <div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(190px,1fr));gap:12px;">
90
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
91
+ <div style="font-size:14px;font-weight:500;margin-bottom:5px;">FP8 mixed precision</div>
92
+ <div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">First validated large-scale FP8 training. Cuts compute cost without quality loss.</div>
93
  </div>
94
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
95
+ <div style="font-size:14px;font-weight:500;margin-bottom:5px;">Zero training instability</div>
96
+ <div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">No irrecoverable loss spikes and no rollbacks throughout the entire pre-training run.</div>
97
  </div>
98
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
99
+ <div style="font-size:14px;font-weight:500;margin-bottom:5px;">Full comm/compute overlap</div>
100
+ <div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">Co-designed algorithms and hardware nearly eliminate cross-node MoE communication bottlenecks.</div>
101
  </div>
102
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
103
+ <div style="font-size:14px;font-weight:500;margin-bottom:5px;">Speculative decoding ready</div>
104
+ <div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">The MTP module doubles as a draft head for inference acceleration out of the box.</div>
105
  </div>
106
  </div>
107
  </div>
108
 
109
+ <div style="margin-bottom:2.5rem;">
110
+ <div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Benchmark performance — math &amp; reasoning</div>
111
+ <table style="width:100%;border-collapse:collapse;font-size:13px;table-layout:fixed;">
112
+ <tr>
113
+ <th style="text-align:left;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);width:44%;">Benchmark</th>
114
+ <th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">GPT-4o</th>
115
+ <th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">Claude 3.5 Sonnet</th>
116
+ <th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">AstroX</th>
117
+ </tr>
118
+ <tr>
119
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">AIME 2024 (Pass@1)</td>
120
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">9.3</td>
121
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">16.0</td>
122
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">39.2</td>
123
+ </tr>
124
+ <tr>
125
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">MATH-500 (EM)</td>
126
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">74.6</td>
127
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">78.3</td>
128
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">90.2</td>
129
+ </tr>
130
+ <tr>
131
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">CNMO 2024 (Pass@1)</td>
132
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">10.8</td>
133
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">13.1</td>
134
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">43.2</td>
135
+ </tr>
136
+ <tr>
137
+ <td style="padding:7px 10px;">GSM8K (EM)</td>
138
+ <td style="padding:7px 10px;text-align:right;">—</td>
139
+ <td style="padding:7px 10px;text-align:right;">—</td>
140
+ <td style="padding:7px 10px;text-align:right;font-weight:500;color:#1D9E75;">89.3</td>
141
+ </tr>
142
  </table>
143
  </div>
144
 
145
+ <div style="margin-bottom:2.5rem;">
146
+ <div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Benchmark performance — code</div>
147
+ <table style="width:100%;border-collapse:collapse;font-size:13px;table-layout:fixed;">
148
+ <tr>
149
+ <th style="text-align:left;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);width:44%;">Benchmark</th>
150
+ <th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">GPT-4o</th>
151
+ <th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">Claude 3.5 Sonnet</th>
152
+ <th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">AstroX</th>
153
+ </tr>
154
+ <tr>
155
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">LiveCodeBench (Pass@1)</td>
156
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">34.2</td>
157
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">32.8</td>
158
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">37.6</td>
159
+ </tr>
160
+ <tr>
161
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">Codeforces (Percentile)</td>
162
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">23.6</td>
163
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">20.3</td>
164
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">51.6</td>
165
+ </tr>
166
+ <tr>
167
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">Aider-Polyglot (Acc.)</td>
168
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">16.0</td>
169
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">45.3</td>
170
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">49.6</td>
171
+ </tr>
172
+ <tr>
173
+ <td style="padding:7px 10px;">HumanEval-Mul (Pass@1)</td>
174
+ <td style="padding:7px 10px;text-align:right;">80.5</td>
175
+ <td style="padding:7px 10px;text-align:right;">81.7</td>
176
+ <td style="padding:7px 10px;text-align:right;font-weight:500;color:#1D9E75;">82.6</td>
177
+ </tr>
178
  </table>
179
  </div>
180
 
181
+ <div style="margin-bottom:2.5rem;">
182
+ <div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Benchmark performance — general</div>
183
+ <table style="width:100%;border-collapse:collapse;font-size:13px;table-layout:fixed;">
184
+ <tr>
185
+ <th style="text-align:left;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);width:44%;">Benchmark</th>
186
+ <th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">GPT-4o</th>
187
+ <th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">Claude 3.5 Sonnet</th>
188
+ <th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">AstroX</th>
189
+ </tr>
190
+ <tr>
191
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">MMLU (EM)</td>
192
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">87.2</td>
193
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">88.3</td>
194
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">88.5</td>
195
+ </tr>
196
+ <tr>
197
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">Arena-Hard</td>
198
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">80.4</td>
199
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">85.2</td>
200
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">85.5</td>
201
+ </tr>
202
+ <tr>
203
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">AlpacaEval 2.0</td>
204
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">51.1</td>
205
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">52.0</td>
206
+ <td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">70.0</td>
207
+ </tr>
208
+ <tr>
209
+ <td style="padding:7px 10px;">DROP (3-shot F1)</td>
210
+ <td style="padding:7px 10px;text-align:right;">83.7</td>
211
+ <td style="padding:7px 10px;text-align:right;">88.3</td>
212
+ <td style="padding:7px 10px;text-align:right;font-weight:500;color:#1D9E75;">91.6</td>
213
+ </tr>
214
  </table>
215
  </div>
216
 
217
+ <div style="margin-bottom:2.5rem;">
218
+ <div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Supported inference frameworks</div>
219
+ <div style="display:grid;grid-template-columns:repeat(auto-fill,minmax(140px,1fr));gap:8px;">
220
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
221
+ <div style="font-size:13px;font-weight:500;margin-bottom:2px;">SGLang</div>
222
+ <div style="font-size:11px;color:var(--color-text-secondary);">Recommended · FP8 + BF16 · NVIDIA + AMD</div>
223
+ </div>
224
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
225
+ <div style="font-size:13px;font-weight:500;margin-bottom:2px;">vLLM</div>
226
+ <div style="font-size:11px;color:var(--color-text-secondary);">FP8 + BF16 · pipeline parallelism</div>
227
+ </div>
228
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
229
+ <div style="font-size:13px;font-weight:500;margin-bottom:2px;">LMDeploy</div>
230
+ <div style="font-size:11px;color:var(--color-text-secondary);">Offline + online · PyTorch-native</div>
231
+ </div>
232
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
233
+ <div style="font-size:13px;font-weight:500;margin-bottom:2px;">TensorRT-LLM</div>
234
+ <div style="font-size:11px;color:var(--color-text-secondary);">BF16 · INT4/INT8 quant</div>
235
+ </div>
236
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
237
+ <div style="font-size:13px;font-weight:500;margin-bottom:2px;">AMD GPU</div>
238
+ <div style="font-size:11px;color:var(--color-text-secondary);">via SGLang · FP8 + BF16</div>
239
+ </div>
240
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
241
+ <div style="font-size:13px;font-weight:500;margin-bottom:2px;">Huawei Ascend</div>
242
+ <div style="font-size:11px;color:var(--color-text-secondary);">via MindIE · BF16</div>
243
+ </div>
244
  </div>
245
  </div>
246
 
247
+ <div style="margin-bottom:2.5rem;">
248
+ <div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Quick start</div>
249
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.25rem;margin-bottom:8px;">
250
+ <div style="font-size:12px;color:var(--color-text-secondary);margin-bottom:8px;">Convert FP8 weights to BF16</div>
251
+ <code style="font-family:var(--font-mono);font-size:12px;color:var(--color-text-primary);display:block;line-height:1.8;">python fp8_cast_bf16.py \<br>&nbsp;&nbsp;--input-fp8-hf-path /path/to/fp8_weights \<br>&nbsp;&nbsp;--output-bf16-hf-path /path/to/bf16_weights</code>
252
  </div>
253
+ <div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.25rem;">
254
+ <div style="font-size:12px;color:var(--color-text-secondary);margin-bottom:8px;">Run interactive inference (2 nodes · 8 GPUs each)</div>
255
+ <code style="font-family:var(--font-mono);font-size:12px;color:var(--color-text-primary);display:block;line-height:1.8;">torchrun --nnodes 2 --nproc-per-node 8 generate.py \<br>&nbsp;&nbsp;--node-rank $RANK --master-addr $ADDR \<br>&nbsp;&nbsp;--ckpt-path /path/to/AstroX \<br>&nbsp;&nbsp;--config configs/config_671B.json \<br>&nbsp;&nbsp;--interactive --temperature 0.7 --max-new-tokens 200</code>
256
  </div>
257
  </div>
258
 
259
+ <div style="text-align:center;padding-top:2rem;font-size:12px;color:var(--color-text-secondary);line-height:2;">
260
  Code license: MIT &nbsp;·&nbsp; Model license: Model Agreement &nbsp;·&nbsp; Commercial use supported<br>
261
  huggingface.co/teamzero/astrox
262
  </div>