Update README.md
Browse files
README.md
CHANGED
|
@@ -1,220 +1,262 @@
|
|
| 1 |
|
| 2 |
-
<style>
|
| 3 |
-
* { box-sizing: border-box; margin: 0; padding: 0; }
|
| 4 |
-
body { font-family: var(--font-sans); color: var(--color-text-primary); }
|
| 5 |
-
.page { max-width: 860px; margin: 0 auto; padding: 2rem 1rem 3rem; }
|
| 6 |
-
.hero { text-align: center; padding: 3rem 1rem 2.5rem; }
|
| 7 |
-
.hero img { height: 70px; object-fit: contain; margin-bottom: 1.75rem; display: block; margin-left: auto; margin-right: auto; }
|
| 8 |
-
.hero h1 { font-size: 32px; font-weight: 500; letter-spacing: -0.5px; margin-bottom: 0.5rem; }
|
| 9 |
-
.hero p { font-size: 15px; color: var(--color-text-secondary); max-width: 560px; margin: 0 auto 1.75rem; line-height: 1.7; }
|
| 10 |
-
.badges { display: flex; gap: 8px; justify-content: center; flex-wrap: wrap; margin-bottom: 1.75rem; }
|
| 11 |
-
.badge { font-size: 11px; padding: 4px 11px; border-radius: 99px; border: 0.5px solid var(--color-border-secondary); color: var(--color-text-secondary); }
|
| 12 |
-
.badge.purple { background: #EEEDFE; color: #3C3489; border-color: #AFA9EC; }
|
| 13 |
-
@media (prefers-color-scheme: dark) { .badge.purple { background: #3C3489; color: #CECBF6; border-color: #534AB7; } }
|
| 14 |
-
.hugging { display: inline-flex; align-items: center; gap: 8px; font-size: 13px; padding: 9px 20px; border-radius: var(--border-radius-md); border: 0.5px solid var(--color-border-secondary); color: var(--color-text-secondary); background: var(--color-background-secondary); }
|
| 15 |
-
.hugging code { font-family: var(--font-mono); font-size: 12px; color: var(--color-text-secondary); }
|
| 16 |
-
.divider { border: none; border-top: 0.5px solid var(--color-border-tertiary); margin: 2.5rem 0; }
|
| 17 |
-
.section { margin-bottom: 2.5rem; }
|
| 18 |
-
.section-label { font-size: 11px; font-weight: 500; color: var(--color-text-secondary); text-transform: uppercase; letter-spacing: 0.09em; margin-bottom: 1rem; }
|
| 19 |
-
.stat-row { display: grid; grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 12px; margin-bottom: 2.5rem; }
|
| 20 |
-
.stat { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem; text-align: center; }
|
| 21 |
-
.stat .val { font-size: 22px; font-weight: 500; margin-bottom: 3px; }
|
| 22 |
-
.stat .lbl { font-size: 12px; color: var(--color-text-secondary); }
|
| 23 |
-
.model-card { background: var(--color-background-primary); border: 0.5px solid var(--color-border-secondary); border-radius: var(--border-radius-lg); padding: 1.5rem; }
|
| 24 |
-
.model-name { font-size: 22px; font-weight: 500; margin-bottom: 4px; }
|
| 25 |
-
.model-sub { font-size: 14px; color: var(--color-text-secondary); margin-bottom: 1rem; line-height: 1.6; }
|
| 26 |
-
.pill-row { display: flex; gap: 6px; flex-wrap: wrap; margin-bottom: 1.25rem; }
|
| 27 |
-
.pill { font-size: 11px; padding: 3px 10px; border-radius: 99px; }
|
| 28 |
-
.pill.blue { background: #E6F1FB; color: #0C447C; }
|
| 29 |
-
.pill.teal { background: #E1F5EE; color: #085041; }
|
| 30 |
-
.pill.amber { background: #FAEEDA; color: #633806; }
|
| 31 |
-
.pill.purple { background: #EEEDFE; color: #3C3489; }
|
| 32 |
-
@media (prefers-color-scheme: dark) {
|
| 33 |
-
.pill.blue { background: #0C447C; color: #B5D4F4; }
|
| 34 |
-
.pill.teal { background: #085041; color: #9FE1CB; }
|
| 35 |
-
.pill.amber { background: #633806; color: #FAC775; }
|
| 36 |
-
.pill.purple { background: #3C3489; color: #CECBF6; }
|
| 37 |
-
}
|
| 38 |
-
.model-meta { border-top: 0.5px solid var(--color-border-tertiary); padding-top: 1rem; display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 12px; }
|
| 39 |
-
.meta-item .mk { font-size: 11px; color: var(--color-text-secondary); margin-bottom: 3px; }
|
| 40 |
-
.meta-item .mv { font-size: 13px; font-weight: 500; }
|
| 41 |
-
.feature-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(190px, 1fr)); gap: 12px; }
|
| 42 |
-
.feature-card { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem 1.1rem; }
|
| 43 |
-
.feature-card .ft { font-size: 14px; font-weight: 500; margin-bottom: 5px; }
|
| 44 |
-
.feature-card .fd { font-size: 13px; color: var(--color-text-secondary); line-height: 1.55; }
|
| 45 |
-
.bench-section { margin-bottom: 1.5rem; }
|
| 46 |
-
.bench-title { font-size: 13px; font-weight: 500; margin-bottom: 10px; padding-bottom: 6px; border-bottom: 0.5px solid var(--color-border-tertiary); }
|
| 47 |
-
table { width: 100%; border-collapse: collapse; font-size: 13px; table-layout: fixed; }
|
| 48 |
-
th { text-align: left; padding: 7px 10px; color: var(--color-text-secondary); font-weight: 500; border-bottom: 0.5px solid var(--color-border-tertiary); }
|
| 49 |
-
td { padding: 7px 10px; border-bottom: 0.5px solid var(--color-border-tertiary); }
|
| 50 |
-
tr:last-child td { border-bottom: none; }
|
| 51 |
-
td:not(:first-child), th:not(:first-child) { text-align: right; }
|
| 52 |
-
.best { font-weight: 500; color: #1D9E75; }
|
| 53 |
-
.framework-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(140px, 1fr)); gap: 8px; }
|
| 54 |
-
.fw { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 10px 14px; font-size: 13px; }
|
| 55 |
-
.fw .fwn { font-weight: 500; margin-bottom: 2px; }
|
| 56 |
-
.fw .fwd { font-size: 11px; color: var(--color-text-secondary); }
|
| 57 |
-
.arch-row { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; }
|
| 58 |
-
.arch-card { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem 1.1rem; }
|
| 59 |
-
.arch-card .ak { font-size: 11px; color: var(--color-text-secondary); margin-bottom: 3px; }
|
| 60 |
-
.arch-card .av { font-size: 14px; font-weight: 500; }
|
| 61 |
-
.arch-card .ad { font-size: 12px; color: var(--color-text-secondary); margin-top: 4px; line-height: 1.5; }
|
| 62 |
-
.footer { text-align: center; padding-top: 2rem; font-size: 12px; color: var(--color-text-secondary); line-height: 2; }
|
| 63 |
-
</style>
|
| 64 |
|
| 65 |
-
<div
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
<
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
<span
|
| 72 |
-
<span
|
| 73 |
-
<span
|
| 74 |
-
<span
|
| 75 |
-
<span class="badge">FP8 training</span>
|
| 76 |
-
<span class="badge">MIT License</span>
|
| 77 |
</div>
|
| 78 |
-
<div
|
| 79 |
-
<span style="font-
|
| 80 |
</div>
|
| 81 |
</div>
|
| 82 |
|
| 83 |
-
<div
|
| 84 |
-
<div
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
</div>
|
| 89 |
|
| 90 |
-
<hr
|
| 91 |
|
| 92 |
-
<div
|
| 93 |
-
<div
|
| 94 |
-
<div
|
| 95 |
-
<div
|
| 96 |
-
<div
|
| 97 |
-
<div
|
| 98 |
-
<span
|
| 99 |
-
<span
|
| 100 |
-
<span
|
| 101 |
-
<span
|
| 102 |
-
</div>
|
| 103 |
-
<div
|
| 104 |
-
<div
|
| 105 |
-
<div
|
| 106 |
-
<div
|
| 107 |
-
<div
|
| 108 |
</div>
|
| 109 |
</div>
|
| 110 |
</div>
|
| 111 |
|
| 112 |
-
<div
|
| 113 |
-
<div
|
| 114 |
-
<div
|
| 115 |
-
<div
|
| 116 |
-
<div
|
| 117 |
-
<div
|
| 118 |
-
<div
|
| 119 |
-
</div>
|
| 120 |
-
<div
|
| 121 |
-
<div
|
| 122 |
-
<div
|
| 123 |
-
<div
|
| 124 |
-
</div>
|
| 125 |
-
<div
|
| 126 |
-
<div
|
| 127 |
-
<div
|
| 128 |
-
<div
|
| 129 |
-
</div>
|
| 130 |
-
<div
|
| 131 |
-
<div
|
| 132 |
-
<div
|
| 133 |
-
<div
|
| 134 |
</div>
|
| 135 |
</div>
|
| 136 |
</div>
|
| 137 |
|
| 138 |
-
<div
|
| 139 |
-
<div
|
| 140 |
-
<div
|
| 141 |
-
<div
|
| 142 |
-
<div
|
| 143 |
-
<div
|
| 144 |
</div>
|
| 145 |
-
<div
|
| 146 |
-
<div
|
| 147 |
-
<div
|
| 148 |
</div>
|
| 149 |
-
<div
|
| 150 |
-
<div
|
| 151 |
-
<div
|
| 152 |
</div>
|
| 153 |
-
<div
|
| 154 |
-
<div
|
| 155 |
-
<div
|
| 156 |
</div>
|
| 157 |
</div>
|
| 158 |
</div>
|
| 159 |
|
| 160 |
-
<div
|
| 161 |
-
<div
|
| 162 |
-
<table>
|
| 163 |
-
<tr>
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
</table>
|
| 169 |
</div>
|
| 170 |
|
| 171 |
-
<div
|
| 172 |
-
<div
|
| 173 |
-
<table>
|
| 174 |
-
<tr>
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
</table>
|
| 180 |
</div>
|
| 181 |
|
| 182 |
-
<div
|
| 183 |
-
<div
|
| 184 |
-
<table>
|
| 185 |
-
<tr>
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
</table>
|
| 191 |
</div>
|
| 192 |
|
| 193 |
-
<div
|
| 194 |
-
<div
|
| 195 |
-
<div
|
| 196 |
-
<div
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
<
|
| 200 |
-
<div
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
</div>
|
| 203 |
</div>
|
| 204 |
|
| 205 |
-
<div
|
| 206 |
-
<div
|
| 207 |
-
<div style="background:
|
| 208 |
-
<div style="font-size:
|
| 209 |
-
<code style="font-family:
|
| 210 |
</div>
|
| 211 |
-
<div style="background:
|
| 212 |
-
<div style="font-size:
|
| 213 |
-
<code style="font-family:
|
| 214 |
</div>
|
| 215 |
</div>
|
| 216 |
|
| 217 |
-
<div
|
| 218 |
Code license: MIT · Model license: Model Agreement · Commercial use supported<br>
|
| 219 |
huggingface.co/teamzero/astrox
|
| 220 |
</div>
|
|
|
|
| 1 |
|
| 2 |
+
<div style="max-width:860px;margin:0 auto;padding:2rem 1rem 3rem;font-family:var(--font-sans);color:var(--color-text-primary);">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
<div style="text-align:center;padding:3rem 1rem 2.5rem;">
|
| 5 |
+
<img src="https://i.ibb.co/rGS6dBcf/logo-Astro-X.png" alt="AstroX AI" style="height:70px;object-fit:contain;display:block;margin:0 auto 1.75rem;">
|
| 6 |
+
<p style="font-size:15px;color:var(--color-text-secondary);max-width:560px;margin:0 auto 1.75rem;line-height:1.7;">A frontier-class Mixture-of-Experts language model — competitive with leading closed-source models at a fraction of the training cost. Fully open-source and commercially licensed.</p>
|
| 7 |
+
<div style="display:flex;gap:8px;justify-content:center;flex-wrap:wrap;margin-bottom:1.75rem;">
|
| 8 |
+
<span style="font-size:11px;padding:4px 11px;border-radius:99px;background:#EEEDFE;color:#3C3489;border:0.5px solid #AFA9EC;">Mixture-of-Experts</span>
|
| 9 |
+
<span style="font-size:11px;padding:4px 11px;border-radius:99px;border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);">671B total params</span>
|
| 10 |
+
<span style="font-size:11px;padding:4px 11px;border-radius:99px;border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);">37B activated per token</span>
|
| 11 |
+
<span style="font-size:11px;padding:4px 11px;border-radius:99px;border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);">128K context</span>
|
| 12 |
+
<span style="font-size:11px;padding:4px 11px;border-radius:99px;border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);">FP8 training</span>
|
| 13 |
+
<span style="font-size:11px;padding:4px 11px;border-radius:99px;border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);">MIT License</span>
|
|
|
|
|
|
|
| 14 |
</div>
|
| 15 |
+
<div style="display:inline-flex;align-items:center;gap:8px;font-size:13px;padding:9px 20px;border-radius:var(--border-radius-md);border:0.5px solid var(--color-border-secondary);color:var(--color-text-secondary);background:var(--color-background-secondary);">
|
| 16 |
+
<span>huggingface.co/</span><code style="font-family:var(--font-mono);font-size:12px;">teamzero/astrox</code>
|
| 17 |
</div>
|
| 18 |
</div>
|
| 19 |
|
| 20 |
+
<div style="display:grid;grid-template-columns:repeat(4,minmax(0,1fr));gap:12px;margin-bottom:2.5rem;">
|
| 21 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem;text-align:center;">
|
| 22 |
+
<div style="font-size:22px;font-weight:500;margin-bottom:3px;">671B</div>
|
| 23 |
+
<div style="font-size:12px;color:var(--color-text-secondary);">Total params</div>
|
| 24 |
+
</div>
|
| 25 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem;text-align:center;">
|
| 26 |
+
<div style="font-size:22px;font-weight:500;margin-bottom:3px;">37B</div>
|
| 27 |
+
<div style="font-size:12px;color:var(--color-text-secondary);">Active per token</div>
|
| 28 |
+
</div>
|
| 29 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem;text-align:center;">
|
| 30 |
+
<div style="font-size:22px;font-weight:500;margin-bottom:3px;">128K</div>
|
| 31 |
+
<div style="font-size:12px;color:var(--color-text-secondary);">Context window</div>
|
| 32 |
+
</div>
|
| 33 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem;text-align:center;">
|
| 34 |
+
<div style="font-size:22px;font-weight:500;margin-bottom:3px;">2.79M</div>
|
| 35 |
+
<div style="font-size:12px;color:var(--color-text-secondary);">H800 GPU hours</div>
|
| 36 |
+
</div>
|
| 37 |
</div>
|
| 38 |
|
| 39 |
+
<hr style="border:none;border-top:0.5px solid var(--color-border-tertiary);margin:2.5rem 0;">
|
| 40 |
|
| 41 |
+
<div style="margin-bottom:2.5rem;">
|
| 42 |
+
<div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Model</div>
|
| 43 |
+
<div style="background:var(--color-background-primary);border:0.5px solid var(--color-border-secondary);border-radius:var(--border-radius-lg);padding:1.5rem;">
|
| 44 |
+
<div style="font-size:22px;font-weight:500;margin-bottom:4px;">AstroX</div>
|
| 45 |
+
<div style="font-size:14px;color:var(--color-text-secondary);margin-bottom:1rem;line-height:1.6;">Instruction-tuned chat model with reinforcement learning and advanced long-chain-of-thought reasoning distillation. The only available model in the AstroX family.</div>
|
| 46 |
+
<div style="display:flex;gap:6px;flex-wrap:wrap;margin-bottom:1.25rem;">
|
| 47 |
+
<span style="font-size:11px;padding:3px 10px;border-radius:99px;background:#EEEDFE;color:#3C3489;">MoE</span>
|
| 48 |
+
<span style="font-size:11px;padding:3px 10px;border-radius:99px;background:#E6F1FB;color:#0C447C;">671B / 37B active</span>
|
| 49 |
+
<span style="font-size:11px;padding:3px 10px;border-radius:99px;background:#E1F5EE;color:#085041;">128K context</span>
|
| 50 |
+
<span style="font-size:11px;padding:3px 10px;border-radius:99px;background:#FAEEDA;color:#633806;">FP8 weights</span>
|
| 51 |
+
</div>
|
| 52 |
+
<div style="border-top:0.5px solid var(--color-border-tertiary);padding-top:1rem;display:grid;grid-template-columns:repeat(auto-fit,minmax(160px,1fr));gap:12px;">
|
| 53 |
+
<div><div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Architecture</div><div style="font-size:13px;font-weight:500;">MoE + Multi-head Latent Attention</div></div>
|
| 54 |
+
<div><div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Experts</div><div style="font-size:13px;font-weight:500;">256 total · 8 active</div></div>
|
| 55 |
+
<div><div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Pre-training data</div><div style="font-size:13px;font-weight:500;">14.8T tokens</div></div>
|
| 56 |
+
<div><div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">License</div><div style="font-size:13px;font-weight:500;">MIT + Model Agreement</div></div>
|
| 57 |
</div>
|
| 58 |
</div>
|
| 59 |
</div>
|
| 60 |
|
| 61 |
+
<div style="margin-bottom:2.5rem;">
|
| 62 |
+
<div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Architecture highlights</div>
|
| 63 |
+
<div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;">
|
| 64 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
|
| 65 |
+
<div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Attention</div>
|
| 66 |
+
<div style="font-size:14px;font-weight:500;margin-bottom:4px;">Multi-head Latent Attention (MLA)</div>
|
| 67 |
+
<div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">Reduces KV cache memory footprint significantly vs. standard MHA, enabling practical long-context inference.</div>
|
| 68 |
+
</div>
|
| 69 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
|
| 70 |
+
<div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Load balancing</div>
|
| 71 |
+
<div style="font-size:14px;font-weight:500;margin-bottom:4px;">Auxiliary-loss-free strategy</div>
|
| 72 |
+
<div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">Balances expert load without the performance penalty of traditional auxiliary loss terms.</div>
|
| 73 |
+
</div>
|
| 74 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
|
| 75 |
+
<div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Training objective</div>
|
| 76 |
+
<div style="font-size:14px;font-weight:500;margin-bottom:4px;">Multi-Token Prediction (MTP)</div>
|
| 77 |
+
<div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">Predicts multiple future tokens simultaneously, boosting performance and enabling speculative decoding.</div>
|
| 78 |
+
</div>
|
| 79 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
|
| 80 |
+
<div style="font-size:11px;color:var(--color-text-secondary);margin-bottom:3px;">Post-training</div>
|
| 81 |
+
<div style="font-size:14px;font-weight:500;margin-bottom:4px;">Reasoning distillation</div>
|
| 82 |
+
<div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">Verification and reflection patterns distilled from a long-CoT model, keeping output style and length controlled.</div>
|
| 83 |
</div>
|
| 84 |
</div>
|
| 85 |
</div>
|
| 86 |
|
| 87 |
+
<div style="margin-bottom:2.5rem;">
|
| 88 |
+
<div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Key innovations</div>
|
| 89 |
+
<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(190px,1fr));gap:12px;">
|
| 90 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
|
| 91 |
+
<div style="font-size:14px;font-weight:500;margin-bottom:5px;">FP8 mixed precision</div>
|
| 92 |
+
<div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">First validated large-scale FP8 training. Cuts compute cost without quality loss.</div>
|
| 93 |
</div>
|
| 94 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
|
| 95 |
+
<div style="font-size:14px;font-weight:500;margin-bottom:5px;">Zero training instability</div>
|
| 96 |
+
<div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">No irrecoverable loss spikes and no rollbacks throughout the entire pre-training run.</div>
|
| 97 |
</div>
|
| 98 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
|
| 99 |
+
<div style="font-size:14px;font-weight:500;margin-bottom:5px;">Full comm/compute overlap</div>
|
| 100 |
+
<div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">Co-designed algorithms and hardware nearly eliminate cross-node MoE communication bottlenecks.</div>
|
| 101 |
</div>
|
| 102 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.1rem;">
|
| 103 |
+
<div style="font-size:14px;font-weight:500;margin-bottom:5px;">Speculative decoding ready</div>
|
| 104 |
+
<div style="font-size:13px;color:var(--color-text-secondary);line-height:1.55;">The MTP module doubles as a draft head for inference acceleration out of the box.</div>
|
| 105 |
</div>
|
| 106 |
</div>
|
| 107 |
</div>
|
| 108 |
|
| 109 |
+
<div style="margin-bottom:2.5rem;">
|
| 110 |
+
<div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Benchmark performance — math & reasoning</div>
|
| 111 |
+
<table style="width:100%;border-collapse:collapse;font-size:13px;table-layout:fixed;">
|
| 112 |
+
<tr>
|
| 113 |
+
<th style="text-align:left;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);width:44%;">Benchmark</th>
|
| 114 |
+
<th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">GPT-4o</th>
|
| 115 |
+
<th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">Claude 3.5 Sonnet</th>
|
| 116 |
+
<th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">AstroX</th>
|
| 117 |
+
</tr>
|
| 118 |
+
<tr>
|
| 119 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">AIME 2024 (Pass@1)</td>
|
| 120 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">9.3</td>
|
| 121 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">16.0</td>
|
| 122 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">39.2</td>
|
| 123 |
+
</tr>
|
| 124 |
+
<tr>
|
| 125 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">MATH-500 (EM)</td>
|
| 126 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">74.6</td>
|
| 127 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">78.3</td>
|
| 128 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">90.2</td>
|
| 129 |
+
</tr>
|
| 130 |
+
<tr>
|
| 131 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">CNMO 2024 (Pass@1)</td>
|
| 132 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">10.8</td>
|
| 133 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">13.1</td>
|
| 134 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">43.2</td>
|
| 135 |
+
</tr>
|
| 136 |
+
<tr>
|
| 137 |
+
<td style="padding:7px 10px;">GSM8K (EM)</td>
|
| 138 |
+
<td style="padding:7px 10px;text-align:right;">—</td>
|
| 139 |
+
<td style="padding:7px 10px;text-align:right;">—</td>
|
| 140 |
+
<td style="padding:7px 10px;text-align:right;font-weight:500;color:#1D9E75;">89.3</td>
|
| 141 |
+
</tr>
|
| 142 |
</table>
|
| 143 |
</div>
|
| 144 |
|
| 145 |
+
<div style="margin-bottom:2.5rem;">
|
| 146 |
+
<div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Benchmark performance — code</div>
|
| 147 |
+
<table style="width:100%;border-collapse:collapse;font-size:13px;table-layout:fixed;">
|
| 148 |
+
<tr>
|
| 149 |
+
<th style="text-align:left;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);width:44%;">Benchmark</th>
|
| 150 |
+
<th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">GPT-4o</th>
|
| 151 |
+
<th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">Claude 3.5 Sonnet</th>
|
| 152 |
+
<th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">AstroX</th>
|
| 153 |
+
</tr>
|
| 154 |
+
<tr>
|
| 155 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">LiveCodeBench (Pass@1)</td>
|
| 156 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">34.2</td>
|
| 157 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">32.8</td>
|
| 158 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">37.6</td>
|
| 159 |
+
</tr>
|
| 160 |
+
<tr>
|
| 161 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">Codeforces (Percentile)</td>
|
| 162 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">23.6</td>
|
| 163 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">20.3</td>
|
| 164 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">51.6</td>
|
| 165 |
+
</tr>
|
| 166 |
+
<tr>
|
| 167 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">Aider-Polyglot (Acc.)</td>
|
| 168 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">16.0</td>
|
| 169 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">45.3</td>
|
| 170 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">49.6</td>
|
| 171 |
+
</tr>
|
| 172 |
+
<tr>
|
| 173 |
+
<td style="padding:7px 10px;">HumanEval-Mul (Pass@1)</td>
|
| 174 |
+
<td style="padding:7px 10px;text-align:right;">80.5</td>
|
| 175 |
+
<td style="padding:7px 10px;text-align:right;">81.7</td>
|
| 176 |
+
<td style="padding:7px 10px;text-align:right;font-weight:500;color:#1D9E75;">82.6</td>
|
| 177 |
+
</tr>
|
| 178 |
</table>
|
| 179 |
</div>
|
| 180 |
|
| 181 |
+
<div style="margin-bottom:2.5rem;">
|
| 182 |
+
<div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Benchmark performance — general</div>
|
| 183 |
+
<table style="width:100%;border-collapse:collapse;font-size:13px;table-layout:fixed;">
|
| 184 |
+
<tr>
|
| 185 |
+
<th style="text-align:left;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);width:44%;">Benchmark</th>
|
| 186 |
+
<th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">GPT-4o</th>
|
| 187 |
+
<th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">Claude 3.5 Sonnet</th>
|
| 188 |
+
<th style="text-align:right;padding:7px 10px;color:var(--color-text-secondary);font-weight:500;border-bottom:0.5px solid var(--color-border-tertiary);">AstroX</th>
|
| 189 |
+
</tr>
|
| 190 |
+
<tr>
|
| 191 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">MMLU (EM)</td>
|
| 192 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">87.2</td>
|
| 193 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">88.3</td>
|
| 194 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">88.5</td>
|
| 195 |
+
</tr>
|
| 196 |
+
<tr>
|
| 197 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">Arena-Hard</td>
|
| 198 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">80.4</td>
|
| 199 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">85.2</td>
|
| 200 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">85.5</td>
|
| 201 |
+
</tr>
|
| 202 |
+
<tr>
|
| 203 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);">AlpacaEval 2.0</td>
|
| 204 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">51.1</td>
|
| 205 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;">52.0</td>
|
| 206 |
+
<td style="padding:7px 10px;border-bottom:0.5px solid var(--color-border-tertiary);text-align:right;font-weight:500;color:#1D9E75;">70.0</td>
|
| 207 |
+
</tr>
|
| 208 |
+
<tr>
|
| 209 |
+
<td style="padding:7px 10px;">DROP (3-shot F1)</td>
|
| 210 |
+
<td style="padding:7px 10px;text-align:right;">83.7</td>
|
| 211 |
+
<td style="padding:7px 10px;text-align:right;">88.3</td>
|
| 212 |
+
<td style="padding:7px 10px;text-align:right;font-weight:500;color:#1D9E75;">91.6</td>
|
| 213 |
+
</tr>
|
| 214 |
</table>
|
| 215 |
</div>
|
| 216 |
|
| 217 |
+
<div style="margin-bottom:2.5rem;">
|
| 218 |
+
<div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Supported inference frameworks</div>
|
| 219 |
+
<div style="display:grid;grid-template-columns:repeat(auto-fill,minmax(140px,1fr));gap:8px;">
|
| 220 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
|
| 221 |
+
<div style="font-size:13px;font-weight:500;margin-bottom:2px;">SGLang</div>
|
| 222 |
+
<div style="font-size:11px;color:var(--color-text-secondary);">Recommended · FP8 + BF16 · NVIDIA + AMD</div>
|
| 223 |
+
</div>
|
| 224 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
|
| 225 |
+
<div style="font-size:13px;font-weight:500;margin-bottom:2px;">vLLM</div>
|
| 226 |
+
<div style="font-size:11px;color:var(--color-text-secondary);">FP8 + BF16 · pipeline parallelism</div>
|
| 227 |
+
</div>
|
| 228 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
|
| 229 |
+
<div style="font-size:13px;font-weight:500;margin-bottom:2px;">LMDeploy</div>
|
| 230 |
+
<div style="font-size:11px;color:var(--color-text-secondary);">Offline + online · PyTorch-native</div>
|
| 231 |
+
</div>
|
| 232 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
|
| 233 |
+
<div style="font-size:13px;font-weight:500;margin-bottom:2px;">TensorRT-LLM</div>
|
| 234 |
+
<div style="font-size:11px;color:var(--color-text-secondary);">BF16 · INT4/INT8 quant</div>
|
| 235 |
+
</div>
|
| 236 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
|
| 237 |
+
<div style="font-size:13px;font-weight:500;margin-bottom:2px;">AMD GPU</div>
|
| 238 |
+
<div style="font-size:11px;color:var(--color-text-secondary);">via SGLang · FP8 + BF16</div>
|
| 239 |
+
</div>
|
| 240 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:10px 14px;">
|
| 241 |
+
<div style="font-size:13px;font-weight:500;margin-bottom:2px;">Huawei Ascend</div>
|
| 242 |
+
<div style="font-size:11px;color:var(--color-text-secondary);">via MindIE · BF16</div>
|
| 243 |
+
</div>
|
| 244 |
</div>
|
| 245 |
</div>
|
| 246 |
|
| 247 |
+
<div style="margin-bottom:2.5rem;">
|
| 248 |
+
<div style="font-size:11px;font-weight:500;color:var(--color-text-secondary);text-transform:uppercase;letter-spacing:0.09em;margin-bottom:1rem;">Quick start</div>
|
| 249 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.25rem;margin-bottom:8px;">
|
| 250 |
+
<div style="font-size:12px;color:var(--color-text-secondary);margin-bottom:8px;">Convert FP8 weights to BF16</div>
|
| 251 |
+
<code style="font-family:var(--font-mono);font-size:12px;color:var(--color-text-primary);display:block;line-height:1.8;">python fp8_cast_bf16.py \<br> --input-fp8-hf-path /path/to/fp8_weights \<br> --output-bf16-hf-path /path/to/bf16_weights</code>
|
| 252 |
</div>
|
| 253 |
+
<div style="background:var(--color-background-secondary);border-radius:var(--border-radius-md);padding:1rem 1.25rem;">
|
| 254 |
+
<div style="font-size:12px;color:var(--color-text-secondary);margin-bottom:8px;">Run interactive inference (2 nodes · 8 GPUs each)</div>
|
| 255 |
+
<code style="font-family:var(--font-mono);font-size:12px;color:var(--color-text-primary);display:block;line-height:1.8;">torchrun --nnodes 2 --nproc-per-node 8 generate.py \<br> --node-rank $RANK --master-addr $ADDR \<br> --ckpt-path /path/to/AstroX \<br> --config configs/config_671B.json \<br> --interactive --temperature 0.7 --max-new-tokens 200</code>
|
| 256 |
</div>
|
| 257 |
</div>
|
| 258 |
|
| 259 |
+
<div style="text-align:center;padding-top:2rem;font-size:12px;color:var(--color-text-secondary);line-height:2;">
|
| 260 |
Code license: MIT · Model license: Model Agreement · Commercial use supported<br>
|
| 261 |
huggingface.co/teamzero/astrox
|
| 262 |
</div>
|