Update README.md
Browse files
README.md
CHANGED
|
@@ -1,75 +1,82 @@
|
|
| 1 |
|
| 2 |
<style>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
</style>
|
| 55 |
|
| 56 |
<div class="page">
|
| 57 |
|
| 58 |
<div class="hero">
|
| 59 |
<img src="https://i.ibb.co/rGS6dBcf/logo-Astro-X.png" alt="AstroX AI">
|
|
|
|
| 60 |
<div class="badges">
|
| 61 |
-
<span class="badge
|
| 62 |
<span class="badge">671B total params</span>
|
| 63 |
<span class="badge">37B activated per token</span>
|
| 64 |
<span class="badge">128K context</span>
|
|
|
|
| 65 |
<span class="badge">MIT License</span>
|
| 66 |
</div>
|
| 67 |
-
<
|
| 68 |
-
|
| 69 |
-
<a class="link-btn primary" href="https://huggingface.co/teamzero/astrox">HuggingFace</a>
|
| 70 |
-
<a class="link-btn" href="https://chat.deepseek.com/">Chat Demo</a>
|
| 71 |
-
<a class="link-btn" href="https://platform.deepseek.com/">API Platform</a>
|
| 72 |
-
<a class="link-btn" href="https://arxiv.org/abs/2412.19437">Paper</a>
|
| 73 |
</div>
|
| 74 |
</div>
|
| 75 |
|
|
@@ -77,94 +84,139 @@
|
|
| 77 |
<div class="stat"><div class="val">671B</div><div class="lbl">Total params</div></div>
|
| 78 |
<div class="stat"><div class="val">37B</div><div class="lbl">Active per token</div></div>
|
| 79 |
<div class="stat"><div class="val">128K</div><div class="lbl">Context window</div></div>
|
| 80 |
-
<div class="stat"><div class="val">2.79M</div><div class="lbl">GPU hours</div></div>
|
| 81 |
</div>
|
| 82 |
|
| 83 |
<hr class="divider">
|
| 84 |
|
| 85 |
-
<div
|
| 86 |
-
<div class="section-
|
| 87 |
<div class="model-card">
|
| 88 |
-
<div class="model-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
<
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
</div>
|
| 98 |
-
<a class="dl-btn" href="https://huggingface.co/teamzero/astrox">Download on HuggingFace</a>
|
| 99 |
</div>
|
| 100 |
</div>
|
| 101 |
|
| 102 |
-
<div
|
| 103 |
-
<div class="section-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
<div class="feature-grid">
|
| 105 |
<div class="feature-card">
|
| 106 |
-
<div class="ft">
|
| 107 |
-
<div class="fd">
|
| 108 |
</div>
|
| 109 |
<div class="feature-card">
|
| 110 |
-
<div class="ft">
|
| 111 |
-
<div class="fd">
|
| 112 |
</div>
|
| 113 |
<div class="feature-card">
|
| 114 |
-
<div class="ft">
|
| 115 |
-
<div class="fd">
|
| 116 |
</div>
|
| 117 |
<div class="feature-card">
|
| 118 |
-
<div class="ft">
|
| 119 |
-
<div class="fd">
|
| 120 |
</div>
|
| 121 |
</div>
|
| 122 |
</div>
|
| 123 |
|
| 124 |
-
<div
|
| 125 |
-
<div class="section-
|
| 126 |
-
<
|
| 127 |
-
<
|
| 128 |
-
<
|
| 129 |
-
<
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
</div>
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
</
|
| 146 |
</div>
|
| 147 |
-
<div
|
| 148 |
-
<
|
| 149 |
-
|
| 150 |
-
<tr><td>MMLU (EM)</td><td>87.2</td><td>88.3</td><td class="best">88.5</td></tr>
|
| 151 |
-
<tr><td>Arena-Hard</td><td>80.4</td><td>85.2</td><td class="best">85.5</td></tr>
|
| 152 |
-
<tr><td>AlpacaEval 2.0</td><td>51.1</td><td>52.0</td><td class="best">70.0</td></tr>
|
| 153 |
-
</table>
|
| 154 |
</div>
|
| 155 |
</div>
|
| 156 |
|
| 157 |
<div class="footer">
|
| 158 |
-
Code: MIT
|
|
|
|
| 159 |
</div>
|
| 160 |
|
| 161 |
</div>
|
| 162 |
-
|
| 163 |
-
<script>
|
| 164 |
-
function switchTab(el, id) {
|
| 165 |
-
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
| 166 |
-
document.querySelectorAll('.tab-content').forEach(t => t.classList.remove('active'));
|
| 167 |
-
el.classList.add('active');
|
| 168 |
-
document.getElementById(id).classList.add('active');
|
| 169 |
-
}
|
| 170 |
-
</script>
|
|
|
|
| 1 |
|
| 2 |
<style>
|
| 3 |
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
| 4 |
+
body { font-family: var(--font-sans); color: var(--color-text-primary); }
|
| 5 |
+
.page { max-width: 860px; margin: 0 auto; padding: 2rem 1rem 3rem; }
|
| 6 |
+
.hero { text-align: center; padding: 3rem 1rem 2.5rem; }
|
| 7 |
+
.hero img { height: 70px; object-fit: contain; margin-bottom: 1.75rem; display: block; margin-left: auto; margin-right: auto; }
|
| 8 |
+
.hero h1 { font-size: 32px; font-weight: 500; letter-spacing: -0.5px; margin-bottom: 0.5rem; }
|
| 9 |
+
.hero p { font-size: 15px; color: var(--color-text-secondary); max-width: 560px; margin: 0 auto 1.75rem; line-height: 1.7; }
|
| 10 |
+
.badges { display: flex; gap: 8px; justify-content: center; flex-wrap: wrap; margin-bottom: 1.75rem; }
|
| 11 |
+
.badge { font-size: 11px; padding: 4px 11px; border-radius: 99px; border: 0.5px solid var(--color-border-secondary); color: var(--color-text-secondary); }
|
| 12 |
+
.badge.purple { background: #EEEDFE; color: #3C3489; border-color: #AFA9EC; }
|
| 13 |
+
@media (prefers-color-scheme: dark) { .badge.purple { background: #3C3489; color: #CECBF6; border-color: #534AB7; } }
|
| 14 |
+
.hugging { display: inline-flex; align-items: center; gap: 8px; font-size: 13px; padding: 9px 20px; border-radius: var(--border-radius-md); border: 0.5px solid var(--color-border-secondary); color: var(--color-text-secondary); background: var(--color-background-secondary); }
|
| 15 |
+
.hugging code { font-family: var(--font-mono); font-size: 12px; color: var(--color-text-secondary); }
|
| 16 |
+
.divider { border: none; border-top: 0.5px solid var(--color-border-tertiary); margin: 2.5rem 0; }
|
| 17 |
+
.section { margin-bottom: 2.5rem; }
|
| 18 |
+
.section-label { font-size: 11px; font-weight: 500; color: var(--color-text-secondary); text-transform: uppercase; letter-spacing: 0.09em; margin-bottom: 1rem; }
|
| 19 |
+
.stat-row { display: grid; grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 12px; margin-bottom: 2.5rem; }
|
| 20 |
+
.stat { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem; text-align: center; }
|
| 21 |
+
.stat .val { font-size: 22px; font-weight: 500; margin-bottom: 3px; }
|
| 22 |
+
.stat .lbl { font-size: 12px; color: var(--color-text-secondary); }
|
| 23 |
+
.model-card { background: var(--color-background-primary); border: 0.5px solid var(--color-border-secondary); border-radius: var(--border-radius-lg); padding: 1.5rem; }
|
| 24 |
+
.model-name { font-size: 22px; font-weight: 500; margin-bottom: 4px; }
|
| 25 |
+
.model-sub { font-size: 14px; color: var(--color-text-secondary); margin-bottom: 1rem; line-height: 1.6; }
|
| 26 |
+
.pill-row { display: flex; gap: 6px; flex-wrap: wrap; margin-bottom: 1.25rem; }
|
| 27 |
+
.pill { font-size: 11px; padding: 3px 10px; border-radius: 99px; }
|
| 28 |
+
.pill.blue { background: #E6F1FB; color: #0C447C; }
|
| 29 |
+
.pill.teal { background: #E1F5EE; color: #085041; }
|
| 30 |
+
.pill.amber { background: #FAEEDA; color: #633806; }
|
| 31 |
+
.pill.purple { background: #EEEDFE; color: #3C3489; }
|
| 32 |
+
@media (prefers-color-scheme: dark) {
|
| 33 |
+
.pill.blue { background: #0C447C; color: #B5D4F4; }
|
| 34 |
+
.pill.teal { background: #085041; color: #9FE1CB; }
|
| 35 |
+
.pill.amber { background: #633806; color: #FAC775; }
|
| 36 |
+
.pill.purple { background: #3C3489; color: #CECBF6; }
|
| 37 |
+
}
|
| 38 |
+
.model-meta { border-top: 0.5px solid var(--color-border-tertiary); padding-top: 1rem; display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 12px; }
|
| 39 |
+
.meta-item .mk { font-size: 11px; color: var(--color-text-secondary); margin-bottom: 3px; }
|
| 40 |
+
.meta-item .mv { font-size: 13px; font-weight: 500; }
|
| 41 |
+
.feature-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(190px, 1fr)); gap: 12px; }
|
| 42 |
+
.feature-card { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem 1.1rem; }
|
| 43 |
+
.feature-card .ft { font-size: 14px; font-weight: 500; margin-bottom: 5px; }
|
| 44 |
+
.feature-card .fd { font-size: 13px; color: var(--color-text-secondary); line-height: 1.55; }
|
| 45 |
+
.bench-section { margin-bottom: 1.5rem; }
|
| 46 |
+
.bench-title { font-size: 13px; font-weight: 500; margin-bottom: 10px; padding-bottom: 6px; border-bottom: 0.5px solid var(--color-border-tertiary); }
|
| 47 |
+
table { width: 100%; border-collapse: collapse; font-size: 13px; table-layout: fixed; }
|
| 48 |
+
th { text-align: left; padding: 7px 10px; color: var(--color-text-secondary); font-weight: 500; border-bottom: 0.5px solid var(--color-border-tertiary); }
|
| 49 |
+
td { padding: 7px 10px; border-bottom: 0.5px solid var(--color-border-tertiary); }
|
| 50 |
+
tr:last-child td { border-bottom: none; }
|
| 51 |
+
td:not(:first-child), th:not(:first-child) { text-align: right; }
|
| 52 |
+
.best { font-weight: 500; color: #1D9E75; }
|
| 53 |
+
.framework-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(140px, 1fr)); gap: 8px; }
|
| 54 |
+
.fw { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 10px 14px; font-size: 13px; }
|
| 55 |
+
.fw .fwn { font-weight: 500; margin-bottom: 2px; }
|
| 56 |
+
.fw .fwd { font-size: 11px; color: var(--color-text-secondary); }
|
| 57 |
+
.arch-row { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; }
|
| 58 |
+
.arch-card { background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem 1.1rem; }
|
| 59 |
+
.arch-card .ak { font-size: 11px; color: var(--color-text-secondary); margin-bottom: 3px; }
|
| 60 |
+
.arch-card .av { font-size: 14px; font-weight: 500; }
|
| 61 |
+
.arch-card .ad { font-size: 12px; color: var(--color-text-secondary); margin-top: 4px; line-height: 1.5; }
|
| 62 |
+
.footer { text-align: center; padding-top: 2rem; font-size: 12px; color: var(--color-text-secondary); line-height: 2; }
|
| 63 |
</style>
|
| 64 |
|
| 65 |
<div class="page">
|
| 66 |
|
| 67 |
<div class="hero">
|
| 68 |
<img src="https://i.ibb.co/rGS6dBcf/logo-Astro-X.png" alt="AstroX AI">
|
| 69 |
+
<p>A frontier-class Mixture-of-Experts language model — competitive with leading closed-source models at a fraction of the training cost. Fully open-source and commercially licensed.</p>
|
| 70 |
<div class="badges">
|
| 71 |
+
<span class="badge purple">Mixture-of-Experts</span>
|
| 72 |
<span class="badge">671B total params</span>
|
| 73 |
<span class="badge">37B activated per token</span>
|
| 74 |
<span class="badge">128K context</span>
|
| 75 |
+
<span class="badge">FP8 training</span>
|
| 76 |
<span class="badge">MIT License</span>
|
| 77 |
</div>
|
| 78 |
+
<div class="hugging">
|
| 79 |
+
<span style="font-size:13px; color: var(--color-text-secondary);">huggingface.co/</span><code>teamzero/astrox</code>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
</div>
|
| 81 |
</div>
|
| 82 |
|
|
|
|
| 84 |
<div class="stat"><div class="val">671B</div><div class="lbl">Total params</div></div>
|
| 85 |
<div class="stat"><div class="val">37B</div><div class="lbl">Active per token</div></div>
|
| 86 |
<div class="stat"><div class="val">128K</div><div class="lbl">Context window</div></div>
|
| 87 |
+
<div class="stat"><div class="val">2.79M</div><div class="lbl">H800 GPU hours</div></div>
|
| 88 |
</div>
|
| 89 |
|
| 90 |
<hr class="divider">
|
| 91 |
|
| 92 |
+
<div class="section">
|
| 93 |
+
<div class="section-label">Model</div>
|
| 94 |
<div class="model-card">
|
| 95 |
+
<div class="model-name">AstroX</div>
|
| 96 |
+
<div class="model-sub">Instruction-tuned chat model with reinforcement learning and R1 long-chain-of-thought reasoning distillation. The only available model in the AstroX family.</div>
|
| 97 |
+
<div class="pill-row">
|
| 98 |
+
<span class="pill purple">MoE</span>
|
| 99 |
+
<span class="pill blue">671B / 37B active</span>
|
| 100 |
+
<span class="pill teal">128K context</span>
|
| 101 |
+
<span class="pill amber">FP8 weights</span>
|
| 102 |
+
</div>
|
| 103 |
+
<div class="model-meta">
|
| 104 |
+
<div class="meta-item"><div class="mk">Architecture</div><div class="mv">DeepSeekMoE + MLA</div></div>
|
| 105 |
+
<div class="meta-item"><div class="mk">Experts</div><div class="mv">256 total · 8 active</div></div>
|
| 106 |
+
<div class="meta-item"><div class="mk">Pre-training data</div><div class="mv">14.8T tokens</div></div>
|
| 107 |
+
<div class="meta-item"><div class="mk">License</div><div class="mv">MIT + Model Agreement</div></div>
|
| 108 |
</div>
|
|
|
|
| 109 |
</div>
|
| 110 |
</div>
|
| 111 |
|
| 112 |
+
<div class="section">
|
| 113 |
+
<div class="section-label">Architecture highlights</div>
|
| 114 |
+
<div class="arch-row">
|
| 115 |
+
<div class="arch-card">
|
| 116 |
+
<div class="ak">Attention</div>
|
| 117 |
+
<div class="av">Multi-head Latent Attention (MLA)</div>
|
| 118 |
+
<div class="ad">Reduces KV cache memory footprint significantly vs. standard MHA, enabling practical long-context inference.</div>
|
| 119 |
+
</div>
|
| 120 |
+
<div class="arch-card">
|
| 121 |
+
<div class="ak">Load balancing</div>
|
| 122 |
+
<div class="av">Auxiliary-loss-free strategy</div>
|
| 123 |
+
<div class="ad">Balances expert load without the performance penalty of traditional auxiliary loss terms.</div>
|
| 124 |
+
</div>
|
| 125 |
+
<div class="arch-card">
|
| 126 |
+
<div class="ak">Training objective</div>
|
| 127 |
+
<div class="av">Multi-Token Prediction (MTP)</div>
|
| 128 |
+
<div class="ad">Predicts multiple future tokens simultaneously, improving performance and enabling speculative decoding.</div>
|
| 129 |
+
</div>
|
| 130 |
+
<div class="arch-card">
|
| 131 |
+
<div class="ak">Post-training</div>
|
| 132 |
+
<div class="av">R1 reasoning distillation</div>
|
| 133 |
+
<div class="ad">Verification and reflection patterns from DeepSeek-R1 are distilled into the model while keeping output style controlled.</div>
|
| 134 |
+
</div>
|
| 135 |
+
</div>
|
| 136 |
+
</div>
|
| 137 |
+
|
| 138 |
+
<div class="section">
|
| 139 |
+
<div class="section-label">Key innovations</div>
|
| 140 |
<div class="feature-grid">
|
| 141 |
<div class="feature-card">
|
| 142 |
+
<div class="ft">FP8 mixed precision</div>
|
| 143 |
+
<div class="fd">First large-scale FP8 training validated on a 671B model. Cuts compute cost without quality loss.</div>
|
| 144 |
</div>
|
| 145 |
<div class="feature-card">
|
| 146 |
+
<div class="ft">Zero training instability</div>
|
| 147 |
+
<div class="fd">No irrecoverable loss spikes and no rollbacks throughout the entire pre-training run.</div>
|
| 148 |
</div>
|
| 149 |
<div class="feature-card">
|
| 150 |
+
<div class="ft">Full comm/compute overlap</div>
|
| 151 |
+
<div class="fd">Co-designed algorithms and hardware nearly eliminate the communication bottleneck in cross-node MoE training.</div>
|
| 152 |
</div>
|
| 153 |
<div class="feature-card">
|
| 154 |
+
<div class="ft">Speculative decoding ready</div>
|
| 155 |
+
<div class="fd">The MTP module can be repurposed as a draft head for inference acceleration out of the box.</div>
|
| 156 |
</div>
|
| 157 |
</div>
|
| 158 |
</div>
|
| 159 |
|
| 160 |
+
<div class="section">
|
| 161 |
+
<div class="section-label">Benchmark performance — math & reasoning</div>
|
| 162 |
+
<table>
|
| 163 |
+
<tr><th style="width:44%">Benchmark</th><th>GPT-4o</th><th>Claude 3.5 Sonnet</th><th>AstroX</th></tr>
|
| 164 |
+
<tr><td>AIME 2024 (Pass@1)</td><td>9.3</td><td>16.0</td><td class="best">39.2</td></tr>
|
| 165 |
+
<tr><td>MATH-500 (EM)</td><td>74.6</td><td>78.3</td><td class="best">90.2</td></tr>
|
| 166 |
+
<tr><td>CNMO 2024 (Pass@1)</td><td>10.8</td><td>13.1</td><td class="best">43.2</td></tr>
|
| 167 |
+
<tr><td>GSM8K (EM)</td><td>—</td><td>—</td><td class="best">89.3</td></tr>
|
| 168 |
+
</table>
|
| 169 |
+
</div>
|
| 170 |
+
|
| 171 |
+
<div class="section">
|
| 172 |
+
<div class="section-label">Benchmark performance — code</div>
|
| 173 |
+
<table>
|
| 174 |
+
<tr><th style="width:44%">Benchmark</th><th>GPT-4o</th><th>Claude 3.5 Sonnet</th><th>AstroX</th></tr>
|
| 175 |
+
<tr><td>LiveCodeBench (Pass@1)</td><td>34.2</td><td>32.8</td><td class="best">37.6</td></tr>
|
| 176 |
+
<tr><td>Codeforces (Percentile)</td><td>23.6</td><td>20.3</td><td class="best">51.6</td></tr>
|
| 177 |
+
<tr><td>Aider-Polyglot (Acc.)</td><td>16.0</td><td>45.3</td><td class="best">49.6</td></tr>
|
| 178 |
+
<tr><td>HumanEval-Mul (Pass@1)</td><td>80.5</td><td>81.7</td><td class="best">82.6</td></tr>
|
| 179 |
+
</table>
|
| 180 |
+
</div>
|
| 181 |
+
|
| 182 |
+
<div class="section">
|
| 183 |
+
<div class="section-label">Benchmark performance — general</div>
|
| 184 |
+
<table>
|
| 185 |
+
<tr><th style="width:44%">Benchmark</th><th>GPT-4o</th><th>Claude 3.5 Sonnet</th><th>AstroX</th></tr>
|
| 186 |
+
<tr><td>MMLU (EM)</td><td>87.2</td><td>88.3</td><td class="best">88.5</td></tr>
|
| 187 |
+
<tr><td>Arena-Hard</td><td>80.4</td><td>85.2</td><td class="best">85.5</td></tr>
|
| 188 |
+
<tr><td>AlpacaEval 2.0</td><td>51.1</td><td>52.0</td><td class="best">70.0</td></tr>
|
| 189 |
+
<tr><td>DROP (3-shot F1)</td><td>83.7</td><td>88.3</td><td class="best">91.6</td></tr>
|
| 190 |
+
</table>
|
| 191 |
+
</div>
|
| 192 |
+
|
| 193 |
+
<div class="section">
|
| 194 |
+
<div class="section-label">Supported inference frameworks</div>
|
| 195 |
+
<div class="framework-grid">
|
| 196 |
+
<div class="fw"><div class="fwn">SGLang</div><div class="fwd">Recommended · FP8 + BF16 · NVIDIA + AMD</div></div>
|
| 197 |
+
<div class="fw"><div class="fwn">vLLM</div><div class="fwd">FP8 + BF16 · pipeline parallelism</div></div>
|
| 198 |
+
<div class="fw"><div class="fwn">LMDeploy</div><div class="fwd">Offline + online · PyTorch-native</div></div>
|
| 199 |
+
<div class="fw"><div class="fwn">TensorRT-LLM</div><div class="fwd">BF16 · INT4/INT8 quant</div></div>
|
| 200 |
+
<div class="fw"><div class="fwn">AMD GPU</div><div class="fwd">via SGLang · FP8 + BF16</div></div>
|
| 201 |
+
<div class="fw"><div class="fwn">Huawei Ascend</div><div class="fwd">via MindIE · BF16</div></div>
|
| 202 |
</div>
|
| 203 |
+
</div>
|
| 204 |
+
|
| 205 |
+
<div class="section">
|
| 206 |
+
<div class="section-label">Quick start</div>
|
| 207 |
+
<div style="background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem 1.25rem;">
|
| 208 |
+
<div style="font-size: 12px; color: var(--color-text-secondary); margin-bottom: 8px;">Convert FP8 weights to BF16</div>
|
| 209 |
+
<code style="font-family: var(--font-mono); font-size: 12px; color: var(--color-text-primary); display: block; line-height: 1.8;">python fp8_cast_bf16.py \<br> --input-fp8-hf-path /path/to/fp8_weights \<br> --output-bf16-hf-path /path/to/bf16_weights</code>
|
| 210 |
</div>
|
| 211 |
+
<div style="background: var(--color-background-secondary); border-radius: var(--border-radius-md); padding: 1rem 1.25rem; margin-top: 8px;">
|
| 212 |
+
<div style="font-size: 12px; color: var(--color-text-secondary); margin-bottom: 8px;">Run interactive inference (2 nodes · 8 GPUs each)</div>
|
| 213 |
+
<code style="font-family: var(--font-mono); font-size: 12px; color: var(--color-text-primary); display: block; line-height: 1.8;">torchrun --nnodes 2 --nproc-per-node 8 generate.py \<br> --node-rank $RANK --master-addr $ADDR \<br> --ckpt-path /path/to/AstroX \<br> --config configs/config_671B.json \<br> --interactive --temperature 0.7 --max-new-tokens 200</code>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
</div>
|
| 215 |
</div>
|
| 216 |
|
| 217 |
<div class="footer">
|
| 218 |
+
Code license: MIT · Model license: Model Agreement · Commercial use supported<br>
|
| 219 |
+
huggingface.co/teamzero/astrox
|
| 220 |
</div>
|
| 221 |
|
| 222 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|