Upload 3 files
Browse files- config.json +15 -0
- export_onnx.py +52 -0
- quantize_onnx.py +16 -0
config.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"n_layer": 24,
|
| 3 |
+
"n_head": 16,
|
| 4 |
+
"n_embd": 1024,
|
| 5 |
+
"block_size": 1024,
|
| 6 |
+
"bias": false,
|
| 7 |
+
"vocab_size": 50304,
|
| 8 |
+
"dropout": 0.0,
|
| 9 |
+
"model_type": "gpt2",
|
| 10 |
+
"architectures": [
|
| 11 |
+
"GPT"
|
| 12 |
+
],
|
| 13 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 14 |
+
"model_name": "SmaLLMPro-350M"
|
| 15 |
+
}
|
export_onnx.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import os
|
| 3 |
+
import onnx # Wichtig für den Single-File Fix
|
| 4 |
+
from model import GPTConfig, GPT
|
| 5 |
+
|
| 6 |
+
# Pfade definieren
|
| 7 |
+
ckpt_path = '/media/leo/Data/checkpoints/350m_SmaLLMPro_Final/SmaLLMPro_Final.pt'
|
| 8 |
+
out_path_full = 'SmaLLMPro_350M.onnx'
|
| 9 |
+
device = 'cpu'
|
| 10 |
+
|
| 11 |
+
# 1. Checkpoint laden
|
| 12 |
+
print(f"Lade Checkpoint: {ckpt_path}")
|
| 13 |
+
checkpoint = torch.load(ckpt_path, map_location=device)
|
| 14 |
+
gptconf = GPTConfig(**checkpoint['model_args'])
|
| 15 |
+
model = GPT(gptconf)
|
| 16 |
+
|
| 17 |
+
state_dict = checkpoint['model']
|
| 18 |
+
unwanted_prefix = '_orig_mod.'
|
| 19 |
+
for k, v in list(state_dict.items()):
|
| 20 |
+
if k.startswith(unwanted_prefix):
|
| 21 |
+
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
| 22 |
+
|
| 23 |
+
model.load_state_dict(state_dict)
|
| 24 |
+
model.eval()
|
| 25 |
+
|
| 26 |
+
# 2. Dummy-Input
|
| 27 |
+
x = torch.randint(0, gptconf.vocab_size, (1, gptconf.block_size), dtype=torch.long)
|
| 28 |
+
|
| 29 |
+
# 3. ONNX Export
|
| 30 |
+
print("Exportiere sauberes ONNX Modell (Opset 18)...")
|
| 31 |
+
torch.onnx.export(
|
| 32 |
+
model,
|
| 33 |
+
(x,),
|
| 34 |
+
out_path_full,
|
| 35 |
+
export_params=True,
|
| 36 |
+
opset_version=18,
|
| 37 |
+
do_constant_folding=True,
|
| 38 |
+
input_names=['input'],
|
| 39 |
+
output_names=['logits'],
|
| 40 |
+
# Wir lassen dynamic_axes hier weg, um den Exporter nicht zu zwingen,
|
| 41 |
+
# den Graph unnötig komplex zu machen, was oft zum Split führt.
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# 4. Der Single-File Fix
|
| 45 |
+
print("Erzwinge Speicherung in einer einzelnen Datei...")
|
| 46 |
+
try:
|
| 47 |
+
model_proto = onnx.load(out_path_full)
|
| 48 |
+
# onnx.save ohne 'location' Parameter versucht alles in ein .onnx File zu schreiben
|
| 49 |
+
onnx.save(model_proto, "SmaLLMPro_350M_Final.onnx")
|
| 50 |
+
print("✅ Full Precision Modell erfolgreich als Einzeldokument gespeichert: SmaLLMPro_350M_Final.onnx")
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"⚠️ Hinweis: Single-File Save fehlgeschlagen (evtl. doch über 2GB?). Fehler: {e}")
|
quantize_onnx.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import onnx
|
| 2 |
+
from onnxruntime.quantization import quantize_dynamic, QuantType
|
| 3 |
+
|
| 4 |
+
# Pfade
|
| 5 |
+
model_fp32 = "SmaLLMPro_350M.onnx"
|
| 6 |
+
model_int8 = "SmaLLMPro_350M_int8.onnx"
|
| 7 |
+
|
| 8 |
+
print(f"📦 Quantisiere {model_fp32} zu INT8...")
|
| 9 |
+
|
| 10 |
+
quantize_dynamic(
|
| 11 |
+
model_input=model_fp32,
|
| 12 |
+
model_output=model_int8,
|
| 13 |
+
weight_type=QuantType.QInt8 # Empfohlen für beste Performance/Präzision
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
print(f"✅ Fertig! Quantisiertes Modell: {model_int8}")
|