samcheng0 commited on
Commit
121f32b
·
verified ·
1 Parent(s): f36316e

Delete deeplm/scripts/upload_to_hf.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. deeplm/scripts/upload_to_hf.py +0 -275
deeplm/scripts/upload_to_hf.py DELETED
@@ -1,275 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Upload Deeplm model to HuggingFace Hub.
4
-
5
- Uploads:
6
- - Model weights (from /vol/checkpoints/final.pt or best.pt)
7
- - All model code (deeplm/)
8
- - Config (config.yaml)
9
- - Tokenizer (if available)
10
- - Model card (README.md)
11
-
12
- Usage:
13
- python scripts/upload_to_hf.py --repo-id samcheng0/deeplm-98m --token hf_xxx
14
- """
15
- import argparse
16
- import json
17
- import os
18
- from pathlib import Path
19
-
20
- from huggingface_hub import HfApi, login, create_repo
21
-
22
-
23
- def create_model_card(repo_id: str, params: int, vocab_size: int, **kwargs) -> str:
24
- """Generate model card README."""
25
- return f"""---
26
- language: id
27
- license: apache-2.0
28
- library_name: transformers
29
- tags:
30
- - indonesian
31
- - language-model
32
- - moe
33
- - mla
34
- - hyper-connections
35
- - lightning-attention
36
- - multi-token-prediction
37
- base_model: "none"
38
- ---
39
-
40
- # Deeplm-98M
41
-
42
- Indonesian language model with novel architecture combining MLA, MoE, Hyper-Connections, Hybrid Attention, and Multi-Token Prediction.
43
-
44
- ## Architecture
45
-
46
- | Component | Detail |
47
- |-----------|--------|
48
- | **Total Parameters** | {params:,} |
49
- | **Vocabulary** | {vocab_size:,} (BBPE) |
50
- | **Layers** | 8 Transformer blocks |
51
- | **Hidden Size** | 384 |
52
- | **Attention** | MLA (Multi-head Latent Attention) |
53
- | **FFN** | MoE (4 routed + 1 shared experts, top-k=2) |
54
- | **Residual** | Hyper-Connections with Sinkhorn routing |
55
- | **Linear Attention** | LightningAttentionV2 (5/8 layers) |
56
- | **Prediction** | MTP (Multi-Token Prediction, depth=2) |
57
- | **Embeddings** | Tied (shared between input/output) |
58
-
59
- ## Key Innovations
60
-
61
- ### 1. Multi-head Latent Attention (MLA)
62
- Compressed KV cache via low-rank latent space (~10x memory savings vs MHA).
63
- - Q: hidden → q_lora_rank(192) → num_heads × head_dim
64
- - KV: hidden → kv_lora_rank(128) + rope_dim(32) → decompressed to full heads
65
- - Decoupled RoPE on small portion of Q/K
66
-
67
- ### 2. Mixture of Experts (MoE)
68
- - 4 routed experts + 1 shared expert (always active)
69
- - Top-k=2 routing with sqrt(softplus) scoring
70
- - Bias-based load balancing (no auxiliary loss)
71
- - Grouped dispatch via sorting for efficient processing
72
-
73
- ### 3. Hyper-Connections
74
- Replaces standard residuals with learned routing over 4 connection types:
75
- - Identity, Transform, Gate, Skip
76
- - Sinkhorn-Knopp normalization for doubly-stochastic weights
77
- - Input-dependent routing with type biases
78
-
79
- ### 4. Hybrid Attention
80
- - 3 softmax layers (0, 4, 7): Standard MLA
81
- - 5 linear layers (1, 2, 3, 5, 6): MLA + LightningAttentionV2 blend
82
- - LightningAttentionV2: O(n) complexity with incremental KV state
83
-
84
- ### 5. Multi-Token Prediction (MTP)
85
- - 2 prediction depths per MTP layer
86
- - RoPE positional encoding (reduced dim for efficiency)
87
- - Skip connections in projections
88
- - Tied LM head for parameter sharing
89
-
90
- ## Training
91
-
92
- - **Dataset**: GSM8K (7.5K) + CEFR CEP (3.48M) interleaved
93
- - **Tokenizer**: 128K BBPE (KBBI + Corpus-Indonesia + WordNet)
94
- - **Optimizer**: AdamW with cosine LR schedule
95
- - **Batch Size**: 2 × 16 grad_accum = 32 effective
96
- - **Sequence Length**: 2048
97
- - **Learning Rate**: 8e-4 with 3% warmup
98
-
99
- ## Usage
100
-
101
- ```python
102
- import torch
103
- from deeplm.config import DeeplmConfig
104
- from deeplm.model.deeplm import DeeplmModel
105
-
106
- # Load model
107
- config = DeeplmConfig()
108
- model = DeeplmModel(config)
109
- model.load_state_dict(torch.load("model.pt", map_location="cpu"), strict=False)
110
- model.eval()
111
-
112
- # Generate
113
- input_ids = torch.tensor([[1, 2, 3]]) # tokenized input
114
- output = model.generate(input_ids, max_new_tokens=128, do_sample=True, temperature=0.7)
115
- ```
116
-
117
- ## Files
118
-
119
- - `model.pt` — Model weights
120
- - `config.yaml` — Model configuration
121
- - `tokenizer.json` — Tokenizer (128K vocab)
122
- - `deeplm/` — Full model code
123
- """
124
-
125
-
126
- def upload_model(repo_id: str, token: str = None, model_path: str = None,
127
- tokenizer_path: str = None, config_yaml: str = None):
128
- """Upload model and all files to HuggingFace."""
129
- if token:
130
- login(token)
131
-
132
- api = HfApi()
133
-
134
- # Create repo if not exists
135
- try:
136
- create_repo(repo_id, repo_type="model", exist_ok=True)
137
- print(f"✓ Repo {repo_id} exists/created")
138
- except Exception as e:
139
- print(f"✗ Failed to create repo: {e}")
140
- return
141
-
142
- # Upload model weights
143
- if model_path and Path(model_path).exists():
144
- print(f"Uploading model weights: {model_path}")
145
- api.upload_file(
146
- path_or_fileobj=str(model_path),
147
- path_in_repo="model.pt",
148
- repo_id=repo_id,
149
- )
150
- print(" ✓ model.pt")
151
- else:
152
- print(" ⚠ No model weights found (skipping)")
153
-
154
- # Upload optimizer state (for resume training)
155
- opt_path = Path(model_path).parent / f"opt-{Path(model_path).stem.split('-')[-1].replace('.pt','')}.pt" if model_path else None
156
- for p in ["/root/deeplm/opt.pt", "opt.pt"]:
157
- if Path(p).exists():
158
- opt_path = Path(p)
159
- break
160
-
161
- if opt_path and opt_path.exists():
162
- print(f"Uploading optimizer state: {opt_path}")
163
- api.upload_file(
164
- path_or_fileobj=str(opt_path),
165
- path_in_repo="optimizer.pt",
166
- repo_id=repo_id,
167
- )
168
- print(" ✓ optimizer.pt")
169
- else:
170
- print(" ⚠ No optimizer state found (skipping)")
171
-
172
- # Upload tokenizer
173
- if tokenizer_path and Path(tokenizer_path).exists():
174
- print(f"Uploading tokenizer: {tokenizer_path}")
175
- api.upload_file(
176
- path_or_fileobj=str(tokenizer_path),
177
- path_in_repo="tokenizer.json",
178
- repo_id=repo_id,
179
- )
180
- print(" ✓ tokenizer.json")
181
- else:
182
- # Try default paths
183
- for p in ["/vol/tokenizer/tokenizer.json", "tokenizer/tokenizer.json"]:
184
- if Path(p).exists():
185
- api.upload_file(
186
- path_or_fileobj=str(p),
187
- path_in_repo="tokenizer.json",
188
- repo_id=repo_id,
189
- )
190
- print(f" ✓ tokenizer.json (from {p})")
191
- break
192
-
193
- # Upload config YAML
194
- if config_yaml and Path(config_yaml).exists():
195
- api.upload_file(
196
- path_or_fileobj=str(config_yaml),
197
- path_in_repo="config.yaml",
198
- repo_id=repo_id,
199
- )
200
- print(" ✓ config.yaml")
201
-
202
- # Upload model code
203
- print("Uploading model code...")
204
- code_dirs = [
205
- "deeplm/model",
206
- "deeplm/training",
207
- "deeplm/data",
208
- "deeplm/inference",
209
- "deeplm/self_evolution",
210
- ]
211
- for d in code_dirs:
212
- if Path(d).exists():
213
- for f in Path(d).rglob("*.py"):
214
- if "__pycache__" not in str(f):
215
- repo_path = str(f)
216
- api.upload_file(
217
- path_or_fileobj=str(f),
218
- path_in_repo=repo_path,
219
- repo_id=repo_id,
220
- )
221
- print(f" ✓ {repo_path}")
222
-
223
- # Upload config.py
224
- if Path("deeplm/config.py").exists():
225
- api.upload_file(
226
- path_or_fileobj="deeplm/config.py",
227
- path_in_repo="deeplm/config.py",
228
- repo_id=repo_id,
229
- )
230
- print(" ✓ deeplm/config.py")
231
-
232
- # Upload __init__.py
233
- if Path("deeplm/__init__.py").exists():
234
- api.upload_file(
235
- path_or_fileobj="deeplm/__init__.py",
236
- path_in_repo="deeplm/__init__.py",
237
- repo_id=repo_id,
238
- )
239
- print(" ✓ deeplm/__init__.py")
240
-
241
- # Create and upload model card
242
- card = create_model_card(repo_id, params=98_237_216, vocab_size=128_000)
243
- api.upload_file(
244
- path_or_fileobj=card.encode(),
245
- path_in_repo="README.md",
246
- repo_id=repo_id,
247
- )
248
- print(" ✓ README.md")
249
-
250
- print(f"\n✓ Upload complete! View at: https://huggingface.co/{repo_id}")
251
-
252
-
253
- if __name__ == "__main__":
254
- parser = argparse.ArgumentParser()
255
- parser.add_argument("--repo-id", default="samcheng0/deeplm-98m")
256
- parser.add_argument("--token", default=os.environ.get("HF_TOKEN"))
257
- parser.add_argument("--model-path", default=None,
258
- help="Path to model.pt (default: check /vol/checkpoints/)")
259
- parser.add_argument("--tokenizer-path", default=None,
260
- help="Path to tokenizer.json")
261
- parser.add_argument("--config-yaml", default="configs/train_kbi.yaml")
262
- args = parser.parse_args()
263
-
264
- # Auto-detect model path
265
- if not args.model_path:
266
- for p in [
267
- "/vol/checkpoints/final.pt",
268
- "/vol/checkpoints/best.pt",
269
- "model.pt",
270
- ]:
271
- if Path(p).exists():
272
- args.model_path = p
273
- break
274
-
275
- upload_model(**vars(args))