drbh HF Staff commited on
Commit
218c358
·
verified ·
1 Parent(s): 55589b1

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. note_test_override.md +261 -0
note_test_override.md ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "uvnote Integration Test Report"
3
+ author: "uvnote"
4
+ theme: "light"
5
+ syntax_theme: "monokai"
6
+ show_line_numbers: true
7
+ collapse_code: false
8
+ custom_css: |
9
+ #output-setup {
10
+ overflow-x: auto;
11
+ }
12
+ .cell-stdout {
13
+ width: 100%;
14
+ }
15
+ .cell-stderr {
16
+ width: max-content;
17
+ max-height: 300px;
18
+ overflow: auto;
19
+ }
20
+ ---
21
+
22
+ ```python id=setup
23
+ # /// script
24
+ # requires-python = ">=3.12"
25
+ # dependencies = [
26
+ # "accelerate>=1.10.1",
27
+ # "torch>=2.7.0",
28
+ # "kernels==0.10.0",
29
+ # "transformers@https://github.com/huggingface/transformers.git",
30
+ # "ipdb>=0.13.13",
31
+ # "matplotlib>=3.7.2",
32
+ # "numpy>=1.24.3",
33
+ # ]
34
+ # ///
35
+
36
+ import torch
37
+ from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
38
+ import time
39
+ import torch.nn as nn
40
+ from kernels import register_kernel_mapping, Mode, LayerRepository
41
+ import sys
42
+ import torch.profiler
43
+ import gc
44
+ import logging
45
+
46
+ # set to debug logging
47
+ logging.basicConfig(level=logging.INFO)
48
+
49
+ def reset_peak_memory_stats():
50
+ """Clear CUDA cache and reset memory allocation counters."""
51
+ torch.cuda.empty_cache()
52
+ if torch.cuda.is_available():
53
+ torch.cuda.reset_peak_memory_stats()
54
+ gc.collect()
55
+
56
+ def get_memory_stats():
57
+ """Get current and peak CUDA memory usage."""
58
+ if not torch.cuda.is_available():
59
+ return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
60
+ return {
61
+ "allocated_gb": torch.cuda.memory_allocated() / 1e9,
62
+ "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
63
+ "reserved_gb": torch.cuda.memory_reserved() / 1e9,
64
+ }
65
+
66
+ def override_kernel_layer_name(cls_name: str, value) -> bool:
67
+ """Helper to dynamically override the kernel_layer_name in a model class."""
68
+ for mod in sys.modules.values():
69
+ if mod is None:
70
+ continue
71
+ obj = getattr(mod, cls_name, None)
72
+ if isinstance(obj, type) and issubclass(obj, nn.Module):
73
+ setattr(obj, "kernel_layer_name", value)
74
+ print(f"Overrode {cls_name}.kernel_layer_name to {value}")
75
+ return True
76
+ return False
77
+
78
+
79
+ # Init the model the normal way
80
+ model_id = "openai/gpt-oss-20b"
81
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
82
+ quantization_config = Mxfp4Config(dequantize=True)
83
+
84
+
85
+ from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
86
+
87
+ from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
88
+
89
+ replace_kernel_forward_from_hub(GptOssMLP, "Yamoe") # direct, type-safe
90
+ replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
91
+ custom_mapping = {
92
+ "Yamoe": {
93
+ "cuda": {
94
+ Mode.INFERENCE: LayerRepository(
95
+ repo_id="drbh/yamoe",
96
+ layer_name="Yamoe",
97
+ revision="v0.3.0",
98
+ )
99
+ }
100
+ }
101
+ }
102
+ register_kernel_mapping(custom_mapping)
103
+
104
+
105
+ model = GptOssForCausalLM.from_pretrained(
106
+ model_id,
107
+ dtype="bfloat16",
108
+ device_map="auto",
109
+ use_kernels=True,
110
+ quantization_config=quantization_config,
111
+ ).eval()
112
+
113
+ messages = [
114
+ {"role": "system", "content": "What is Tensor Parallelism?"},
115
+ ]
116
+
117
+ inputs = tokenizer.apply_chat_template(
118
+ messages,
119
+ add_generation_prompt=True,
120
+ return_tensors="pt",
121
+ return_dict=True,
122
+ reasoning_effort="low",
123
+ ).to("cuda")
124
+
125
+ max_tokens = 512
126
+
127
+ with torch.inference_mode():
128
+ start_time = time.perf_counter()
129
+ generated = model.generate(
130
+ **inputs,
131
+ max_new_tokens=max_tokens,
132
+ do_sample=False,
133
+ temperature=None,
134
+ )
135
+ end_time = time.perf_counter()
136
+
137
+ print(tokenizer.decode(generated[0], skip_special_tokens=False))
138
+ print(f"Generation took {end_time - start_time:.2f} seconds")
139
+
140
+ ```
141
+
142
+ # Reference kernel
143
+
144
+ ```python id=setup2
145
+ # /// script
146
+ # requires-python = ">=3.12"
147
+ # dependencies = [
148
+ # "accelerate>=1.10.1",
149
+ # "torch>=2.7.0",
150
+ # "kernels==0.10.0",
151
+ # "transformers@https://github.com/huggingface/transformers.git",
152
+ # "ipdb>=0.13.13",
153
+ # "matplotlib>=3.7.2",
154
+ # "numpy>=1.24.3",
155
+ # ]
156
+ # ///
157
+
158
+ import torch
159
+ from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
160
+ import time
161
+ import torch.nn as nn
162
+ from kernels import register_kernel_mapping, Mode, LayerRepository
163
+ import sys
164
+ import torch.profiler
165
+ import gc
166
+ import logging
167
+
168
+ # set to debug logging
169
+ logging.basicConfig(level=logging.INFO)
170
+
171
+ def reset_peak_memory_stats():
172
+ """Clear CUDA cache and reset memory allocation counters."""
173
+ torch.cuda.empty_cache()
174
+ if torch.cuda.is_available():
175
+ torch.cuda.reset_peak_memory_stats()
176
+ gc.collect()
177
+
178
+ def get_memory_stats():
179
+ """Get current and peak CUDA memory usage."""
180
+ if not torch.cuda.is_available():
181
+ return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
182
+ return {
183
+ "allocated_gb": torch.cuda.memory_allocated() / 1e9,
184
+ "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
185
+ "reserved_gb": torch.cuda.memory_reserved() / 1e9,
186
+ }
187
+
188
+ def override_kernel_layer_name(cls_name: str, value) -> bool:
189
+ """Helper to dynamically override the kernel_layer_name in a model class."""
190
+ for mod in sys.modules.values():
191
+ if mod is None:
192
+ continue
193
+ obj = getattr(mod, cls_name, None)
194
+ if isinstance(obj, type) and issubclass(obj, nn.Module):
195
+ setattr(obj, "kernel_layer_name", value)
196
+ print(f"Overrode {cls_name}.kernel_layer_name to {value}")
197
+ return True
198
+ return False
199
+
200
+
201
+ # Init the model the normal way
202
+ model_id = "openai/gpt-oss-20b"
203
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
204
+ quantization_config = Mxfp4Config(dequantize=True)
205
+
206
+
207
+ from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
208
+
209
+ from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
210
+
211
+ replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
212
+ custom_mapping = {
213
+ "Yamoe": {
214
+ "cuda": {
215
+ Mode.INFERENCE: LayerRepository(
216
+ repo_id="drbh/yamoe",
217
+ layer_name="Yamoe",
218
+ revision="v0.3.0",
219
+ )
220
+ }
221
+ }
222
+ }
223
+ register_kernel_mapping(custom_mapping)
224
+
225
+
226
+ model = GptOssForCausalLM.from_pretrained(
227
+ model_id,
228
+ dtype="bfloat16",
229
+ device_map="auto",
230
+ use_kernels=True,
231
+ quantization_config=quantization_config,
232
+ ).eval()
233
+
234
+ messages = [
235
+ {"role": "system", "content": "What is Tensor Parallelism?"},
236
+ ]
237
+
238
+ inputs = tokenizer.apply_chat_template(
239
+ messages,
240
+ add_generation_prompt=True,
241
+ return_tensors="pt",
242
+ return_dict=True,
243
+ reasoning_effort="low",
244
+ ).to("cuda")
245
+
246
+ max_tokens = 512
247
+
248
+ with torch.inference_mode():
249
+ start_time = time.perf_counter()
250
+ generated = model.generate(
251
+ **inputs,
252
+ max_new_tokens=max_tokens,
253
+ do_sample=False,
254
+ temperature=None,
255
+ )
256
+ end_time = time.perf_counter()
257
+
258
+ print(tokenizer.decode(generated[0], skip_special_tokens=False))
259
+ print(f"Generation took {end_time - start_time:.2f} seconds")
260
+
261
+ ```