drbh HF Staff commited on
Commit
73f8595
·
verified ·
1 Parent(s): 93c5002

Upload folder using huggingface_hub

Browse files
index.html CHANGED
@@ -17,8 +17,8 @@
17
  <body>
18
  <h1>Index of /</h1>
19
  <ul>
20
- <li><a href='.venv/index.html' class='dir'>.venv/</a></li>
21
- <li><a href='moe_benchmarks/index.html' class='dir'>moe_benchmarks/</a></li>
22
  </ul>
23
  </body>
24
  </html>
 
17
  <body>
18
  <h1>Index of /</h1>
19
  <ul>
20
+ <li><a href='megablocks/index.html' class='dir'>megablocks/</a></li>
21
+ <li><a href='megablocks_yamoe/index.html' class='dir'>megablocks_yamoe/</a></li>
22
  </ul>
23
  </body>
24
  </html>
megablocks/cells/forward_and_backward.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.12"
3
+ # dependencies = [
4
+ # "accelerate>=1.10.1",
5
+ # "torch>=2.7.0",
6
+ # "kernels==0.10.0",
7
+ # "transformers@https://github.com/huggingface/transformers.git",
8
+ # "ipdb>=0.13.13",
9
+ # "matplotlib>=3.7.2",
10
+ # "numpy>=1.24.3",
11
+ # ]
12
+ # ///
13
+
14
+ import torch
15
+ from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
+ import time
17
+ import torch.nn as nn
18
+ from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
19
+ import sys
20
+ import torch.profiler
21
+ import gc
22
+ import logging
23
+ from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
24
+
25
+ # remove liger kernel for testing
26
+ replace_kernel_forward_from_hub(GptOssRMSNorm, None)
27
+
28
+ # set to debug logging
29
+ logging.basicConfig(level=logging.INFO)
30
+
31
+ def reset_peak_memory_stats():
32
+ """Clear CUDA cache and reset memory allocation counters."""
33
+ torch.cuda.empty_cache()
34
+ if torch.cuda.is_available():
35
+ torch.cuda.reset_peak_memory_stats()
36
+ gc.collect()
37
+
38
+ def get_memory_stats():
39
+ """Get current and peak CUDA memory usage."""
40
+ if not torch.cuda.is_available():
41
+ return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
42
+ return {
43
+ "allocated_gb": torch.cuda.memory_allocated() / 1e9,
44
+ "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
45
+ "reserved_gb": torch.cuda.memory_reserved() / 1e9,
46
+ }
47
+
48
+ def override_kernel_layer_name(cls_name: str, value) -> bool:
49
+ """Helper to dynamically override the kernel_layer_name in a model class."""
50
+ for mod in sys.modules.values():
51
+ if mod is None:
52
+ continue
53
+ obj = getattr(mod, cls_name, None)
54
+ if isinstance(obj, type) and issubclass(obj, nn.Module):
55
+ setattr(obj, "kernel_layer_name", value)
56
+ print(f"Overrode {cls_name}.kernel_layer_name to {value}")
57
+ return True
58
+ return False
59
+
60
+
61
+ # Init the model the normal way
62
+ model_id = "openai/gpt-oss-20b"
63
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
64
+ quantization_config = Mxfp4Config(dequantize=True)
65
+
66
+ model = GptOssForCausalLM.from_pretrained(
67
+ model_id,
68
+ dtype="bfloat16",
69
+ device_map="auto",
70
+ use_kernels=True,
71
+ quantization_config=quantization_config,
72
+ ).eval()
73
+
74
+ messages = [
75
+ {"role": "system", "content": "What is Tensor Parallelism?"},
76
+ ]
77
+
78
+ inputs = tokenizer.apply_chat_template(
79
+ messages,
80
+ add_generation_prompt=True,
81
+ return_tensors="pt",
82
+ return_dict=True,
83
+ reasoning_effort="low",
84
+ ).to("cuda")
85
+
86
+ max_tokens = 128 # Reduced to help with memory usage
87
+
88
+ # Clear memory before backward pass
89
+ reset_peak_memory_stats()
90
+ print(f"Pre-generation memory: {get_memory_stats()}")
91
+
92
+ # forward and backward pass
93
+ with torch.autograd.set_grad_enabled(True):
94
+ start_time = time.perf_counter()
95
+ generated = model.generate(
96
+ **inputs,
97
+ max_new_tokens=max_tokens,
98
+ do_sample=False,
99
+ temperature=None,
100
+ )
101
+ end_time = time.perf_counter()
102
+ print(tokenizer.decode(generated[0], skip_special_tokens=False))
103
+ print(f"Generation took {end_time - start_time:.2f} seconds")
104
+ print(f"Post-generation memory: {get_memory_stats()}")
105
+
106
+ # Use gradient checkpointing to reduce memory usage
107
+ if hasattr(model, 'gradient_checkpointing_enable'):
108
+ model.gradient_checkpointing_enable()
109
+ print("Enabled gradient checkpointing")
110
+
111
+ # Reduce sequence length if needed for memory
112
+ max_seq_len = 512 # Limit sequence length for backward pass
113
+ if generated.size(1) > max_seq_len:
114
+ print(f"Truncating sequence from {generated.size(1)} to {max_seq_len} tokens")
115
+ full_sequence = generated[:, -max_seq_len:]
116
+ else:
117
+ full_sequence = generated
118
+
119
+ # Get model outputs for the full sequence
120
+ model.train() # Enable dropout and other training behaviors
121
+
122
+ try:
123
+ outputs = model(
124
+ input_ids=full_sequence,
125
+ labels=full_sequence, # This will compute loss internally
126
+ return_dict=True
127
+ )
128
+ print(f"Post-forward memory: {get_memory_stats()}")
129
+
130
+ # If model doesn't compute loss, compute it manually
131
+ if outputs.loss is None:
132
+ shift_logits = outputs.logits[..., :-1, :].contiguous()
133
+ shift_labels = full_sequence[..., 1:].contiguous()
134
+
135
+ # Use CrossEntropyLoss with ignore_index for padding tokens
136
+ loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -100)
137
+ loss = loss_fct(
138
+ shift_logits.view(-1, shift_logits.size(-1)),
139
+ shift_labels.view(-1)
140
+ )
141
+ else:
142
+ loss = outputs.loss
143
+
144
+ print(f"Loss: {loss.item():.4f}")
145
+
146
+ # Clear intermediate tensors to save memory
147
+ del outputs
148
+ torch.cuda.empty_cache()
149
+
150
+ # Perform backward pass with memory management
151
+ print("Running backward pass...")
152
+ print(f"Pre-backward memory: {get_memory_stats()}")
153
+
154
+ loss.backward()
155
+ print(f"Post-backward memory: {get_memory_stats()}")
156
+
157
+ except torch.cuda.OutOfMemoryError as e:
158
+ print(f"OOM during forward/backward pass: {e}")
159
+ print("Try reducing max_tokens or max_seq_len")
160
+ raise
161
+
162
+ # Calculate gradient statistics and print sample gradients
163
+ total_norm = 0.0
164
+ param_count = 0
165
+ grad_samples = {}
166
+
167
+ for name, p in model.named_parameters():
168
+ if p.grad is not None:
169
+ param_count += 1
170
+ grad_norm = p.grad.data.norm(2).item()
171
+ total_norm += grad_norm ** 2
172
+
173
+ # Collect gradient statistics for key layers
174
+ if any(key in name for key in ['embed', 'lm_head', 'mlp.up', 'mlp.down', 'self_attn.q_proj', 'norm']):
175
+ grad_samples[name] = {
176
+ 'norm': grad_norm,
177
+ 'mean': p.grad.data.mean().item(),
178
+ 'std': p.grad.data.std().item(),
179
+ 'max': p.grad.data.max().item(),
180
+ 'min': p.grad.data.min().item(),
181
+ }
182
+
183
+ total_norm = total_norm ** 0.5
184
+
185
+ print(f"\nGradient norm: {total_norm:.4f}")
186
+ print(f"Parameters with gradients: {param_count}")
187
+
188
+ # Print sample gradients from important layers
189
+ print("\nSample gradient statistics:")
190
+ for i, (name, stats) in enumerate(list(grad_samples.items())[:10]):
191
+ print(f" {name[:60]:<60} | norm: {stats['norm']:.4e} | mean: {stats['mean']:.4e} | std: {stats['std']:.4e}")
192
+
193
+ # Optional: zero gradients for next iteration
194
+ model.zero_grad()
195
+ model.eval() # Switch back to eval mode
196
+
megablocks/cells/forward_and_backward_no_kernel.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.12"
3
+ # dependencies = [
4
+ # "accelerate>=1.10.1",
5
+ # "torch>=2.7.0",
6
+ # "kernels==0.10.0",
7
+ # "transformers@https://github.com/huggingface/transformers.git",
8
+ # "ipdb>=0.13.13",
9
+ # "matplotlib>=3.7.2",
10
+ # "numpy>=1.24.3",
11
+ # ]
12
+ # ///
13
+
14
+ import torch
15
+ from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
+ import time
17
+ import torch.nn as nn
18
+ from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
19
+ import sys
20
+ import torch.profiler
21
+ import gc
22
+ import logging
23
+ from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
24
+
25
+ # remove liger kernel for testing
26
+ replace_kernel_forward_from_hub(GptOssRMSNorm, None)
27
+
28
+ # set to debug logging
29
+ logging.basicConfig(level=logging.INFO)
30
+
31
+ def reset_peak_memory_stats():
32
+ """Clear CUDA cache and reset memory allocation counters."""
33
+ torch.cuda.empty_cache()
34
+ if torch.cuda.is_available():
35
+ torch.cuda.reset_peak_memory_stats()
36
+ gc.collect()
37
+
38
+ def get_memory_stats():
39
+ """Get current and peak CUDA memory usage."""
40
+ if not torch.cuda.is_available():
41
+ return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
42
+ return {
43
+ "allocated_gb": torch.cuda.memory_allocated() / 1e9,
44
+ "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
45
+ "reserved_gb": torch.cuda.memory_reserved() / 1e9,
46
+ }
47
+
48
+ def override_kernel_layer_name(cls_name: str, value) -> bool:
49
+ """Helper to dynamically override the kernel_layer_name in a model class."""
50
+ for mod in sys.modules.values():
51
+ if mod is None:
52
+ continue
53
+ obj = getattr(mod, cls_name, None)
54
+ if isinstance(obj, type) and issubclass(obj, nn.Module):
55
+ setattr(obj, "kernel_layer_name", value)
56
+ print(f"Overrode {cls_name}.kernel_layer_name to {value}")
57
+ return True
58
+ return False
59
+
60
+
61
+ # Init the model the normal way
62
+ model_id = "openai/gpt-oss-20b"
63
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
64
+ quantization_config = Mxfp4Config(dequantize=True)
65
+
66
+ model = GptOssForCausalLM.from_pretrained(
67
+ model_id,
68
+ dtype="bfloat16",
69
+ device_map="auto",
70
+ use_kernels=False,
71
+ quantization_config=quantization_config,
72
+ ).eval()
73
+
74
+ messages = [
75
+ {"role": "system", "content": "What is Tensor Parallelism?"},
76
+ ]
77
+
78
+ inputs = tokenizer.apply_chat_template(
79
+ messages,
80
+ add_generation_prompt=True,
81
+ return_tensors="pt",
82
+ return_dict=True,
83
+ reasoning_effort="low",
84
+ ).to("cuda")
85
+
86
+ max_tokens = 128 # Reduced to help with memory usage
87
+
88
+ # Clear memory before backward pass
89
+ reset_peak_memory_stats()
90
+ print(f"Pre-generation memory: {get_memory_stats()}")
91
+
92
+ # forward and backward pass
93
+ with torch.autograd.set_grad_enabled(True):
94
+ start_time = time.perf_counter()
95
+ generated = model.generate(
96
+ **inputs,
97
+ max_new_tokens=max_tokens,
98
+ do_sample=False,
99
+ temperature=None,
100
+ )
101
+ end_time = time.perf_counter()
102
+ print(tokenizer.decode(generated[0], skip_special_tokens=False))
103
+ print(f"Generation took {end_time - start_time:.2f} seconds")
104
+ print(f"Post-generation memory: {get_memory_stats()}")
105
+
106
+ # Use gradient checkpointing to reduce memory usage
107
+ if hasattr(model, 'gradient_checkpointing_enable'):
108
+ model.gradient_checkpointing_enable()
109
+ print("Enabled gradient checkpointing")
110
+
111
+ # Reduce sequence length if needed for memory
112
+ max_seq_len = 512 # Limit sequence length for backward pass
113
+ if generated.size(1) > max_seq_len:
114
+ print(f"Truncating sequence from {generated.size(1)} to {max_seq_len} tokens")
115
+ full_sequence = generated[:, -max_seq_len:]
116
+ else:
117
+ full_sequence = generated
118
+
119
+ # Get model outputs for the full sequence
120
+ model.train() # Enable dropout and other training behaviors
121
+
122
+ try:
123
+ outputs = model(
124
+ input_ids=full_sequence,
125
+ labels=full_sequence, # This will compute loss internally
126
+ return_dict=True
127
+ )
128
+ print(f"Post-forward memory: {get_memory_stats()}")
129
+
130
+ # If model doesn't compute loss, compute it manually
131
+ if outputs.loss is None:
132
+ shift_logits = outputs.logits[..., :-1, :].contiguous()
133
+ shift_labels = full_sequence[..., 1:].contiguous()
134
+
135
+ # Use CrossEntropyLoss with ignore_index for padding tokens
136
+ loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -100)
137
+ loss = loss_fct(
138
+ shift_logits.view(-1, shift_logits.size(-1)),
139
+ shift_labels.view(-1)
140
+ )
141
+ else:
142
+ loss = outputs.loss
143
+
144
+ print(f"Loss: {loss.item():.4f}")
145
+
146
+ # Clear intermediate tensors to save memory
147
+ del outputs
148
+ torch.cuda.empty_cache()
149
+
150
+ # Perform backward pass with memory management
151
+ print("Running backward pass...")
152
+ print(f"Pre-backward memory: {get_memory_stats()}")
153
+
154
+ loss.backward()
155
+ print(f"Post-backward memory: {get_memory_stats()}")
156
+
157
+ except torch.cuda.OutOfMemoryError as e:
158
+ print(f"OOM during forward/backward pass: {e}")
159
+ print("Try reducing max_tokens or max_seq_len")
160
+ raise
161
+
162
+ # Calculate gradient statistics and print sample gradients
163
+ total_norm = 0.0
164
+ param_count = 0
165
+ grad_samples = {}
166
+
167
+ for name, p in model.named_parameters():
168
+ if p.grad is not None:
169
+ param_count += 1
170
+ grad_norm = p.grad.data.norm(2).item()
171
+ total_norm += grad_norm ** 2
172
+
173
+ # Collect gradient statistics for key layers
174
+ if any(key in name for key in ['embed', 'lm_head', 'mlp.up', 'mlp.down', 'self_attn.q_proj', 'norm']):
175
+ grad_samples[name] = {
176
+ 'norm': grad_norm,
177
+ 'mean': p.grad.data.mean().item(),
178
+ 'std': p.grad.data.std().item(),
179
+ 'max': p.grad.data.max().item(),
180
+ 'min': p.grad.data.min().item(),
181
+ }
182
+
183
+ total_norm = total_norm ** 0.5
184
+
185
+ print(f"\nGradient norm: {total_norm:.4f}")
186
+ print(f"Parameters with gradients: {param_count}")
187
+
188
+ # Print sample gradients from important layers
189
+ print("\nSample gradient statistics:")
190
+ for i, (name, stats) in enumerate(list(grad_samples.items())[:10]):
191
+ print(f" {name[:60]:<60} | norm: {stats['norm']:.4e} | mean: {stats['mean']:.4e} | std: {stats['std']:.4e}")
192
+
193
+ # Optional: zero gradients for next iteration
194
+ model.zero_grad()
195
+ model.eval() # Switch back to eval mode
196
+
megablocks/cells/no_kernels.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.12"
3
+ # dependencies = [
4
+ # "accelerate>=1.10.1",
5
+ # "torch>=2.7.0",
6
+ # "kernels==0.10.0",
7
+ # "transformers@https://github.com/huggingface/transformers.git",
8
+ # "ipdb>=0.13.13",
9
+ # "matplotlib>=3.7.2",
10
+ # "numpy>=1.24.3",
11
+ # ]
12
+ # ///
13
+
14
+ import torch
15
+ from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
+ import time
17
+ import torch.nn as nn
18
+ from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
19
+ import sys
20
+ import torch.profiler
21
+ import gc
22
+ import logging
23
+ from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
24
+
25
+ # set to debug logging
26
+ logging.basicConfig(level=logging.INFO)
27
+
28
+ def reset_peak_memory_stats():
29
+ """Clear CUDA cache and reset memory allocation counters."""
30
+ torch.cuda.empty_cache()
31
+ if torch.cuda.is_available():
32
+ torch.cuda.reset_peak_memory_stats()
33
+ gc.collect()
34
+
35
+ def get_memory_stats():
36
+ """Get current and peak CUDA memory usage."""
37
+ if not torch.cuda.is_available():
38
+ return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
39
+ return {
40
+ "allocated_gb": torch.cuda.memory_allocated() / 1e9,
41
+ "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
42
+ "reserved_gb": torch.cuda.memory_reserved() / 1e9,
43
+ }
44
+
45
+ def override_kernel_layer_name(cls_name: str, value) -> bool:
46
+ """Helper to dynamically override the kernel_layer_name in a model class."""
47
+ for mod in sys.modules.values():
48
+ if mod is None:
49
+ continue
50
+ obj = getattr(mod, cls_name, None)
51
+ if isinstance(obj, type) and issubclass(obj, nn.Module):
52
+ setattr(obj, "kernel_layer_name", value)
53
+ print(f"Overrode {cls_name}.kernel_layer_name to {value}")
54
+ return True
55
+ return False
56
+
57
+
58
+ # Init the model the normal way
59
+ model_id = "openai/gpt-oss-20b"
60
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
61
+ quantization_config = Mxfp4Config(dequantize=True)
62
+
63
+
64
+
65
+ model = GptOssForCausalLM.from_pretrained(
66
+ model_id,
67
+ dtype="bfloat16",
68
+ device_map="auto",
69
+ use_kernels=False,
70
+ quantization_config=quantization_config,
71
+ ).eval()
72
+
73
+ messages = [
74
+ {"role": "system", "content": "What is Tensor Parallelism?"},
75
+ ]
76
+
77
+ inputs = tokenizer.apply_chat_template(
78
+ messages,
79
+ add_generation_prompt=True,
80
+ return_tensors="pt",
81
+ return_dict=True,
82
+ reasoning_effort="low",
83
+ ).to("cuda")
84
+
85
+ max_tokens = 256
86
+
87
+ with torch.inference_mode():
88
+ start_time = time.perf_counter()
89
+ generated = model.generate(
90
+ **inputs,
91
+ max_new_tokens=max_tokens,
92
+ do_sample=False,
93
+ temperature=None,
94
+ )
95
+ end_time = time.perf_counter()
96
+
97
+ print(tokenizer.decode(generated[0], skip_special_tokens=False))
98
+ print(f"Generation took {end_time - start_time:.2f} seconds")
megablocks/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /megablocks</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='megablocks_only.html' class='file'>megablocks_only.html</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
megablocks/megablocks_only.html ADDED
The diff for this file is too large to render. See raw diff
 
megablocks_yamoe/artifacts/binned_run/binned_results.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "implementation": "binned_results",
3
+ "config": {
4
+ "warmup": 10,
5
+ "iters": 50,
6
+ "device": "cuda",
7
+ "dtype": "torch.float32",
8
+ "tokens": 100,
9
+ "vary_inputs": true
10
+ },
11
+ "stats": {
12
+ "avg_ms": 36.478538259971174,
13
+ "min_ms": 33.54985800024224,
14
+ "max_ms": 39.617000999896845,
15
+ "std_ms": 1.5870638955537886,
16
+ "p50_ms": 36.43554149994088,
17
+ "p95_ms": 39.16828469987195,
18
+ "p99_ms": 39.47986176004633,
19
+ "num_iters": 50,
20
+ "tokens_per_s": 2741.3379145658514,
21
+ "throughput_variance": 118.95302366172646
22
+ },
23
+ "output_sum": 3.97190523147583
24
+ }
megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "implementation": "gptoss_results",
3
+ "config": {
4
+ "warmup": 10,
5
+ "iters": 50,
6
+ "device": "cuda",
7
+ "dtype": "torch.float32",
8
+ "tokens": 100,
9
+ "vary_inputs": true
10
+ },
11
+ "stats": {
12
+ "avg_ms": 45.01105025997276,
13
+ "min_ms": 39.02894699967874,
14
+ "max_ms": 49.29527800004507,
15
+ "std_ms": 2.979711623110132,
16
+ "p50_ms": 45.6719464998514,
17
+ "p95_ms": 48.48902935004844,
18
+ "p99_ms": 49.0557057300839,
19
+ "num_iters": 50,
20
+ "tokens_per_s": 2221.6766643396363,
21
+ "throughput_variance": 151.30753386326467
22
+ },
23
+ "output_sum": 11.53223705291748
24
+ }
megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "implementation": "gptoss_training_results",
3
+ "config": {
4
+ "warmup": 10,
5
+ "iters": 50,
6
+ "device": "cuda",
7
+ "dtype": "torch.float32",
8
+ "tokens": 100,
9
+ "vary_inputs": true
10
+ },
11
+ "stats": {
12
+ "avg_ms": 44.678819580012714,
13
+ "min_ms": 38.108840999939275,
14
+ "max_ms": 49.00846700002148,
15
+ "std_ms": 2.8989978625015023,
16
+ "p50_ms": 45.39998149971325,
17
+ "p95_ms": 48.408032500015,
18
+ "p99_ms": 48.790303320047315,
19
+ "num_iters": 50,
20
+ "tokens_per_s": 2238.197001174478,
21
+ "throughput_variance": 150.30214966250284
22
+ },
23
+ "output_sum": 11.53223705291748
24
+ }
megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "implementation": "yamoe_results",
3
+ "config": {
4
+ "warmup": 10,
5
+ "iters": 50,
6
+ "device": "cuda",
7
+ "dtype": "torch.float32",
8
+ "tokens": 100,
9
+ "vary_inputs": true
10
+ },
11
+ "stats": {
12
+ "avg_ms": 4.249353080003857,
13
+ "min_ms": 4.130692999751773,
14
+ "max_ms": 4.30493799967735,
15
+ "std_ms": 0.027547357116313485,
16
+ "p50_ms": 4.2500710001149855,
17
+ "p95_ms": 4.289417000063622,
18
+ "p99_ms": 4.299768499754464,
19
+ "num_iters": 50,
20
+ "tokens_per_s": 23532.993873954394,
21
+ "throughput_variance": 154.3815152476545
22
+ },
23
+ "output_sum": 3.971905469894409
24
+ }
megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc ADDED
Binary file (16.1 kB). View file
 
megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc ADDED
Binary file (680 Bytes). View file
 
megablocks_yamoe/cells/bench_utils.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ """Reusable benchmarking utilities for performance testing."""
9
+ import time
10
+ import numpy as np
11
+ from contextlib import contextmanager
12
+ from typing import Callable, Dict, Tuple, Any, Optional
13
+ import torch
14
+
15
+ def to_dtype(dtype_str: str):
16
+ """Convert string to torch dtype."""
17
+ if dtype_str == "float16":
18
+ return torch.float16
19
+ if dtype_str == "bfloat16":
20
+ return torch.bfloat16
21
+ return torch.float32
22
+
23
+ def _sync(device: str):
24
+ """Synchronize device if CUDA."""
25
+ if device == "cuda":
26
+ torch.cuda.synchronize()
27
+
28
+ def _compute_stats(times_s, tokens: Optional[int] = None) -> Dict[str, float]:
29
+ """Compute comprehensive latency and throughput statistics."""
30
+ lat_ms = np.array([t * 1000.0 for t in times_s])
31
+ lat_ms_sorted = np.sort(lat_ms)
32
+ n = len(lat_ms)
33
+
34
+ stats = {
35
+ "avg_ms": np.mean(lat_ms),
36
+ "min_ms": np.min(lat_ms),
37
+ "max_ms": np.max(lat_ms),
38
+ "std_ms": np.std(lat_ms),
39
+ "p50_ms": np.percentile(lat_ms, 50),
40
+ "p95_ms": np.percentile(lat_ms, 95),
41
+ "p99_ms": np.percentile(lat_ms, 99),
42
+ "num_iters": n
43
+ }
44
+
45
+ if tokens is not None and n > 0:
46
+ avg_s = np.mean(times_s)
47
+ stats["tokens_per_s"] = tokens / avg_s if avg_s > 0 else float("inf")
48
+ stats["throughput_variance"] = np.std([tokens / t for t in times_s if t > 0])
49
+
50
+ return stats
51
+
52
+ def _format_timing_stats(stats: Dict[str, float], tokens: Optional[int] = None) -> str:
53
+ """Format timing statistics for display."""
54
+ lines = [
55
+ "\n━━━━━━━━━━━━━━━━━━━━ Benchmark Results ━━━━━━━━━━━━━━━━━━━━",
56
+ f"Iterations: {stats.get('num_iters', 0)}",
57
+ "\nLatency Statistics:",
58
+ f" Average: {stats['avg_ms']:.3f} ms",
59
+ f" Min: {stats['min_ms']:.3f} ms",
60
+ f" Max: {stats['max_ms']:.3f} ms",
61
+ f" Std Dev: {stats['std_ms']:.3f} ms",
62
+ "\nPercentiles:",
63
+ f" P50 (median): {stats['p50_ms']:.3f} ms",
64
+ f" P95: {stats['p95_ms']:.3f} ms",
65
+ f" P99: {stats['p99_ms']:.3f} ms",
66
+ ]
67
+
68
+ if tokens is not None and 'tokens_per_s' in stats:
69
+ lines.extend([
70
+ "\nThroughput:",
71
+ f" Tokens/sec: {stats['tokens_per_s']:.1f}",
72
+ f" Std Dev: {stats.get('throughput_variance', 0):.1f}",
73
+ ])
74
+
75
+ lines.append("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
76
+ return "\n".join(lines)
77
+
78
+ def _bench_engine(
79
+ call: Callable[[], Any], *, warmup: int, iters: int, device: str, dtype, input_gen: Callable[[], Any] = None
80
+ ) -> Tuple[Any, list]:
81
+ """Core benchmarking engine with warmup and timing."""
82
+ use_autocast = device == "cuda" and dtype in (torch.float16, torch.bfloat16)
83
+
84
+ # Warmup phase
85
+ print(f"\nWarming up ({warmup} iterations)...")
86
+ with torch.inference_mode():
87
+ for _ in range(max(0, warmup)):
88
+ if use_autocast:
89
+ with torch.autocast(device_type="cuda", dtype=dtype):
90
+ if input_gen is not None:
91
+ _ = call(input_gen())
92
+ else:
93
+ _ = call()
94
+ else:
95
+ if input_gen is not None:
96
+ _ = call(input_gen())
97
+ else:
98
+ _ = call()
99
+ _sync(device)
100
+
101
+ # Measurement phase
102
+ print(f"Benchmarking ({iters} iterations)...")
103
+ times_s = []
104
+ last = None
105
+ with torch.inference_mode():
106
+ for i in range(max(1, iters)):
107
+ start = time.perf_counter()
108
+ if use_autocast:
109
+ with torch.autocast(device_type="cuda", dtype=dtype):
110
+ if input_gen is not None:
111
+ last = call(input_gen())
112
+ else:
113
+ last = call()
114
+ else:
115
+ if input_gen is not None:
116
+ last = call(input_gen())
117
+ else:
118
+ last = call()
119
+ _sync(device)
120
+ end = time.perf_counter()
121
+ times_s.append(end - start)
122
+
123
+ # Progress indicator every 20% of iterations
124
+ if i > 0 and i % max(1, iters // 5) == 0:
125
+ pct = (i / iters) * 100
126
+ avg_so_far = np.mean(times_s[:i]) * 1000
127
+ print(f" Progress: {pct:.0f}% complete (avg: {avg_so_far:.3f} ms)")
128
+
129
+ return last, times_s
130
+
131
+ def tensor_stats(t: torch.Tensor) -> str:
132
+ """Generate comprehensive stats string for a tensor."""
133
+ return (f"shape={tuple(t.shape)}, "
134
+ f"dtype={t.dtype}, "
135
+ f"device={t.device}, "
136
+ f"range=[{t.min().item():.6f}, {t.max().item():.6f}], "
137
+ f"mean={t.mean().item():.6f}, "
138
+ f"std={t.std().item():.6f}, "
139
+ f"norm={t.norm().item():.6f}")
140
+
141
+ @contextmanager
142
+ def bench_context(
143
+ *, warmup: int = 25, iters: int = 100, device: str = "cuda", dtype=torch.float32, tokens: Optional[int] = None, verbose: bool = True, save_json: Optional[str] = None, vary_inputs: bool = True
144
+ ):
145
+ """Context that yields a runner: runner(fn, *args, **kwargs) -> (result, stats).
146
+
147
+ If vary_inputs=True, the first argument should be a base tensor that will be varied each iteration
148
+ by adding a small deterministic increment to prevent caching artifacts.
149
+ """
150
+
151
+ def runner(fn: Callable[..., Any], *args, **kwargs) -> Tuple[Any, Dict[str, float]]:
152
+ # Log configuration
153
+ if verbose:
154
+ print(f"\n┌─ Benchmark Configuration ─────────────────────────────┐")
155
+ # print(f"│ Device: {device:<15} Dtype: {dtype} │")
156
+ print(f"│ Warmup: {warmup:<15} Iters: {iters} │")
157
+ if tokens:
158
+ print(f"│ Tokens: {tokens} │")
159
+ if vary_inputs:
160
+ print(f"│ Input Variation: Enabled (prevents caching artifacts) │")
161
+ print(f"└────────────────────────────────────────────────────────┘")
162
+
163
+ # Set up input generation
164
+ input_gen = None
165
+ if vary_inputs and args and isinstance(args[0], torch.Tensor):
166
+ base_input = args[0].clone()
167
+ iteration_counter = [0] # Use list for mutable closure
168
+
169
+ def generate_varied_input():
170
+ """Generate input tensor varied by iteration to prevent caching."""
171
+ # Add small deterministic increment: 0.001 * iteration_number
172
+ varied_input = base_input + (iteration_counter[0] * 0.001)
173
+ iteration_counter[0] += 1
174
+ return varied_input
175
+
176
+ input_gen = generate_varied_input
177
+ call = lambda x: fn(x, *args[1:], **kwargs)
178
+
179
+ # Log base input stats
180
+ if verbose:
181
+ print(f"\nBase Input: {tensor_stats(base_input)}")
182
+ print(f"Input Variation: +{0.001:.3f} * iteration (deterministic)")
183
+ else:
184
+ # Legacy mode - static inputs
185
+ call = lambda: fn(*args, **kwargs)
186
+ if verbose and args and isinstance(args[0], torch.Tensor):
187
+ print(f"\nInput: {tensor_stats(args[0])}")
188
+
189
+ result, times_s = _bench_engine(call, warmup=warmup, iters=iters, device=device, dtype=dtype, input_gen=input_gen)
190
+
191
+ # Log output if it's a tensor or tuple with tensors
192
+ if verbose:
193
+ print("\nOutput tensors:")
194
+ if isinstance(result, torch.Tensor):
195
+ print(f" Primary: {tensor_stats(result)}")
196
+ elif isinstance(result, tuple) and len(result) > 0 and isinstance(result[0], torch.Tensor):
197
+ print(f" Primary: {tensor_stats(result[0])}")
198
+ if len(result) > 1:
199
+ if isinstance(result[1], torch.Tensor):
200
+ print(f" Auxiliary: {tensor_stats(result[1])}")
201
+ else:
202
+ print(f" Auxiliary: {type(result[1]).__name__}")
203
+
204
+ # Compute and display statistics
205
+ stats = _compute_stats(times_s, tokens=tokens)
206
+ if verbose:
207
+ print(_format_timing_stats(stats, tokens))
208
+
209
+ # Save to JSON if requested
210
+ if save_json:
211
+ import json
212
+ json_data = {
213
+ "implementation": save_json.replace(".json", ""),
214
+ "config": {
215
+ "warmup": warmup,
216
+ "iters": iters,
217
+ "device": str(device), # Convert device to string
218
+ "dtype": str(dtype),
219
+ "tokens": tokens,
220
+ "vary_inputs": vary_inputs
221
+ },
222
+ "stats": stats,
223
+ "output_sum": float(result[0].sum().item()) if isinstance(result, tuple) and len(result) > 0 else float(result.sum().item()) if isinstance(result, torch.Tensor) else None
224
+ }
225
+ with open(save_json, 'w') as f:
226
+ json.dump(json_data, f, indent=2)
227
+ if verbose:
228
+ print(f"\nSaved benchmark results to {save_json}")
229
+
230
+ return result, stats
231
+
232
+ yield runner
233
+
234
+ def set_seed(seed: int):
235
+ """Set seeds for reproducibility."""
236
+ torch.manual_seed(seed)
237
+ if torch.cuda.is_available():
238
+ torch.cuda.manual_seed(seed)
239
+ torch.cuda.manual_seed_all(seed)
240
+ torch.backends.cudnn.deterministic = True
241
+ torch.backends.cudnn.benchmark = False
megablocks_yamoe/cells/binned_run.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ import torch
9
+ from torch import nn
10
+ from torch.nn import functional as F
11
+ from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
12
+ from config import (
13
+ NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
14
+ BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
15
+ WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
16
+ )
17
+ from pathlib import Path
18
+ import os
19
+
20
+ # Discover the upstream artifact directory from env
21
+ data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
22
+
23
+ router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
24
+ router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
25
+ gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
26
+ gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
27
+ down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
28
+ down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
29
+
30
+ print("Loaded shared weights from artifacts")
31
+ print(f"Router weight sum: {router_weight.sum().item():.6f}")
32
+ print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
33
+ print(f"Down sum: {down_proj.sum().item():.6f}")
34
+
35
+ def binned_gather(x, indices, bins, expert_capacity, top_k):
36
+ E, H = bins.shape[0], x.shape[1]
37
+ out = torch.zeros((E, expert_capacity, H), device=x.device, dtype=x.dtype)
38
+ for e in range(E):
39
+ start = 0 if e == 0 else bins[e - 1]
40
+ end = bins[e]
41
+ n = min(end - start, expert_capacity)
42
+ for i in range(n):
43
+ flat_pos = indices[start + i]
44
+ tok = flat_pos // top_k
45
+ out[e, i] = x[tok]
46
+ return out
47
+
48
+ def binned_scatter(x, indices, weights, bins, expert_capacity, top_k):
49
+ E, C, H = x.shape
50
+ N = indices.shape[0] // top_k
51
+ out = torch.zeros((N, top_k, H), dtype=x.dtype, device=x.device)
52
+ for e in range(E):
53
+ start = 0 if e == 0 else bins[e - 1]
54
+ end = bins[e]
55
+ n = end - start
56
+ if n == 0:
57
+ continue
58
+ take = min(n, expert_capacity)
59
+ for i in range(take):
60
+ flat_pos = indices[start + i]
61
+ tok = flat_pos // top_k
62
+ slot = flat_pos % top_k
63
+ scale = weights[flat_pos] if weights is not None else 1.0
64
+ out[tok, slot] = x[e, i] * scale
65
+ return out.sum(dim=1)
66
+
67
+ def sort_tokens_by_expert(router_indices, num_experts):
68
+ flat_indices = router_indices.flatten()
69
+ sorted_values, sorted_indices = torch.sort(flat_indices)
70
+ tokens_per_expert = torch.bincount(sorted_values, minlength=num_experts)
71
+ bins = torch.cumsum(tokens_per_expert, dim=0)
72
+ return sorted_indices, sorted_values, bins, tokens_per_expert
73
+
74
+ def binned_experts_ref(
75
+ hidden_states,
76
+ router_indices,
77
+ routing_weights,
78
+ gate_up_proj,
79
+ gate_up_proj_bias,
80
+ down_proj,
81
+ down_proj_bias,
82
+ expert_capacity,
83
+ ):
84
+ B, S, H = hidden_states.shape
85
+ E, K = routing_weights.shape[1], router_indices.shape[1]
86
+
87
+ indices, _, bins, _ = sort_tokens_by_expert(router_indices, E)
88
+ x = binned_gather(hidden_states.view(-1, H), indices, bins, expert_capacity, K)
89
+
90
+ gate_up = torch.bmm(x, gate_up_proj)
91
+ gate_up += gate_up_proj_bias[..., None, :]
92
+
93
+ gate, up = gate_up[..., ::2], gate_up[..., 1::2]
94
+
95
+ # clamp to limit
96
+ limit = 7.0
97
+ gate = gate.clamp(min=None, max=limit)
98
+ up = up.clamp(min=-limit, max=limit)
99
+
100
+ glu = gate * torch.sigmoid(gate * 1.702)
101
+ x = (up + 1) * glu
102
+ x = torch.bmm(x, down_proj) + down_proj_bias[..., None, :]
103
+
104
+ # build routing weights aligned to (token, slot)
105
+ flat_dense = routing_weights.view(-1, E)
106
+ flat_router = router_indices.view(-1, K)
107
+ selected = torch.gather(flat_dense, 1, flat_router).reshape(-1)
108
+
109
+ # scatter back
110
+ y = binned_scatter(x, indices, selected, bins, expert_capacity, K)
111
+
112
+ return y.view(B, S, H)
113
+
114
+ class BinnedRouter(nn.Module):
115
+ def __init__(self, router_weight, router_bias):
116
+ super().__init__()
117
+ self.top_k = TOP_K
118
+ self.num_experts = NUM_EXPERTS
119
+ self.hidden_dim = HIDDEN_SIZE
120
+ self.weight = nn.Parameter(router_weight.clone())
121
+ self.bias = nn.Parameter(router_bias.clone())
122
+
123
+ def forward(self, hidden_states):
124
+ hidden_states = hidden_states.reshape(-1, self.hidden_dim)
125
+ router_logits = F.linear(hidden_states, self.weight, self.bias)
126
+ router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
127
+ router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
128
+ router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
129
+ return router_scores, router_indices
130
+
131
+ def ceil_div(a, b):
132
+ return (a + b - 1) // b
133
+
134
+ class BinnedMoEMLP(nn.Module):
135
+ def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
136
+ super().__init__()
137
+ self.router = BinnedRouter(router_weight, router_bias)
138
+ self.num_experts = NUM_EXPERTS
139
+ self.hidden_size = HIDDEN_SIZE
140
+ self.top_k = TOP_K
141
+
142
+ # Expert weights - use the loaded weights
143
+ self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
144
+ self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
145
+ self.down_proj = nn.Parameter(down_proj.clone())
146
+ self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
147
+
148
+ def forward(self, hidden_states):
149
+ router_scores, router_indices = self.router(hidden_states)
150
+ batch_size = hidden_states.shape[0]
151
+ expert_capacity = ceil_div(batch_size * self.top_k, self.num_experts)
152
+
153
+ output = binned_experts_ref(
154
+ hidden_states,
155
+ router_indices,
156
+ router_scores,
157
+ self.gate_up_proj,
158
+ self.gate_up_proj_bias,
159
+ self.down_proj,
160
+ self.down_proj_bias,
161
+ expert_capacity,
162
+ )
163
+
164
+ return output, router_scores
165
+
166
+ # Run the model
167
+ set_seed(GENERAL_SEED)
168
+
169
+ device = torch.device(DEVICE)
170
+ dtype = to_dtype(DTYPE)
171
+
172
+ print("\n=== Binned Implementation ===")
173
+ # Initialize model with loaded weights
174
+ model = BinnedMoEMLP(
175
+ router_weight.to(device),
176
+ router_bias.to(device),
177
+ gate_up_proj.to(device),
178
+ gate_up_proj_bias.to(device),
179
+ down_proj.to(device),
180
+ down_proj_bias.to(device)
181
+ ).to(device=device)
182
+
183
+ print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
184
+ print(f"Gate/up proj sum: {model.gate_up_proj.sum().item():.6f}")
185
+ print(f"Down proj sum: {model.down_proj.sum().item():.6f}")
186
+
187
+ # Generate the same input as Yamoe
188
+ set_seed(INPUT_SEED)
189
+ x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
190
+
191
+ # Benchmark the model with varied inputs to prevent caching artifacts
192
+ tokens = BATCH_SIZE * SEQ_LEN
193
+ with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="binned_results.json", vary_inputs=True) as bench:
194
+ output, stats = bench(model, x)
195
+ print(f"\nOutput sum: {output[0].sum().item():.6f}")
megablocks_yamoe/cells/config.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ """Shared configuration for both implementations."""
9
+ import torch
10
+
11
+ # Model configuration
12
+ NUM_EXPERTS = 128
13
+ HIDDEN_SIZE = 1152
14
+ INTERMEDIATE_SIZE = 3072
15
+ TOP_K = 4
16
+
17
+ # Input configuration
18
+ BATCH_SIZE = 1
19
+ SEQ_LEN = 100
20
+ DTYPE = "float32"
21
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
22
+
23
+ # Seeds for reproducibility
24
+ WEIGHT_SEED = 999
25
+ EXPERT_SEED = 777
26
+ INPUT_SEED = 123
27
+ GENERAL_SEED = 42
megablocks_yamoe/cells/gptoss_run.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ import torch
9
+ from torch import nn
10
+ from torch.nn import functional as F
11
+ from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
12
+ from config import (
13
+ NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
14
+ BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
15
+ WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
16
+ )
17
+ from pathlib import Path
18
+ import os
19
+
20
+ # Discover the upstream artifact directory from env
21
+ data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
22
+
23
+ router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
24
+ router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
25
+ gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
26
+ gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
27
+ down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
28
+ down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
29
+
30
+ print("Loaded shared weights from artifacts")
31
+ print(f"Router weight sum: {router_weight.sum().item():.6f}")
32
+ print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
33
+ print(f"Down sum: {down_proj.sum().item():.6f}")
34
+
35
+ class GptOssRouter(nn.Module):
36
+ def __init__(self, router_weight, router_bias):
37
+ super().__init__()
38
+ self.top_k = TOP_K
39
+ self.num_experts = NUM_EXPERTS
40
+ self.hidden_dim = HIDDEN_SIZE
41
+ self.weight = nn.Parameter(router_weight.clone())
42
+ self.bias = nn.Parameter(router_bias.clone())
43
+
44
+ def forward(self, hidden_states):
45
+ hidden_states = hidden_states.reshape(-1, self.hidden_dim)
46
+ router_logits = F.linear(hidden_states, self.weight, self.bias)
47
+ router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
48
+ router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
49
+ router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
50
+ return router_scores, router_indices
51
+
52
+ class GptOssExperts(nn.Module):
53
+ def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
54
+ super().__init__()
55
+ self.num_experts = NUM_EXPERTS
56
+ self.hidden_size = HIDDEN_SIZE
57
+ self.expert_dim = self.hidden_size
58
+ self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
59
+ self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
60
+ self.down_proj = nn.Parameter(down_proj.clone())
61
+ self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
62
+ self.alpha = 1.702
63
+ self.limit = 7.0
64
+
65
+ def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
66
+ batch_size = hidden_states.shape[0]
67
+ hidden_states = hidden_states.reshape(-1, self.hidden_size)
68
+ num_experts = routing_weights.shape[1]
69
+
70
+ if hidden_states.device.type == "cpu" or self.training:
71
+ next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
72
+ with torch.no_grad():
73
+ expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
74
+ expert_mask = expert_mask.permute(2, 1, 0)
75
+ expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
76
+
77
+ for expert_idx in expert_hit[:]:
78
+ expert_idx = expert_idx[0]
79
+ with torch.no_grad():
80
+ _, token_idx = torch.where(expert_mask[expert_idx])
81
+ current_state = hidden_states[token_idx]
82
+ gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
83
+ gate, up = gate_up[..., ::2], gate_up[..., 1::2]
84
+ gate = gate.clamp(min=None, max=self.limit)
85
+ up = up.clamp(min=-self.limit, max=self.limit)
86
+ glu = gate * torch.sigmoid(gate * self.alpha)
87
+ gated_output = (up + 1) * glu
88
+ out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
89
+ weighted_output = out * routing_weights[token_idx, expert_idx, None]
90
+ next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
91
+ next_states = next_states.view(batch_size, -1, self.hidden_size)
92
+ else:
93
+ hidden_states = hidden_states.repeat(num_experts, 1)
94
+ hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
95
+ gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
96
+ gate, up = gate_up[..., ::2], gate_up[..., 1::2]
97
+ gate = gate.clamp(min=None, max=self.limit)
98
+ up = up.clamp(min=-self.limit, max=self.limit)
99
+ glu = gate * torch.sigmoid(gate * self.alpha)
100
+ next_states = torch.bmm(((up + 1) * glu), self.down_proj)
101
+ next_states = next_states + self.down_proj_bias[..., None, :]
102
+ next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
103
+ next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
104
+ next_states = next_states.sum(dim=0)
105
+ return next_states
106
+
107
+ class GptOssMoEMLP(nn.Module):
108
+ def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
109
+ super().__init__()
110
+ self.router = GptOssRouter(router_weight, router_bias)
111
+ self.experts = GptOssExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias)
112
+
113
+ def forward(self, hidden_states):
114
+ router_scores, router_indices = self.router(hidden_states)
115
+ routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
116
+ return routed_out, router_scores
117
+
118
+ # Run the model
119
+ set_seed(GENERAL_SEED)
120
+
121
+ device = torch.device(DEVICE)
122
+ dtype = to_dtype(DTYPE)
123
+
124
+ print("\n=== GPT-OSS Implementation ===")
125
+ # Initialize model with loaded weights
126
+ model = GptOssMoEMLP(
127
+ router_weight.to(device),
128
+ router_bias.to(device),
129
+ gate_up_proj.to(device),
130
+ gate_up_proj_bias.to(device),
131
+ down_proj.to(device),
132
+ down_proj_bias.to(device)
133
+ ).to(device=device)
134
+
135
+ print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
136
+ print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}")
137
+ print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}")
138
+
139
+ # Generate the same input as other implementations
140
+ set_seed(INPUT_SEED)
141
+ x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
142
+
143
+ # Benchmark the model with varied inputs to prevent caching artifacts
144
+ tokens = BATCH_SIZE * SEQ_LEN
145
+ with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="gptoss_results.json", vary_inputs=True) as bench:
146
+ output, stats = bench(model, x)
147
+ print(f"\nOutput sum: {output[0].sum().item():.6f}")
megablocks_yamoe/cells/gptoss_training_run.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ import torch
9
+ from torch import nn
10
+ from torch.nn import functional as F
11
+ from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
12
+ from config import (
13
+ NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
14
+ BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
15
+ WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
16
+ )
17
+ from pathlib import Path
18
+ import os
19
+
20
+ # Discover the upstream artifact directory from env
21
+ data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
22
+
23
+ router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
24
+ router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
25
+ gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
26
+ gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
27
+ down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
28
+ down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
29
+
30
+ print("Loaded shared weights from artifacts")
31
+ print(f"Router weight sum: {router_weight.sum().item():.6f}")
32
+ print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
33
+ print(f"Down sum: {down_proj.sum().item():.6f}")
34
+
35
+ class GptOssTrainingRouter(nn.Module):
36
+ def __init__(self, router_weight, router_bias):
37
+ super().__init__()
38
+ self.top_k = TOP_K
39
+ self.num_experts = NUM_EXPERTS
40
+ self.hidden_dim = HIDDEN_SIZE
41
+ self.weight = nn.Parameter(router_weight.clone())
42
+ self.bias = nn.Parameter(router_bias.clone())
43
+
44
+ def forward(self, hidden_states):
45
+ hidden_states = hidden_states.reshape(-1, self.hidden_dim)
46
+ router_logits = F.linear(hidden_states, self.weight, self.bias)
47
+ router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
48
+ router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
49
+ router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
50
+ return router_scores, router_indices
51
+
52
+ class GptOssTrainingExperts(nn.Module):
53
+ def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
54
+ super().__init__()
55
+ self.num_experts = NUM_EXPERTS
56
+ self.hidden_size = HIDDEN_SIZE
57
+ self.expert_dim = self.hidden_size
58
+ self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
59
+ self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
60
+ self.down_proj = nn.Parameter(down_proj.clone())
61
+ self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
62
+ self.alpha = 1.702
63
+ self.limit = 7.0
64
+
65
+ def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
66
+ batch_size = hidden_states.shape[0]
67
+ hidden_states = hidden_states.reshape(-1, self.hidden_size)
68
+ num_experts = routing_weights.shape[1]
69
+
70
+ # Force training mode path (expert loop instead of batched)
71
+ next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
72
+ with torch.no_grad():
73
+ expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
74
+ expert_mask = expert_mask.permute(2, 1, 0)
75
+ expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
76
+
77
+ for expert_idx in expert_hit[:]:
78
+ expert_idx = expert_idx[0]
79
+ with torch.no_grad():
80
+ _, token_idx = torch.where(expert_mask[expert_idx])
81
+ current_state = hidden_states[token_idx]
82
+ gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
83
+ gate, up = gate_up[..., ::2], gate_up[..., 1::2]
84
+ gate = gate.clamp(min=None, max=self.limit)
85
+ up = up.clamp(min=-self.limit, max=self.limit)
86
+ glu = gate * torch.sigmoid(gate * self.alpha)
87
+ gated_output = (up + 1) * glu
88
+ out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
89
+ weighted_output = out * routing_weights[token_idx, expert_idx, None]
90
+ next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
91
+ next_states = next_states.view(batch_size, -1, self.hidden_size)
92
+ return next_states
93
+
94
+ class GptOssTrainingMoEMLP(nn.Module):
95
+ def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
96
+ super().__init__()
97
+ self.router = GptOssTrainingRouter(router_weight, router_bias)
98
+ self.experts = GptOssTrainingExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias)
99
+
100
+ def forward(self, hidden_states):
101
+ router_scores, router_indices = self.router(hidden_states)
102
+ routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
103
+ return routed_out, router_scores
104
+
105
+ # Run the model
106
+ set_seed(GENERAL_SEED)
107
+
108
+ device = torch.device(DEVICE)
109
+ dtype = to_dtype(DTYPE)
110
+
111
+ print("\n=== GPT-OSS Implementation (Training Mode - Expert Loop) ===")
112
+ # Initialize model with loaded weights and force training mode
113
+ model = GptOssTrainingMoEMLP(
114
+ router_weight.to(device),
115
+ router_bias.to(device),
116
+ gate_up_proj.to(device),
117
+ gate_up_proj_bias.to(device),
118
+ down_proj.to(device),
119
+ down_proj_bias.to(device)
120
+ ).to(device=device)
121
+
122
+ # Set to training mode to force expert loop path
123
+ model.train()
124
+
125
+ print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
126
+ print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}")
127
+ print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}")
128
+ print(f"Model training mode: {model.training}")
129
+
130
+ # Generate the same input as other implementations
131
+ set_seed(INPUT_SEED)
132
+ x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
133
+
134
+ # Benchmark the model with varied inputs to prevent caching artifacts
135
+ tokens = BATCH_SIZE * SEQ_LEN
136
+ with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="gptoss_training_results.json", vary_inputs=True) as bench:
137
+ output, stats = bench(model, x)
138
+ print(f"\nOutput sum: {output[0].sum().item():.6f}")
megablocks_yamoe/cells/megablocks_run.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # "kernels",
6
+ # ]
7
+ # ///
8
+
9
+ import torch
10
+ from torch import nn
11
+ from torch.nn import functional as F
12
+ from kernels import get_kernel, get_local_kernel
13
+ from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
14
+ from config import (
15
+ NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
16
+ BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
17
+ WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
18
+ )
19
+ from pathlib import Path
20
+ from collections import namedtuple
21
+ import os
22
+
23
+ # Discover the upstream artifact directory from env
24
+ data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
25
+
26
+ print(f"Loading weights from: {data_dir}")
27
+
28
+ router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
29
+ router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
30
+ gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
31
+ gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
32
+ down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
33
+ down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
34
+
35
+ print("Loaded shared weights from artifacts")
36
+ print(f"Router weight sum: {router_weight.sum().item():.6f}")
37
+ print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
38
+ print(f"Down sum: {down_proj.sum().item():.6f}")
39
+
40
+ def build_megablocks_model(device: torch.device):
41
+ # Download optimized kernels from the Hugging Face hub
42
+ megablocks = get_kernel("kernels-community/megablocks", revision="v0.0.2")
43
+ model = megablocks.layers.MegaBlocksMoeMLP()
44
+
45
+ # Create attribute container for expert weights
46
+ model.experts = namedtuple(
47
+ "Experts", ["gate_up_proj", "gate_up_proj_bias", "down_proj", "down_proj_bias", "hidden_size"]
48
+ )
49
+
50
+ # Use loaded router weights for consistency
51
+ model.router = torch.nn.Linear(HIDDEN_SIZE, NUM_EXPERTS, device=device)
52
+ with torch.no_grad():
53
+ model.router.weight.copy_(router_weight)
54
+ model.router.bias.copy_(router_bias)
55
+
56
+ # Attach loaded expert weights to the experts container
57
+ e = model.experts
58
+ e.alpha = 1.702
59
+ e.capacity_factor = 128
60
+ e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device))
61
+ e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device))
62
+ e.down_proj = torch.nn.Parameter(down_proj.clone().to(device))
63
+ e.down_proj_bias = torch.nn.Parameter(down_proj_bias.clone().to(device))
64
+ e.hidden_size = HIDDEN_SIZE
65
+
66
+ # Log weight statistics for comparison
67
+ print(f"[MegaBlocks] Router weight sum: {model.router.weight.sum().item():.6f}")
68
+ print(f"[MegaBlocks] Gate/up projection shape: {tuple(e.gate_up_proj.shape)}, sum: {e.gate_up_proj.sum().item():.6f}")
69
+ print(f"[MegaBlocks] Down projection shape: {tuple(e.down_proj.shape)}, sum: {e.down_proj.sum().item():.6f}")
70
+
71
+ return model
72
+
73
+ # Create a wrapper to match the interface of other implementations
74
+ class MegaBlocksMoEWrapper(nn.Module):
75
+ def __init__(self, megablocks_model):
76
+ super().__init__()
77
+ self.model = megablocks_model
78
+
79
+ def forward(self, hidden_states):
80
+ # MegaBlocks expects input in the format (batch, seq_len, hidden_dim)
81
+ output, dummy_routing_weights = self.model(hidden_states)
82
+ return output, dummy_routing_weights
83
+
84
+ # Run the model
85
+ set_seed(GENERAL_SEED)
86
+
87
+ device = torch.device(DEVICE)
88
+ dtype = to_dtype(DTYPE)
89
+
90
+ print("\n=== MegaBlocks Implementation ===")
91
+ # Build MegaBlocks model with loaded weights
92
+ megablocks_model = build_megablocks_model(device)
93
+ model = MegaBlocksMoEWrapper(megablocks_model).to(device=device)
94
+
95
+ # Generate the same input as other implementations
96
+ set_seed(INPUT_SEED)
97
+ x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
98
+
99
+ # Benchmark the model with varied inputs to prevent caching artifacts
100
+ tokens = BATCH_SIZE * SEQ_LEN
101
+ with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="megablocks_results.json", vary_inputs=True) as bench:
102
+ output, stats = bench(model, x)
103
+ print(f"\nOutput sum: {output[0].sum().item():.6f}")
megablocks_yamoe/cells/nv.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import subprocess
2
+
3
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
megablocks_yamoe/cells/save_data.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ """
9
+ Generate deterministic shared weights once and save as artifacts so
10
+ both implementations load identical parameters.
11
+ """
12
+ import torch
13
+ from config import NUM_EXPERTS, HIDDEN_SIZE, WEIGHT_SEED, EXPERT_SEED
14
+
15
+ def save_shared_weights():
16
+ # Router: Kaiming uniform as used by both, bias zeros
17
+ torch.manual_seed(WEIGHT_SEED)
18
+ router_weight = torch.empty(NUM_EXPERTS, HIDDEN_SIZE)
19
+ torch.nn.init.kaiming_uniform_(router_weight)
20
+ router_bias = torch.zeros(NUM_EXPERTS)
21
+
22
+ # Experts: normal(0, 0.02), biases zeros
23
+ torch.manual_seed(EXPERT_SEED)
24
+ gate_up_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, 2 * HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
25
+ gate_up_proj_bias = torch.zeros(NUM_EXPERTS, 2 * HIDDEN_SIZE)
26
+ down_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
27
+ down_proj_bias = torch.zeros(NUM_EXPERTS, HIDDEN_SIZE)
28
+
29
+ # Save artifacts
30
+ torch.save(router_weight, 'router_weight.pt')
31
+ torch.save(router_bias, 'router_bias.pt')
32
+ torch.save(gate_up_proj, 'gate_up_proj.pt')
33
+ torch.save(gate_up_proj_bias, 'gate_up_proj_bias.pt')
34
+ torch.save(down_proj, 'down_proj.pt')
35
+ torch.save(down_proj_bias, 'down_proj_bias.pt')
36
+
37
+ print("Saved shared weights to artifacts")
38
+ print(f"Router weight sum: {router_weight.sum().item():.6f}")
39
+ print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
40
+ print(f"Down sum: {down_proj.sum().item():.6f}")
41
+
42
+ save_shared_weights()
megablocks_yamoe/cells/setup.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.12"
3
+ # dependencies = [
4
+ # "accelerate>=1.10.1",
5
+ # "torch>=2.7.0",
6
+ # "kernels==0.10.0",
7
+ # "transformers@https://github.com/huggingface/transformers.git",
8
+ # "ipdb>=0.13.13",
9
+ # "matplotlib>=3.7.2",
10
+ # "numpy>=1.24.3",
11
+ # ]
12
+ # ///
13
+
14
+ import torch
15
+ from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
+ import time
17
+ import torch.nn as nn
18
+ from kernels import register_kernel_mapping, Mode, LayerRepository
19
+ import sys
20
+ import torch.profiler
21
+ import gc
22
+ import logging
23
+
24
+ # set to debug logging
25
+ logging.basicConfig(level=logging.INFO)
26
+
27
+ def reset_peak_memory_stats():
28
+ """Clear CUDA cache and reset memory allocation counters."""
29
+ torch.cuda.empty_cache()
30
+ if torch.cuda.is_available():
31
+ torch.cuda.reset_peak_memory_stats()
32
+ gc.collect()
33
+
34
+ def get_memory_stats():
35
+ """Get current and peak CUDA memory usage."""
36
+ if not torch.cuda.is_available():
37
+ return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
38
+ return {
39
+ "allocated_gb": torch.cuda.memory_allocated() / 1e9,
40
+ "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
41
+ "reserved_gb": torch.cuda.memory_reserved() / 1e9,
42
+ }
43
+
44
+ def override_kernel_layer_name(cls_name: str, value) -> bool:
45
+ """Helper to dynamically override the kernel_layer_name in a model class."""
46
+ for mod in sys.modules.values():
47
+ if mod is None:
48
+ continue
49
+ obj = getattr(mod, cls_name, None)
50
+ if isinstance(obj, type) and issubclass(obj, nn.Module):
51
+ setattr(obj, "kernel_layer_name", value)
52
+ print(f"Overrode {cls_name}.kernel_layer_name to {value}")
53
+ return True
54
+ return False
55
+
56
+
57
+ # Init the model the normal way
58
+ model_id = "openai/gpt-oss-20b"
59
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
60
+ quantization_config = Mxfp4Config(dequantize=True)
61
+
62
+
63
+ from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
64
+
65
+ from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
66
+
67
+ replace_kernel_forward_from_hub(GptOssMLP, "Yamoe")
68
+ replace_kernel_forward_from_hub(GptOssRMSNorm, None)
69
+ custom_mapping = {
70
+ "Yamoe": {
71
+ "cuda": {
72
+ Mode.INFERENCE: LayerRepository(
73
+ repo_id="drbh/yamoe",
74
+ layer_name="Yamoe",
75
+ revision="v0.3.0",
76
+ )
77
+ }
78
+ }
79
+ }
80
+ register_kernel_mapping(custom_mapping)
81
+
82
+
83
+ model = GptOssForCausalLM.from_pretrained(
84
+ model_id,
85
+ dtype="bfloat16",
86
+ device_map="auto",
87
+ use_kernels=True,
88
+ quantization_config=quantization_config,
89
+ ).eval()
90
+
91
+ messages = [
92
+ {"role": "system", "content": "What is Tensor Parallelism?"},
93
+ ]
94
+
95
+ inputs = tokenizer.apply_chat_template(
96
+ messages,
97
+ add_generation_prompt=True,
98
+ return_tensors="pt",
99
+ return_dict=True,
100
+ reasoning_effort="low",
101
+ ).to("cuda")
102
+
103
+ max_tokens = 256
104
+
105
+ with torch.inference_mode():
106
+ start_time = time.perf_counter()
107
+ generated = model.generate(
108
+ **inputs,
109
+ max_new_tokens=max_tokens,
110
+ do_sample=False,
111
+ temperature=None,
112
+ )
113
+ end_time = time.perf_counter()
114
+
115
+ print(tokenizer.decode(generated[0], skip_special_tokens=False))
116
+ print(f"Generation took {end_time - start_time:.2f} seconds")
megablocks_yamoe/cells/setup2.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.12"
3
+ # dependencies = [
4
+ # "accelerate>=1.10.1",
5
+ # "torch>=2.7.0",
6
+ # "kernels==0.10.0",
7
+ # "transformers@https://github.com/huggingface/transformers.git",
8
+ # "ipdb>=0.13.13",
9
+ # "matplotlib>=3.7.2",
10
+ # "numpy>=1.24.3",
11
+ # ]
12
+ # ///
13
+
14
+ import torch
15
+ from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
+ import time
17
+ import torch.nn as nn
18
+ from kernels import register_kernel_mapping, Mode, LayerRepository
19
+ import sys
20
+ import torch.profiler
21
+ import gc
22
+ import logging
23
+
24
+ # set to debug logging
25
+ logging.basicConfig(level=logging.INFO)
26
+
27
+ def reset_peak_memory_stats():
28
+ """Clear CUDA cache and reset memory allocation counters."""
29
+ torch.cuda.empty_cache()
30
+ if torch.cuda.is_available():
31
+ torch.cuda.reset_peak_memory_stats()
32
+ gc.collect()
33
+
34
+ def get_memory_stats():
35
+ """Get current and peak CUDA memory usage."""
36
+ if not torch.cuda.is_available():
37
+ return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
38
+ return {
39
+ "allocated_gb": torch.cuda.memory_allocated() / 1e9,
40
+ "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
41
+ "reserved_gb": torch.cuda.memory_reserved() / 1e9,
42
+ }
43
+
44
+ def override_kernel_layer_name(cls_name: str, value) -> bool:
45
+ """Helper to dynamically override the kernel_layer_name in a model class."""
46
+ for mod in sys.modules.values():
47
+ if mod is None:
48
+ continue
49
+ obj = getattr(mod, cls_name, None)
50
+ if isinstance(obj, type) and issubclass(obj, nn.Module):
51
+ setattr(obj, "kernel_layer_name", value)
52
+ print(f"Overrode {cls_name}.kernel_layer_name to {value}")
53
+ return True
54
+ return False
55
+
56
+
57
+ # Init the model the normal way
58
+ model_id = "openai/gpt-oss-20b"
59
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
60
+ quantization_config = Mxfp4Config(dequantize=True)
61
+
62
+
63
+ from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
64
+
65
+ from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
66
+
67
+ replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
68
+ custom_mapping = {
69
+ "Yamoe": {
70
+ "cuda": {
71
+ Mode.INFERENCE: LayerRepository(
72
+ repo_id="drbh/yamoe",
73
+ layer_name="Yamoe",
74
+ revision="v0.3.0",
75
+ )
76
+ }
77
+ }
78
+ }
79
+ register_kernel_mapping(custom_mapping)
80
+
81
+
82
+ model = GptOssForCausalLM.from_pretrained(
83
+ model_id,
84
+ dtype="bfloat16",
85
+ device_map="auto",
86
+ use_kernels=True,
87
+ quantization_config=quantization_config,
88
+ ).eval()
89
+
90
+ messages = [
91
+ {"role": "system", "content": "What is Tensor Parallelism?"},
92
+ ]
93
+
94
+ inputs = tokenizer.apply_chat_template(
95
+ messages,
96
+ add_generation_prompt=True,
97
+ return_tensors="pt",
98
+ return_dict=True,
99
+ reasoning_effort="low",
100
+ ).to("cuda")
101
+
102
+ max_tokens = 256
103
+
104
+ with torch.inference_mode():
105
+ start_time = time.perf_counter()
106
+ generated = model.generate(
107
+ **inputs,
108
+ max_new_tokens=max_tokens,
109
+ do_sample=False,
110
+ temperature=None,
111
+ )
112
+ end_time = time.perf_counter()
113
+
114
+ print(tokenizer.decode(generated[0], skip_special_tokens=False))
115
+ print(f"Generation took {end_time - start_time:.2f} seconds")
megablocks_yamoe/cells/utils.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ """Simple utilities for running the models."""
9
+ import torch
10
+
11
+ def to_dtype(dtype_str: str):
12
+ """Convert string to torch dtype."""
13
+ if dtype_str == "float16":
14
+ return torch.float16
15
+ if dtype_str == "bfloat16":
16
+ return torch.bfloat16
17
+ return torch.float32
18
+
19
+ def tensor_stats(t: torch.Tensor) -> str:
20
+ """Generate stats string for a tensor."""
21
+ return (f"shape={tuple(t.shape)}, "
22
+ f"dtype={t.dtype}, "
23
+ f"device={t.device}, "
24
+ f"mean={t.mean().item():.6f}, "
25
+ f"std={t.std().item():.6f}")
26
+
27
+ def set_seed(seed: int):
28
+ """Set seeds for reproducibility."""
29
+ torch.manual_seed(seed)
30
+ if torch.cuda.is_available():
31
+ torch.cuda.manual_seed(seed)
32
+ torch.cuda.manual_seed_all(seed)
33
+ torch.backends.cudnn.deterministic = True
34
+ torch.backends.cudnn.benchmark = False
megablocks_yamoe/cells/yamoe_run.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "kernels",
5
+ # "numpy",
6
+ # ]
7
+ # ///
8
+
9
+ import torch
10
+ from torch import nn
11
+ from torch.nn import functional as F
12
+ from kernels import get_kernel, get_local_kernel
13
+ from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
14
+ from config import (
15
+ NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
16
+ BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
17
+ WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
18
+ )
19
+ from pathlib import Path
20
+ import os
21
+
22
+ # Discover the upstream artifact directory from env
23
+ data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
24
+ print(f"Loading weights from: {data_dir}")
25
+
26
+ router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
27
+ router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
28
+ gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
29
+ gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
30
+ down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
31
+ down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
32
+
33
+ print("Loaded shared weights from artifacts")
34
+ print(f"Router weight sum: {router_weight.sum().item():.6f}")
35
+ print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
36
+ print(f"Down sum: {down_proj.sum().item():.6f}")
37
+
38
+ class YamoeRouter(nn.Module):
39
+ def __init__(self, router_weight, router_bias):
40
+ super().__init__()
41
+ self.top_k = TOP_K
42
+ self.num_experts = NUM_EXPERTS
43
+ self.hidden_dim = HIDDEN_SIZE
44
+ self.weight = nn.Parameter(router_weight.clone())
45
+ self.bias = nn.Parameter(router_bias.clone())
46
+
47
+ def forward(self, hidden_states):
48
+ hidden_states = hidden_states.reshape(-1, self.hidden_dim)
49
+ router_logits = F.linear(hidden_states, self.weight, self.bias)
50
+ router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
51
+ router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
52
+ router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
53
+ return router_scores, router_indices
54
+
55
+ def ceil_div(a, b):
56
+ return (a + b - 1) // b
57
+
58
+ class YamoeMoEMLP(nn.Module):
59
+ def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
60
+ super().__init__()
61
+ self.router = YamoeRouter(router_weight, router_bias)
62
+ self.num_experts = NUM_EXPERTS
63
+ self.hidden_size = HIDDEN_SIZE
64
+ self.top_k = TOP_K
65
+
66
+ # Load Yamoe kernel
67
+ # self.yamoe = get_local_kernel(Path("/home/ubuntu/Projects/yamoe/result"), "yamoe")
68
+ self.yamoe = get_kernel("drbh/yamoe", revision="v0.2.0")
69
+
70
+ # Expert weights - use the loaded weights
71
+ self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
72
+ self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
73
+ self.down_proj = nn.Parameter(down_proj.clone())
74
+ self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
75
+
76
+ def forward(self, hidden_states):
77
+ batch_size, seq_len, hidden_dim = hidden_states.shape
78
+
79
+ # Get routing decisions
80
+ routing_weights, router_indices = self.router(hidden_states)
81
+
82
+ # Reshape for Yamoe kernel
83
+ hidden_states_flat = hidden_states.view(-1, hidden_dim)
84
+ routing_weights_flat = routing_weights.view(-1, self.num_experts)
85
+ expert_capacity = ceil_div(batch_size * self.top_k, self.num_experts)
86
+
87
+ # Call Yamoe optimized kernel
88
+ output = self.yamoe.experts(
89
+ hidden_states_flat,
90
+ router_indices,
91
+ routing_weights_flat,
92
+ self.gate_up_proj,
93
+ self.gate_up_proj_bias,
94
+ self.down_proj,
95
+ self.down_proj_bias,
96
+ expert_capacity,
97
+ self.num_experts,
98
+ self.top_k,
99
+ )
100
+
101
+ # Reshape output back
102
+ output = output.view(batch_size, seq_len, hidden_dim)
103
+
104
+ return output, routing_weights
105
+
106
+ # Run the model
107
+ set_seed(GENERAL_SEED)
108
+
109
+ device = torch.device(DEVICE if DEVICE == "cuda" else "cuda")
110
+ dtype = to_dtype(DTYPE)
111
+
112
+ print("\n=== Yamoe Implementation ===")
113
+ # Initialize model with loaded weights
114
+ model = YamoeMoEMLP(
115
+ router_weight.to(device),
116
+ router_bias.to(device),
117
+ gate_up_proj.to(device),
118
+ gate_up_proj_bias.to(device),
119
+ down_proj.to(device),
120
+ down_proj_bias.to(device)
121
+ ).to(device=device)
122
+
123
+ print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
124
+ print(f"Gate/up proj sum: {model.gate_up_proj.sum().item():.6f}")
125
+ print(f"Down proj sum: {model.down_proj.sum().item():.6f}")
126
+
127
+ # Generate input
128
+ set_seed(INPUT_SEED)
129
+ x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
130
+
131
+ # Benchmark the model with varied inputs to prevent caching artifacts
132
+ tokens = BATCH_SIZE * SEQ_LEN
133
+ with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="yamoe_results.json", vary_inputs=True) as bench:
134
+ output, stats = bench(model, x)
135
+ print(f"\nOutput sum: {output[0].sum().item():.6f}")
megablocks_yamoe/index.html ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /megablocks_yamoe</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='megablocks_yamoe.html' class='file'>megablocks_yamoe.html</a></li>
22
+ <li><a href='torch_profile.html' class='file'>torch_profile.html</a></li>
23
+ </ul>
24
+ </body>
25
+ </html>
megablocks_yamoe/megablocks_yamoe.html ADDED
The diff for this file is too large to render. See raw diff
 
megablocks_yamoe/torch_profile.html ADDED
The diff for this file is too large to render. See raw diff