Commit
·
d9908db
1
Parent(s):
c166ffe
Rotate examples and reduce overfit
Browse files
app.py
CHANGED
|
@@ -83,6 +83,8 @@ SCORE_SCALE = 3000.0
|
|
| 83 |
PARSE_REWARD = 0.02
|
| 84 |
API_REWARD = 0.05
|
| 85 |
EXEC_REWARD = 0.10
|
|
|
|
|
|
|
| 86 |
PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
|
| 87 |
ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
|
| 88 |
ADAPTER_DATASET_REPO = os.environ.get("ADAPTER_DATASET_REPO", "CreativeEngineer/vliw-optimizer-adapters")
|
|
@@ -465,7 +467,8 @@ def perf_takehome_reward_fn(completions, prompts=None, **kwargs):
|
|
| 465 |
text = str(completion)
|
| 466 |
|
| 467 |
code = extract_code_block(text)
|
| 468 |
-
|
|
|
|
| 469 |
|
| 470 |
reward = 0.0
|
| 471 |
if result.get("correctness", 0.0) > 0:
|
|
@@ -477,6 +480,8 @@ def perf_takehome_reward_fn(completions, prompts=None, **kwargs):
|
|
| 477 |
reward += API_REWARD
|
| 478 |
if result.get("exec_ok"):
|
| 479 |
reward += EXEC_REWARD
|
|
|
|
|
|
|
| 480 |
cycles = result.get("cycles")
|
| 481 |
with state_lock:
|
| 482 |
if isinstance(cycles, int) and cycles < training_state["best_cycles"]:
|
|
@@ -487,7 +492,8 @@ def perf_takehome_reward_fn(completions, prompts=None, **kwargs):
|
|
| 487 |
|
| 488 |
|
| 489 |
# Prompt template for VLIW optimization
|
| 490 |
-
|
|
|
|
| 491 |
```python
|
| 492 |
class OptimizedKernelBuilder(KernelBuilder):
|
| 493 |
def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
|
|
@@ -496,8 +502,8 @@ class OptimizedKernelBuilder(KernelBuilder):
|
|
| 496 |
def run():
|
| 497 |
return (0,)
|
| 498 |
```
|
| 499 |
-
|
| 500 |
-
Example with scratch +
|
| 501 |
```python
|
| 502 |
class OptimizedKernelBuilder(KernelBuilder):
|
| 503 |
def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
|
|
@@ -508,9 +514,49 @@ class OptimizedKernelBuilder(KernelBuilder):
|
|
| 508 |
def run():
|
| 509 |
return (0,)
|
| 510 |
```
|
| 511 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
|
| 513 |
-
|
|
|
|
|
|
|
| 514 |
|
| 515 |
ARCHITECTURE: 12 ALU + 6 VALU (VLEN=8) + 2 load + 2 store + 1 flow slots per cycle. 1536-word scratch.
|
| 516 |
|
|
@@ -542,10 +588,11 @@ RULES:
|
|
| 542 |
- class OptimizedKernelBuilder(KernelBuilder): override build_kernel() and emit instructions using add()/build()
|
| 543 |
- def run(): return any tuple (ignored), but must exist
|
| 544 |
- No imports.
|
|
|
|
| 545 |
|
| 546 |
Baseline: {BASELINE_CYCLES:,} cycles. Target: <{TARGET_CYCLES:,} cycles.
|
| 547 |
|
| 548 |
-
{
|
| 549 |
"""
|
| 550 |
|
| 551 |
|
|
@@ -620,7 +667,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
|
|
| 620 |
|
| 621 |
# Create dataset with prompts
|
| 622 |
add_log("Creating VLIW optimization dataset...")
|
| 623 |
-
prompts = [
|
| 624 |
dataset = Dataset.from_dict({"prompt": prompts})
|
| 625 |
add_log(f"[OK] Dataset ready: {len(prompts)} prompts")
|
| 626 |
|
|
@@ -734,7 +781,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
|
|
| 734 |
|
| 735 |
# Test generation
|
| 736 |
add_log("Testing trained model...")
|
| 737 |
-
inputs = tokenizer(
|
| 738 |
with torch.no_grad():
|
| 739 |
outputs = model.generate(
|
| 740 |
**inputs,
|
|
|
|
| 83 |
PARSE_REWARD = 0.02
|
| 84 |
API_REWARD = 0.05
|
| 85 |
EXEC_REWARD = 0.10
|
| 86 |
+
COPY_PENALTY = 0.05
|
| 87 |
+
SEED_POOL = [0, 1, 2, 3]
|
| 88 |
PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
|
| 89 |
ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
|
| 90 |
ADAPTER_DATASET_REPO = os.environ.get("ADAPTER_DATASET_REPO", "CreativeEngineer/vliw-optimizer-adapters")
|
|
|
|
| 467 |
text = str(completion)
|
| 468 |
|
| 469 |
code = extract_code_block(text)
|
| 470 |
+
seed = random.choice(SEED_POOL)
|
| 471 |
+
result = verify_perf_takehome_code(code, seed=seed)
|
| 472 |
|
| 473 |
reward = 0.0
|
| 474 |
if result.get("correctness", 0.0) > 0:
|
|
|
|
| 480 |
reward += API_REWARD
|
| 481 |
if result.get("exec_ok"):
|
| 482 |
reward += EXEC_REWARD
|
| 483 |
+
if code.strip() in EXAMPLE_CODE_SET:
|
| 484 |
+
reward = max(0.0, reward - COPY_PENALTY)
|
| 485 |
cycles = result.get("cycles")
|
| 486 |
with state_lock:
|
| 487 |
if isinstance(cycles, int) and cycles < training_state["best_cycles"]:
|
|
|
|
| 492 |
|
| 493 |
|
| 494 |
# Prompt template for VLIW optimization
|
| 495 |
+
EXAMPLE_POOL = [
|
| 496 |
+
"""Example format (not optimized):
|
| 497 |
```python
|
| 498 |
class OptimizedKernelBuilder(KernelBuilder):
|
| 499 |
def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
|
|
|
|
| 502 |
def run():
|
| 503 |
return (0,)
|
| 504 |
```
|
| 505 |
+
""",
|
| 506 |
+
"""Example with scratch + const:
|
| 507 |
```python
|
| 508 |
class OptimizedKernelBuilder(KernelBuilder):
|
| 509 |
def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
|
|
|
|
| 514 |
def run():
|
| 515 |
return (0,)
|
| 516 |
```
|
| 517 |
+
""",
|
| 518 |
+
"""Example with load/store:
|
| 519 |
+
```python
|
| 520 |
+
class OptimizedKernelBuilder(KernelBuilder):
|
| 521 |
+
def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
|
| 522 |
+
addr = self.alloc_scratch("addr")
|
| 523 |
+
val = self.alloc_scratch("val")
|
| 524 |
+
self.add("load", ("const", addr, 4))
|
| 525 |
+
self.add("load", ("load", val, addr))
|
| 526 |
+
self.add("store", ("store", addr, val))
|
| 527 |
+
self.add("flow", ("halt",))
|
| 528 |
+
|
| 529 |
+
def run():
|
| 530 |
+
return (0,)
|
| 531 |
+
```
|
| 532 |
+
""",
|
| 533 |
+
"""Example with tiny loop:
|
| 534 |
+
```python
|
| 535 |
+
class OptimizedKernelBuilder(KernelBuilder):
|
| 536 |
+
def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
|
| 537 |
+
tmp = self.alloc_scratch("tmp")
|
| 538 |
+
for _ in range(2):
|
| 539 |
+
self.add("load", ("const", tmp, 1))
|
| 540 |
+
self.add("flow", ("halt",))
|
| 541 |
+
|
| 542 |
+
def run():
|
| 543 |
+
return (0,)
|
| 544 |
+
```
|
| 545 |
+
""",
|
| 546 |
+
]
|
| 547 |
+
|
| 548 |
+
EXAMPLE_CODE_SET = {
|
| 549 |
+
extract_code_block(example) for example in EXAMPLE_POOL
|
| 550 |
+
}
|
| 551 |
+
|
| 552 |
+
def _select_examples() -> str:
|
| 553 |
+
k = 2 if len(EXAMPLE_POOL) >= 2 else 1
|
| 554 |
+
picks = random.sample(EXAMPLE_POOL, k)
|
| 555 |
+
return "\n".join(picks)
|
| 556 |
|
| 557 |
+
def build_prompt() -> str:
|
| 558 |
+
examples = _select_examples()
|
| 559 |
+
return f"""Write an optimized VLIW/SIMD kernel. OUTPUT ONLY ONE ```python CODE BLOCK.
|
| 560 |
|
| 561 |
ARCHITECTURE: 12 ALU + 6 VALU (VLEN=8) + 2 load + 2 store + 1 flow slots per cycle. 1536-word scratch.
|
| 562 |
|
|
|
|
| 588 |
- class OptimizedKernelBuilder(KernelBuilder): override build_kernel() and emit instructions using add()/build()
|
| 589 |
- def run(): return any tuple (ignored), but must exist
|
| 590 |
- No imports.
|
| 591 |
+
- Examples are format-only. Do NOT copy them verbatim.
|
| 592 |
|
| 593 |
Baseline: {BASELINE_CYCLES:,} cycles. Target: <{TARGET_CYCLES:,} cycles.
|
| 594 |
|
| 595 |
+
{examples}
|
| 596 |
"""
|
| 597 |
|
| 598 |
|
|
|
|
| 667 |
|
| 668 |
# Create dataset with prompts
|
| 669 |
add_log("Creating VLIW optimization dataset...")
|
| 670 |
+
prompts = [build_prompt() for _ in range(16)]
|
| 671 |
dataset = Dataset.from_dict({"prompt": prompts})
|
| 672 |
add_log(f"[OK] Dataset ready: {len(prompts)} prompts")
|
| 673 |
|
|
|
|
| 781 |
|
| 782 |
# Test generation
|
| 783 |
add_log("Testing trained model...")
|
| 784 |
+
inputs = tokenizer(build_prompt(), return_tensors="pt").to(model.device)
|
| 785 |
with torch.no_grad():
|
| 786 |
outputs = model.generate(
|
| 787 |
**inputs,
|