CreativeEngineer commited on
Commit
d9908db
·
1 Parent(s): c166ffe

Rotate examples and reduce overfit

Browse files
Files changed (1) hide show
  1. app.py +56 -9
app.py CHANGED
@@ -83,6 +83,8 @@ SCORE_SCALE = 3000.0
83
  PARSE_REWARD = 0.02
84
  API_REWARD = 0.05
85
  EXEC_REWARD = 0.10
 
 
86
  PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
87
  ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
88
  ADAPTER_DATASET_REPO = os.environ.get("ADAPTER_DATASET_REPO", "CreativeEngineer/vliw-optimizer-adapters")
@@ -465,7 +467,8 @@ def perf_takehome_reward_fn(completions, prompts=None, **kwargs):
465
  text = str(completion)
466
 
467
  code = extract_code_block(text)
468
- result = verify_perf_takehome_code(code)
 
469
 
470
  reward = 0.0
471
  if result.get("correctness", 0.0) > 0:
@@ -477,6 +480,8 @@ def perf_takehome_reward_fn(completions, prompts=None, **kwargs):
477
  reward += API_REWARD
478
  if result.get("exec_ok"):
479
  reward += EXEC_REWARD
 
 
480
  cycles = result.get("cycles")
481
  with state_lock:
482
  if isinstance(cycles, int) and cycles < training_state["best_cycles"]:
@@ -487,7 +492,8 @@ def perf_takehome_reward_fn(completions, prompts=None, **kwargs):
487
 
488
 
489
  # Prompt template for VLIW optimization
490
- FEWSHOT_EXAMPLES = """Example format (not optimized):
 
491
  ```python
492
  class OptimizedKernelBuilder(KernelBuilder):
493
  def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
@@ -496,8 +502,8 @@ class OptimizedKernelBuilder(KernelBuilder):
496
  def run():
497
  return (0,)
498
  ```
499
-
500
- Example with scratch + load:
501
  ```python
502
  class OptimizedKernelBuilder(KernelBuilder):
503
  def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
@@ -508,9 +514,49 @@ class OptimizedKernelBuilder(KernelBuilder):
508
  def run():
509
  return (0,)
510
  ```
511
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
 
513
- PERF_TAKEHOME_PROMPT = f"""Write an optimized VLIW/SIMD kernel. OUTPUT ONLY ONE ```python CODE BLOCK.
 
 
514
 
515
  ARCHITECTURE: 12 ALU + 6 VALU (VLEN=8) + 2 load + 2 store + 1 flow slots per cycle. 1536-word scratch.
516
 
@@ -542,10 +588,11 @@ RULES:
542
  - class OptimizedKernelBuilder(KernelBuilder): override build_kernel() and emit instructions using add()/build()
543
  - def run(): return any tuple (ignored), but must exist
544
  - No imports.
 
545
 
546
  Baseline: {BASELINE_CYCLES:,} cycles. Target: <{TARGET_CYCLES:,} cycles.
547
 
548
- {FEWSHOT_EXAMPLES}
549
  """
550
 
551
 
@@ -620,7 +667,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
620
 
621
  # Create dataset with prompts
622
  add_log("Creating VLIW optimization dataset...")
623
- prompts = [PERF_TAKEHOME_PROMPT] * 16
624
  dataset = Dataset.from_dict({"prompt": prompts})
625
  add_log(f"[OK] Dataset ready: {len(prompts)} prompts")
626
 
@@ -734,7 +781,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
734
 
735
  # Test generation
736
  add_log("Testing trained model...")
737
- inputs = tokenizer(PERF_TAKEHOME_PROMPT, return_tensors="pt").to(model.device)
738
  with torch.no_grad():
739
  outputs = model.generate(
740
  **inputs,
 
83
  PARSE_REWARD = 0.02
84
  API_REWARD = 0.05
85
  EXEC_REWARD = 0.10
86
+ COPY_PENALTY = 0.05
87
+ SEED_POOL = [0, 1, 2, 3]
88
  PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
89
  ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
90
  ADAPTER_DATASET_REPO = os.environ.get("ADAPTER_DATASET_REPO", "CreativeEngineer/vliw-optimizer-adapters")
 
467
  text = str(completion)
468
 
469
  code = extract_code_block(text)
470
+ seed = random.choice(SEED_POOL)
471
+ result = verify_perf_takehome_code(code, seed=seed)
472
 
473
  reward = 0.0
474
  if result.get("correctness", 0.0) > 0:
 
480
  reward += API_REWARD
481
  if result.get("exec_ok"):
482
  reward += EXEC_REWARD
483
+ if code.strip() in EXAMPLE_CODE_SET:
484
+ reward = max(0.0, reward - COPY_PENALTY)
485
  cycles = result.get("cycles")
486
  with state_lock:
487
  if isinstance(cycles, int) and cycles < training_state["best_cycles"]:
 
492
 
493
 
494
  # Prompt template for VLIW optimization
495
+ EXAMPLE_POOL = [
496
+ """Example format (not optimized):
497
  ```python
498
  class OptimizedKernelBuilder(KernelBuilder):
499
  def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
 
502
  def run():
503
  return (0,)
504
  ```
505
+ """,
506
+ """Example with scratch + const:
507
  ```python
508
  class OptimizedKernelBuilder(KernelBuilder):
509
  def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
 
514
  def run():
515
  return (0,)
516
  ```
517
+ """,
518
+ """Example with load/store:
519
+ ```python
520
+ class OptimizedKernelBuilder(KernelBuilder):
521
+ def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
522
+ addr = self.alloc_scratch("addr")
523
+ val = self.alloc_scratch("val")
524
+ self.add("load", ("const", addr, 4))
525
+ self.add("load", ("load", val, addr))
526
+ self.add("store", ("store", addr, val))
527
+ self.add("flow", ("halt",))
528
+
529
+ def run():
530
+ return (0,)
531
+ ```
532
+ """,
533
+ """Example with tiny loop:
534
+ ```python
535
+ class OptimizedKernelBuilder(KernelBuilder):
536
+ def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
537
+ tmp = self.alloc_scratch("tmp")
538
+ for _ in range(2):
539
+ self.add("load", ("const", tmp, 1))
540
+ self.add("flow", ("halt",))
541
+
542
+ def run():
543
+ return (0,)
544
+ ```
545
+ """,
546
+ ]
547
+
548
+ EXAMPLE_CODE_SET = {
549
+ extract_code_block(example) for example in EXAMPLE_POOL
550
+ }
551
+
552
+ def _select_examples() -> str:
553
+ k = 2 if len(EXAMPLE_POOL) >= 2 else 1
554
+ picks = random.sample(EXAMPLE_POOL, k)
555
+ return "\n".join(picks)
556
 
557
+ def build_prompt() -> str:
558
+ examples = _select_examples()
559
+ return f"""Write an optimized VLIW/SIMD kernel. OUTPUT ONLY ONE ```python CODE BLOCK.
560
 
561
  ARCHITECTURE: 12 ALU + 6 VALU (VLEN=8) + 2 load + 2 store + 1 flow slots per cycle. 1536-word scratch.
562
 
 
588
  - class OptimizedKernelBuilder(KernelBuilder): override build_kernel() and emit instructions using add()/build()
589
  - def run(): return any tuple (ignored), but must exist
590
  - No imports.
591
+ - Examples are format-only. Do NOT copy them verbatim.
592
 
593
  Baseline: {BASELINE_CYCLES:,} cycles. Target: <{TARGET_CYCLES:,} cycles.
594
 
595
+ {examples}
596
  """
597
 
598
 
 
667
 
668
  # Create dataset with prompts
669
  add_log("Creating VLIW optimization dataset...")
670
+ prompts = [build_prompt() for _ in range(16)]
671
  dataset = Dataset.from_dict({"prompt": prompts})
672
  add_log(f"[OK] Dataset ready: {len(prompts)} prompts")
673
 
 
781
 
782
  # Test generation
783
  add_log("Testing trained model...")
784
+ inputs = tokenizer(build_prompt(), return_tensors="pt").to(model.device)
785
  with torch.no_grad():
786
  outputs = model.generate(
787
  **inputs,