Spaces:

CreativeEngineer
/

vliw-optimizer

Sleeping

App Files Files Community

CreativeEngineer commited on Jan 26

Commit

648e193

1 Parent(s): b3b926b

Switch to correctness-gated GRPO LoRA with persistence

Browse files

Files changed (1) hide show

app.py +403 -197

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 """
 HF Spaces app for VLIW kernel optimization via RL.
-Uses actual simulator for cycle-count based rewards.
 """
 import gradio as gr
 import threading
 import time
@@ -42,13 +44,21 @@ try:
 except Exception as e:
     startup_log.append(f"✗ CUDA check: {e}")
 # Import simulator components
 try:
     from problem import (
         Machine, Tree, Input, DebugInfo,
         build_mem_image, reference_kernel2,
-        SLOT_LIMITS, VLEN, N_CORES, SCRATCH_SIZE, HASH_STAGES
     )
     startup_log.append("✓ VLIW Simulator: OK")
     SIMULATOR_AVAILABLE = True
 except Exception as e:
@@ -58,6 +68,9 @@ except Exception as e:
 # Constants
 BASELINE_CYCLES = 147734
 TARGET_CYCLES = 1363
 # Training state
 training_state = {
@@ -65,184 +78,291 @@ training_state = {
     "should_stop": False,
     "log": [],
     "best_cycles": BASELINE_CYCLES,
     "step": 0,
 }
 state_lock = threading.Lock()
 def get_status():
     return "\n".join(startup_log)
-def parse_kernel_code(code_text):
-    """
-    Parse LLM-generated kernel code into simulator instructions.
-    Returns list of instruction dicts or None if parsing fails.
-    """
-    instructions = []
-    # Try to find instruction patterns in the code
-    # Format: {"engine": [("op", arg1, arg2, ...)]}
-    # Look for dict-like instruction patterns
-    pattern = r'\{[^}]+\}'
-    matches = re.findall(pattern, code_text)
-    for match in matches:
-        try:
-            # Try to eval as Python dict (safely)
-            instr = eval(match, {"__builtins__": {}})
-            if isinstance(instr, dict):
-                # Validate it's a valid instruction
-                valid_engines = {"alu", "valu", "load", "store", "flow", "debug"}
-                if any(k in valid_engines for k in instr.keys()):
-                    instructions.append(instr)
-        except:
-            continue
-    return instructions if instructions else None
-def build_simple_kernel(batch_size, rounds):
-    """
-    Build a simple baseline kernel for comparison.
-    This is a simplified version that the model should try to beat.
-    """
-    instructions = []
-    # Initialize scratch space addresses
-    for i in range(7):
-        instructions.append({"load": [("const", i, i)]})
-        instructions.append({"load": [("load", i, i)]})
-    instructions.append({"flow": [("pause",)]})
-    # Main loop body (simplified)
-    for r in range(min(rounds, 2)):  # Limit for testing
-        for i in range(min(batch_size, 4)):  # Limit for testing
-            # Load index and value
-            instructions.append({"alu": [("+", 10, 5, 0)]})  # addr = inp_indices_p + 0
-            instructions.append({"load": [("load", 11, 10)]})  # idx = mem[addr]
-            instructions.append({"alu": [("+", 12, 6, 0)]})  # addr = inp_values_p + 0
-            instructions.append({"load": [("load", 13, 12)]})  # val = mem[addr]
-    instructions.append({"flow": [("pause",)]})
-    return instructions
-def evaluate_kernel(instructions, seed=42):
-    """
-    Run kernel through simulator and return cycle count.
-    Lower is better.
-    """
     if not SIMULATOR_AVAILABLE:
-        return BASELINE_CYCLES
     try:
-        random.seed(seed)
-        forest = Tree.generate(10)
-        inp = Input.generate(forest, 256, 16)
-        mem = build_mem_image(forest, inp)
-        debug_info = DebugInfo(scratch_map={})
         machine = Machine(
-            mem,
-            instructions,
-            debug_info,
             n_cores=N_CORES,
             trace=False,
         )
         machine.enable_pause = False
         machine.enable_debug = False
-        # Run the machine
-        machine.run()
-        return machine.cycle
     except Exception as e:
-        # Return high cycle count for invalid code
-        return BASELINE_CYCLES * 2
-def vliw_reward_fn(completions, prompts=None, **kwargs):
-    """
-    Reward function based on VLIW simulator cycle count.
-    Higher reward for lower cycle count.
-    """
     rewards = []
     for completion in completions:
-        # Extract text from completion
         if isinstance(completion, list):
             text = completion[0].get("content", "") if completion else ""
         else:
             text = str(completion)
-        # Try to parse as kernel instructions
-        instructions = parse_kernel_code(text)
-        if instructions and len(instructions) > 5:
-            # Evaluate with simulator
-            cycles = evaluate_kernel(instructions)
-            # Reward: normalized improvement over baseline
-            # Max reward when cycles <= TARGET_CYCLES
-            if cycles <= TARGET_CYCLES:
-                reward = 2.0  # Maximum reward
-            elif cycles < BASELINE_CYCLES:
-                # Linear scale between baseline and target
-                improvement = (BASELINE_CYCLES - cycles) / (BASELINE_CYCLES - TARGET_CYCLES)
-                reward = 0.5 + 1.5 * improvement
-            else:
-                # Below baseline performance
-                reward = 0.5 * (BASELINE_CYCLES / max(cycles, 1))
-        else:
-            # Could not parse - give small reward for code-like output
-            reward = 0.1
-            if "def " in text or "for " in text:
-                reward = 0.2
-            if any(kw in text for kw in ["alu", "load", "store", "valu"]):
-                reward = 0.3
-        rewards.append(reward)
     return rewards
 # Prompt template for VLIW optimization
-VLIW_PROMPT = """You are an expert in VLIW (Very Long Instruction Word) architecture optimization.
-Generate optimized VLIW assembly code for a parallel tree traversal kernel.
-The architecture has these engines that execute in parallel each cycle:
-- alu: up to 12 scalar ALU operations per cycle
-- valu: up to 6 vector ALU operations (VLEN=8 elements)
-- load: up to 2 load operations per cycle
-- store: up to 2 store operations per cycle
-- flow: 1 control flow operation per cycle
-Instructions are in Python dict format:
-{"alu": [("+", dest, src1, src2), ("*", dest, src1, src2)], "load": [("load", dest, addr)]}
-The kernel should:
-1. Load indices and values from memory
-2. Perform hash computation (6 stages using +, ^, <<, >>)
-3. Update tree traversal index based on hash result
-4. Store results back to memory
-Optimize for minimum cycle count. Current baseline: 147,734 cycles. Target: <1,363 cycles.
-Generate the optimized kernel code:"""
-def run_training(model_name, num_steps):
-    """Run RL training with VLIW simulator rewards."""
     import torch
     from datasets import Dataset
     from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
     from peft import LoraConfig
     from trl import GRPOConfig, GRPOTrainer
     from transformers import TrainerCallback
@@ -258,12 +378,16 @@ def run_training(model_name, num_steps):
         training_state["should_stop"] = False
         training_state["log"] = []
         training_state["best_cycles"] = BASELINE_CYCLES
         training_state["step"] = 0
     try:
         add_log(f"Starting VLIW optimization training")
-        add_log(f"Model: {model_name}, Steps: {num_steps}")
         add_log(f"Baseline: {BASELINE_CYCLES:,} cycles, Target: {TARGET_CYCLES:,} cycles")
         # Load tokenizer
         add_log("Loading tokenizer...")
@@ -279,17 +403,25 @@ def run_training(model_name, num_steps):
             bnb_4bit_quant_type="nf4",
             bnb_4bit_compute_dtype=torch.bfloat16,
         )
-        model = AutoModelForCausalLM.from_pretrained(
             model_name,
             quantization_config=bnb_config,
             device_map="auto",
             trust_remote_code=True,
         )
-        add_log(f"✓ Model loaded on {next(model.parameters()).device}")
-        # Create dataset with VLIW prompts
         add_log("Creating VLIW optimization dataset...")
-        prompts = [VLIW_PROMPT] * 16
         dataset = Dataset.from_dict({"prompt": prompts})
         add_log(f"✓ Dataset ready: {len(prompts)} prompts")
@@ -304,62 +436,102 @@ def run_training(model_name, num_steps):
             task_type="CAUSAL_LM",
         )
-        # Custom callback for logging
         class VLIWCallback(TrainerCallback):
             def on_step_end(self, args, state, control, **kwargs):
                 with state_lock:
-                    training_state["step"] = state.global_step
                     if training_state["should_stop"]:
                         control.should_training_stop = True
                 return control
             def on_log(self, args, state, control, logs=None, **kwargs):
                 if logs:
                     loss = logs.get("loss", "N/A")
                     reward = logs.get("reward", logs.get("mean_reward", "N/A"))
-                    step = state.global_step
                     add_log(f"Step {step}: loss={loss:.4f}, reward={reward:.4f}" if isinstance(loss, float) else f"Step {step}: {logs}")
-        # GRPO config
-        add_log("Creating GRPO trainer with VLIW rewards...")
-        config = GRPOConfig(
-            output_dir="./grpo_vliw_output",
-            num_train_epochs=1,
-            max_steps=num_steps,
-            per_device_train_batch_size=1,
-            gradient_accumulation_steps=4,
-            learning_rate=1e-5,
-            logging_steps=1,
-            save_steps=999999,
-            report_to="none",
-            remove_unused_columns=False,
-            max_completion_length=512,
-            num_generations=4,
-        )
-        trainer = GRPOTrainer(
-            model=model,
-            args=config,
-            train_dataset=dataset,
-            reward_funcs=vliw_reward_fn,
-            peft_config=lora_config,
-            processing_class=tokenizer,
-            callbacks=[VLIWCallback()],
-        )
-        add_log("✓ Trainer ready")
-        # Train
         add_log("Starting training loop...")
-        add_log("(Model will learn to generate VLIW code with lower cycle counts)")
-        train_result = trainer.train()
-        metrics = train_result.metrics
-        add_log(f"✓ Training complete!")
-        add_log(f"  Total steps: {metrics.get('train_steps', num_steps)}")
         # Test generation
         add_log("Testing trained model...")
-        inputs = tokenizer(VLIW_PROMPT[:200], return_tensors="pt").to(model.device)
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
@@ -370,15 +542,15 @@ def run_training(model_name, num_steps):
             )
         result = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Try to evaluate the generated code
-        instructions = parse_kernel_code(result)
-        if instructions:
-            cycles = evaluate_kernel(instructions)
-            add_log(f"Generated kernel: {len(instructions)} instructions, {cycles:,} cycles")
-            speedup = BASELINE_CYCLES / max(cycles, 1)
             add_log(f"Speedup: {speedup:.2f}x over baseline")
         else:
-            add_log(f"Sample output (first 200 chars): {result[len(VLIW_PROMPT[:200]):len(VLIW_PROMPT[:200])+200]}...")
         add_log("\n✓ All done!")
@@ -398,13 +570,25 @@ def run_training(model_name, num_steps):
     return "\n".join(log)
-def start_training(model_name, num_steps):
     """Start training."""
     with state_lock:
         if training_state["is_training"]:
-            return "Training already in progress. Please wait."
-    return run_training(model_name, int(num_steps))
 def stop_training():
@@ -420,14 +604,14 @@ def stop_training():
 with gr.Blocks(title="VLIW Optimizer") as demo:
     gr.Markdown("# VLIW Kernel Optimizer - RL Training")
     gr.Markdown(f"""
-    Train a language model with reinforcement learning to generate optimized VLIW/SIMD kernels.
     **Goal:** Reduce cycle count from **{BASELINE_CYCLES:,}** (baseline) to **<{TARGET_CYCLES:,}** (108x speedup)
     **How it works:**
-    1. Model generates VLIW assembly code
-    2. Simulator evaluates cycle count
-    3. RL training improves model based on cycle-count rewards
     """)
     with gr.Row():
@@ -448,12 +632,28 @@ with gr.Blocks(title="VLIW Optimizer") as demo:
                 value="Qwen/Qwen2.5-Coder-1.5B-Instruct",
                 label="Model",
             )
-            steps_slider = gr.Slider(
                 minimum=5,
                 maximum=100,
                 value=20,
                 step=5,
-                label="Training Steps",
             )
             with gr.Row():
@@ -467,12 +667,18 @@ with gr.Blocks(title="VLIW Optimizer") as demo:
         value="Click 'Start Training' to begin VLIW optimization.",
     )
     start_btn.click(
         start_training,
-        [model_dropdown, steps_slider],
         [output_box],
     )
     stop_btn.click(stop_training, [], [output_box])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 """
 HF Spaces app for VLIW kernel optimization via RL.
+Uses actual simulator for correctness-gated cycle-count rewards.
 """
+import os
+import sys
 import gradio as gr
 import threading
 import time
 except Exception as e:
     startup_log.append(f"✗ CUDA check: {e}")
+# Prefer simulator + KernelBuilder from bundled original_performance_takehome.
+# In Spaces, this keeps evaluation consistent and enables correctness checks.
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+PERF_TAKEHOME_PATH = os.path.join(THIS_DIR, "original_performance_takehome")
+if os.path.isdir(PERF_TAKEHOME_PATH):
+    sys.path.insert(0, PERF_TAKEHOME_PATH)
 # Import simulator components
 try:
     from problem import (
         Machine, Tree, Input, DebugInfo,
         build_mem_image, reference_kernel2,
+        SLOT_LIMITS, VLEN, N_CORES, SCRATCH_SIZE, CoreState
     )
+    from perf_takehome import KernelBuilder, HASH_STAGES
     startup_log.append("✓ VLIW Simulator: OK")
     SIMULATOR_AVAILABLE = True
 except Exception as e:
 # Constants
 BASELINE_CYCLES = 147734
 TARGET_CYCLES = 1363
+SCORE_SCALE = 3000.0
+PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
+ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
 # Training state
 training_state = {
     "should_stop": False,
     "log": [],
     "best_cycles": BASELINE_CYCLES,
+    "best_code": None,
     "step": 0,
 }
 state_lock = threading.Lock()
+_eval_context = {}
 def get_status():
     return "\n".join(startup_log)
+def extract_code_block(text: str) -> str:
+    pattern = r"```python\s*(.*?)```"
+    matches = re.findall(pattern, text, re.DOTALL)
+    if matches:
+        return matches[-1].strip()
+    pattern = r"```\s*(.*?)```"
+    matches = re.findall(pattern, text, re.DOTALL)
+    if matches:
+        return matches[-1].strip()
+    return text.strip()
+def _run_machine_with_cycle_limit(machine: Machine, max_cycles: int) -> bool:
+    for core in machine.cores:
+        if core.state == CoreState.PAUSED:
+            core.state = CoreState.RUNNING
+    while any(c.state == CoreState.RUNNING for c in machine.cores):
+        has_non_debug = False
+        for core in machine.cores:
+            if core.state != CoreState.RUNNING:
+                continue
+            if core.pc >= len(machine.program):
+                core.state = CoreState.STOPPED
+                continue
+            instr = machine.program[core.pc]
+            core.pc += 1
+            machine.step(instr, core)
+            if any(name != "debug" for name in instr.keys()):
+                has_non_debug = True
+        if has_non_debug:
+            machine.cycle += 1
+            if machine.cycle >= max_cycles:
+                for core in machine.cores:
+                    core.state = CoreState.STOPPED
+                return False
+    return True
+def _get_eval_context(seed: int) -> dict:
+    with state_lock:
+        cached = _eval_context.get(seed)
+        if cached is not None:
+            return cached
+    random.seed(seed)
+    forest = Tree.generate(10)
+    inp = Input.generate(forest, 256, 16)
+    mem0 = build_mem_image(forest, inp)
+    ref_mem = None
+    for ref_mem in reference_kernel2(list(mem0)):
+        pass
+    if ref_mem is None:
+        raise RuntimeError("Reference kernel produced no output")
+    inp_values_p = ref_mem[6]
+    expected = ref_mem[inp_values_p : inp_values_p + len(inp.values)]
+    ctx = {
+        "forest": forest,
+        "inp": inp,
+        "mem0": mem0,
+        "expected": expected,
+        "inp_values_p": inp_values_p,
+    }
+    with state_lock:
+        _eval_context[seed] = ctx
+    return ctx
+def verify_perf_takehome_code(code: str, seed: int = 123) -> dict:
     if not SIMULATOR_AVAILABLE:
+        return {
+            "score": 0.0,
+            "correctness": 0.0,
+            "cycles": None,
+            "msg": "Simulator unavailable",
+        }
     try:
+        code = code.strip()
+        if not code:
+            return {
+                "score": 0.0,
+                "correctness": 0.0,
+                "cycles": None,
+                "msg": "Empty code",
+            }
+        if "OptimizedKernelBuilder" not in code:
+            return {
+                "score": 0.0,
+                "correctness": 0.0,
+                "cycles": None,
+                "msg": "Missing OptimizedKernelBuilder",
+            }
+        if "def run" not in code:
+            return {
+                "score": 0.0,
+                "correctness": 0.0,
+                "cycles": None,
+                "msg": "Missing run()",
+            }
+        safe_builtins = {
+            "abs": abs,
+            "all": all,
+            "any": any,
+            "dict": dict,
+            "enumerate": enumerate,
+            "int": int,
+            "len": len,
+            "list": list,
+            "max": max,
+            "min": min,
+            "range": range,
+            "sum": sum,
+            "tuple": tuple,
+            "zip": zip,
+        }
+        exec_globals = {
+            "__builtins__": safe_builtins,
+            "KernelBuilder": KernelBuilder,
+            "HASH_STAGES": HASH_STAGES,
+            "VLEN": VLEN,
+            "SLOT_LIMITS": SLOT_LIMITS,
+        }
+        exec(code, exec_globals)
+        if "OptimizedKernelBuilder" not in exec_globals:
+            return {
+                "score": 0.0,
+                "correctness": 0.0,
+                "cycles": None,
+                "msg": "OptimizedKernelBuilder not defined after exec",
+            }
+        ctx = _get_eval_context(seed)
+        forest = ctx["forest"]
+        inp = ctx["inp"]
+        mem0 = ctx["mem0"]
+        kb = exec_globals["OptimizedKernelBuilder"]()
+        kb.build_kernel(10, len(forest.values), 256, 16)
         machine = Machine(
+            list(mem0),
+            kb.instrs,
+            kb.debug_info(),
             n_cores=N_CORES,
             trace=False,
         )
         machine.enable_pause = False
         machine.enable_debug = False
+        ok = _run_machine_with_cycle_limit(machine, max_cycles=250000)
+        if not ok:
+            return {
+                "score": 0.0,
+                "correctness": 0.0,
+                "cycles": int(machine.cycle),
+                "msg": f"Exceeded cycle limit (cycles={machine.cycle})",
+            }
+        cycles = machine.cycle
+        if cycles <= 100:
+            return {
+                "score": 0.0,
+                "correctness": 0.0,
+                "cycles": int(cycles),
+                "msg": f"Suspiciously low cycles ({cycles})",
+            }
+        if cycles > 200000:
+            return {
+                "score": 0.0,
+                "correctness": 0.0,
+                "cycles": int(cycles),
+                "msg": f"Cycles too high ({cycles})",
+            }
+        inp_values_p = ctx["inp_values_p"]
+        expected = ctx["expected"]
+        actual = machine.mem[inp_values_p : inp_values_p + len(inp.values)]
+        if expected != actual:
+            return {
+                "score": 0.0,
+                "correctness": 0.0,
+                "cycles": int(cycles),
+                "msg": f"Incorrect output (cycles={cycles})",
+            }
+        score = SCORE_SCALE / cycles
+        return {
+            "score": float(score),
+            "correctness": 1.0,
+            "cycles": int(cycles),
+            "msg": f"Success: {cycles} cycles",
+        }
     except Exception as e:
+        return {
+            "score": 0.0,
+            "correctness": 0.0,
+            "cycles": None,
+            "msg": f"Execution error: {str(e)[:200]}",
+        }
+def perf_takehome_reward_fn(completions, prompts=None, **kwargs):
     rewards = []
     for completion in completions:
         if isinstance(completion, list):
             text = completion[0].get("content", "") if completion else ""
         else:
             text = str(completion)
+        code = extract_code_block(text)
+        result = verify_perf_takehome_code(code)
+        reward = 0.0
+        if result.get("correctness", 0.0) > 0:
+            reward = float(result["score"]) + 1.0
+            cycles = result.get("cycles")
+            with state_lock:
+                if isinstance(cycles, int) and cycles < training_state["best_cycles"]:
+                    training_state["best_cycles"] = cycles
+                    training_state["best_code"] = code
+        rewards.append(float(reward))
     return rewards
 # Prompt template for VLIW optimization
+PERF_TAKEHOME_PROMPT = f"""Write an optimized VLIW/SIMD kernel. OUTPUT ONLY ONE ```python CODE BLOCK.
+ARCHITECTURE: 12 ALU + 6 VALU (VLEN=8) + 2 load + 2 store + 1 flow slots per cycle. 1536-word scratch.
+API (KernelBuilder):
+- alloc_scratch(name, length) -> addr
+- scratch_const(val, name) -> addr
+- add(engine, slot): engine in {{alu, valu, load, store, flow}}
+  - alu: (op, dst, src1, src2) where op in {{+,-,*,//,%,^,&,|,<<,>>,<,==,!=,<=,>=,>}}
+  - valu: same ops but on vectors (VLEN=8)
+  - load: (load,dst,addr), (vload,dst,addr), (const,dst,val), (vbroadcast,dst,scalar_addr)
+  - store: (store,addr,src), (vstore,addr,src)
+  - flow: (select,dst,cond,t,f), (vselect,dst,cond,t,f), (cond_jump,cond,pc), (jump,pc), (halt,)
+- label(name): mark code position
+- build(slots, vliw=True): pack slots into VLIW bundle
+MEMORY: mem[4]=forest_values, mem[5]=inp_indices, mem[6]=inp_values (256 elements each)
+ALGORITHM: 16 rounds x 256 items:
+  load idx,val
+  node = tree[idx]
+  val = hash(val ^ node) using HASH_STAGES
+  idx = 2*idx + (1 if val%2==0 else 2)
+  idx = 0 if idx >= n_nodes else idx
+  store idx,val
+RULES:
+- Output exactly one python code block.
+- The code block must define:
+  - class OptimizedKernelBuilder(KernelBuilder): override build_kernel() and emit instructions using add()/build()
+  - def run(): return any tuple (ignored), but must exist
+- No imports.
+Baseline: {BASELINE_CYCLES:,} cycles. Target: <{TARGET_CYCLES:,} cycles.
+"""
+def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_continue):
+    """Run GRPO + LoRA training with correctness-gated perf_takehome rewards."""
     import torch
     from datasets import Dataset
     from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
     from peft import LoraConfig
+    from peft import PeftModel
     from trl import GRPOConfig, GRPOTrainer
     from transformers import TrainerCallback
         training_state["should_stop"] = False
         training_state["log"] = []
         training_state["best_cycles"] = BASELINE_CYCLES
+        training_state["best_code"] = None
         training_state["step"] = 0
     try:
         add_log(f"Starting VLIW optimization training")
+        add_log(f"Model: {model_name}")
+        add_log(f"Chunk steps: {chunk_steps}")
+        add_log(f"Auto-continue: {auto_continue} (max_total_steps={max_total_steps}, max_minutes={max_minutes})")
         add_log(f"Baseline: {BASELINE_CYCLES:,} cycles, Target: {TARGET_CYCLES:,} cycles")
+        add_log(f"Adapter dir: {ADAPTER_DIR}")
         # Load tokenizer
         add_log("Loading tokenizer...")
             bnb_4bit_quant_type="nf4",
             bnb_4bit_compute_dtype=torch.bfloat16,
         )
+        base_model = AutoModelForCausalLM.from_pretrained(
             model_name,
             quantization_config=bnb_config,
             device_map="auto",
             trust_remote_code=True,
         )
+        add_log(f"✓ Base model loaded on {next(base_model.parameters()).device}")
+        # Resume LoRA adapter if present
+        if os.path.isdir(ADAPTER_DIR) and os.path.exists(os.path.join(ADAPTER_DIR, "adapter_config.json")):
+            add_log("Loading existing LoRA adapter (resume)...")
+            model = PeftModel.from_pretrained(base_model, ADAPTER_DIR, is_trainable=True)
+            add_log("✓ Adapter loaded")
+        else:
+            model = base_model
+        # Create dataset with prompts
         add_log("Creating VLIW optimization dataset...")
+        prompts = [PERF_TAKEHOME_PROMPT] * 16
         dataset = Dataset.from_dict({"prompt": prompts})
         add_log(f"✓ Dataset ready: {len(prompts)} prompts")
             task_type="CAUSAL_LM",
         )
+        progress = {"step": 0}
+        start_time = time.time()
+        max_seconds = float(max_minutes) * 60.0 if auto_continue else float("inf")
+        total_target_steps = int(max_total_steps) if auto_continue else int(chunk_steps)
+        # Custom callback for logging + early stop
         class VLIWCallback(TrainerCallback):
             def on_step_end(self, args, state, control, **kwargs):
                 with state_lock:
+                    progress["step"] += 1
+                    training_state["step"] = progress["step"]
                     if training_state["should_stop"]:
                         control.should_training_stop = True
+                    if training_state["best_cycles"] <= TARGET_CYCLES:
+                        control.should_training_stop = True
                 return control
             def on_log(self, args, state, control, logs=None, **kwargs):
                 if logs:
                     loss = logs.get("loss", "N/A")
                     reward = logs.get("reward", logs.get("mean_reward", "N/A"))
+                    step = progress["step"]
                     add_log(f"Step {step}: loss={loss:.4f}, reward={reward:.4f}" if isinstance(loss, float) else f"Step {step}: {logs}")
+        add_log("Creating GRPO trainer with perf_takehome rewards...")
+        output_dir = os.path.join(PERSIST_DIR, "grpo_perf_takehome_output")
+        os.makedirs(output_dir, exist_ok=True)
+        add_log("✓ Trainer config ready")
         add_log("Starting training loop...")
+        add_log("(Stops early if target reached; can auto-continue in chunks)")
+        chunk_idx = 0
+        while True:
+            with state_lock:
+                if training_state["should_stop"]:
+                    break
+                if training_state["best_cycles"] <= TARGET_CYCLES:
+                    break
+            if progress["step"] >= total_target_steps:
+                break
+            if (time.time() - start_time) >= max_seconds:
+                break
+            remaining = total_target_steps - progress["step"]
+            this_chunk_steps = min(int(chunk_steps), int(remaining))
+            if this_chunk_steps <= 0:
+                break
+            chunk_idx += 1
+            add_log(f"Chunk {chunk_idx}: training {this_chunk_steps} steps...")
+            config = GRPOConfig(
+                output_dir=output_dir,
+                num_train_epochs=1,
+                max_steps=this_chunk_steps,
+                per_device_train_batch_size=1,
+                gradient_accumulation_steps=4,
+                learning_rate=1e-5,
+                logging_steps=1,
+                save_steps=999999,
+                report_to="none",
+                remove_unused_columns=False,
+                max_completion_length=512,
+                num_generations=4,
+            )
+            trainer = GRPOTrainer(
+                model=model,
+                args=config,
+                train_dataset=dataset,
+                reward_funcs=perf_takehome_reward_fn,
+                peft_config=lora_config,
+                processing_class=tokenizer,
+                callbacks=[VLIWCallback()],
+            )
+            train_result = trainer.train()
+            metrics = train_result.metrics
+            add_log(f"Chunk {chunk_idx} done: steps={metrics.get('train_steps', this_chunk_steps)}")
+            # Save adapter after each chunk so it persists across restarts
+            try:
+                os.makedirs(os.path.dirname(ADAPTER_DIR), exist_ok=True)
+                trainer.save_model(ADAPTER_DIR)
+                add_log(f"✓ Saved adapter to {ADAPTER_DIR}")
+            except Exception as e:
+                add_log(f"✗ Failed to save adapter: {str(e)[:120]}")
+            if not auto_continue:
+                break
         # Test generation
         add_log("Testing trained model...")
+        inputs = tokenizer(PERF_TAKEHOME_PROMPT, return_tensors="pt").to(model.device)
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
             )
         result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        code = extract_code_block(result)
+        verify_out = verify_perf_takehome_code(code)
+        if verify_out.get("correctness", 0.0) > 0:
+            cycles = verify_out.get("cycles")
+            add_log(f"Generated kernel verified: {cycles:,} cycles")
+            speedup = BASELINE_CYCLES / max(int(cycles), 1) if isinstance(cycles, int) else 0.0
             add_log(f"Speedup: {speedup:.2f}x over baseline")
         else:
+            add_log(f"Generated kernel invalid: {verify_out.get('msg', '')[:160]}")
         add_log("\n✓ All done!")
     return "\n".join(log)
+def start_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_continue):
     """Start training."""
     with state_lock:
         if training_state["is_training"]:
+            return "\n".join(training_state["log"][-200:]) or "Training already in progress. Please wait."
+    thread = threading.Thread(
+        target=run_training,
+        args=(
+            model_name,
+            int(chunk_steps),
+            int(max_total_steps),
+            float(max_minutes),
+            bool(auto_continue),
+        ),
+        daemon=True,
+    )
+    thread.start()
+    return "Training started. Logs will stream below."
 def stop_training():
 with gr.Blocks(title="VLIW Optimizer") as demo:
     gr.Markdown("# VLIW Kernel Optimizer - RL Training")
     gr.Markdown(f"""
+    Train a language model with reinforcement learning (LoRA) at test time to generate correct, fast VLIW/SIMD kernels.
     **Goal:** Reduce cycle count from **{BASELINE_CYCLES:,}** (baseline) to **<{TARGET_CYCLES:,}** (108x speedup)
     **How it works:**
+    1. Model generates Python kernel builder code
+    2. Simulator checks correctness vs reference and measures cycles
+    3. GRPO updates LoRA weights; adapter is saved and reloaded from `{ADAPTER_DIR}`
     """)
     with gr.Row():
                 value="Qwen/Qwen2.5-Coder-1.5B-Instruct",
                 label="Model",
             )
+            chunk_steps_slider = gr.Slider(
                 minimum=5,
                 maximum=100,
                 value=20,
                 step=5,
+                label="Chunk Steps",
+            )
+            auto_continue_checkbox = gr.Checkbox(
+                value=False,
+                label="Auto-continue (chain chunks)",
+            )
+            max_total_steps_slider = gr.Slider(
+                minimum=5,
+                maximum=500,
+                value=100,
+                step=5,
+                label="Max Total Steps",
+            )
+            max_minutes_number = gr.Number(
+                value=60,
+                precision=0,
+                label="Max Minutes",
             )
             with gr.Row():
         value="Click 'Start Training' to begin VLIW optimization.",
     )
+    def poll_log():
+        with state_lock:
+            return "\n".join(training_state["log"][-400:]) if training_state["log"] else ""
     start_btn.click(
         start_training,
+        [model_dropdown, chunk_steps_slider, max_total_steps_slider, max_minutes_number, auto_continue_checkbox],
         [output_box],
     )
     stop_btn.click(stop_training, [], [output_box])
+    gr.Timer(1.0).tick(poll_log, outputs=[output_box])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)