Spaces:

CreativeEngineer
/

vliw-optimizer

Paused

File size: 32,065 Bytes

"""
HF Spaces app for VLIW kernel optimization via RL.
Uses actual simulator for correctness-gated cycle-count rewards.
"""
import os
import sys
import gradio as gr
import threading
import time
import random
import re
from copy import copy
from pathlib import Path

# Check imports at startup
startup_log = []

def check_import(name, import_fn):
    try:
        result = import_fn()
        startup_log.append(f"[OK] {name}: {result}")
        return True
    except Exception as e:
        startup_log.append(f"[ERR] {name}: {str(e)[:80]}")
        return False

check_import("torch", lambda: __import__("torch").__version__)
check_import("transformers", lambda: __import__("transformers").__version__)
check_import("datasets", lambda: __import__("datasets").__version__)
check_import("peft", lambda: __import__("peft").__version__)
check_import("trl", lambda: __import__("trl").__version__)
check_import("huggingface_hub", lambda: __import__("huggingface_hub").__version__)

try:
    from trl import GRPOConfig, GRPOTrainer
    startup_log.append("[OK] GRPOTrainer: OK")
except Exception as e:
    startup_log.append(f"[ERR] GRPOTrainer: {e}")

try:
    import torch
    if torch.cuda.is_available():
        startup_log.append(f"[OK] CUDA: {torch.cuda.get_device_name(0)}")
    else:
        startup_log.append("[ERR] CUDA: Not available")
except Exception as e:
    startup_log.append(f"[ERR] CUDA check: {e}")

# Prefer simulator + KernelBuilder from bundled original_performance_takehome.
# In Spaces, this keeps evaluation consistent and enables correctness checks.
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
PERF_TAKEHOME_PATH = os.path.join(THIS_DIR, "original_performance_takehome")
if os.path.isdir(PERF_TAKEHOME_PATH):
    sys.path.insert(0, PERF_TAKEHOME_PATH)

# Import simulator components
try:
    from problem import (
        Machine, Tree, Input, DebugInfo,
        build_mem_image, reference_kernel2,
        SLOT_LIMITS, VLEN, N_CORES, SCRATCH_SIZE, CoreState
    )
    from perf_takehome import KernelBuilder, HASH_STAGES
    startup_log.append("[OK] VLIW Simulator: OK")
    SIMULATOR_AVAILABLE = True
except Exception as e:
    startup_log.append(f"[ERR] VLIW Simulator: {e}")
    SIMULATOR_AVAILABLE = False

# Hugging Face Hub adapter persistence via dataset repo
try:
    from huggingface_hub import HfApi, snapshot_download
    startup_log.append("[OK] huggingface_hub: OK")
    HF_HUB_AVAILABLE = True
except Exception as e:
    startup_log.append(f"[ERR] huggingface_hub: {str(e)[:80]}")
    HF_HUB_AVAILABLE = False

# Constants
BASELINE_CYCLES = 147734
TARGET_CYCLES = 1363
SCORE_SCALE = 3000.0
PARSE_REWARD = 0.02
API_REWARD = 0.05
EXEC_REWARD = 0.10
COPY_PENALTY = 0.05
SEED_POOL = [0, 1, 2, 3]
PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
ADAPTER_DATASET_REPO = os.environ.get("ADAPTER_DATASET_REPO", "CreativeEngineer/vliw-optimizer-adapters")
ADAPTER_DATASET_SUBDIR = os.environ.get("ADAPTER_DATASET_SUBDIR", "perf_takehome_latest")

# Training state
training_state = {
    "is_training": False,
    "should_stop": False,
    "log": [],
    "best_cycles": BASELINE_CYCLES,
    "best_code": None,
    "step": 0,
}
state_lock = threading.Lock()

_eval_context = {}


def get_status():
    return "\n".join(startup_log)


def extract_code_block(text: str) -> str:
    # Prefer closed fences
    pattern = r"```python\s*(.*?)```"
    matches = re.findall(pattern, text, re.DOTALL)
    if matches:
        return matches[-1].strip()
    pattern = r"```\s*(.*?)```"
    matches = re.findall(pattern, text, re.DOTALL)
    if matches:
        return matches[-1].strip()

    # Handle unclosed fences (common when generation truncates)
    if "```python" in text:
        after = text.split("```python", 1)[1]
        if "```" in after:
            after = after.split("```", 1)[0]
        return after.strip()
    if "```" in text:
        after = text.split("```", 1)[1]
        if "```" in after:
            after = after.split("```", 1)[0]
        return after.strip()
    return text.strip()


def _hf_token() -> str | None:
    return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")


def _ensure_dir(path: str) -> None:
    Path(path).mkdir(parents=True, exist_ok=True)


def _adapter_exists(path: str) -> bool:
    return os.path.exists(os.path.join(path, "adapter_config.json"))


def _try_download_adapter(add_log) -> None:
    if not HF_HUB_AVAILABLE:
        add_log("[ERR] Hub sync disabled: huggingface_hub not available")
        return
    _ensure_dir(os.path.dirname(ADAPTER_DIR))
    allow = [f"{ADAPTER_DATASET_SUBDIR}/**"]
    try:
        snapshot_download(
            repo_id=ADAPTER_DATASET_REPO,
            repo_type="dataset",
            allow_patterns=allow,
            local_dir=os.path.dirname(ADAPTER_DIR),
            local_dir_use_symlinks=False,
            token=_hf_token(),
        )
        downloaded = os.path.join(os.path.dirname(ADAPTER_DIR), ADAPTER_DATASET_SUBDIR)
        if _adapter_exists(downloaded):
            if downloaded != ADAPTER_DIR:
                _ensure_dir(os.path.dirname(ADAPTER_DIR))
                # Simple overwrite by copying files into ADAPTER_DIR
                _ensure_dir(ADAPTER_DIR)
                for root, _, files in os.walk(downloaded):
                    rel = os.path.relpath(root, downloaded)
                    dst_root = ADAPTER_DIR if rel == "." else os.path.join(ADAPTER_DIR, rel)
                    _ensure_dir(dst_root)
                    for name in files:
                        src = os.path.join(root, name)
                        dst = os.path.join(dst_root, name)
                        with open(src, "rb") as fsrc, open(dst, "wb") as fdst:
                            fdst.write(fsrc.read())
            add_log(f"[OK] Downloaded adapter from dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
        else:
            add_log("[INFO] No adapter found in dataset yet")
    except Exception as e:
        add_log(f"[INFO] Adapter download skipped: {str(e)[:160]}")


def _try_upload_adapter(add_log) -> None:
    if not HF_HUB_AVAILABLE:
        add_log("[ERR] Hub sync disabled: huggingface_hub not available")
        return
    if not _adapter_exists(ADAPTER_DIR):
        add_log("[INFO] No adapter to upload yet")
        return
    token = _hf_token()
    if token is None:
        add_log("[INFO] No HF token set (HF_TOKEN/HUGGINGFACE_HUB_TOKEN); skipping upload")
        return
    try:
        api = HfApi(token=token)
        api.create_repo(repo_id=ADAPTER_DATASET_REPO, repo_type="dataset", exist_ok=True)
        api.upload_folder(
            repo_id=ADAPTER_DATASET_REPO,
            repo_type="dataset",
            folder_path=ADAPTER_DIR,
            path_in_repo=ADAPTER_DATASET_SUBDIR,
            commit_message="Update perf_takehome adapter",
        )
        add_log(f"[OK] Uploaded adapter to dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
    except Exception as e:
        add_log(f"[INFO] Adapter upload skipped: {str(e)[:160]}")


def _run_machine_with_cycle_limit(machine: Machine, max_cycles: int) -> bool:
    for core in machine.cores:
        if core.state == CoreState.PAUSED:
            core.state = CoreState.RUNNING
    while any(c.state == CoreState.RUNNING for c in machine.cores):
        has_non_debug = False
        for core in machine.cores:
            if core.state != CoreState.RUNNING:
                continue
            if core.pc >= len(machine.program):
                core.state = CoreState.STOPPED
                continue
            instr = machine.program[core.pc]
            core.pc += 1
            machine.step(instr, core)
            if any(name != "debug" for name in instr.keys()):
                has_non_debug = True
        if has_non_debug:
            machine.cycle += 1
            if machine.cycle >= max_cycles:
                for core in machine.cores:
                    core.state = CoreState.STOPPED
                return False
    return True


def _get_eval_context(seed: int) -> dict:
    with state_lock:
        cached = _eval_context.get(seed)
        if cached is not None:
            return cached
    rng_state = random.getstate()
    random.seed(seed)
    forest = Tree.generate(10)
    inp = Input.generate(forest, 256, 16)
    random.setstate(rng_state)
    mem0 = build_mem_image(forest, inp)
    ref_mem = None
    for ref_mem in reference_kernel2(list(mem0)):
        pass
    if ref_mem is None:
        raise RuntimeError("Reference kernel produced no output")
    inp_values_p = ref_mem[6]
    expected = ref_mem[inp_values_p : inp_values_p + len(inp.values)]
    ctx = {
        "forest": forest,
        "inp": inp,
        "mem0": mem0,
        "expected": expected,
        "inp_values_p": inp_values_p,
    }
    with state_lock:
        _eval_context[seed] = ctx
    return ctx


def verify_perf_takehome_code(code: str, seed: int = 123) -> dict:
    if not SIMULATOR_AVAILABLE:
        return {
            "score": 0.0,
            "correctness": 0.0,
            "cycles": None,
            "msg": "Simulator unavailable",
            "parse_ok": False,
            "api_ok": False,
            "exec_ok": False,
        }

    try:
        code = code.strip()
        if not code:
            return {
                "score": 0.0,
                "correctness": 0.0,
                "cycles": None,
                "msg": "Empty code",
                "parse_ok": False,
                "api_ok": False,
                "exec_ok": False,
            }

        try:
            compile(code, "<string>", "exec")
        except Exception as e:
            return {
                "score": 0.0,
                "correctness": 0.0,
                "cycles": None,
                "msg": f"Syntax error: {str(e)[:200]}",
                "parse_ok": False,
                "api_ok": False,
                "exec_ok": False,
            }

        if "OptimizedKernelBuilder" not in code:
            return {
                "score": 0.0,
                "correctness": 0.0,
                "cycles": None,
                "msg": "Missing OptimizedKernelBuilder",
                "parse_ok": True,
                "api_ok": False,
                "exec_ok": False,
            }

        if "def run" not in code:
            return {
                "score": 0.0,
                "correctness": 0.0,
                "cycles": None,
                "msg": "Missing run()",
                "parse_ok": True,
                "api_ok": False,
                "exec_ok": False,
            }

        safe_builtins = {
            "abs": abs,
            "all": all,
            "any": any,
            "__build_class__": __build_class__,
            "dict": dict,
            "enumerate": enumerate,
            "int": int,
            "len": len,
            "list": list,
            "max": max,
            "min": min,
            "object": object,
            "range": range,
            "sum": sum,
            "tuple": tuple,
            "zip": zip,
        }
        exec_globals = {
            "__builtins__": safe_builtins,
            "__name__": "__main__",
            "KernelBuilder": KernelBuilder,
            "HASH_STAGES": HASH_STAGES,
            "VLEN": VLEN,
            "SLOT_LIMITS": SLOT_LIMITS,
        }

        try:
            exec(code, exec_globals)
        except Exception as e:
            return {
                "score": 0.0,
                "correctness": 0.0,
                "cycles": None,
                "msg": f"Execution error: {str(e)[:200]}",
                "parse_ok": True,
                "api_ok": True,
                "exec_ok": False,
            }

        if "OptimizedKernelBuilder" not in exec_globals:
            return {
                "score": 0.0,
                "correctness": 0.0,
                "cycles": None,
                "msg": "OptimizedKernelBuilder not defined after exec",
                "parse_ok": True,
                "api_ok": True,
                "exec_ok": True,
            }

        ctx = _get_eval_context(seed)
        forest = ctx["forest"]
        inp = ctx["inp"]
        mem0 = ctx["mem0"]

        kb = exec_globals["OptimizedKernelBuilder"]()
        kb.build_kernel(10, len(forest.values), 256, 16)

        machine = Machine(
            list(mem0),
            kb.instrs,
            kb.debug_info(),
            n_cores=N_CORES,
            trace=False,
        )
        machine.enable_pause = False
        machine.enable_debug = False

        ok = _run_machine_with_cycle_limit(machine, max_cycles=250000)
        if not ok:
            return {
                "score": 0.0,
                "correctness": 0.0,
                "cycles": int(machine.cycle),
                "msg": f"Exceeded cycle limit (cycles={machine.cycle})",
                "parse_ok": True,
                "api_ok": True,
                "exec_ok": True,
            }
        cycles = machine.cycle

        if cycles <= 100:
            return {
                "score": 0.0,
                "correctness": 0.0,
                "cycles": int(cycles),
                "msg": f"Suspiciously low cycles ({cycles})",
                "parse_ok": True,
                "api_ok": True,
                "exec_ok": True,
            }
        if cycles > 200000:
            return {
                "score": 0.0,
                "correctness": 0.0,
                "cycles": int(cycles),
                "msg": f"Cycles too high ({cycles})",
                "parse_ok": True,
                "api_ok": True,
                "exec_ok": True,
            }

        inp_values_p = ctx["inp_values_p"]
        expected = ctx["expected"]
        actual = machine.mem[inp_values_p : inp_values_p + len(inp.values)]
        if expected != actual:
            return {
                "score": 0.0,
                "correctness": 0.0,
                "cycles": int(cycles),
                "msg": f"Incorrect output (cycles={cycles})",
                "parse_ok": True,
                "api_ok": True,
                "exec_ok": True,
            }

        score = SCORE_SCALE / cycles
        return {
            "score": float(score),
            "correctness": 1.0,
            "cycles": int(cycles),
            "msg": f"Success: {cycles} cycles",
            "parse_ok": True,
            "api_ok": True,
            "exec_ok": True,
        }
    except Exception as e:
        return {
            "score": 0.0,
            "correctness": 0.0,
            "cycles": None,
            "msg": f"Execution error: {str(e)[:200]}",
            "parse_ok": False,
            "api_ok": False,
            "exec_ok": False,
        }


def perf_takehome_reward_fn(completions, prompts=None, **kwargs):
    rewards = []
    for completion in completions:
        if isinstance(completion, list):
            text = completion[0].get("content", "") if completion else ""
        else:
            text = str(completion)

        code = extract_code_block(text)
        seed = random.choice(SEED_POOL)
        result = verify_perf_takehome_code(code, seed=seed)

        reward = 0.0
        if result.get("correctness", 0.0) > 0:
            reward = float(result["score"]) + 1.0
        else:
            if result.get("parse_ok"):
                reward += PARSE_REWARD
            if result.get("api_ok"):
                reward += API_REWARD
            if result.get("exec_ok"):
                reward += EXEC_REWARD
        if code.strip() in EXAMPLE_CODE_SET:
            reward = max(0.0, reward - COPY_PENALTY)
        cycles = result.get("cycles")
        correctness = result.get("correctness", 0.0)
        with state_lock:
            if isinstance(cycles, int) and isinstance(correctness, (int, float)) and correctness > 0 and cycles < training_state["best_cycles"]:
                training_state["best_cycles"] = cycles
                training_state["best_code"] = code
        rewards.append(float(reward))
    return rewards


# Prompt template for VLIW optimization
EXAMPLE_POOL = [
    """Example format (not optimized):
```python
class OptimizedKernelBuilder(KernelBuilder):
    def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
        self.add("flow", ("halt",))

def run():
    return (0,)
```
""",
    """Example with scratch + const:
```python
class OptimizedKernelBuilder(KernelBuilder):
    def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
        tmp = self.alloc_scratch("tmp")
        self.add("load", ("const", tmp, 0))
        self.add("flow", ("halt",))

def run():
    return (0,)
```
""",
    """Example with load/store:
```python
class OptimizedKernelBuilder(KernelBuilder):
    def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
        addr = self.alloc_scratch("addr")
        val = self.alloc_scratch("val")
        self.add("load", ("const", addr, 4))
        self.add("load", ("load", val, addr))
        self.add("store", ("store", addr, val))
        self.add("flow", ("halt",))

def run():
    return (0,)
```
""",
    """Example with tiny loop:
```python
class OptimizedKernelBuilder(KernelBuilder):
    def build_kernel(self, forest_height, n_nodes, batch_size, rounds):
        tmp = self.alloc_scratch("tmp")
        for _ in range(2):
            self.add("load", ("const", tmp, 1))
        self.add("flow", ("halt",))

def run():
    return (0,)
```
""",
]

EXAMPLE_CODE_SET = {
    extract_code_block(example) for example in EXAMPLE_POOL
}

def _select_examples() -> str:
    k = 2 if len(EXAMPLE_POOL) >= 2 else 1
    picks = random.sample(EXAMPLE_POOL, k)
    return "\n".join(picks)

def build_prompt() -> str:
    examples = _select_examples()
    return f"""Write an optimized VLIW/SIMD kernel. OUTPUT ONLY ONE ```python CODE BLOCK.

ARCHITECTURE: 12 ALU + 6 VALU (VLEN=8) + 2 load + 2 store + 1 flow slots per cycle. 1536-word scratch.

API (KernelBuilder):
- alloc_scratch(name, length) -> addr
- scratch_const(val, name) -> addr
- add(engine, slot): engine in {{alu, valu, load, store, flow}}
  - alu: (op, dst, src1, src2) where op in {{+,-,*,//,%,^,&,|,<<,>>,<,==,!=,<=,>=,>}}
  - valu: same ops but on vectors (VLEN=8)
  - load: (load,dst,addr), (vload,dst,addr), (const,dst,val), (vbroadcast,dst,scalar_addr)
  - store: (store,addr,src), (vstore,addr,src)
  - flow: (select,dst,cond,t,f), (vselect,dst,cond,t,f), (cond_jump,cond,pc), (jump,pc), (halt,)
- label(name): mark code position
- build(slots, vliw=True): pack slots into VLIW bundle

MEMORY: mem[4]=forest_values, mem[5]=inp_indices, mem[6]=inp_values (256 elements each)

ALGORITHM: 16 rounds x 256 items:
  load idx,val
  node = tree[idx]
  val = hash(val ^ node) using HASH_STAGES
  idx = 2*idx + (1 if val%2==0 else 2)
  idx = 0 if idx >= n_nodes else idx
  store idx,val

RULES:
- Output exactly one python code block.
- The code block must define:
  - class OptimizedKernelBuilder(KernelBuilder): override build_kernel() and emit instructions using add()/build()
  - def run(): return any tuple (ignored), but must exist
- No imports.
- Examples are format-only. Do NOT copy them verbatim.

Baseline: {BASELINE_CYCLES:,} cycles. Target: <{TARGET_CYCLES:,} cycles.

{examples}
"""


def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_continue):
    """Run GRPO + LoRA training with correctness-gated perf_takehome rewards."""
    import torch
    from datasets import Dataset
    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
    from peft import LoraConfig
    from peft import PeftModel
    from trl import GRPOConfig, GRPOTrainer
    from transformers import TrainerCallback

    log = []

    def add_log(msg):
        log.append(f"[{time.strftime('%H:%M:%S')}] {msg}")
        with state_lock:
            training_state["log"] = log.copy()

    with state_lock:
        training_state["is_training"] = True
        training_state["should_stop"] = False
        training_state["log"] = []
        training_state["best_cycles"] = BASELINE_CYCLES
        training_state["best_code"] = None
        training_state["step"] = 0

    try:
        add_log(f"Starting VLIW optimization training")
        add_log(f"Model: {model_name}")
        add_log(f"Chunk steps: {chunk_steps}")
        add_log(f"Auto-continue: {auto_continue} (max_total_steps={max_total_steps}, max_minutes={max_minutes})")
        add_log(f"Baseline: {BASELINE_CYCLES:,} cycles, Target: {TARGET_CYCLES:,} cycles")
        add_log(f"Adapter dir: {ADAPTER_DIR}")
        add_log(f"Adapter dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")

        # Load tokenizer
        add_log("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        add_log("[OK] Tokenizer ready")

        # Load model with 4-bit quantization
        add_log("Loading model (4-bit quantization)...")
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
        base_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
        )
        add_log(f"[OK] Base model loaded on {next(base_model.parameters()).device}")

        # Try to restore adapter from dataset before loading it
        _try_download_adapter(add_log)

        # Resume LoRA adapter if present
        resume_adapter = False
        if os.path.isdir(ADAPTER_DIR) and os.path.exists(os.path.join(ADAPTER_DIR, "adapter_config.json")):
            add_log("Loading existing LoRA adapter (resume)...")
            model = PeftModel.from_pretrained(base_model, ADAPTER_DIR, is_trainable=True)
            add_log("[OK] Adapter loaded")
            resume_adapter = True
        else:
            model = base_model

        # Create dataset with prompts
        add_log("Creating VLIW optimization dataset...")
        prompts = [build_prompt() for _ in range(16)]
        dataset = Dataset.from_dict({"prompt": prompts})
        add_log(f"[OK] Dataset ready: {len(prompts)} prompts")

        # LoRA config
        add_log("Setting up LoRA...")
        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )

        progress = {"step": 0}
        start_time = time.time()
        max_seconds = float(max_minutes) * 60.0 if auto_continue else float("inf")
        total_target_steps = int(max_total_steps) if auto_continue else int(chunk_steps)

        # Custom callback for logging + early stop
        class VLIWCallback(TrainerCallback):
            def on_step_end(self, args, state, control, **kwargs):
                with state_lock:
                    progress["step"] += 1
                    training_state["step"] = progress["step"]
                    if training_state["should_stop"]:
                        control.should_training_stop = True
                    if training_state["best_cycles"] <= TARGET_CYCLES:
                        control.should_training_stop = True
                return control

            def on_log(self, args, state, control, logs=None, **kwargs):
                if logs:
                    loss = logs.get("loss", "N/A")
                    reward = logs.get("reward", logs.get("mean_reward", "N/A"))
                    step = progress["step"]
                    add_log(f"Step {step}: loss={loss:.4f}, reward={reward:.4f}" if isinstance(loss, float) else f"Step {step}: {logs}")

        add_log("Creating GRPO trainer with perf_takehome rewards...")
        output_dir = os.path.join(PERSIST_DIR, "grpo_perf_takehome_output")
        os.makedirs(output_dir, exist_ok=True)

        add_log("[OK] Trainer config ready")
        add_log("Starting training loop...")
        add_log("(Stops early if target reached; can auto-continue in chunks)")

        chunk_idx = 0
        while True:
            with state_lock:
                if training_state["should_stop"]:
                    break
                if training_state["best_cycles"] <= TARGET_CYCLES:
                    break

            if progress["step"] >= total_target_steps:
                break
            if (time.time() - start_time) >= max_seconds:
                break

            remaining = total_target_steps - progress["step"]
            this_chunk_steps = min(int(chunk_steps), int(remaining))
            if this_chunk_steps <= 0:
                break

            chunk_idx += 1
            add_log(f"Chunk {chunk_idx}: training {this_chunk_steps} steps...")

            config = GRPOConfig(
                output_dir=output_dir,
                num_train_epochs=1,
                max_steps=this_chunk_steps,
                per_device_train_batch_size=1,
                gradient_accumulation_steps=4,
                learning_rate=1e-5,
                logging_steps=1,
                save_steps=999999,
                report_to="none",
                remove_unused_columns=False,
                max_completion_length=2048,
                num_generations=4,
            )

            trainer_kwargs = {
                "model": model,
                "args": config,
                "train_dataset": dataset,
                "reward_funcs": perf_takehome_reward_fn,
                "processing_class": tokenizer,
                "callbacks": [VLIWCallback()],
            }
            if not resume_adapter:
                trainer_kwargs["peft_config"] = lora_config

            trainer = GRPOTrainer(**trainer_kwargs)

            train_result = trainer.train()
            metrics = train_result.metrics
            add_log(f"Chunk {chunk_idx} done: steps={metrics.get('train_steps', this_chunk_steps)}")

            # Save adapter after each chunk so it persists across restarts
            try:
                os.makedirs(os.path.dirname(ADAPTER_DIR), exist_ok=True)
                trainer.save_model(ADAPTER_DIR)
                add_log(f"[OK] Saved adapter to {ADAPTER_DIR}")
                _try_upload_adapter(add_log)
            except Exception as e:
                add_log(f"[ERR] Failed to save adapter: {str(e)[:120]}")

            if not auto_continue:
                break

        # Test generation
        add_log("Testing trained model...")
        inputs = tokenizer(build_prompt(), return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)

        code = extract_code_block(result)
        verify_out = verify_perf_takehome_code(code)
        if verify_out.get("correctness", 0.0) > 0:
            cycles = verify_out.get("cycles")
            add_log(f"Generated kernel verified: {cycles:,} cycles")
            speedup = BASELINE_CYCLES / max(int(cycles), 1) if isinstance(cycles, int) else 0.0
            add_log(f"Speedup: {speedup:.2f}x over baseline")
        else:
            add_log(f"Generated kernel invalid: {verify_out.get('msg', '')[:160]}")

        add_log("\n[OK] All done!")

    except Exception as e:
        import traceback
        add_log(f"[ERR] Error: {e}")
        add_log(traceback.format_exc()[:800])
    finally:
        with state_lock:
            training_state["is_training"] = False
        try:
            del model
            torch.cuda.empty_cache()
        except:
            pass

    return "\n".join(log)


def start_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_continue):
    """Start training."""
    with state_lock:
        if training_state["is_training"]:
            return "\n".join(training_state["log"][-200:]) or "Training already in progress. Please wait."
        training_state["is_training"] = True
        training_state["should_stop"] = False
        training_state["log"] = [f"[{time.strftime('%H:%M:%S')}] Starting training..."]
        training_state["step"] = 0

    thread = threading.Thread(
        target=run_training,
        args=(
            model_name,
            int(chunk_steps),
            int(max_total_steps),
            float(max_minutes),
            bool(auto_continue),
        ),
        daemon=True,
    )
    thread.start()
    return "Training started. Logs will stream below."


def stop_training():
    """Request stop."""
    with state_lock:
        if not training_state["is_training"]:
            return "No training in progress"
        training_state["should_stop"] = True
    return "Stop requested. Training will stop after current step."


# Gradio UI
with gr.Blocks(title="VLIW Optimizer") as demo:
    gr.Markdown("# VLIW Kernel Optimizer - RL Training")
    gr.Markdown(f"""
    Train a language model with reinforcement learning (LoRA) at test time to generate correct, fast VLIW/SIMD kernels.

    **Goal:** Reduce cycle count from **{BASELINE_CYCLES:,}** (baseline) to **<{TARGET_CYCLES:,}** (108x speedup)

    **How it works:**
    1. Model generates Python kernel builder code
    2. Simulator checks correctness vs reference and measures cycles
    3. GRPO updates LoRA weights; adapter is saved and reloaded from `{ADAPTER_DIR}`
    """)

    with gr.Row():
        with gr.Column(scale=1):
            status_box = gr.Textbox(
                label="System Status",
                value=get_status(),
                lines=12,
                interactive=False,
            )

        with gr.Column(scale=2):
            model_dropdown = gr.Dropdown(
                choices=[
                    "Qwen/Qwen2.5-Coder-1.5B-Instruct",
                    "Qwen/Qwen2.5-Coder-3B-Instruct",
                ],
                value="Qwen/Qwen2.5-Coder-1.5B-Instruct",
                label="Model",
            )
            chunk_steps_slider = gr.Slider(
                minimum=5,
                maximum=100,
                value=20,
                step=5,
                label="Chunk Steps",
            )
            auto_continue_checkbox = gr.Checkbox(
                value=False,
                label="Auto-continue (chain chunks)",
            )
            max_total_steps_slider = gr.Slider(
                minimum=5,
                maximum=500,
                value=100,
                step=5,
                label="Max Total Steps",
            )
            max_minutes_number = gr.Number(
                value=60,
                precision=0,
                label="Max Minutes",
            )

            with gr.Row():
                start_btn = gr.Button("Start Training", variant="primary")
                stop_btn = gr.Button("Stop", variant="stop")

    output_box = gr.Textbox(
        label="Training Log",
        lines=25,
        interactive=False,
        value="Click 'Start Training' to begin VLIW optimization.",
    )

    def poll_log():
        with state_lock:
            if not training_state["log"]:
                return ""
            lines = training_state["log"][-200:]
            return "\n".join(line[:400] for line in lines)

    start_btn.click(
        start_training,
        [model_dropdown, chunk_steps_slider, max_total_steps_slider, max_minutes_number, auto_continue_checkbox],
        [output_box],
        queue=False,
    )
    stop_btn.click(stop_training, [], [output_box], queue=False)
    refresh_btn = gr.Button("Refresh Log")
    refresh_btn.click(poll_log, outputs=[output_box], queue=False)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)