""" HF Spaces app for VLIW kernel optimization via RL. Uses actual simulator for correctness-gated cycle-count rewards. """ import os import sys import gradio as gr import threading import time import random import re from copy import copy from pathlib import Path # Check imports at startup startup_log = [] def check_import(name, import_fn): try: result = import_fn() startup_log.append(f"[OK] {name}: {result}") return True except Exception as e: startup_log.append(f"[ERR] {name}: {str(e)[:80]}") return False check_import("torch", lambda: __import__("torch").__version__) check_import("transformers", lambda: __import__("transformers").__version__) check_import("datasets", lambda: __import__("datasets").__version__) check_import("peft", lambda: __import__("peft").__version__) check_import("trl", lambda: __import__("trl").__version__) check_import("huggingface_hub", lambda: __import__("huggingface_hub").__version__) try: from trl import GRPOConfig, GRPOTrainer startup_log.append("[OK] GRPOTrainer: OK") except Exception as e: startup_log.append(f"[ERR] GRPOTrainer: {e}") try: import torch if torch.cuda.is_available(): startup_log.append(f"[OK] CUDA: {torch.cuda.get_device_name(0)}") else: startup_log.append("[ERR] CUDA: Not available") except Exception as e: startup_log.append(f"[ERR] CUDA check: {e}") # Prefer simulator + KernelBuilder from bundled original_performance_takehome. # In Spaces, this keeps evaluation consistent and enables correctness checks. THIS_DIR = os.path.dirname(os.path.abspath(__file__)) PERF_TAKEHOME_PATH = os.path.join(THIS_DIR, "original_performance_takehome") if os.path.isdir(PERF_TAKEHOME_PATH): sys.path.insert(0, PERF_TAKEHOME_PATH) # Import simulator components try: from problem import ( Machine, Tree, Input, DebugInfo, build_mem_image, reference_kernel2, SLOT_LIMITS, VLEN, N_CORES, SCRATCH_SIZE, CoreState ) from perf_takehome import KernelBuilder, HASH_STAGES startup_log.append("[OK] VLIW Simulator: OK") SIMULATOR_AVAILABLE = True except Exception as e: startup_log.append(f"[ERR] VLIW Simulator: {e}") SIMULATOR_AVAILABLE = False # Hugging Face Hub adapter persistence via dataset repo try: from huggingface_hub import HfApi, snapshot_download startup_log.append("[OK] huggingface_hub: OK") HF_HUB_AVAILABLE = True except Exception as e: startup_log.append(f"[ERR] huggingface_hub: {str(e)[:80]}") HF_HUB_AVAILABLE = False # Constants BASELINE_CYCLES = 147734 TARGET_CYCLES = 1363 SCORE_SCALE = 3000.0 PARSE_REWARD = 0.02 API_REWARD = 0.05 EXEC_REWARD = 0.10 COPY_PENALTY = 0.05 SEED_POOL = [0, 1, 2, 3] PERSIST_DIR = "/data" if os.path.isdir("/data") else "." ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest") ADAPTER_DATASET_REPO = os.environ.get("ADAPTER_DATASET_REPO", "CreativeEngineer/vliw-optimizer-adapters") ADAPTER_DATASET_SUBDIR = os.environ.get("ADAPTER_DATASET_SUBDIR", "perf_takehome_latest") # Training state training_state = { "is_training": False, "should_stop": False, "log": [], "best_cycles": BASELINE_CYCLES, "best_code": None, "step": 0, } state_lock = threading.Lock() _eval_context = {} def get_status(): return "\n".join(startup_log) def extract_code_block(text: str) -> str: # Prefer closed fences pattern = r"```python\s*(.*?)```" matches = re.findall(pattern, text, re.DOTALL) if matches: return matches[-1].strip() pattern = r"```\s*(.*?)```" matches = re.findall(pattern, text, re.DOTALL) if matches: return matches[-1].strip() # Handle unclosed fences (common when generation truncates) if "```python" in text: after = text.split("```python", 1)[1] if "```" in after: after = after.split("```", 1)[0] return after.strip() if "```" in text: after = text.split("```", 1)[1] if "```" in after: after = after.split("```", 1)[0] return after.strip() return text.strip() def _hf_token() -> str | None: return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") def _ensure_dir(path: str) -> None: Path(path).mkdir(parents=True, exist_ok=True) def _adapter_exists(path: str) -> bool: return os.path.exists(os.path.join(path, "adapter_config.json")) def _try_download_adapter(add_log) -> None: if not HF_HUB_AVAILABLE: add_log("[ERR] Hub sync disabled: huggingface_hub not available") return _ensure_dir(os.path.dirname(ADAPTER_DIR)) allow = [f"{ADAPTER_DATASET_SUBDIR}/**"] try: snapshot_download( repo_id=ADAPTER_DATASET_REPO, repo_type="dataset", allow_patterns=allow, local_dir=os.path.dirname(ADAPTER_DIR), local_dir_use_symlinks=False, token=_hf_token(), ) downloaded = os.path.join(os.path.dirname(ADAPTER_DIR), ADAPTER_DATASET_SUBDIR) if _adapter_exists(downloaded): if downloaded != ADAPTER_DIR: _ensure_dir(os.path.dirname(ADAPTER_DIR)) # Simple overwrite by copying files into ADAPTER_DIR _ensure_dir(ADAPTER_DIR) for root, _, files in os.walk(downloaded): rel = os.path.relpath(root, downloaded) dst_root = ADAPTER_DIR if rel == "." else os.path.join(ADAPTER_DIR, rel) _ensure_dir(dst_root) for name in files: src = os.path.join(root, name) dst = os.path.join(dst_root, name) with open(src, "rb") as fsrc, open(dst, "wb") as fdst: fdst.write(fsrc.read()) add_log(f"[OK] Downloaded adapter from dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}") else: add_log("[INFO] No adapter found in dataset yet") except Exception as e: add_log(f"[INFO] Adapter download skipped: {str(e)[:160]}") def _try_upload_adapter(add_log) -> None: if not HF_HUB_AVAILABLE: add_log("[ERR] Hub sync disabled: huggingface_hub not available") return if not _adapter_exists(ADAPTER_DIR): add_log("[INFO] No adapter to upload yet") return token = _hf_token() if token is None: add_log("[INFO] No HF token set (HF_TOKEN/HUGGINGFACE_HUB_TOKEN); skipping upload") return try: api = HfApi(token=token) api.create_repo(repo_id=ADAPTER_DATASET_REPO, repo_type="dataset", exist_ok=True) api.upload_folder( repo_id=ADAPTER_DATASET_REPO, repo_type="dataset", folder_path=ADAPTER_DIR, path_in_repo=ADAPTER_DATASET_SUBDIR, commit_message="Update perf_takehome adapter", ) add_log(f"[OK] Uploaded adapter to dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}") except Exception as e: add_log(f"[INFO] Adapter upload skipped: {str(e)[:160]}") def _run_machine_with_cycle_limit(machine: Machine, max_cycles: int) -> bool: for core in machine.cores: if core.state == CoreState.PAUSED: core.state = CoreState.RUNNING while any(c.state == CoreState.RUNNING for c in machine.cores): has_non_debug = False for core in machine.cores: if core.state != CoreState.RUNNING: continue if core.pc >= len(machine.program): core.state = CoreState.STOPPED continue instr = machine.program[core.pc] core.pc += 1 machine.step(instr, core) if any(name != "debug" for name in instr.keys()): has_non_debug = True if has_non_debug: machine.cycle += 1 if machine.cycle >= max_cycles: for core in machine.cores: core.state = CoreState.STOPPED return False return True def _get_eval_context(seed: int) -> dict: with state_lock: cached = _eval_context.get(seed) if cached is not None: return cached rng_state = random.getstate() random.seed(seed) forest = Tree.generate(10) inp = Input.generate(forest, 256, 16) random.setstate(rng_state) mem0 = build_mem_image(forest, inp) ref_mem = None for ref_mem in reference_kernel2(list(mem0)): pass if ref_mem is None: raise RuntimeError("Reference kernel produced no output") inp_values_p = ref_mem[6] expected = ref_mem[inp_values_p : inp_values_p + len(inp.values)] ctx = { "forest": forest, "inp": inp, "mem0": mem0, "expected": expected, "inp_values_p": inp_values_p, } with state_lock: _eval_context[seed] = ctx return ctx def verify_perf_takehome_code(code: str, seed: int = 123) -> dict: if not SIMULATOR_AVAILABLE: return { "score": 0.0, "correctness": 0.0, "cycles": None, "msg": "Simulator unavailable", "parse_ok": False, "api_ok": False, "exec_ok": False, } try: code = code.strip() if not code: return { "score": 0.0, "correctness": 0.0, "cycles": None, "msg": "Empty code", "parse_ok": False, "api_ok": False, "exec_ok": False, } try: compile(code, "", "exec") except Exception as e: return { "score": 0.0, "correctness": 0.0, "cycles": None, "msg": f"Syntax error: {str(e)[:200]}", "parse_ok": False, "api_ok": False, "exec_ok": False, } if "OptimizedKernelBuilder" not in code: return { "score": 0.0, "correctness": 0.0, "cycles": None, "msg": "Missing OptimizedKernelBuilder", "parse_ok": True, "api_ok": False, "exec_ok": False, } if "def run" not in code: return { "score": 0.0, "correctness": 0.0, "cycles": None, "msg": "Missing run()", "parse_ok": True, "api_ok": False, "exec_ok": False, } safe_builtins = { "abs": abs, "all": all, "any": any, "dict": dict, "enumerate": enumerate, "int": int, "len": len, "list": list, "max": max, "min": min, "range": range, "sum": sum, "tuple": tuple, "zip": zip, } exec_globals = { "__builtins__": safe_builtins, "KernelBuilder": KernelBuilder, "HASH_STAGES": HASH_STAGES, "VLEN": VLEN, "SLOT_LIMITS": SLOT_LIMITS, } try: exec(code, exec_globals) except Exception as e: return { "score": 0.0, "correctness": 0.0, "cycles": None, "msg": f"Execution error: {str(e)[:200]}", "parse_ok": True, "api_ok": True, "exec_ok": False, } if "OptimizedKernelBuilder" not in exec_globals: return { "score": 0.0, "correctness": 0.0, "cycles": None, "msg": "OptimizedKernelBuilder not defined after exec", "parse_ok": True, "api_ok": True, "exec_ok": True, } ctx = _get_eval_context(seed) forest = ctx["forest"] inp = ctx["inp"] mem0 = ctx["mem0"] kb = exec_globals["OptimizedKernelBuilder"]() kb.build_kernel(10, len(forest.values), 256, 16) machine = Machine( list(mem0), kb.instrs, kb.debug_info(), n_cores=N_CORES, trace=False, ) machine.enable_pause = False machine.enable_debug = False ok = _run_machine_with_cycle_limit(machine, max_cycles=250000) if not ok: return { "score": 0.0, "correctness": 0.0, "cycles": int(machine.cycle), "msg": f"Exceeded cycle limit (cycles={machine.cycle})", "parse_ok": True, "api_ok": True, "exec_ok": True, } cycles = machine.cycle if cycles <= 100: return { "score": 0.0, "correctness": 0.0, "cycles": int(cycles), "msg": f"Suspiciously low cycles ({cycles})", "parse_ok": True, "api_ok": True, "exec_ok": True, } if cycles > 200000: return { "score": 0.0, "correctness": 0.0, "cycles": int(cycles), "msg": f"Cycles too high ({cycles})", "parse_ok": True, "api_ok": True, "exec_ok": True, } inp_values_p = ctx["inp_values_p"] expected = ctx["expected"] actual = machine.mem[inp_values_p : inp_values_p + len(inp.values)] if expected != actual: return { "score": 0.0, "correctness": 0.0, "cycles": int(cycles), "msg": f"Incorrect output (cycles={cycles})", "parse_ok": True, "api_ok": True, "exec_ok": True, } score = SCORE_SCALE / cycles return { "score": float(score), "correctness": 1.0, "cycles": int(cycles), "msg": f"Success: {cycles} cycles", "parse_ok": True, "api_ok": True, "exec_ok": True, } except Exception as e: return { "score": 0.0, "correctness": 0.0, "cycles": None, "msg": f"Execution error: {str(e)[:200]}", "parse_ok": False, "api_ok": False, "exec_ok": False, } def perf_takehome_reward_fn(completions, prompts=None, **kwargs): rewards = [] for completion in completions: if isinstance(completion, list): text = completion[0].get("content", "") if completion else "" else: text = str(completion) code = extract_code_block(text) seed = random.choice(SEED_POOL) result = verify_perf_takehome_code(code, seed=seed) reward = 0.0 if result.get("correctness", 0.0) > 0: reward = float(result["score"]) + 1.0 else: if result.get("parse_ok"): reward += PARSE_REWARD if result.get("api_ok"): reward += API_REWARD if result.get("exec_ok"): reward += EXEC_REWARD if code.strip() in EXAMPLE_CODE_SET: reward = max(0.0, reward - COPY_PENALTY) cycles = result.get("cycles") correctness = result.get("correctness", 0.0) with state_lock: if isinstance(cycles, int) and isinstance(correctness, (int, float)) and correctness > 0 and cycles < training_state["best_cycles"]: training_state["best_cycles"] = cycles training_state["best_code"] = code rewards.append(float(reward)) return rewards # Prompt template for VLIW optimization EXAMPLE_POOL = [ """Example format (not optimized): ```python class OptimizedKernelBuilder(KernelBuilder): def build_kernel(self, forest_height, n_nodes, batch_size, rounds): self.add("flow", ("halt",)) def run(): return (0,) ``` """, """Example with scratch + const: ```python class OptimizedKernelBuilder(KernelBuilder): def build_kernel(self, forest_height, n_nodes, batch_size, rounds): tmp = self.alloc_scratch("tmp") self.add("load", ("const", tmp, 0)) self.add("flow", ("halt",)) def run(): return (0,) ``` """, """Example with load/store: ```python class OptimizedKernelBuilder(KernelBuilder): def build_kernel(self, forest_height, n_nodes, batch_size, rounds): addr = self.alloc_scratch("addr") val = self.alloc_scratch("val") self.add("load", ("const", addr, 4)) self.add("load", ("load", val, addr)) self.add("store", ("store", addr, val)) self.add("flow", ("halt",)) def run(): return (0,) ``` """, """Example with tiny loop: ```python class OptimizedKernelBuilder(KernelBuilder): def build_kernel(self, forest_height, n_nodes, batch_size, rounds): tmp = self.alloc_scratch("tmp") for _ in range(2): self.add("load", ("const", tmp, 1)) self.add("flow", ("halt",)) def run(): return (0,) ``` """, ] EXAMPLE_CODE_SET = { extract_code_block(example) for example in EXAMPLE_POOL } def _select_examples() -> str: k = 2 if len(EXAMPLE_POOL) >= 2 else 1 picks = random.sample(EXAMPLE_POOL, k) return "\n".join(picks) def build_prompt() -> str: examples = _select_examples() return f"""Write an optimized VLIW/SIMD kernel. OUTPUT ONLY ONE ```python CODE BLOCK. ARCHITECTURE: 12 ALU + 6 VALU (VLEN=8) + 2 load + 2 store + 1 flow slots per cycle. 1536-word scratch. API (KernelBuilder): - alloc_scratch(name, length) -> addr - scratch_const(val, name) -> addr - add(engine, slot): engine in {{alu, valu, load, store, flow}} - alu: (op, dst, src1, src2) where op in {{+,-,*,//,%,^,&,|,<<,>>,<,==,!=,<=,>=,>}} - valu: same ops but on vectors (VLEN=8) - load: (load,dst,addr), (vload,dst,addr), (const,dst,val), (vbroadcast,dst,scalar_addr) - store: (store,addr,src), (vstore,addr,src) - flow: (select,dst,cond,t,f), (vselect,dst,cond,t,f), (cond_jump,cond,pc), (jump,pc), (halt,) - label(name): mark code position - build(slots, vliw=True): pack slots into VLIW bundle MEMORY: mem[4]=forest_values, mem[5]=inp_indices, mem[6]=inp_values (256 elements each) ALGORITHM: 16 rounds x 256 items: load idx,val node = tree[idx] val = hash(val ^ node) using HASH_STAGES idx = 2*idx + (1 if val%2==0 else 2) idx = 0 if idx >= n_nodes else idx store idx,val RULES: - Output exactly one python code block. - The code block must define: - class OptimizedKernelBuilder(KernelBuilder): override build_kernel() and emit instructions using add()/build() - def run(): return any tuple (ignored), but must exist - No imports. - Examples are format-only. Do NOT copy them verbatim. Baseline: {BASELINE_CYCLES:,} cycles. Target: <{TARGET_CYCLES:,} cycles. {examples} """ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_continue): """Run GRPO + LoRA training with correctness-gated perf_takehome rewards.""" import torch from datasets import Dataset from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import LoraConfig from peft import PeftModel from trl import GRPOConfig, GRPOTrainer from transformers import TrainerCallback log = [] def add_log(msg): log.append(f"[{time.strftime('%H:%M:%S')}] {msg}") with state_lock: training_state["log"] = log.copy() with state_lock: training_state["is_training"] = True training_state["should_stop"] = False training_state["log"] = [] training_state["best_cycles"] = BASELINE_CYCLES training_state["best_code"] = None training_state["step"] = 0 try: add_log(f"Starting VLIW optimization training") add_log(f"Model: {model_name}") add_log(f"Chunk steps: {chunk_steps}") add_log(f"Auto-continue: {auto_continue} (max_total_steps={max_total_steps}, max_minutes={max_minutes})") add_log(f"Baseline: {BASELINE_CYCLES:,} cycles, Target: {TARGET_CYCLES:,} cycles") add_log(f"Adapter dir: {ADAPTER_DIR}") add_log(f"Adapter dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}") # Load tokenizer add_log("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token add_log("[OK] Tokenizer ready") # Load model with 4-bit quantization add_log("Loading model (4-bit quantization)...") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) base_model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, ) add_log(f"[OK] Base model loaded on {next(base_model.parameters()).device}") # Try to restore adapter from dataset before loading it _try_download_adapter(add_log) # Resume LoRA adapter if present resume_adapter = False if os.path.isdir(ADAPTER_DIR) and os.path.exists(os.path.join(ADAPTER_DIR, "adapter_config.json")): add_log("Loading existing LoRA adapter (resume)...") model = PeftModel.from_pretrained(base_model, ADAPTER_DIR, is_trainable=True) add_log("[OK] Adapter loaded") resume_adapter = True else: model = base_model # Create dataset with prompts add_log("Creating VLIW optimization dataset...") prompts = [build_prompt() for _ in range(16)] dataset = Dataset.from_dict({"prompt": prompts}) add_log(f"[OK] Dataset ready: {len(prompts)} prompts") # LoRA config add_log("Setting up LoRA...") lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", ) progress = {"step": 0} start_time = time.time() max_seconds = float(max_minutes) * 60.0 if auto_continue else float("inf") total_target_steps = int(max_total_steps) if auto_continue else int(chunk_steps) # Custom callback for logging + early stop class VLIWCallback(TrainerCallback): def on_step_end(self, args, state, control, **kwargs): with state_lock: progress["step"] += 1 training_state["step"] = progress["step"] if training_state["should_stop"]: control.should_training_stop = True if training_state["best_cycles"] <= TARGET_CYCLES: control.should_training_stop = True return control def on_log(self, args, state, control, logs=None, **kwargs): if logs: loss = logs.get("loss", "N/A") reward = logs.get("reward", logs.get("mean_reward", "N/A")) step = progress["step"] add_log(f"Step {step}: loss={loss:.4f}, reward={reward:.4f}" if isinstance(loss, float) else f"Step {step}: {logs}") add_log("Creating GRPO trainer with perf_takehome rewards...") output_dir = os.path.join(PERSIST_DIR, "grpo_perf_takehome_output") os.makedirs(output_dir, exist_ok=True) add_log("[OK] Trainer config ready") add_log("Starting training loop...") add_log("(Stops early if target reached; can auto-continue in chunks)") chunk_idx = 0 while True: with state_lock: if training_state["should_stop"]: break if training_state["best_cycles"] <= TARGET_CYCLES: break if progress["step"] >= total_target_steps: break if (time.time() - start_time) >= max_seconds: break remaining = total_target_steps - progress["step"] this_chunk_steps = min(int(chunk_steps), int(remaining)) if this_chunk_steps <= 0: break chunk_idx += 1 add_log(f"Chunk {chunk_idx}: training {this_chunk_steps} steps...") config = GRPOConfig( output_dir=output_dir, num_train_epochs=1, max_steps=this_chunk_steps, per_device_train_batch_size=1, gradient_accumulation_steps=4, learning_rate=1e-5, logging_steps=1, save_steps=999999, report_to="none", remove_unused_columns=False, max_completion_length=2048, num_generations=4, ) trainer_kwargs = { "model": model, "args": config, "train_dataset": dataset, "reward_funcs": perf_takehome_reward_fn, "processing_class": tokenizer, "callbacks": [VLIWCallback()], } if not resume_adapter: trainer_kwargs["peft_config"] = lora_config trainer = GRPOTrainer(**trainer_kwargs) train_result = trainer.train() metrics = train_result.metrics add_log(f"Chunk {chunk_idx} done: steps={metrics.get('train_steps', this_chunk_steps)}") # Save adapter after each chunk so it persists across restarts try: os.makedirs(os.path.dirname(ADAPTER_DIR), exist_ok=True) trainer.save_model(ADAPTER_DIR) add_log(f"[OK] Saved adapter to {ADAPTER_DIR}") _try_upload_adapter(add_log) except Exception as e: add_log(f"[ERR] Failed to save adapter: {str(e)[:120]}") if not auto_continue: break # Test generation add_log("Testing trained model...") inputs = tokenizer(build_prompt(), return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=1024, do_sample=True, temperature=0.7, top_p=0.9, ) result = tokenizer.decode(outputs[0], skip_special_tokens=True) code = extract_code_block(result) verify_out = verify_perf_takehome_code(code) if verify_out.get("correctness", 0.0) > 0: cycles = verify_out.get("cycles") add_log(f"Generated kernel verified: {cycles:,} cycles") speedup = BASELINE_CYCLES / max(int(cycles), 1) if isinstance(cycles, int) else 0.0 add_log(f"Speedup: {speedup:.2f}x over baseline") else: add_log(f"Generated kernel invalid: {verify_out.get('msg', '')[:160]}") add_log("\n[OK] All done!") except Exception as e: import traceback add_log(f"[ERR] Error: {e}") add_log(traceback.format_exc()[:800]) finally: with state_lock: training_state["is_training"] = False try: del model torch.cuda.empty_cache() except: pass return "\n".join(log) def start_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_continue): """Start training.""" with state_lock: if training_state["is_training"]: return "\n".join(training_state["log"][-200:]) or "Training already in progress. Please wait." training_state["is_training"] = True training_state["should_stop"] = False training_state["log"] = [f"[{time.strftime('%H:%M:%S')}] Starting training..."] training_state["step"] = 0 thread = threading.Thread( target=run_training, args=( model_name, int(chunk_steps), int(max_total_steps), float(max_minutes), bool(auto_continue), ), daemon=True, ) thread.start() return "Training started. Logs will stream below." def stop_training(): """Request stop.""" with state_lock: if not training_state["is_training"]: return "No training in progress" training_state["should_stop"] = True return "Stop requested. Training will stop after current step." # Gradio UI with gr.Blocks(title="VLIW Optimizer") as demo: gr.Markdown("# VLIW Kernel Optimizer - RL Training") gr.Markdown(f""" Train a language model with reinforcement learning (LoRA) at test time to generate correct, fast VLIW/SIMD kernels. **Goal:** Reduce cycle count from **{BASELINE_CYCLES:,}** (baseline) to **<{TARGET_CYCLES:,}** (108x speedup) **How it works:** 1. Model generates Python kernel builder code 2. Simulator checks correctness vs reference and measures cycles 3. GRPO updates LoRA weights; adapter is saved and reloaded from `{ADAPTER_DIR}` """) with gr.Row(): with gr.Column(scale=1): status_box = gr.Textbox( label="System Status", value=get_status(), lines=12, interactive=False, ) with gr.Column(scale=2): model_dropdown = gr.Dropdown( choices=[ "Qwen/Qwen2.5-Coder-1.5B-Instruct", "Qwen/Qwen2.5-Coder-3B-Instruct", ], value="Qwen/Qwen2.5-Coder-1.5B-Instruct", label="Model", ) chunk_steps_slider = gr.Slider( minimum=5, maximum=100, value=20, step=5, label="Chunk Steps", ) auto_continue_checkbox = gr.Checkbox( value=False, label="Auto-continue (chain chunks)", ) max_total_steps_slider = gr.Slider( minimum=5, maximum=500, value=100, step=5, label="Max Total Steps", ) max_minutes_number = gr.Number( value=60, precision=0, label="Max Minutes", ) with gr.Row(): start_btn = gr.Button("Start Training", variant="primary") stop_btn = gr.Button("Stop", variant="stop") output_box = gr.Textbox( label="Training Log", lines=25, interactive=False, value="Click 'Start Training' to begin VLIW optimization.", ) def poll_log(): with state_lock: if not training_state["log"]: return "" lines = training_state["log"][-200:] return "\n".join(line[:400] for line in lines) start_btn.click( start_training, [model_dropdown, chunk_steps_slider, max_total_steps_slider, max_minutes_number, auto_continue_checkbox], [output_box], queue=False, ) stop_btn.click(stop_training, [], [output_box], queue=False) refresh_btn = gr.Button("Refresh Log") refresh_btn.click(poll_log, outputs=[output_box], queue=False) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)