CreativeEngineer commited on
Commit
f4dcd8f
·
1 Parent(s): 61f3fc6

Debug version to check imports

Browse files
Files changed (1) hide show
  1. app.py +75 -523
app.py CHANGED
@@ -1,548 +1,100 @@
1
  """
2
  HF Spaces app for VLIW kernel optimization via RL.
3
- Deploy to HF Spaces Pro (A10G GPU).
4
-
5
- This is self-contained - includes verification logic inline.
6
  """
7
  import os
8
  import sys
9
- import re
10
- import threading
11
- import time
12
- import random
13
- from datetime import datetime
14
-
15
  import gradio as gr
16
 
17
- # Thread lock for safe state access
18
- training_state_lock = threading.Lock()
19
-
20
- # Add simulator path
21
- SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
22
- PERF_TAKEHOME_PATH = os.path.join(SCRIPT_DIR, "original_performance_takehome")
23
- if os.path.exists(PERF_TAKEHOME_PATH):
24
- sys.path.insert(0, PERF_TAKEHOME_PATH)
25
-
26
- # Constants
27
- BASELINE_CYCLES = 147734
28
- TARGET_CYCLES = 1363
29
- SCORE_SCALE = 3000.0
30
-
31
- # Training state (global)
32
- training_state = {
33
- "running": False,
34
- "step": 0,
35
- "total_steps": 0,
36
- "best_cycles": BASELINE_CYCLES,
37
- "best_code": None,
38
- "log": [],
39
- "start_time": None,
40
- "results": [],
41
- }
42
-
43
- SYSTEM_PROMPT = '''Write optimized VLIW/SIMD kernel. OUTPUT ONLY ONE ```python CODE BLOCK.
44
-
45
- ARCHITECTURE: 12 ALU + 6 VALU (VLEN=8) + 2 load + 2 store + 1 flow slots per cycle. 1536-word scratch.
46
-
47
- API:
48
- - alloc_scratch(name, length) -> addr
49
- - scratch_const(val, name) -> addr
50
- - add(engine, slot): engine in {alu, valu, load, store, flow}
51
- - alu: (op, dst, src1, src2) where op in {+,-,*,/,%,^,&,|,==,!=,<,>,<=,>=}
52
- - valu: same ops but on vectors (VLEN=8)
53
- - load: (load,dst,addr), (vload,dst,addr), (const,dst,val), (vbroadcast,dst,scalar_addr)
54
- - store: (store,addr,src), (vstore,addr,src)
55
- - flow: (select,dst,cond,t,f), (jump,label), (jump_if_zero,cond,label), (halt,)
56
- - label(name): mark code position
57
- - build(slots, vliw=True): pack slots into VLIW bundle
58
-
59
- MEMORY: mem[4]=forest_values, mem[5]=inp_indices, mem[6]=inp_values (256 elements each)
60
-
61
- ALGORITHM: 16 rounds x 256 items: load idx,val; val=hash(val^tree[idx]); idx=2*idx+(1 or 2 based on val%2); store. Hash is 16 stages using HASH_STAGES constant.
62
-
63
- OPTIMIZATION:
64
- 1. Use vload/vstore: process 8 elements per instruction (256/8 = 32 vector iterations)
65
- 2. Pack ops: 6 VALU slots = 6 vector ops per cycle
66
- 3. Unroll: minimize loop overhead
67
- 4. Pipeline: overlap loads with compute
68
-
69
- You MUST override build_kernel() with actual instructions. Do NOT just call super().
70
- '''
71
-
72
-
73
- def extract_code_block(text: str) -> str:
74
- """Extract python code from markdown code blocks."""
75
- pattern = r"```python\s*(.*?)```"
76
- matches = re.findall(pattern, text, re.DOTALL)
77
- if matches:
78
- return matches[-1].strip()
79
- pattern = r"```\s*(.*?)```"
80
- matches = re.findall(pattern, text, re.DOTALL)
81
- if matches:
82
- return matches[-1].strip()
83
- return text.strip()
84
 
85
-
86
- def verify_perf_takehome(generation: str, score_scale: float = SCORE_SCALE) -> dict:
87
- """
88
- Verify kernel code and return score.
89
- Self-contained verification using the simulator.
90
- """
91
  try:
92
- code = generation.strip()
93
-
94
- if not code:
95
- return {"score": 0.0, "correctness": 0.0, "performance": -1000000,
96
- "msg": "Empty code", "cycles": None}
97
-
98
- if "def run" not in code:
99
- return {"score": 0.0, "correctness": 0.0, "performance": -1000000,
100
- "msg": "No 'run' function defined", "cycles": None}
101
-
102
- # Build execution environment
103
- exec_globals = {
104
- "FOREST_HEIGHT": 10,
105
- "ROUNDS": 16,
106
- "BATCH_SIZE": 256,
107
- }
108
-
109
- # Setup imports
110
- setup_code = f'''
111
- import sys
112
- sys.path.insert(0, "{PERF_TAKEHOME_PATH}")
113
- from problem import Machine, Tree, Input, build_mem_image, N_CORES, VLEN, reference_kernel2
114
- from perf_takehome import KernelBuilder, HASH_STAGES, BASELINE
115
- import random
116
- '''
117
- full_code = setup_code + "\n" + code
118
- exec(full_code, exec_globals)
119
-
120
- if "run" not in exec_globals:
121
- return {"score": 0.0, "correctness": 0.0, "performance": -1000000,
122
- "msg": "No 'run' function after exec", "cycles": None}
123
-
124
- # Require OptimizedKernelBuilder
125
- if "OptimizedKernelBuilder" not in exec_globals:
126
- return {"score": 0.0, "correctness": 0.0, "performance": -1000000,
127
- "msg": "No OptimizedKernelBuilder class", "cycles": None}
128
-
129
- # Run verification
130
- random.seed(123)
131
- from problem import Tree, Input, Machine, build_mem_image, N_CORES, reference_kernel2
132
-
133
- forest = Tree.generate(10)
134
- inp = Input.generate(forest, 256, 16)
135
- mem = build_mem_image(forest, inp)
136
-
137
- # Get reference output
138
- ref_mem = None
139
- for ref_mem in reference_kernel2(list(mem)):
140
- pass
141
-
142
- if ref_mem is None:
143
- return {"score": 0.0, "correctness": 0.0, "performance": -1000000,
144
- "msg": "Reference kernel failed", "cycles": None}
145
-
146
- # Run submitted kernel
147
- kb = exec_globals["OptimizedKernelBuilder"]()
148
- kb.build_kernel(10, len(forest.values), 256, 16)
149
- machine = Machine(list(mem), kb.instrs, kb.debug_info(), n_cores=N_CORES)
150
- machine.enable_pause = False
151
- machine.enable_debug = False
152
- machine.run()
153
-
154
- cycles = machine.cycle
155
-
156
- # Validate cycles
157
- if cycles <= 100:
158
- return {"score": 0.0, "correctness": 0.0, "performance": -1000000,
159
- "msg": f"Suspiciously low cycles ({cycles})", "cycles": cycles}
160
-
161
- if cycles > 200000:
162
- return {"score": 0.0, "correctness": 0.0, "performance": -1000000,
163
- "msg": f"Cycles too high: {cycles}", "cycles": cycles}
164
-
165
- # Compare outputs
166
- inp_values_p = ref_mem[6]
167
- expected = ref_mem[inp_values_p : inp_values_p + len(inp.values)]
168
- actual = machine.mem[inp_values_p : inp_values_p + len(inp.values)]
169
-
170
- if expected != actual:
171
- return {"score": 0.0, "correctness": 0.0, "performance": -1000000,
172
- "msg": f"Incorrect output (cycles={cycles})", "cycles": cycles}
173
-
174
- # Success!
175
- score = score_scale / cycles
176
- return {
177
- "score": score,
178
- "correctness": 1.0,
179
- "performance": -cycles,
180
- "msg": f"Success: {cycles} cycles",
181
- "cycles": cycles,
182
- }
183
-
184
  except Exception as e:
185
- import traceback
186
- tb = traceback.format_exc()
187
- error_line = tb.strip().split('\n')[-1][:200]
188
- return {"score": 0.0, "correctness": 0.0, "performance": -1000000,
189
- "msg": f"Error: {error_line}", "cycles": None}
190
-
191
-
192
- def log(msg: str):
193
- """Add to training log (thread-safe)."""
194
- timestamp = datetime.now().strftime("%H:%M:%S")
195
- formatted = f"[{timestamp}] {msg}"
196
- with training_state_lock:
197
- training_state["log"].append(formatted)
198
- print(formatted)
199
-
200
-
201
- def reward_function(completions: list[str], **kwargs) -> list[float]:
202
- """Compute rewards for completions."""
203
- rewards = []
204
- for completion in completions:
205
- try:
206
- code = extract_code_block(completion)
207
- result = verify_perf_takehome(code)
208
- reward = result["score"]
209
-
210
- if result["correctness"] > 0:
211
- reward += 1.0
212
- cycles = result.get("cycles")
213
- if cycles:
214
- with training_state_lock:
215
- training_state["results"].append({
216
- "step": training_state["step"],
217
- "cycles": cycles,
218
- "time": time.time() - (training_state["start_time"] or time.time())
219
- })
220
- if cycles < training_state["best_cycles"]:
221
- training_state["best_cycles"] = cycles
222
- training_state["best_code"] = code
223
- speedup = BASELINE_CYCLES / cycles
224
- log(f"NEW BEST: {cycles:,} cycles ({speedup:.2f}x speedup)")
225
-
226
- rewards.append(reward)
227
-
228
- except Exception as e:
229
- log(f"Reward error: {str(e)[:100]}")
230
- rewards.append(0.0)
231
-
232
- return rewards
233
-
234
-
235
- def build_prompt(current_cycles: int = BASELINE_CYCLES, last_code: str = "") -> str:
236
- """Build training prompt."""
237
- prompt = f"""{SYSTEM_PROMPT}
238
-
239
- CURRENT: {current_cycles:,} cycles. TARGET: <{TARGET_CYCLES:,} cycles (need {current_cycles//TARGET_CYCLES}x speedup).
240
- """
241
- if last_code:
242
- prompt += f"""
243
- Previous best attempt:
244
- ```python
245
- {last_code[:2000]}
246
- ```
247
-
248
- Improve this code to reduce cycles further.
249
- """
250
- else:
251
- prompt += """
252
- Write a complete solution with:
253
- 1. A run() function that returns (cycles, code_string)
254
- 2. An OptimizedKernelBuilder class with build_kernel() method
255
- """
256
- return prompt
257
-
258
-
259
- def run_training(model_name: str, num_steps: int, batch_size: int, lr: float, lora_rank: int):
260
- """Main training loop."""
261
- global training_state
262
-
263
- with training_state_lock:
264
- training_state["running"] = True
265
- training_state["step"] = 0
266
- training_state["total_steps"] = num_steps
267
- training_state["best_cycles"] = BASELINE_CYCLES
268
- training_state["best_code"] = None
269
- training_state["log"] = []
270
- training_state["results"] = []
271
- training_state["start_time"] = time.time()
272
-
273
- log(f"Starting training: {model_name}")
274
- log(f"Steps: {num_steps}, Batch: {batch_size}, LR: {lr}, LoRA rank: {lora_rank}")
275
-
276
  try:
277
- import torch
278
- from datasets import Dataset
279
- from transformers import AutoTokenizer, BitsAndBytesConfig, TrainerCallback
280
- from peft import LoraConfig
281
- from trl import GRPOConfig, GRPOTrainer
282
-
283
- # Check GPU
284
- if torch.cuda.is_available():
285
- gpu_name = torch.cuda.get_device_name(0)
286
- gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
287
- log(f"GPU: {gpu_name} ({gpu_mem:.1f}GB)")
288
- else:
289
- log("WARNING: No GPU detected!")
290
-
291
- log("Loading tokenizer...")
292
- tokenizer = AutoTokenizer.from_pretrained(model_name)
293
- if tokenizer.pad_token is None:
294
- tokenizer.pad_token = tokenizer.eos_token
295
-
296
- # Create dataset
297
- prompt = build_prompt(BASELINE_CYCLES, "")
298
- dataset = Dataset.from_dict({"prompt": [prompt] * 64})
299
-
300
- # LoRA config
301
- peft_config = LoraConfig(
302
- r=lora_rank,
303
- lora_alpha=lora_rank * 2,
304
- lora_dropout=0.05,
305
- bias="none",
306
- task_type="CAUSAL_LM",
307
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
308
- "gate_proj", "up_proj", "down_proj"],
309
- )
310
-
311
- # Training config
312
- output_dir = f"./output/{datetime.now().strftime('%Y%m%d-%H%M%S')}"
313
- os.makedirs(output_dir, exist_ok=True)
314
-
315
- training_args = GRPOConfig(
316
- output_dir=output_dir,
317
- num_train_epochs=num_steps,
318
- per_device_train_batch_size=batch_size,
319
- gradient_accumulation_steps=4,
320
- learning_rate=lr,
321
- logging_steps=1,
322
- save_steps=10,
323
- max_completion_length=2048,
324
- max_prompt_length=2048,
325
- temperature=0.7,
326
- num_generations=4,
327
- beta=0.1,
328
- bf16=True,
329
- report_to="none",
330
- )
331
-
332
- # Quantization for 7B model on A10G
333
- quant_config = None
334
- if "7B" in model_name or "7b" in model_name:
335
- log("Using 4-bit quantization for 7B model")
336
- quant_config = BitsAndBytesConfig(
337
- load_in_4bit=True,
338
- bnb_4bit_compute_dtype=torch.bfloat16,
339
- bnb_4bit_use_double_quant=True,
340
- bnb_4bit_quant_type="nf4",
341
- )
342
-
343
- log("Loading model (this may take a few minutes)...")
344
-
345
- model_kwargs = {}
346
- if quant_config:
347
- model_kwargs["quantization_config"] = quant_config
348
-
349
- # Create stop callback
350
- class StopCallback(TrainerCallback):
351
- def on_step_end(self, args, state, control, **kwargs):
352
- if not training_state["running"]:
353
- log("Stop signal received, halting training...")
354
- control.should_training_stop = True
355
- return control
356
-
357
- trainer = GRPOTrainer(
358
- model=model_name,
359
- reward_funcs=[reward_function],
360
- args=training_args,
361
- train_dataset=dataset,
362
- peft_config=peft_config,
363
- processing_class=tokenizer,
364
- model_init_kwargs=model_kwargs,
365
- callbacks=[StopCallback()],
366
- )
367
-
368
- log("Model loaded! Starting training...")
369
-
370
- # Train
371
- trainer.train()
372
-
373
- log("Training complete!")
374
-
375
- # Save
376
- trainer.save_model(os.path.join(output_dir, "final"))
377
- log(f"Model saved to {output_dir}/final")
378
-
379
- # Save best code
380
- if training_state["best_code"]:
381
- with open(os.path.join(output_dir, "best_code.py"), "w") as f:
382
- f.write(training_state["best_code"])
383
- log("Best code saved!")
384
-
385
- except Exception as e:
386
- import traceback
387
- log(f"ERROR: {str(e)}")
388
- log(traceback.format_exc())
389
-
390
- finally:
391
- with training_state_lock:
392
- training_state["running"] = False
393
- elapsed = time.time() - training_state["start_time"]
394
- best = training_state["best_cycles"]
395
- log(f"Total time: {elapsed/60:.1f} minutes")
396
- log(f"Best result: {best:,} cycles")
397
-
398
-
399
- def start_training(model_name, num_steps, batch_size, lr, lora_rank):
400
- """Start training in background."""
401
- if training_state["running"]:
402
- return "Training already running!"
403
-
404
- thread = threading.Thread(
405
- target=run_training,
406
- args=(model_name, int(num_steps), int(batch_size), float(lr), int(lora_rank)),
407
- daemon=False # Non-daemon to ensure training completes
408
- )
409
- thread.start()
410
- return "Training started! Monitor progress below."
411
-
412
-
413
- def stop_training():
414
- """Signal training to stop."""
415
- with training_state_lock:
416
- training_state["running"] = False
417
- return "Stop signal sent. Training will stop after current step."
418
-
419
-
420
- def get_status():
421
- """Get current status as markdown."""
422
- if not training_state["start_time"]:
423
- return "### Status: Not started\n\nConfigure settings and click Start Training."
424
-
425
- with training_state_lock:
426
- elapsed = time.time() - training_state["start_time"]
427
- elapsed_str = f"{elapsed/60:.1f} min"
428
- best_cycles = max(training_state["best_cycles"], 1) # Prevent division by zero
429
- is_running = training_state["running"]
430
- log_lines = training_state["log"][-15:]
431
-
432
- speedup = BASELINE_CYCLES / best_cycles
433
- progress_pct = (1 - best_cycles / BASELINE_CYCLES) * 100
434
-
435
- status = f"""### Status: {'Running' if is_running else 'Stopped'}
436
-
437
- | Metric | Value |
438
- |--------|-------|
439
- | Elapsed | {elapsed_str} |
440
- | Best Cycles | **{best_cycles:,}** |
441
- | Speedup | **{speedup:.2f}x** |
442
- | Progress to Target | {progress_pct:.1f}% |
443
- | Target | {TARGET_CYCLES:,} cycles |
444
-
445
- ---
446
-
447
- ### Recent Log
448
- ```
449
- {chr(10).join(log_lines)}
450
- ```
451
- """
452
- return status
453
-
454
-
455
- def get_best_code():
456
- """Get best code found."""
457
- with training_state_lock:
458
- best_code = training_state["best_code"]
459
- if best_code:
460
- return best_code
461
- return "# No valid code found yet.\n# Start training to generate optimized kernels."
462
-
463
-
464
- def get_results_chart():
465
- """Get results as simple text chart."""
466
- with training_state_lock:
467
- results = list(training_state["results"][-20:])
468
-
469
- if not results:
470
- return "No results yet."
471
-
472
- lines = ["Cycles over time:", ""]
473
- for r in results:
474
- bar_len = max(1, int(50 * r["cycles"] / BASELINE_CYCLES))
475
- bar = "#" * bar_len
476
- lines.append(f"{r['cycles']:>7,} | {bar}")
477
-
478
- return "\n".join(lines)
479
-
480
 
481
- # Build Gradio UI
482
- with gr.Blocks(title="VLIW Kernel Optimizer", theme=gr.themes.Soft()) as demo:
483
- gr.Markdown("""
484
- # VLIW Kernel Optimization via Reinforcement Learning
485
 
486
- Train a language model to generate optimized VLIW/SIMD kernels.
 
487
 
488
- | Baseline | Target | Goal |
489
- |----------|--------|------|
490
- | 147,734 cycles | 1,363 cycles | 108x speedup |
491
- """)
492
 
493
- with gr.Row():
494
- with gr.Column(scale=1):
495
- gr.Markdown("### Configuration")
496
-
497
- model_dropdown = gr.Dropdown(
498
- choices=[
499
- "Qwen/Qwen2.5-Coder-7B-Instruct",
500
- "Qwen/Qwen2.5-Coder-3B-Instruct",
501
- "Qwen/Qwen2.5-Coder-1.5B-Instruct",
502
- "deepseek-ai/deepseek-coder-6.7b-instruct",
503
- "codellama/CodeLlama-7b-Instruct-hf",
504
- ],
505
- value="Qwen/Qwen2.5-Coder-7B-Instruct",
506
- label="Model"
507
- )
508
 
509
- steps_slider = gr.Slider(1, 100, value=50, step=1, label="Training Steps")
510
- batch_slider = gr.Slider(1, 8, value=4, step=1, label="Batch Size")
511
- lr_input = gr.Number(value=2e-4, label="Learning Rate")
512
- lora_slider = gr.Slider(8, 64, value=32, step=8, label="LoRA Rank")
513
 
514
- with gr.Row():
515
- start_btn = gr.Button("Start Training", variant="primary", size="lg")
516
- stop_btn = gr.Button("Stop", variant="stop")
517
-
518
- with gr.Column(scale=2):
519
- status_md = gr.Markdown("### Status: Not started")
520
- refresh_btn = gr.Button("Refresh", size="sm")
521
 
522
  with gr.Row():
523
  with gr.Column():
524
- gr.Markdown("### Best Code Found")
525
- code_output = gr.Code(language="python", lines=25)
526
- code_btn = gr.Button("Show Best Code")
 
 
 
 
527
 
528
  with gr.Column():
529
- gr.Markdown("### Results")
530
- results_output = gr.Textbox(lines=15, label="Cycles Progress")
531
- results_btn = gr.Button("Show Results")
532
-
533
- # Event handlers
534
- start_btn.click(
535
- start_training,
536
- inputs=[model_dropdown, steps_slider, batch_slider, lr_input, lora_slider],
537
- outputs=[status_md]
538
- )
539
- stop_btn.click(stop_training, outputs=[status_md])
540
- refresh_btn.click(get_status, outputs=[status_md])
541
- code_btn.click(get_best_code, outputs=[code_output])
542
- results_btn.click(get_results_chart, outputs=[results_output])
543
-
544
- # Auto-refresh
545
- demo.load(get_status, outputs=[status_md], every=5)
546
 
547
 
548
  if __name__ == "__main__":
 
1
  """
2
  HF Spaces app for VLIW kernel optimization via RL.
3
+ Minimal version for debugging.
 
 
4
  """
5
  import os
6
  import sys
 
 
 
 
 
 
7
  import gradio as gr
8
 
9
+ # Check imports on startup
10
+ startup_log = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ def check_import(name, import_fn):
 
 
 
 
 
13
  try:
14
+ result = import_fn()
15
+ startup_log.append(f"✓ {name}: {result}")
16
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  except Exception as e:
18
+ startup_log.append(f"✗ {name}: {str(e)[:100]}")
19
+ return False
20
+
21
+ # Test imports
22
+ check_import("torch", lambda: __import__("torch").__version__)
23
+ check_import("transformers", lambda: __import__("transformers").__version__)
24
+ check_import("datasets", lambda: __import__("datasets").__version__)
25
+ check_import("peft", lambda: __import__("peft").__version__)
26
+ check_import("trl", lambda: __import__("trl").__version__)
27
+ check_import("accelerate", lambda: __import__("accelerate").__version__)
28
+
29
+ # Try GRPO import
30
+ try:
31
+ from trl import GRPOConfig, GRPOTrainer
32
+ startup_log.append("✓ GRPOTrainer: imported from trl")
33
+ except ImportError as e:
34
+ startup_log.append(f"✗ GRPOTrainer from trl: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  try:
36
+ from trl.trainer.grpo_trainer import GRPOConfig, GRPOTrainer
37
+ startup_log.append("✓ GRPOTrainer: imported from trl.trainer.grpo_trainer")
38
+ except ImportError as e2:
39
+ startup_log.append(f"✗ GRPOTrainer alt: {e2}")
40
+
41
+ # Check CUDA
42
+ try:
43
+ import torch
44
+ if torch.cuda.is_available():
45
+ startup_log.append(f"✓ CUDA: {torch.cuda.get_device_name(0)}")
46
+ else:
47
+ startup_log.append("✗ CUDA: Not available")
48
+ except:
49
+ startup_log.append("✗ CUDA: Could not check")
50
+
51
+ # Check simulator
52
+ try:
53
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
54
+ PERF_PATH = os.path.join(SCRIPT_DIR, "original_performance_takehome")
55
+ if os.path.exists(PERF_PATH):
56
+ sys.path.insert(0, PERF_PATH)
57
+ from problem import Machine, Tree
58
+ startup_log.append("✓ Simulator: loaded")
59
+ else:
60
+ startup_log.append(f"✗ Simulator: path not found ({PERF_PATH})")
61
+ except Exception as e:
62
+ startup_log.append(f"✗ Simulator: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
 
 
 
 
64
 
65
+ def get_startup_log():
66
+ return "\n".join(startup_log)
67
 
 
 
 
 
68
 
69
+ def dummy_train(model, steps):
70
+ return f"Would train {model} for {steps} steps\n\nImport status:\n" + get_startup_log()
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
 
 
 
 
72
 
73
+ # Simple UI
74
+ with gr.Blocks(title="VLIW Optimizer") as demo:
75
+ gr.Markdown("# VLIW Kernel Optimizer - Debug Mode")
76
+ gr.Markdown("Checking if all imports work...")
 
 
 
77
 
78
  with gr.Row():
79
  with gr.Column():
80
+ status = gr.Textbox(
81
+ label="Startup Log",
82
+ value=get_startup_log(),
83
+ lines=20
84
+ )
85
+ refresh_btn = gr.Button("Refresh Status")
86
+ refresh_btn.click(get_startup_log, outputs=[status])
87
 
88
  with gr.Column():
89
+ model = gr.Dropdown(
90
+ choices=["Qwen/Qwen2.5-Coder-1.5B-Instruct", "Qwen/Qwen2.5-Coder-3B-Instruct"],
91
+ value="Qwen/Qwen2.5-Coder-1.5B-Instruct",
92
+ label="Model"
93
+ )
94
+ steps = gr.Slider(1, 10, value=3, label="Steps")
95
+ train_btn = gr.Button("Test Train", variant="primary")
96
+ output = gr.Textbox(label="Output", lines=10)
97
+ train_btn.click(dummy_train, inputs=[model, steps], outputs=[output])
 
 
 
 
 
 
 
 
98
 
99
 
100
  if __name__ == "__main__":