CreativeEngineer commited on
Commit
c5c47e3
·
1 Parent(s): 75d63f1

Revert to Gradio 5.49.1 and ASCII logs

Browse files
Files changed (3) hide show
  1. README.md +1 -2
  2. app.py +24 -24
  3. requirements.txt +3 -4
README.md CHANGED
@@ -4,8 +4,7 @@ emoji: "⚡"
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.44.1
8
- python_version: "3.10"
9
  app_file: app.py
10
  pinned: false
11
  license: mit
 
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.49.1
 
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py CHANGED
@@ -18,10 +18,10 @@ startup_log = []
18
  def check_import(name, import_fn):
19
  try:
20
  result = import_fn()
21
- startup_log.append(f" {name}: {result}")
22
  return True
23
  except Exception as e:
24
- startup_log.append(f" {name}: {str(e)[:80]}")
25
  return False
26
 
27
  check_import("torch", lambda: __import__("torch").__version__)
@@ -33,18 +33,18 @@ check_import("huggingface_hub", lambda: __import__("huggingface_hub").__version_
33
 
34
  try:
35
  from trl import GRPOConfig, GRPOTrainer
36
- startup_log.append(" GRPOTrainer: OK")
37
  except Exception as e:
38
- startup_log.append(f" GRPOTrainer: {e}")
39
 
40
  try:
41
  import torch
42
  if torch.cuda.is_available():
43
- startup_log.append(f" CUDA: {torch.cuda.get_device_name(0)}")
44
  else:
45
- startup_log.append(" CUDA: Not available")
46
  except Exception as e:
47
- startup_log.append(f" CUDA check: {e}")
48
 
49
  # Prefer simulator + KernelBuilder from bundled original_performance_takehome.
50
  # In Spaces, this keeps evaluation consistent and enables correctness checks.
@@ -61,19 +61,19 @@ try:
61
  SLOT_LIMITS, VLEN, N_CORES, SCRATCH_SIZE, CoreState
62
  )
63
  from perf_takehome import KernelBuilder, HASH_STAGES
64
- startup_log.append(" VLIW Simulator: OK")
65
  SIMULATOR_AVAILABLE = True
66
  except Exception as e:
67
- startup_log.append(f" VLIW Simulator: {e}")
68
  SIMULATOR_AVAILABLE = False
69
 
70
  # Hugging Face Hub adapter persistence via dataset repo
71
  try:
72
  from huggingface_hub import HfApi, snapshot_download
73
- startup_log.append(" huggingface_hub: OK")
74
  HF_HUB_AVAILABLE = True
75
  except Exception as e:
76
- startup_log.append(f" huggingface_hub: {str(e)[:80]}")
77
  HF_HUB_AVAILABLE = False
78
 
79
  # Constants
@@ -142,7 +142,7 @@ def _adapter_exists(path: str) -> bool:
142
 
143
  def _try_download_adapter(add_log) -> None:
144
  if not HF_HUB_AVAILABLE:
145
- add_log(" Hub sync disabled: huggingface_hub not available")
146
  return
147
  _ensure_dir(os.path.dirname(ADAPTER_DIR))
148
  allow = [f"{ADAPTER_DATASET_SUBDIR}/**"]
@@ -170,7 +170,7 @@ def _try_download_adapter(add_log) -> None:
170
  dst = os.path.join(dst_root, name)
171
  with open(src, "rb") as fsrc, open(dst, "wb") as fdst:
172
  fdst.write(fsrc.read())
173
- add_log(f" Downloaded adapter from dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
174
  else:
175
  add_log("ℹ No adapter found in dataset yet")
176
  except Exception as e:
@@ -179,7 +179,7 @@ def _try_download_adapter(add_log) -> None:
179
 
180
  def _try_upload_adapter(add_log) -> None:
181
  if not HF_HUB_AVAILABLE:
182
- add_log(" Hub sync disabled: huggingface_hub not available")
183
  return
184
  if not _adapter_exists(ADAPTER_DIR):
185
  add_log("ℹ No adapter to upload yet")
@@ -198,7 +198,7 @@ def _try_upload_adapter(add_log) -> None:
198
  path_in_repo=ADAPTER_DATASET_SUBDIR,
199
  commit_message="Update perf_takehome adapter",
200
  )
201
- add_log(f" Uploaded adapter to dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
202
  except Exception as e:
203
  add_log(f"ℹ Adapter upload skipped: {str(e)[:160]}")
204
 
@@ -496,7 +496,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
496
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
497
  if tokenizer.pad_token is None:
498
  tokenizer.pad_token = tokenizer.eos_token
499
- add_log(" Tokenizer ready")
500
 
501
  # Load model with 4-bit quantization
502
  add_log("Loading model (4-bit quantization)...")
@@ -511,7 +511,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
511
  device_map="auto",
512
  trust_remote_code=True,
513
  )
514
- add_log(f" Base model loaded on {next(base_model.parameters()).device}")
515
 
516
  # Try to restore adapter from dataset before loading it
517
  _try_download_adapter(add_log)
@@ -521,7 +521,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
521
  if os.path.isdir(ADAPTER_DIR) and os.path.exists(os.path.join(ADAPTER_DIR, "adapter_config.json")):
522
  add_log("Loading existing LoRA adapter (resume)...")
523
  model = PeftModel.from_pretrained(base_model, ADAPTER_DIR, is_trainable=True)
524
- add_log(" Adapter loaded")
525
  resume_adapter = True
526
  else:
527
  model = base_model
@@ -530,7 +530,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
530
  add_log("Creating VLIW optimization dataset...")
531
  prompts = [PERF_TAKEHOME_PROMPT] * 16
532
  dataset = Dataset.from_dict({"prompt": prompts})
533
- add_log(f" Dataset ready: {len(prompts)} prompts")
534
 
535
  # LoRA config
536
  add_log("Setting up LoRA...")
@@ -571,7 +571,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
571
  output_dir = os.path.join(PERSIST_DIR, "grpo_perf_takehome_output")
572
  os.makedirs(output_dir, exist_ok=True)
573
 
574
- add_log(" Trainer config ready")
575
  add_log("Starting training loop...")
576
  add_log("(Stops early if target reached; can auto-continue in chunks)")
577
 
@@ -632,10 +632,10 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
632
  try:
633
  os.makedirs(os.path.dirname(ADAPTER_DIR), exist_ok=True)
634
  trainer.save_model(ADAPTER_DIR)
635
- add_log(f" Saved adapter to {ADAPTER_DIR}")
636
  _try_upload_adapter(add_log)
637
  except Exception as e:
638
- add_log(f" Failed to save adapter: {str(e)[:120]}")
639
 
640
  if not auto_continue:
641
  break
@@ -663,11 +663,11 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
663
  else:
664
  add_log(f"Generated kernel invalid: {verify_out.get('msg', '')[:160]}")
665
 
666
- add_log("\n All done!")
667
 
668
  except Exception as e:
669
  import traceback
670
- add_log(f" Error: {e}")
671
  add_log(traceback.format_exc()[:800])
672
  finally:
673
  with state_lock:
 
18
  def check_import(name, import_fn):
19
  try:
20
  result = import_fn()
21
+ startup_log.append(f"[OK] {name}: {result}")
22
  return True
23
  except Exception as e:
24
+ startup_log.append(f"[ERR] {name}: {str(e)[:80]}")
25
  return False
26
 
27
  check_import("torch", lambda: __import__("torch").__version__)
 
33
 
34
  try:
35
  from trl import GRPOConfig, GRPOTrainer
36
+ startup_log.append("[OK] GRPOTrainer: OK")
37
  except Exception as e:
38
+ startup_log.append(f"[ERR] GRPOTrainer: {e}")
39
 
40
  try:
41
  import torch
42
  if torch.cuda.is_available():
43
+ startup_log.append(f"[OK] CUDA: {torch.cuda.get_device_name(0)}")
44
  else:
45
+ startup_log.append("[ERR] CUDA: Not available")
46
  except Exception as e:
47
+ startup_log.append(f"[ERR] CUDA check: {e}")
48
 
49
  # Prefer simulator + KernelBuilder from bundled original_performance_takehome.
50
  # In Spaces, this keeps evaluation consistent and enables correctness checks.
 
61
  SLOT_LIMITS, VLEN, N_CORES, SCRATCH_SIZE, CoreState
62
  )
63
  from perf_takehome import KernelBuilder, HASH_STAGES
64
+ startup_log.append("[OK] VLIW Simulator: OK")
65
  SIMULATOR_AVAILABLE = True
66
  except Exception as e:
67
+ startup_log.append(f"[ERR] VLIW Simulator: {e}")
68
  SIMULATOR_AVAILABLE = False
69
 
70
  # Hugging Face Hub adapter persistence via dataset repo
71
  try:
72
  from huggingface_hub import HfApi, snapshot_download
73
+ startup_log.append("[OK] huggingface_hub: OK")
74
  HF_HUB_AVAILABLE = True
75
  except Exception as e:
76
+ startup_log.append(f"[ERR] huggingface_hub: {str(e)[:80]}")
77
  HF_HUB_AVAILABLE = False
78
 
79
  # Constants
 
142
 
143
  def _try_download_adapter(add_log) -> None:
144
  if not HF_HUB_AVAILABLE:
145
+ add_log("[ERR] Hub sync disabled: huggingface_hub not available")
146
  return
147
  _ensure_dir(os.path.dirname(ADAPTER_DIR))
148
  allow = [f"{ADAPTER_DATASET_SUBDIR}/**"]
 
170
  dst = os.path.join(dst_root, name)
171
  with open(src, "rb") as fsrc, open(dst, "wb") as fdst:
172
  fdst.write(fsrc.read())
173
+ add_log(f"[OK] Downloaded adapter from dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
174
  else:
175
  add_log("ℹ No adapter found in dataset yet")
176
  except Exception as e:
 
179
 
180
  def _try_upload_adapter(add_log) -> None:
181
  if not HF_HUB_AVAILABLE:
182
+ add_log("[ERR] Hub sync disabled: huggingface_hub not available")
183
  return
184
  if not _adapter_exists(ADAPTER_DIR):
185
  add_log("ℹ No adapter to upload yet")
 
198
  path_in_repo=ADAPTER_DATASET_SUBDIR,
199
  commit_message="Update perf_takehome adapter",
200
  )
201
+ add_log(f"[OK] Uploaded adapter to dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
202
  except Exception as e:
203
  add_log(f"ℹ Adapter upload skipped: {str(e)[:160]}")
204
 
 
496
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
497
  if tokenizer.pad_token is None:
498
  tokenizer.pad_token = tokenizer.eos_token
499
+ add_log("[OK] Tokenizer ready")
500
 
501
  # Load model with 4-bit quantization
502
  add_log("Loading model (4-bit quantization)...")
 
511
  device_map="auto",
512
  trust_remote_code=True,
513
  )
514
+ add_log(f"[OK] Base model loaded on {next(base_model.parameters()).device}")
515
 
516
  # Try to restore adapter from dataset before loading it
517
  _try_download_adapter(add_log)
 
521
  if os.path.isdir(ADAPTER_DIR) and os.path.exists(os.path.join(ADAPTER_DIR, "adapter_config.json")):
522
  add_log("Loading existing LoRA adapter (resume)...")
523
  model = PeftModel.from_pretrained(base_model, ADAPTER_DIR, is_trainable=True)
524
+ add_log("[OK] Adapter loaded")
525
  resume_adapter = True
526
  else:
527
  model = base_model
 
530
  add_log("Creating VLIW optimization dataset...")
531
  prompts = [PERF_TAKEHOME_PROMPT] * 16
532
  dataset = Dataset.from_dict({"prompt": prompts})
533
+ add_log(f"[OK] Dataset ready: {len(prompts)} prompts")
534
 
535
  # LoRA config
536
  add_log("Setting up LoRA...")
 
571
  output_dir = os.path.join(PERSIST_DIR, "grpo_perf_takehome_output")
572
  os.makedirs(output_dir, exist_ok=True)
573
 
574
+ add_log("[OK] Trainer config ready")
575
  add_log("Starting training loop...")
576
  add_log("(Stops early if target reached; can auto-continue in chunks)")
577
 
 
632
  try:
633
  os.makedirs(os.path.dirname(ADAPTER_DIR), exist_ok=True)
634
  trainer.save_model(ADAPTER_DIR)
635
+ add_log(f"[OK] Saved adapter to {ADAPTER_DIR}")
636
  _try_upload_adapter(add_log)
637
  except Exception as e:
638
+ add_log(f"[ERR] Failed to save adapter: {str(e)[:120]}")
639
 
640
  if not auto_continue:
641
  break
 
663
  else:
664
  add_log(f"Generated kernel invalid: {verify_out.get('msg', '')[:160]}")
665
 
666
+ add_log("\n[OK] All done!")
667
 
668
  except Exception as e:
669
  import traceback
670
+ add_log(f"[ERR] Error: {e}")
671
  add_log(traceback.format_exc()[:800])
672
  finally:
673
  with state_lock:
requirements.txt CHANGED
@@ -1,10 +1,9 @@
1
  torch>=2.1.0
2
  transformers>=4.45.0
3
- huggingface_hub>=0.22.0,<0.23.0
4
- datasets==2.18.0
5
  peft>=0.13.0
6
  trl>=0.12.0
7
  accelerate>=0.34.0
8
  bitsandbytes>=0.44.0
9
- gradio>=4.0.0,<5.0.0
10
- audioop-lts>=0.2.2
 
1
  torch>=2.1.0
2
  transformers>=4.45.0
3
+ huggingface_hub>=0.30.0
4
+ datasets>=2.18.0
5
  peft>=0.13.0
6
  trl>=0.12.0
7
  accelerate>=0.34.0
8
  bitsandbytes>=0.44.0
9
+ gradio>=5.49.1,<6.0.0