CreativeEngineer commited on
Commit
f2d5eaa
·
1 Parent(s): 648e193

Persist LoRA adapter via HF dataset repo

Browse files
Files changed (2) hide show
  1. app.py +92 -0
  2. requirements.txt +1 -0
app.py CHANGED
@@ -10,6 +10,7 @@ import time
10
  import random
11
  import re
12
  from copy import copy
 
13
 
14
  # Check imports at startup
15
  startup_log = []
@@ -65,12 +66,23 @@ except Exception as e:
65
  startup_log.append(f"✗ VLIW Simulator: {e}")
66
  SIMULATOR_AVAILABLE = False
67
 
 
 
 
 
 
 
 
 
 
68
  # Constants
69
  BASELINE_CYCLES = 147734
70
  TARGET_CYCLES = 1363
71
  SCORE_SCALE = 3000.0
72
  PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
73
  ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
 
 
74
 
75
  # Training state
76
  training_state = {
@@ -102,6 +114,81 @@ def extract_code_block(text: str) -> str:
102
  return text.strip()
103
 
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  def _run_machine_with_cycle_limit(machine: Machine, max_cycles: int) -> bool:
106
  for core in machine.cores:
107
  if core.state == CoreState.PAUSED:
@@ -388,6 +475,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
388
  add_log(f"Auto-continue: {auto_continue} (max_total_steps={max_total_steps}, max_minutes={max_minutes})")
389
  add_log(f"Baseline: {BASELINE_CYCLES:,} cycles, Target: {TARGET_CYCLES:,} cycles")
390
  add_log(f"Adapter dir: {ADAPTER_DIR}")
 
391
 
392
  # Load tokenizer
393
  add_log("Loading tokenizer...")
@@ -411,6 +499,9 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
411
  )
412
  add_log(f"✓ Base model loaded on {next(base_model.parameters()).device}")
413
 
 
 
 
414
  # Resume LoRA adapter if present
415
  if os.path.isdir(ADAPTER_DIR) and os.path.exists(os.path.join(ADAPTER_DIR, "adapter_config.json")):
416
  add_log("Loading existing LoRA adapter (resume)...")
@@ -523,6 +614,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
523
  os.makedirs(os.path.dirname(ADAPTER_DIR), exist_ok=True)
524
  trainer.save_model(ADAPTER_DIR)
525
  add_log(f"✓ Saved adapter to {ADAPTER_DIR}")
 
526
  except Exception as e:
527
  add_log(f"✗ Failed to save adapter: {str(e)[:120]}")
528
 
 
10
  import random
11
  import re
12
  from copy import copy
13
+ from pathlib import Path
14
 
15
  # Check imports at startup
16
  startup_log = []
 
66
  startup_log.append(f"✗ VLIW Simulator: {e}")
67
  SIMULATOR_AVAILABLE = False
68
 
69
+ # Hugging Face Hub adapter persistence via dataset repo
70
+ try:
71
+ from huggingface_hub import HfApi, snapshot_download
72
+ startup_log.append("✓ huggingface_hub: OK")
73
+ HF_HUB_AVAILABLE = True
74
+ except Exception as e:
75
+ startup_log.append(f"✗ huggingface_hub: {str(e)[:80]}")
76
+ HF_HUB_AVAILABLE = False
77
+
78
  # Constants
79
  BASELINE_CYCLES = 147734
80
  TARGET_CYCLES = 1363
81
  SCORE_SCALE = 3000.0
82
  PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
83
  ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
84
+ ADAPTER_DATASET_REPO = os.environ.get("ADAPTER_DATASET_REPO", "CreativeEngineer/vliw-optimizer-adapters")
85
+ ADAPTER_DATASET_SUBDIR = os.environ.get("ADAPTER_DATASET_SUBDIR", "perf_takehome_latest")
86
 
87
  # Training state
88
  training_state = {
 
114
  return text.strip()
115
 
116
 
117
+ def _hf_token() -> str | None:
118
+ return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
119
+
120
+
121
+ def _ensure_dir(path: str) -> None:
122
+ Path(path).mkdir(parents=True, exist_ok=True)
123
+
124
+
125
+ def _adapter_exists(path: str) -> bool:
126
+ return os.path.exists(os.path.join(path, "adapter_config.json"))
127
+
128
+
129
+ def _try_download_adapter(add_log) -> None:
130
+ if not HF_HUB_AVAILABLE:
131
+ add_log("✗ Hub sync disabled: huggingface_hub not available")
132
+ return
133
+ _ensure_dir(os.path.dirname(ADAPTER_DIR))
134
+ allow = [f"{ADAPTER_DATASET_SUBDIR}/**"]
135
+ try:
136
+ snapshot_download(
137
+ repo_id=ADAPTER_DATASET_REPO,
138
+ repo_type="dataset",
139
+ allow_patterns=allow,
140
+ local_dir=os.path.dirname(ADAPTER_DIR),
141
+ local_dir_use_symlinks=False,
142
+ token=_hf_token(),
143
+ )
144
+ downloaded = os.path.join(os.path.dirname(ADAPTER_DIR), ADAPTER_DATASET_SUBDIR)
145
+ if _adapter_exists(downloaded):
146
+ if downloaded != ADAPTER_DIR:
147
+ _ensure_dir(os.path.dirname(ADAPTER_DIR))
148
+ # Simple overwrite by copying files into ADAPTER_DIR
149
+ _ensure_dir(ADAPTER_DIR)
150
+ for root, _, files in os.walk(downloaded):
151
+ rel = os.path.relpath(root, downloaded)
152
+ dst_root = ADAPTER_DIR if rel == "." else os.path.join(ADAPTER_DIR, rel)
153
+ _ensure_dir(dst_root)
154
+ for name in files:
155
+ src = os.path.join(root, name)
156
+ dst = os.path.join(dst_root, name)
157
+ with open(src, "rb") as fsrc, open(dst, "wb") as fdst:
158
+ fdst.write(fsrc.read())
159
+ add_log(f"✓ Downloaded adapter from dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
160
+ else:
161
+ add_log("ℹ No adapter found in dataset yet")
162
+ except Exception as e:
163
+ add_log(f"ℹ Adapter download skipped: {str(e)[:160]}")
164
+
165
+
166
+ def _try_upload_adapter(add_log) -> None:
167
+ if not HF_HUB_AVAILABLE:
168
+ add_log("✗ Hub sync disabled: huggingface_hub not available")
169
+ return
170
+ if not _adapter_exists(ADAPTER_DIR):
171
+ add_log("ℹ No adapter to upload yet")
172
+ return
173
+ token = _hf_token()
174
+ if token is None:
175
+ add_log("ℹ No HF token set (HF_TOKEN/HUGGINGFACE_HUB_TOKEN); skipping upload")
176
+ return
177
+ try:
178
+ api = HfApi(token=token)
179
+ api.create_repo(repo_id=ADAPTER_DATASET_REPO, repo_type="dataset", exist_ok=True)
180
+ api.upload_folder(
181
+ repo_id=ADAPTER_DATASET_REPO,
182
+ repo_type="dataset",
183
+ folder_path=ADAPTER_DIR,
184
+ path_in_repo=ADAPTER_DATASET_SUBDIR,
185
+ commit_message="Update perf_takehome adapter",
186
+ )
187
+ add_log(f"✓ Uploaded adapter to dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
188
+ except Exception as e:
189
+ add_log(f"ℹ Adapter upload skipped: {str(e)[:160]}")
190
+
191
+
192
  def _run_machine_with_cycle_limit(machine: Machine, max_cycles: int) -> bool:
193
  for core in machine.cores:
194
  if core.state == CoreState.PAUSED:
 
475
  add_log(f"Auto-continue: {auto_continue} (max_total_steps={max_total_steps}, max_minutes={max_minutes})")
476
  add_log(f"Baseline: {BASELINE_CYCLES:,} cycles, Target: {TARGET_CYCLES:,} cycles")
477
  add_log(f"Adapter dir: {ADAPTER_DIR}")
478
+ add_log(f"Adapter dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
479
 
480
  # Load tokenizer
481
  add_log("Loading tokenizer...")
 
499
  )
500
  add_log(f"✓ Base model loaded on {next(base_model.parameters()).device}")
501
 
502
+ # Try to restore adapter from dataset before loading it
503
+ _try_download_adapter(add_log)
504
+
505
  # Resume LoRA adapter if present
506
  if os.path.isdir(ADAPTER_DIR) and os.path.exists(os.path.join(ADAPTER_DIR, "adapter_config.json")):
507
  add_log("Loading existing LoRA adapter (resume)...")
 
614
  os.makedirs(os.path.dirname(ADAPTER_DIR), exist_ok=True)
615
  trainer.save_model(ADAPTER_DIR)
616
  add_log(f"✓ Saved adapter to {ADAPTER_DIR}")
617
+ _try_upload_adapter(add_log)
618
  except Exception as e:
619
  add_log(f"✗ Failed to save adapter: {str(e)[:120]}")
620
 
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  torch>=2.1.0
2
  transformers>=4.45.0
 
3
  datasets>=2.18.0
4
  peft>=0.13.0
5
  trl>=0.12.0
 
1
  torch>=2.1.0
2
  transformers>=4.45.0
3
+ huggingface_hub>=0.23.0
4
  datasets>=2.18.0
5
  peft>=0.13.0
6
  trl>=0.12.0