Spaces:
Running
on
A10G
Running
on
A10G
Commit
·
f2d5eaa
1
Parent(s):
648e193
Persist LoRA adapter via HF dataset repo
Browse files- app.py +92 -0
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -10,6 +10,7 @@ import time
|
|
| 10 |
import random
|
| 11 |
import re
|
| 12 |
from copy import copy
|
|
|
|
| 13 |
|
| 14 |
# Check imports at startup
|
| 15 |
startup_log = []
|
|
@@ -65,12 +66,23 @@ except Exception as e:
|
|
| 65 |
startup_log.append(f"✗ VLIW Simulator: {e}")
|
| 66 |
SIMULATOR_AVAILABLE = False
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# Constants
|
| 69 |
BASELINE_CYCLES = 147734
|
| 70 |
TARGET_CYCLES = 1363
|
| 71 |
SCORE_SCALE = 3000.0
|
| 72 |
PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
|
| 73 |
ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# Training state
|
| 76 |
training_state = {
|
|
@@ -102,6 +114,81 @@ def extract_code_block(text: str) -> str:
|
|
| 102 |
return text.strip()
|
| 103 |
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
def _run_machine_with_cycle_limit(machine: Machine, max_cycles: int) -> bool:
|
| 106 |
for core in machine.cores:
|
| 107 |
if core.state == CoreState.PAUSED:
|
|
@@ -388,6 +475,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
|
|
| 388 |
add_log(f"Auto-continue: {auto_continue} (max_total_steps={max_total_steps}, max_minutes={max_minutes})")
|
| 389 |
add_log(f"Baseline: {BASELINE_CYCLES:,} cycles, Target: {TARGET_CYCLES:,} cycles")
|
| 390 |
add_log(f"Adapter dir: {ADAPTER_DIR}")
|
|
|
|
| 391 |
|
| 392 |
# Load tokenizer
|
| 393 |
add_log("Loading tokenizer...")
|
|
@@ -411,6 +499,9 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
|
|
| 411 |
)
|
| 412 |
add_log(f"✓ Base model loaded on {next(base_model.parameters()).device}")
|
| 413 |
|
|
|
|
|
|
|
|
|
|
| 414 |
# Resume LoRA adapter if present
|
| 415 |
if os.path.isdir(ADAPTER_DIR) and os.path.exists(os.path.join(ADAPTER_DIR, "adapter_config.json")):
|
| 416 |
add_log("Loading existing LoRA adapter (resume)...")
|
|
@@ -523,6 +614,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
|
|
| 523 |
os.makedirs(os.path.dirname(ADAPTER_DIR), exist_ok=True)
|
| 524 |
trainer.save_model(ADAPTER_DIR)
|
| 525 |
add_log(f"✓ Saved adapter to {ADAPTER_DIR}")
|
|
|
|
| 526 |
except Exception as e:
|
| 527 |
add_log(f"✗ Failed to save adapter: {str(e)[:120]}")
|
| 528 |
|
|
|
|
| 10 |
import random
|
| 11 |
import re
|
| 12 |
from copy import copy
|
| 13 |
+
from pathlib import Path
|
| 14 |
|
| 15 |
# Check imports at startup
|
| 16 |
startup_log = []
|
|
|
|
| 66 |
startup_log.append(f"✗ VLIW Simulator: {e}")
|
| 67 |
SIMULATOR_AVAILABLE = False
|
| 68 |
|
| 69 |
+
# Hugging Face Hub adapter persistence via dataset repo
|
| 70 |
+
try:
|
| 71 |
+
from huggingface_hub import HfApi, snapshot_download
|
| 72 |
+
startup_log.append("✓ huggingface_hub: OK")
|
| 73 |
+
HF_HUB_AVAILABLE = True
|
| 74 |
+
except Exception as e:
|
| 75 |
+
startup_log.append(f"✗ huggingface_hub: {str(e)[:80]}")
|
| 76 |
+
HF_HUB_AVAILABLE = False
|
| 77 |
+
|
| 78 |
# Constants
|
| 79 |
BASELINE_CYCLES = 147734
|
| 80 |
TARGET_CYCLES = 1363
|
| 81 |
SCORE_SCALE = 3000.0
|
| 82 |
PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
|
| 83 |
ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
|
| 84 |
+
ADAPTER_DATASET_REPO = os.environ.get("ADAPTER_DATASET_REPO", "CreativeEngineer/vliw-optimizer-adapters")
|
| 85 |
+
ADAPTER_DATASET_SUBDIR = os.environ.get("ADAPTER_DATASET_SUBDIR", "perf_takehome_latest")
|
| 86 |
|
| 87 |
# Training state
|
| 88 |
training_state = {
|
|
|
|
| 114 |
return text.strip()
|
| 115 |
|
| 116 |
|
| 117 |
+
def _hf_token() -> str | None:
|
| 118 |
+
return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def _ensure_dir(path: str) -> None:
|
| 122 |
+
Path(path).mkdir(parents=True, exist_ok=True)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _adapter_exists(path: str) -> bool:
|
| 126 |
+
return os.path.exists(os.path.join(path, "adapter_config.json"))
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _try_download_adapter(add_log) -> None:
|
| 130 |
+
if not HF_HUB_AVAILABLE:
|
| 131 |
+
add_log("✗ Hub sync disabled: huggingface_hub not available")
|
| 132 |
+
return
|
| 133 |
+
_ensure_dir(os.path.dirname(ADAPTER_DIR))
|
| 134 |
+
allow = [f"{ADAPTER_DATASET_SUBDIR}/**"]
|
| 135 |
+
try:
|
| 136 |
+
snapshot_download(
|
| 137 |
+
repo_id=ADAPTER_DATASET_REPO,
|
| 138 |
+
repo_type="dataset",
|
| 139 |
+
allow_patterns=allow,
|
| 140 |
+
local_dir=os.path.dirname(ADAPTER_DIR),
|
| 141 |
+
local_dir_use_symlinks=False,
|
| 142 |
+
token=_hf_token(),
|
| 143 |
+
)
|
| 144 |
+
downloaded = os.path.join(os.path.dirname(ADAPTER_DIR), ADAPTER_DATASET_SUBDIR)
|
| 145 |
+
if _adapter_exists(downloaded):
|
| 146 |
+
if downloaded != ADAPTER_DIR:
|
| 147 |
+
_ensure_dir(os.path.dirname(ADAPTER_DIR))
|
| 148 |
+
# Simple overwrite by copying files into ADAPTER_DIR
|
| 149 |
+
_ensure_dir(ADAPTER_DIR)
|
| 150 |
+
for root, _, files in os.walk(downloaded):
|
| 151 |
+
rel = os.path.relpath(root, downloaded)
|
| 152 |
+
dst_root = ADAPTER_DIR if rel == "." else os.path.join(ADAPTER_DIR, rel)
|
| 153 |
+
_ensure_dir(dst_root)
|
| 154 |
+
for name in files:
|
| 155 |
+
src = os.path.join(root, name)
|
| 156 |
+
dst = os.path.join(dst_root, name)
|
| 157 |
+
with open(src, "rb") as fsrc, open(dst, "wb") as fdst:
|
| 158 |
+
fdst.write(fsrc.read())
|
| 159 |
+
add_log(f"✓ Downloaded adapter from dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
|
| 160 |
+
else:
|
| 161 |
+
add_log("ℹ No adapter found in dataset yet")
|
| 162 |
+
except Exception as e:
|
| 163 |
+
add_log(f"ℹ Adapter download skipped: {str(e)[:160]}")
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _try_upload_adapter(add_log) -> None:
|
| 167 |
+
if not HF_HUB_AVAILABLE:
|
| 168 |
+
add_log("✗ Hub sync disabled: huggingface_hub not available")
|
| 169 |
+
return
|
| 170 |
+
if not _adapter_exists(ADAPTER_DIR):
|
| 171 |
+
add_log("ℹ No adapter to upload yet")
|
| 172 |
+
return
|
| 173 |
+
token = _hf_token()
|
| 174 |
+
if token is None:
|
| 175 |
+
add_log("ℹ No HF token set (HF_TOKEN/HUGGINGFACE_HUB_TOKEN); skipping upload")
|
| 176 |
+
return
|
| 177 |
+
try:
|
| 178 |
+
api = HfApi(token=token)
|
| 179 |
+
api.create_repo(repo_id=ADAPTER_DATASET_REPO, repo_type="dataset", exist_ok=True)
|
| 180 |
+
api.upload_folder(
|
| 181 |
+
repo_id=ADAPTER_DATASET_REPO,
|
| 182 |
+
repo_type="dataset",
|
| 183 |
+
folder_path=ADAPTER_DIR,
|
| 184 |
+
path_in_repo=ADAPTER_DATASET_SUBDIR,
|
| 185 |
+
commit_message="Update perf_takehome adapter",
|
| 186 |
+
)
|
| 187 |
+
add_log(f"✓ Uploaded adapter to dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
|
| 188 |
+
except Exception as e:
|
| 189 |
+
add_log(f"ℹ Adapter upload skipped: {str(e)[:160]}")
|
| 190 |
+
|
| 191 |
+
|
| 192 |
def _run_machine_with_cycle_limit(machine: Machine, max_cycles: int) -> bool:
|
| 193 |
for core in machine.cores:
|
| 194 |
if core.state == CoreState.PAUSED:
|
|
|
|
| 475 |
add_log(f"Auto-continue: {auto_continue} (max_total_steps={max_total_steps}, max_minutes={max_minutes})")
|
| 476 |
add_log(f"Baseline: {BASELINE_CYCLES:,} cycles, Target: {TARGET_CYCLES:,} cycles")
|
| 477 |
add_log(f"Adapter dir: {ADAPTER_DIR}")
|
| 478 |
+
add_log(f"Adapter dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
|
| 479 |
|
| 480 |
# Load tokenizer
|
| 481 |
add_log("Loading tokenizer...")
|
|
|
|
| 499 |
)
|
| 500 |
add_log(f"✓ Base model loaded on {next(base_model.parameters()).device}")
|
| 501 |
|
| 502 |
+
# Try to restore adapter from dataset before loading it
|
| 503 |
+
_try_download_adapter(add_log)
|
| 504 |
+
|
| 505 |
# Resume LoRA adapter if present
|
| 506 |
if os.path.isdir(ADAPTER_DIR) and os.path.exists(os.path.join(ADAPTER_DIR, "adapter_config.json")):
|
| 507 |
add_log("Loading existing LoRA adapter (resume)...")
|
|
|
|
| 614 |
os.makedirs(os.path.dirname(ADAPTER_DIR), exist_ok=True)
|
| 615 |
trainer.save_model(ADAPTER_DIR)
|
| 616 |
add_log(f"✓ Saved adapter to {ADAPTER_DIR}")
|
| 617 |
+
_try_upload_adapter(add_log)
|
| 618 |
except Exception as e:
|
| 619 |
add_log(f"✗ Failed to save adapter: {str(e)[:120]}")
|
| 620 |
|
requirements.txt
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
torch>=2.1.0
|
| 2 |
transformers>=4.45.0
|
|
|
|
| 3 |
datasets>=2.18.0
|
| 4 |
peft>=0.13.0
|
| 5 |
trl>=0.12.0
|
|
|
|
| 1 |
torch>=2.1.0
|
| 2 |
transformers>=4.45.0
|
| 3 |
+
huggingface_hub>=0.23.0
|
| 4 |
datasets>=2.18.0
|
| 5 |
peft>=0.13.0
|
| 6 |
trl>=0.12.0
|