Spaces:

CreativeEngineer
/

vliw-optimizer

Sleeping

App Files Files Community

CreativeEngineer commited on Jan 26

Commit

f2d5eaa

1 Parent(s): 648e193

Persist LoRA adapter via HF dataset repo

Browse files

Files changed (2) hide show

app.py +92 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import time
 import random
 import re
 from copy import copy
 # Check imports at startup
 startup_log = []
@@ -65,12 +66,23 @@ except Exception as e:
     startup_log.append(f"✗ VLIW Simulator: {e}")
     SIMULATOR_AVAILABLE = False
 # Constants
 BASELINE_CYCLES = 147734
 TARGET_CYCLES = 1363
 SCORE_SCALE = 3000.0
 PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
 ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
 # Training state
 training_state = {
@@ -102,6 +114,81 @@ def extract_code_block(text: str) -> str:
     return text.strip()
 def _run_machine_with_cycle_limit(machine: Machine, max_cycles: int) -> bool:
     for core in machine.cores:
         if core.state == CoreState.PAUSED:
@@ -388,6 +475,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
         add_log(f"Auto-continue: {auto_continue} (max_total_steps={max_total_steps}, max_minutes={max_minutes})")
         add_log(f"Baseline: {BASELINE_CYCLES:,} cycles, Target: {TARGET_CYCLES:,} cycles")
         add_log(f"Adapter dir: {ADAPTER_DIR}")
         # Load tokenizer
         add_log("Loading tokenizer...")
@@ -411,6 +499,9 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
         )
         add_log(f"✓ Base model loaded on {next(base_model.parameters()).device}")
         # Resume LoRA adapter if present
         if os.path.isdir(ADAPTER_DIR) and os.path.exists(os.path.join(ADAPTER_DIR, "adapter_config.json")):
             add_log("Loading existing LoRA adapter (resume)...")
@@ -523,6 +614,7 @@ def run_training(model_name, chunk_steps, max_total_steps, max_minutes, auto_con
                 os.makedirs(os.path.dirname(ADAPTER_DIR), exist_ok=True)
                 trainer.save_model(ADAPTER_DIR)
                 add_log(f"✓ Saved adapter to {ADAPTER_DIR}")
             except Exception as e:
                 add_log(f"✗ Failed to save adapter: {str(e)[:120]}")

 import random
 import re
 from copy import copy
+from pathlib import Path
 # Check imports at startup
 startup_log = []
     startup_log.append(f"✗ VLIW Simulator: {e}")
     SIMULATOR_AVAILABLE = False
+# Hugging Face Hub adapter persistence via dataset repo
+try:
+    from huggingface_hub import HfApi, snapshot_download
+    startup_log.append("✓ huggingface_hub: OK")
+    HF_HUB_AVAILABLE = True
+except Exception as e:
+    startup_log.append(f"✗ huggingface_hub: {str(e)[:80]}")
+    HF_HUB_AVAILABLE = False
 # Constants
 BASELINE_CYCLES = 147734
 TARGET_CYCLES = 1363
 SCORE_SCALE = 3000.0
 PERSIST_DIR = "/data" if os.path.isdir("/data") else "."
 ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapters", "perf_takehome_latest")
+ADAPTER_DATASET_REPO = os.environ.get("ADAPTER_DATASET_REPO", "CreativeEngineer/vliw-optimizer-adapters")
+ADAPTER_DATASET_SUBDIR = os.environ.get("ADAPTER_DATASET_SUBDIR", "perf_takehome_latest")
 # Training state
 training_state = {
     return text.strip()
+def _hf_token() -> str | None:
+    return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
+def _ensure_dir(path: str) -> None:
+    Path(path).mkdir(parents=True, exist_ok=True)
+def _adapter_exists(path: str) -> bool:
+    return os.path.exists(os.path.join(path, "adapter_config.json"))
+def _try_download_adapter(add_log) -> None:
+    if not HF_HUB_AVAILABLE:
+        add_log("✗ Hub sync disabled: huggingface_hub not available")
+        return
+    _ensure_dir(os.path.dirname(ADAPTER_DIR))
+    allow = [f"{ADAPTER_DATASET_SUBDIR}/**"]
+    try:
+        snapshot_download(
+            repo_id=ADAPTER_DATASET_REPO,
+            repo_type="dataset",
+            allow_patterns=allow,
+            local_dir=os.path.dirname(ADAPTER_DIR),
+            local_dir_use_symlinks=False,
+            token=_hf_token(),
+        )
+        downloaded = os.path.join(os.path.dirname(ADAPTER_DIR), ADAPTER_DATASET_SUBDIR)
+        if _adapter_exists(downloaded):
+            if downloaded != ADAPTER_DIR:
+                _ensure_dir(os.path.dirname(ADAPTER_DIR))
+                # Simple overwrite by copying files into ADAPTER_DIR
+                _ensure_dir(ADAPTER_DIR)
+                for root, _, files in os.walk(downloaded):
+                    rel = os.path.relpath(root, downloaded)
+                    dst_root = ADAPTER_DIR if rel == "." else os.path.join(ADAPTER_DIR, rel)
+                    _ensure_dir(dst_root)
+                    for name in files:
+                        src = os.path.join(root, name)
+                        dst = os.path.join(dst_root, name)
+                        with open(src, "rb") as fsrc, open(dst, "wb") as fdst:
+                            fdst.write(fsrc.read())
+            add_log(f"✓ Downloaded adapter from dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
+        else:
+            add_log("ℹ No adapter found in dataset yet")
+    except Exception as e:
+        add_log(f"ℹ Adapter download skipped: {str(e)[:160]}")
+def _try_upload_adapter(add_log) -> None:
+    if not HF_HUB_AVAILABLE:
+        add_log("✗ Hub sync disabled: huggingface_hub not available")
+        return
+    if not _adapter_exists(ADAPTER_DIR):
+        add_log("ℹ No adapter to upload yet")
+        return
+    token = _hf_token()
+    if token is None:
+        add_log("ℹ No HF token set (HF_TOKEN/HUGGINGFACE_HUB_TOKEN); skipping upload")
+        return
+    try:
+        api = HfApi(token=token)
+        api.create_repo(repo_id=ADAPTER_DATASET_REPO, repo_type="dataset", exist_ok=True)
+        api.upload_folder(
+            repo_id=ADAPTER_DATASET_REPO,
+            repo_type="dataset",
+            folder_path=ADAPTER_DIR,
+            path_in_repo=ADAPTER_DATASET_SUBDIR,
+            commit_message="Update perf_takehome adapter",
+        )
+        add_log(f"✓ Uploaded adapter to dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
+    except Exception as e:
+        add_log(f"ℹ Adapter upload skipped: {str(e)[:160]}")
 def _run_machine_with_cycle_limit(machine: Machine, max_cycles: int) -> bool:
     for core in machine.cores:
         if core.state == CoreState.PAUSED:
         add_log(f"Auto-continue: {auto_continue} (max_total_steps={max_total_steps}, max_minutes={max_minutes})")
         add_log(f"Baseline: {BASELINE_CYCLES:,} cycles, Target: {TARGET_CYCLES:,} cycles")
         add_log(f"Adapter dir: {ADAPTER_DIR}")
+        add_log(f"Adapter dataset: {ADAPTER_DATASET_REPO}/{ADAPTER_DATASET_SUBDIR}")
         # Load tokenizer
         add_log("Loading tokenizer...")
         )
         add_log(f"✓ Base model loaded on {next(base_model.parameters()).device}")
+        # Try to restore adapter from dataset before loading it
+        _try_download_adapter(add_log)
         # Resume LoRA adapter if present
         if os.path.isdir(ADAPTER_DIR) and os.path.exists(os.path.join(ADAPTER_DIR, "adapter_config.json")):
             add_log("Loading existing LoRA adapter (resume)...")
                 os.makedirs(os.path.dirname(ADAPTER_DIR), exist_ok=True)
                 trainer.save_model(ADAPTER_DIR)
                 add_log(f"✓ Saved adapter to {ADAPTER_DIR}")
+                _try_upload_adapter(add_log)
             except Exception as e:
                 add_log(f"✗ Failed to save adapter: {str(e)[:120]}")

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 torch>=2.1.0
 transformers>=4.45.0
 datasets>=2.18.0
 peft>=0.13.0
 trl>=0.12.0

 torch>=2.1.0
 transformers>=4.45.0
+huggingface_hub>=0.23.0
 datasets>=2.18.0
 peft>=0.13.0
 trl>=0.12.0