Spaces:
Running on Zero
Running on Zero
Upload 132 files
Browse files- app.py +53 -7
- obliteratus/models/loader.py +16 -0
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -135,6 +135,11 @@ def _load_model_to_device(
|
|
| 135 |
|
| 136 |
model = AutoModelForCausalLM.from_pretrained(pretrained_path, **kwargs)
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
# On MPS / CPU: model loaded without device_map, move to best device
|
| 139 |
if not dev.supports_device_map_auto():
|
| 140 |
target = dev.get_device()
|
|
@@ -243,9 +248,12 @@ def _recover_sessions_from_disk() -> None:
|
|
| 243 |
_obliterate_counter = idx + 1
|
| 244 |
except (ValueError, IndexError):
|
| 245 |
pass
|
| 246 |
-
# If we recovered sessions
|
| 247 |
-
# most recent checkpoint so chat_respond can reload from disk.
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
| 249 |
with _lock:
|
| 250 |
latest = _last_obliterated_label
|
| 251 |
if latest and latest in _session_models:
|
|
@@ -854,6 +862,13 @@ def _cleanup_disk():
|
|
| 854 |
# Clear session model cache (checkpoints are gone)
|
| 855 |
_session_models.clear()
|
| 856 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 857 |
# Also clear GPU
|
| 858 |
_clear_gpu()
|
| 859 |
|
|
@@ -1968,6 +1983,7 @@ def obliterate(model_choice: str, method_choice: str,
|
|
| 1968 |
on_stage=on_stage,
|
| 1969 |
on_log=on_log,
|
| 1970 |
)
|
|
|
|
| 1971 |
pipeline_ref[0] = pipeline
|
| 1972 |
pipeline.run_informed()
|
| 1973 |
else:
|
|
@@ -2023,6 +2039,7 @@ def obliterate(model_choice: str, method_choice: str,
|
|
| 2023 |
cot_aware=adv_cot_aware,
|
| 2024 |
n_sae_features=int(adv_n_sae_features),
|
| 2025 |
)
|
|
|
|
| 2026 |
pipeline_ref[0] = pipeline
|
| 2027 |
pipeline.run()
|
| 2028 |
except Exception as e:
|
|
@@ -2047,8 +2064,8 @@ def obliterate(model_choice: str, method_choice: str,
|
|
| 2047 |
worker = threading.Thread(target=run_pipeline, daemon=True)
|
| 2048 |
worker.start()
|
| 2049 |
|
| 2050 |
-
# Stream log updates while pipeline runs (max
|
| 2051 |
-
_max_pipeline_secs =
|
| 2052 |
_pipeline_start = time.time()
|
| 2053 |
status_msg = "**Obliterating\u2026** (0s)"
|
| 2054 |
while worker.is_alive():
|
|
@@ -2059,7 +2076,7 @@ def obliterate(model_choice: str, method_choice: str,
|
|
| 2059 |
else:
|
| 2060 |
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
|
| 2061 |
if time.time() - _pipeline_start > _max_pipeline_secs:
|
| 2062 |
-
log_lines.append("\nTIMEOUT: Pipeline exceeded
|
| 2063 |
break
|
| 2064 |
time.sleep(0.5)
|
| 2065 |
|
|
@@ -2392,6 +2409,17 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
|
|
| 2392 |
if not checkpoint or not Path(checkpoint).exists():
|
| 2393 |
_recover_sessions_from_disk()
|
| 2394 |
checkpoint = _state.get("output_dir")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2395 |
if checkpoint and Path(checkpoint).exists():
|
| 2396 |
try:
|
| 2397 |
is_preset = (_state.get("model_name") or "") in MODELS
|
|
@@ -2555,12 +2583,30 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
|
|
| 2555 |
global _skip_session_load
|
| 2556 |
if _skip_session_load > 0:
|
| 2557 |
_skip_session_load -= 1
|
| 2558 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2559 |
yield (
|
| 2560 |
f"**Ready!** `{choice}` is loaded β just type in the chat below.",
|
| 2561 |
get_chat_header(),
|
| 2562 |
)
|
| 2563 |
return
|
|
|
|
| 2564 |
|
| 2565 |
if not choice or choice not in _bench_configs:
|
| 2566 |
# On ZeroGPU, global state may be lost between process restarts.
|
|
|
|
| 135 |
|
| 136 |
model = AutoModelForCausalLM.from_pretrained(pretrained_path, **kwargs)
|
| 137 |
|
| 138 |
+
# Compat: some custom model code (ChatGLM/GLM-4) accesses config.max_length
|
| 139 |
+
# which was removed from PretrainedConfig in newer transformers.
|
| 140 |
+
if not hasattr(model.config, "max_length"):
|
| 141 |
+
model.config.max_length = 20
|
| 142 |
+
|
| 143 |
# On MPS / CPU: model loaded without device_map, move to best device
|
| 144 |
if not dev.supports_device_map_auto():
|
| 145 |
target = dev.get_device()
|
|
|
|
| 248 |
_obliterate_counter = idx + 1
|
| 249 |
except (ValueError, IndexError):
|
| 250 |
pass
|
| 251 |
+
# If we recovered sessions and _state has no valid output_dir, set it to
|
| 252 |
+
# the most recent checkpoint so chat_respond can reload from disk.
|
| 253 |
+
# Also overwrite a stale output_dir that points to a non-existent path.
|
| 254 |
+
_cur_dir = _state.get("output_dir")
|
| 255 |
+
_needs_update = not _cur_dir or not Path(_cur_dir).exists()
|
| 256 |
+
if found_any and _needs_update:
|
| 257 |
with _lock:
|
| 258 |
latest = _last_obliterated_label
|
| 259 |
if latest and latest in _session_models:
|
|
|
|
| 862 |
# Clear session model cache (checkpoints are gone)
|
| 863 |
_session_models.clear()
|
| 864 |
|
| 865 |
+
# Clear stale output_dir reference (checkpoints were just deleted)
|
| 866 |
+
with _lock:
|
| 867 |
+
_state["output_dir"] = None
|
| 868 |
+
_state["model_name"] = None
|
| 869 |
+
_state["method"] = None
|
| 870 |
+
_state["status"] = "idle"
|
| 871 |
+
|
| 872 |
# Also clear GPU
|
| 873 |
_clear_gpu()
|
| 874 |
|
|
|
|
| 1983 |
on_stage=on_stage,
|
| 1984 |
on_log=on_log,
|
| 1985 |
)
|
| 1986 |
+
pipeline._bayesian_trials = int(adv_bayesian_trials)
|
| 1987 |
pipeline_ref[0] = pipeline
|
| 1988 |
pipeline.run_informed()
|
| 1989 |
else:
|
|
|
|
| 2039 |
cot_aware=adv_cot_aware,
|
| 2040 |
n_sae_features=int(adv_n_sae_features),
|
| 2041 |
)
|
| 2042 |
+
pipeline._bayesian_trials = int(adv_bayesian_trials)
|
| 2043 |
pipeline_ref[0] = pipeline
|
| 2044 |
pipeline.run()
|
| 2045 |
except Exception as e:
|
|
|
|
| 2064 |
worker = threading.Thread(target=run_pipeline, daemon=True)
|
| 2065 |
worker.start()
|
| 2066 |
|
| 2067 |
+
# Stream log updates while pipeline runs (max 400 hours for large-model Optuna optimization)
|
| 2068 |
+
_max_pipeline_secs = 400 * 60 * 60
|
| 2069 |
_pipeline_start = time.time()
|
| 2070 |
status_msg = "**Obliterating\u2026** (0s)"
|
| 2071 |
while worker.is_alive():
|
|
|
|
| 2076 |
else:
|
| 2077 |
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
|
| 2078 |
if time.time() - _pipeline_start > _max_pipeline_secs:
|
| 2079 |
+
log_lines.append("\nTIMEOUT: Pipeline exceeded 400-hour limit.")
|
| 2080 |
break
|
| 2081 |
time.sleep(0.5)
|
| 2082 |
|
|
|
|
| 2409 |
if not checkpoint or not Path(checkpoint).exists():
|
| 2410 |
_recover_sessions_from_disk()
|
| 2411 |
checkpoint = _state.get("output_dir")
|
| 2412 |
+
# If output_dir is still stale, scan session models for any valid checkpoint
|
| 2413 |
+
if not checkpoint or not Path(checkpoint).exists():
|
| 2414 |
+
for _sm in _session_models.values():
|
| 2415 |
+
_sm_dir = _sm.get("output_dir")
|
| 2416 |
+
if _sm_dir and Path(_sm_dir).exists():
|
| 2417 |
+
checkpoint = _sm_dir
|
| 2418 |
+
with _lock:
|
| 2419 |
+
_state["output_dir"] = _sm_dir
|
| 2420 |
+
_state["model_name"] = _sm.get("model_choice")
|
| 2421 |
+
_state["method"] = _sm.get("method")
|
| 2422 |
+
break
|
| 2423 |
if checkpoint and Path(checkpoint).exists():
|
| 2424 |
try:
|
| 2425 |
is_preset = (_state.get("model_name") or "") in MODELS
|
|
|
|
| 2583 |
global _skip_session_load
|
| 2584 |
if _skip_session_load > 0:
|
| 2585 |
_skip_session_load -= 1
|
| 2586 |
+
# Verify the model is actually usable β not just that status says "ready".
|
| 2587 |
+
# ZeroGPU can evict the model while status stays "ready", and the counter
|
| 2588 |
+
# can get out of sync if only one dropdown .change fires instead of both.
|
| 2589 |
+
with _lock:
|
| 2590 |
+
_model_ok = (
|
| 2591 |
+
_state.get("status") == "ready"
|
| 2592 |
+
and _state.get("model") is not None
|
| 2593 |
+
and _state.get("tokenizer") is not None
|
| 2594 |
+
)
|
| 2595 |
+
if choice and _model_ok:
|
| 2596 |
+
# Double-check model tensors aren't stale (meta device)
|
| 2597 |
+
try:
|
| 2598 |
+
_dev = next(_state["model"].parameters()).device
|
| 2599 |
+
if _dev.type == "meta":
|
| 2600 |
+
_model_ok = False
|
| 2601 |
+
except Exception:
|
| 2602 |
+
_model_ok = False
|
| 2603 |
+
if choice and _model_ok:
|
| 2604 |
yield (
|
| 2605 |
f"**Ready!** `{choice}` is loaded β just type in the chat below.",
|
| 2606 |
get_chat_header(),
|
| 2607 |
)
|
| 2608 |
return
|
| 2609 |
+
# Model is stale or evicted β fall through to normal loading path
|
| 2610 |
|
| 2611 |
if not choice or choice not in _bench_configs:
|
| 2612 |
# On ZeroGPU, global state may be lost between process restarts.
|
obliteratus/models/loader.py
CHANGED
|
@@ -465,6 +465,16 @@ def load_model(
|
|
| 465 |
f"If this model requires custom code, pass trust_remote_code=True explicitly."
|
| 466 |
) from e
|
| 467 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
# Memory estimation and warnings (skip for natively quantized models β estimate is wrong)
|
| 469 |
native_quant = getattr(config, "quantization_config", None)
|
| 470 |
est_gb = _estimate_model_memory_gb(config, torch_dtype) if native_quant is None else 0.0
|
|
@@ -629,6 +639,12 @@ def load_model(
|
|
| 629 |
|
| 630 |
model.eval()
|
| 631 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
# Free accelerator cache after loading
|
| 633 |
dev.empty_cache()
|
| 634 |
|
|
|
|
| 465 |
f"If this model requires custom code, pass trust_remote_code=True explicitly."
|
| 466 |
) from e
|
| 467 |
|
| 468 |
+
# ββ Config compat: ensure generation-related attributes exist ββββββ
|
| 469 |
+
# Older PretrainedConfig had max_length (default 20) and other generation
|
| 470 |
+
# defaults. Newer transformers moved them to GenerationConfig, but some
|
| 471 |
+
# custom model code (ChatGLM, GLM-4) still accesses config.max_length
|
| 472 |
+
# directly. Patch them back so trust_remote_code models don't crash.
|
| 473 |
+
_gen_defaults = {"max_length": 20, "max_new_tokens": None}
|
| 474 |
+
for _attr, _default in _gen_defaults.items():
|
| 475 |
+
if not hasattr(config, _attr):
|
| 476 |
+
setattr(config, _attr, _default)
|
| 477 |
+
|
| 478 |
# Memory estimation and warnings (skip for natively quantized models β estimate is wrong)
|
| 479 |
native_quant = getattr(config, "quantization_config", None)
|
| 480 |
est_gb = _estimate_model_memory_gb(config, torch_dtype) if native_quant is None else 0.0
|
|
|
|
| 639 |
|
| 640 |
model.eval()
|
| 641 |
|
| 642 |
+
# Patch model.config with the same generation defaults (model.config may be
|
| 643 |
+
# a separate instance from the config we pre-patched above).
|
| 644 |
+
for _attr, _default in _gen_defaults.items():
|
| 645 |
+
if not hasattr(model.config, _attr):
|
| 646 |
+
setattr(model.config, _attr, _default)
|
| 647 |
+
|
| 648 |
# Free accelerator cache after loading
|
| 649 |
dev.empty_cache()
|
| 650 |
|
requirements.txt
CHANGED
|
@@ -13,3 +13,4 @@ numpy>=1.24
|
|
| 13 |
scikit-learn>=1.3
|
| 14 |
tqdm>=4.64
|
| 15 |
bitsandbytes>=0.46.1
|
|
|
|
|
|
| 13 |
scikit-learn>=1.3
|
| 14 |
tqdm>=4.64
|
| 15 |
bitsandbytes>=0.46.1
|
| 16 |
+
optuna>=3.0
|