Spaces:
Runtime error
Runtime error
Switch assistant back to gemma-4-E4B (bf16)
Browse filesVLM_MODEL_ID default -> prithivMLmods/gemma-4-E4B-it-Uncensored-MAX (fast,
~2s warm). Qwen 9B/4B + gemma 12B kept commented for revert. App reload.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -152,16 +152,15 @@ print("Pipelines loaded!")
|
|
| 152 |
# -----------------------------------------------------------------------------
|
| 153 |
# Other models, kept for easy revert (set VLM_MODEL_ID, and VLM_LOAD_8BIT for big ones):
|
| 154 |
# "rodrigomt/Qwen3.5-4B-Uncensored-Aggressive" # no generation_config; needed eos pinning
|
| 155 |
-
# "
|
| 156 |
# "OpenYourMind/gemma-4-12B-it-abliterated-uncensored" # gemma4_unified, ~24GB; needs VLM_LOAD_8BIT=1 (slow)
|
| 157 |
-
# Current:
|
| 158 |
-
#
|
| 159 |
-
#
|
| 160 |
-
# (
|
| 161 |
-
#
|
| 162 |
-
#
|
| 163 |
-
|
| 164 |
-
VLM_MODEL_ID = os.environ.get("VLM_MODEL_ID", "ccharnkij/Qwen3.5-9B-Uncensored")
|
| 165 |
VLM_LOAD_8BIT = os.environ.get("VLM_LOAD_8BIT", "0").lower() not in ("0", "false", "no", "")
|
| 166 |
|
| 167 |
print(f"Loading assistant: {VLM_MODEL_ID} (8bit={VLM_LOAD_8BIT}) ...")
|
|
|
|
| 152 |
# -----------------------------------------------------------------------------
|
| 153 |
# Other models, kept for easy revert (set VLM_MODEL_ID, and VLM_LOAD_8BIT for big ones):
|
| 154 |
# "rodrigomt/Qwen3.5-4B-Uncensored-Aggressive" # no generation_config; needed eos pinning
|
| 155 |
+
# "ccharnkij/Qwen3.5-9B-Uncensored" # 9B Qwen3.5 VL, ~18.8 GB bf16, thinking model
|
| 156 |
# "OpenYourMind/gemma-4-12B-it-abliterated-uncensored" # gemma4_unified, ~24GB; needs VLM_LOAD_8BIT=1 (slow)
|
| 157 |
+
# Current: gemma-4-E4B (model_type=gemma4 / Gemma4ForConditionalGeneration), multimodal,
|
| 158 |
+
# uncensored. Small/fast β loads full bf16 (~8 GB, fits the zero-a10g alongside the
|
| 159 |
+
# diffusion pipelines; warm calls ~2 s). Stop tokens are handled model-agnostically by
|
| 160 |
+
# _resolve_vlm_eos_ids() (this model ships a proper generation_config). NOTE: under-rates
|
| 161 |
+
# explicit content (~2 cap) β the known tradeoff for its speed. VLM_LOAD_8BIT=1 forces
|
| 162 |
+
# bitsandbytes 8-bit (only needed for the 12B); default is bf16.
|
| 163 |
+
VLM_MODEL_ID = os.environ.get("VLM_MODEL_ID", "prithivMLmods/gemma-4-E4B-it-Uncensored-MAX")
|
|
|
|
| 164 |
VLM_LOAD_8BIT = os.environ.get("VLM_LOAD_8BIT", "0").lower() not in ("0", "false", "no", "")
|
| 165 |
|
| 166 |
print(f"Loading assistant: {VLM_MODEL_ID} (8bit={VLM_LOAD_8BIT}) ...")
|