Spaces:
Runtime error
Runtime error
ImageStudio Maintainer Claude Opus 4.8 (1M context) commited on
Commit Β·
c4415be
1
Parent(s): a597aa8
fix: VLM assistant never stops (no generation_config) -> pin eos/pad tokens
Browse filesThe finetune ships no generation_config.json and its config has no
eos_token_id, so generate() had no stop token and ran to max_new_tokens
(endless output) even after the repetition fix. Pin eos_token_id to the
chat-template terminator <|im_end|> (+ <|endoftext|> fallback) and set
pad_token_id explicitly, derived from the tokenizer at load time.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
app.py
CHANGED
|
@@ -159,7 +159,23 @@ vlm_model = AutoModelForImageTextToText.from_pretrained(
|
|
| 159 |
)
|
| 160 |
vlm_model.to("cuda").eval()
|
| 161 |
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
|
| 165 |
# =============================================================================
|
|
@@ -679,6 +695,10 @@ def _vlm_chat_core(message, image, reasoning, max_new_tokens):
|
|
| 679 |
# keeping decoding deterministic (important for prompt rewrites).
|
| 680 |
repetition_penalty=1.3,
|
| 681 |
no_repeat_ngram_size=3,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 682 |
streamer=streamer,
|
| 683 |
)
|
| 684 |
except Exception as exc: # noqa: BLE001 - surfaced to the main thread
|
|
|
|
| 159 |
)
|
| 160 |
vlm_model.to("cuda").eval()
|
| 161 |
|
| 162 |
+
# This finetune ships NO generation_config.json and its config carries no
|
| 163 |
+
# eos_token_id, so generate() has no stop token and runs to max_new_tokens
|
| 164 |
+
# (endless output). Pin the stop tokens explicitly from the tokenizer: the chat
|
| 165 |
+
# template ends each assistant turn with <|im_end|>, so that's the real
|
| 166 |
+
# terminator (plus <|endoftext|> as a fallback).
|
| 167 |
+
_vlm_tokenizer = getattr(vlm_processor, "tokenizer", vlm_processor)
|
| 168 |
+
_VLM_EOS_IDS = sorted({
|
| 169 |
+
tid for tok in ("<|im_end|>", "<|endoftext|>")
|
| 170 |
+
for tid in (_vlm_tokenizer.convert_tokens_to_ids(tok),)
|
| 171 |
+
if isinstance(tid, int) and tid >= 0
|
| 172 |
+
} | ({_vlm_tokenizer.eos_token_id} if _vlm_tokenizer.eos_token_id is not None else set()))
|
| 173 |
+
_VLM_PAD_ID = (
|
| 174 |
+
_vlm_tokenizer.pad_token_id
|
| 175 |
+
if _vlm_tokenizer.pad_token_id is not None
|
| 176 |
+
else (_VLM_EOS_IDS[0] if _VLM_EOS_IDS else None)
|
| 177 |
+
)
|
| 178 |
+
print(f"Assistant loaded! (eos_ids={_VLM_EOS_IDS}, pad_id={_VLM_PAD_ID})")
|
| 179 |
|
| 180 |
|
| 181 |
# =============================================================================
|
|
|
|
| 695 |
# keeping decoding deterministic (important for prompt rewrites).
|
| 696 |
repetition_penalty=1.3,
|
| 697 |
no_repeat_ngram_size=3,
|
| 698 |
+
# Explicit stop tokens β the model has no generation_config, so
|
| 699 |
+
# without these generate() never stops and rambles to the budget.
|
| 700 |
+
eos_token_id=_VLM_EOS_IDS,
|
| 701 |
+
pad_token_id=_VLM_PAD_ID,
|
| 702 |
streamer=streamer,
|
| 703 |
)
|
| 704 |
except Exception as exc: # noqa: BLE001 - surfaced to the main thread
|