Spaces:
Running
Running
Fix: apply chat template in generate_text (fixes Qwen3 garbage output)
Browse files- ternary_quant/inference.py +14 -1
ternary_quant/inference.py
CHANGED
|
@@ -1090,7 +1090,20 @@ def generate_text(
|
|
| 1090 |
) -> str:
|
| 1091 |
"""Generate text using a quantized or full-precision model."""
|
| 1092 |
device = next(model.parameters()).device
|
| 1093 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1094 |
|
| 1095 |
if tokenizer.pad_token_id is None:
|
| 1096 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
|
|
|
| 1090 |
) -> str:
|
| 1091 |
"""Generate text using a quantized or full-precision model."""
|
| 1092 |
device = next(model.parameters()).device
|
| 1093 |
+
|
| 1094 |
+
# Apply chat template if the tokenizer has one (e.g. Qwen3, Phi, Mistral).
|
| 1095 |
+
# Without this, instruction-tuned models produce garbage output.
|
| 1096 |
+
if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
|
| 1097 |
+
messages = [{"role": "user", "content": prompt}]
|
| 1098 |
+
formatted = tokenizer.apply_chat_template(
|
| 1099 |
+
messages,
|
| 1100 |
+
tokenize=False,
|
| 1101 |
+
add_generation_prompt=True,
|
| 1102 |
+
enable_thinking=False, # Qwen3: disable chain-of-thought
|
| 1103 |
+
)
|
| 1104 |
+
inputs = tokenizer(formatted, return_tensors="pt").to(device)
|
| 1105 |
+
else:
|
| 1106 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(device)
|
| 1107 |
|
| 1108 |
if tokenizer.pad_token_id is None:
|
| 1109 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|