AsadIsmail commited on
Commit
0876865
·
verified ·
1 Parent(s): 0f0baa5

Fix: apply chat template in generate_text (fixes Qwen3 garbage output)

Browse files
Files changed (1) hide show
  1. ternary_quant/inference.py +14 -1
ternary_quant/inference.py CHANGED
@@ -1090,7 +1090,20 @@ def generate_text(
1090
  ) -> str:
1091
  """Generate text using a quantized or full-precision model."""
1092
  device = next(model.parameters()).device
1093
- inputs = tokenizer(prompt, return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
1094
 
1095
  if tokenizer.pad_token_id is None:
1096
  tokenizer.pad_token_id = tokenizer.eos_token_id
 
1090
  ) -> str:
1091
  """Generate text using a quantized or full-precision model."""
1092
  device = next(model.parameters()).device
1093
+
1094
+ # Apply chat template if the tokenizer has one (e.g. Qwen3, Phi, Mistral).
1095
+ # Without this, instruction-tuned models produce garbage output.
1096
+ if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
1097
+ messages = [{"role": "user", "content": prompt}]
1098
+ formatted = tokenizer.apply_chat_template(
1099
+ messages,
1100
+ tokenize=False,
1101
+ add_generation_prompt=True,
1102
+ enable_thinking=False, # Qwen3: disable chain-of-thought
1103
+ )
1104
+ inputs = tokenizer(formatted, return_tensors="pt").to(device)
1105
+ else:
1106
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
1107
 
1108
  if tokenizer.pad_token_id is None:
1109
  tokenizer.pad_token_id = tokenizer.eos_token_id