Spaces:

AsadIsmail
/

ternary-quant-demo

Running

AsadIsmail commited on 17 days ago

Commit

0876865

verified ·

1 Parent(s): 0f0baa5

Fix: apply chat template in generate_text (fixes Qwen3 garbage output)

Files changed (1) hide show

ternary_quant/inference.py CHANGED Viewed

@@ -1090,7 +1090,20 @@ def generate_text(
 ) -> str:
     """Generate text using a quantized or full-precision model."""
     device = next(model.parameters()).device
-    inputs = tokenizer(prompt, return_tensors="pt").to(device)
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id

 ) -> str:
     """Generate text using a quantized or full-precision model."""
     device = next(model.parameters()).device
+    # Apply chat template if the tokenizer has one (e.g. Qwen3, Phi, Mistral).
+    # Without this, instruction-tuned models produce garbage output.
+    if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
+        messages = [{"role": "user", "content": prompt}]
+        formatted = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False,   # Qwen3: disable chain-of-thought
+        )
+        inputs = tokenizer(formatted, return_tensors="pt").to(device)
+    else:
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id