Spaces:

gr0010
/

CustomThinker-Demo

Running on Zero

App Files Files Community

gr0010 commited on 25 days ago

Commit

d938e2c

verified ·

1 Parent(s): d89decd

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -21

app.py CHANGED Viewed

@@ -34,13 +34,13 @@ def generate_and_parse(messages: list, temperature: float = 0.6,
     and parses it into thinking and answer parts.
     Decorated with @spaces.GPU for Zero GPU allocation.
     """
-    # Build prompt manually to preserve <think> tags in context
-    prompt_text = ""
-    for msg in messages:
-        role = msg["role"]
-        content = msg["content"]
-        prompt_text += f"<|im_start|>{role}\n{content}<|im_end|>\n"
-    prompt_text += "<|im_start|>assistant\n"
     # --- CONSOLE DEBUG OUTPUT ---
     print("\n" + "="*50)
@@ -279,15 +279,6 @@ Think using bullet points and short sentences to simulate thoughts and emoticons
         messages_for_model.extend(model_history)
         try:
-            # --- DEBUG: Print what model sees ---
-            print("\n" + "="*80)
-            print("--- MESSAGES SENT TO MODEL (model_history) ---")
-            for i, msg in enumerate(messages_for_model):
-                print(f"\n[Message {i}] Role: {msg['role']}")
-                content_preview = msg['content'][:200] + "..." if len(msg['content']) > 200 else msg['content']
-                print(f"Content: {content_preview}")
-            print("="*80 + "\n")
             # Generate response with hyperparameters
             thinking, answer = generate_and_parse(
                 messages_for_model,
@@ -298,11 +289,8 @@ Think using bullet points and short sentences to simulate thoughts and emoticons
                 max_new_tokens=max_tokens
             )
-            # Update model history with thinking AND answer (CHANGED)
-            if thinking and thinking.strip():
-                model_history.append({"role": "assistant", "content": f"<think>{thinking}</think>\n{answer}"})
-            else:
-                model_history.append({"role": "assistant", "content": answer})
             # Format response for display (with HTML formatting)
             if thinking and thinking.strip():

     and parses it into thinking and answer parts.
     Decorated with @spaces.GPU for Zero GPU allocation.
     """
+    # Apply chat template with enable_thinking=True for Qwen3
+    prompt_text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=True  # Explicitly enable thinking mode
+    )
     # --- CONSOLE DEBUG OUTPUT ---
     print("\n" + "="*50)
         messages_for_model.extend(model_history)
         try:
             # Generate response with hyperparameters
             thinking, answer = generate_and_parse(
                 messages_for_model,
                 max_new_tokens=max_tokens
             )
+            # Update model history with CLEAN answer (no HTML formatting)
+            model_history.append({"role": "assistant", "content": answer})
             # Format response for display (with HTML formatting)
             if thinking and thinking.strip():