Spaces:

OrbitMC
/

slm

Sleeping

App Files Files Community

OrbitMC commited on Feb 7

Commit

eb4f3c5

verified ·

1 Parent(s): b673820

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -45

app.py CHANGED Viewed

@@ -1,82 +1,152 @@
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from duckduckgo_search import DDGS
 from threading import Thread
-# --- MODEL CONFIG ---
-MODEL_ID = "Qwen/Qwen3-0.6B" # Pure HF Datacard
-print(f"Loading model {MODEL_ID}...")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
-    torch_dtype=torch.float16,
-    low_cpu_mem_usage=True
 )
-# --- WEB SEARCH ---
-def search_web(query):
     try:
         with DDGS() as ddgs:
-            results = [r for r in ddgs.text(query, max_results=3)]
             if not results: return ""
-            context = "\n".join([f"Source: {r['title']}\nContent: {r['body']}" for r in results])
-            return f"\n\nWeb Search Context:\n{context}\n"
-    except Exception as e:
-        print(f"Search error: {e}")
         return ""
-# --- INFERENCE ---
-def stream_response(message, history, search_enabled, temperature, max_new_tokens):
-    # Prepare prompt
-    context = ""
     if search_enabled:
-        context = search_web(message)
-    # Simple Chat Template
-    full_prompt = f"User: {message}{context}\nAssistant:"
-    inputs = tokenizer([full_prompt], return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(
-        inputs,
         streamer=streamer,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
         temperature=temperature,
-        pad_token_id=tokenizer.eos_token_id
     )
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    partial_text = ""
     for new_text in streamer:
-        # Handle the thinking process tags if present in output
-        new_text = new_text.replace("<think>", "💭 *Thinking:* ").replace("</think>", "\n\n---\n\n")
-        partial_text += new_text
-        yield partial_text
-# --- CLEAN UI ---
-with gr.Blocks(theme=gr.themes.Default(primary_hue="orange", secondary_hue="gray")) as demo:
-    gr.Markdown("# 🛸 Qwen3 Pure-Python Explorer")
     with gr.Row():
         with gr.Column(scale=4):
-            chatbot = gr.ChatInterface(
-                fn=stream_response,
-                additional_inputs=[
-                    gr.Checkbox(label="🌐 Enable Web Search", value=False),
-                    gr.Slider(0.1, 1.0, 0.7, label="Temperature"),
-                    gr.Slider(128, 4096, 1024, label="Max Tokens"),
-                ],
-                fill_height=True
-            )
-    gr.Markdown("### Features:\n- ✅ **Zero C++ / Zero llama-cpp**\n- ✅ **Native HuggingFace Transformers**\n- ✅ **DuckDuckGo Integration**")
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import torch
+import re
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from duckduckgo_search import DDGS
 from threading import Thread
+# --- MODEL SETUP ---
+MODEL_ID = "Qwen/Qwen3-0.6B" # Official HF Repo
+print("Loading model and tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
+    torch_dtype="auto",
     device_map="auto",
+    trust_remote_code=True
 )
+# --- SEARCH FUNCTION ---
+def web_search(query):
     try:
         with DDGS() as ddgs:
+            results = list(ddgs.text(query, max_results=3))
             if not results: return ""
+            blob = "\n\nSearch Results:\n"
+            for r in results:
+                blob += f"- {r['title']}: {r['body']}\n"
+            return blob
+    except:
         return ""
+# --- UI HELPERS ---
+CSS = """
+.thought-box {
+    background-color: rgba(255, 255, 255, 0.05);
+    border-left: 4px solid #facc15;
+    padding: 10px;
+    margin: 10px 0;
+    font-style: italic;
+    color: #9ca3af;
+}
+details summary {
+    cursor: pointer;
+    color: #facc15;
+    font-weight: bold;
+}
+"""
+def parse_output(text):
+    """Parses <think> tags into a clean UI format."""
+    if "<think>" in text:
+        parts = text.split("</think>")
+        if len(parts) > 1:
+            # Finished thinking
+            thought = parts[0].replace("<think>", "").strip()
+            answer = parts[1].strip()
+            return f"<details open><summary>💭 Thought Process</summary><div class='thought-box'>{thought}</div></details>\n\n{answer}"
+        else:
+            # Still thinking
+            thought = parts[0].replace("<think>", "").strip()
+            return f"<details open><summary>🌀 Thinking...</summary><div class='thought-box'>{thought}</div></details>"
+    return text
+# --- GENERATION LOGIC ---
+def chat(message, history, search_enabled, temperature, max_tokens):
+    # 1. Handle Web Search
+    search_context = ""
     if search_enabled:
+        search_context = web_search(message)
+    # 2. Build properly formatted prompt (Fixes AI talking to itself)
+    # We use the official ChatML template
+    conversation = []
+    for user_msg, assistant_msg in history:
+        conversation.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            # Remove UI formatting before feeding back to model
+            clean_assistant = re.sub(r'<details.*?</details>', '', assistant_msg, flags=re.DOTALL).strip()
+            conversation.append({"role": "assistant", "content": clean_assistant})
+    user_content = message + search_context
+    conversation.append({"role": "user", "content": user_content})
+    input_ids = tokenizer.apply_chat_template(
+        conversation,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to(model.device)
+    # 3. Streamer with stop criteria
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids=input_ids,
         streamer=streamer,
+        max_new_tokens=max_tokens,
         temperature=temperature,
+        do_sample=True,
+        pad_token_id=tokenizer.eos_token_id,
+        # Stop generating once the model tries to start a new 'User' turn
+        eos_token_id=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|im_end|>")]
     )
+    thread = Thread(target=model.generate, kwargs=generate_kwargs)
     thread.start()
+    buffer = ""
     for new_text in streamer:
+        # Crucial Fix: If the model generates "User:" or "<|im_start|>", stop displaying
+        if "User:" in new_text or "<|im_start|>" in new_text:
+            break
+        buffer += new_text
+        yield parse_output(buffer)
+# --- GRADIO UI ---
+with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
+    gr.HTML("<h1>🧠 Qwen3 Reasoning Lab</h1>")
     with gr.Row():
         with gr.Column(scale=4):
+            chat_box = gr.Chatbot(height=600, label="Qwen3-0.6B")
+            msg_input = gr.Textbox(placeholder="Ask a logic question...", show_label=False)
+        with gr.Column(scale=1):
+            search_toggle = gr.Checkbox(label="🌐 Web Search (DDG)", value=False)
+            temp_slider = gr.Slider(0.1, 1.0, 0.7, label="Temperature")
+            token_slider = gr.Slider(512, 4096, 1024, label="Max Tokens")
+            gr.Markdown("""
+            ### Tips:
+            - **Thinking:** This model is trained for Chain-of-Thought.
+            - **Self-Talk Fix:** We use stop sequences to prevent the AI from acting as 'User'.
+            """)
+            clear_btn = gr.Button("🗑 Clear Chat")
+    # Set up logic
+    chat_event = msg_input.submit(
+        lambda x, y: (x, y + [[x, None]]),
+        [msg_input, chat_box],
+        [msg_input, chat_box],
+        queue=False
+    ).then(
+        chat,
+        [msg_input, chat_box, search_toggle, temp_slider, token_slider],
+        chat_box
+    )
+    clear_btn.click(lambda: None, None, chat_box, queue=False)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)