Spaces:

OrbitMC
/

slm

Sleeping

App Files Files Community

OrbitMC commited on Feb 7

Commit

b673820

verified ·

1 Parent(s): 6cf0909

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -127

app.py CHANGED Viewed

@@ -1,148 +1,82 @@
-import time
 import gradio as gr
-from llama_cpp import Llama
 from duckduckgo_search import DDGS
-# --- Initialize Model ---
-print("Loading model from Hugging Face...")
-llm = Llama.from_pretrained(
-    repo_id="unsloth/Qwen3-0.6B-GGUF",
-    filename="Qwen3-0.6B-BF16.gguf",
-    n_ctx=32768,
-    n_threads=None, # Automatically use all CPU cores
-    verbose=False
 )
-# --- Logic Functions ---
 def search_web(query):
     try:
         with DDGS() as ddgs:
             results = [r for r in ddgs.text(query, max_results=3)]
-            if not results: return None
-            return "\n".join([f"Source: {r['title']}\nContent: {r['body']}" for r in results])
     except Exception as e:
         print(f"Search error: {e}")
-        return None
-def format_time(seconds_float):
-    ts = int(round(seconds_float))
-    m, s = divmod(ts, 60)
-    h, m = divmod(m, 60)
-    return f"{h}h {m}m {s}s" if h > 0 else f"{m}m {s}s" if m > 0 else f"{s}s"
-class ParserState:
-    def __init__(self):
-        self.answer = ""
-        self.thought = ""
-        self.in_think = False
-        self.start_time = 0
-        self.total_think_time = 0.0
-def format_ui_response(state):
-    collapsible = ""
-    if state.thought or state.in_think:
-        status = f"🌀 Thinking ({format_time(state.total_think_time)})" if state.in_think else f"✅ Thought for {format_time(state.total_think_time)}"
-        open_tag = "open" if state.in_think else ""
-        collapsible = f"<details {open_tag}><summary>{status}</summary><div style='color: #666; font-style: italic; border-left: 3px solid #facc15; padding-left: 10px; background: rgba(0,0,0,0.02);'>{state.thought}</div></details>"
-    return f"{collapsible}\n\n{state.answer}"
-# --- Gradio Handlers ---
-def generate_response(history, search_enabled, temp, top_p, max_tok, active_gen):
-    if not history: return history
-    query = history[-1][0]
-    prompt = query
     if search_enabled:
-        history[-1][1] = "🔍 Searching the web..."
-        yield history
-        context = search_web(query)
-        if context:
-            prompt = f"Context from Web:\n{context}\n\nUser Question: {query}\n\nAnswer using the context above:"
-    state = ParserState()
-    active_gen[0] = True
-    try:
-        # llama-cpp-python streaming completion
-        stream = llm.create_chat_completion(
-            messages=[{"role": "user", "content": prompt}],
-            temperature=temp,
-            top_p=top_p,
-            max_tokens=max_tok,
-            stream=True
-        )
-        for chunk in stream:
-            if not active_gen[0]: break
-            delta = chunk['choices'][0]['delta']
-            if 'content' in delta:
-                token = delta['content']
-                # Logic to handle <think> tags
-                if "<think>" in token:
-                    state.in_think = True
-                    state.start_time = time.perf_counter()
-                    token = token.replace("<think>", "")
-                if "</think>" in token:
-                    state.total_think_time += (time.perf_counter() - state.start_time)
-                    state.in_think = False
-                    token = token.replace("</think>", "")
-                if state.in_think:
-                    state.thought += token
-                    state.total_think_time = time.perf_counter() - state.start_time
-                else:
-                    state.answer += token
-                history[-1][1] = format_ui_response(state)
-                yield history
-    except Exception as e:
-        history[-1][1] = f"Error: {str(e)}"
-        yield history
-# --- UI Layout ---
-with gr.Blocks(theme=gr.themes.Soft(), css="footer {visibility: hidden}") as demo:
-    active_gen = gr.State([False])
-    gr.Markdown("# 🚀 Qwen3 Reasoning Engine\n*Integrated Llama-CPP with Web Search*")
     with gr.Row():
         with gr.Column(scale=4):
-            chatbot = gr.Chatbot(height=500, show_label=False, bubble_full_width=False)
-        with gr.Column(scale=1):
-            search_toggle = gr.Checkbox(label="🌐 Web Search", value=False)
-            temp = gr.Slider(0.1, 1.2, 0.7, label="Temperature")
-            max_tok = gr.Slider(512, 8192, 2048, step=128, label="Max Tokens")
-            gr.Markdown("---")
-            stop_btn = gr.Button("⏹ Stop", variant="secondary")
-            clear_btn = gr.Button("🗑 Clear", variant="secondary")
-    with gr.Row():
-        msg = gr.Textbox(placeholder="Enter your prompt here...", container=False, scale=7)
-        submit_btn = gr.Button("Send", variant="primary", scale=1)
-    # Event Wiring
-    sub_ev = submit_btn.click(
-        lambda m, h: ("", h + [[m, None]]), [msg, chatbot], [msg, chatbot], queue=False
-    ).then(
-        generate_response, [chatbot, search_toggle, temp, gr.State(0.95), max_tok, active_gen], chatbot
-    )
-    msg.submit(
-        lambda m, h: ("", h + [[m, None]]), [msg, chatbot], [msg, chatbot], queue=False
-    ).then(
-        generate_response, [chatbot, search_toggle, temp, gr.State(0.95), max_tok, active_gen], chatbot
-    )
-    stop_btn.click(lambda: [False], None, active_gen, cancels=[sub_ev])
-    clear_btn.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from duckduckgo_search import DDGS
+from threading import Thread
+# --- MODEL CONFIG ---
+MODEL_ID = "Qwen/Qwen3-0.6B" # Pure HF Datacard
+print(f"Loading model {MODEL_ID}...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype=torch.float16,
+    low_cpu_mem_usage=True
 )
+# --- WEB SEARCH ---
 def search_web(query):
     try:
         with DDGS() as ddgs:
             results = [r for r in ddgs.text(query, max_results=3)]
+            if not results: return ""
+            context = "\n".join([f"Source: {r['title']}\nContent: {r['body']}" for r in results])
+            return f"\n\nWeb Search Context:\n{context}\n"
     except Exception as e:
         print(f"Search error: {e}")
+        return ""
+# --- INFERENCE ---
+def stream_response(message, history, search_enabled, temperature, max_new_tokens):
+    # Prepare prompt
+    context = ""
     if search_enabled:
+        context = search_web(message)
+    # Simple Chat Template
+    full_prompt = f"User: {message}{context}\nAssistant:"
+    inputs = tokenizer([full_prompt], return_tensors="pt").to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(
+        inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature,
+        pad_token_id=tokenizer.eos_token_id
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    partial_text = ""
+    for new_text in streamer:
+        # Handle the thinking process tags if present in output
+        new_text = new_text.replace("<think>", "💭 *Thinking:* ").replace("</think>", "\n\n---\n\n")
+        partial_text += new_text
+        yield partial_text
+# --- CLEAN UI ---
+with gr.Blocks(theme=gr.themes.Default(primary_hue="orange", secondary_hue="gray")) as demo:
+    gr.Markdown("# 🛸 Qwen3 Pure-Python Explorer")
     with gr.Row():
         with gr.Column(scale=4):
+            chatbot = gr.ChatInterface(
+                fn=stream_response,
+                additional_inputs=[
+                    gr.Checkbox(label="🌐 Enable Web Search", value=False),
+                    gr.Slider(0.1, 1.0, 0.7, label="Temperature"),
+                    gr.Slider(128, 4096, 1024, label="Max Tokens"),
+                ],
+                fill_height=True
+            )
+    gr.Markdown("### Features:\n- ✅ **Zero C++ / Zero llama-cpp**\n- ✅ **Native HuggingFace Transformers**\n- ✅ **DuckDuckGo Integration**")
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)