# model_final.py ← FINAL VERSION: No more echoing, no crashes, super fast from ctransformers import AutoModelForCausalLM from llama_cpp import Llama import gradio as gr import re import threading # ============================== # LOAD MODELS – OPTIMAL SPEED # ============================== print("Loading Mistral...") mistral_model = AutoModelForCausalLM.from_pretrained( r"C:\Users\ksrvisitor\Downloads\optimizationmodel\quant_model.gguf", model_type="mistral", threads=8, batch_size=512, context_length=8192, gpu_layers=0, temperature=0.7, top_p=0.9, top_k=30, repetition_penalty=1.1, max_new_tokens=1024 ) print("Loading Qwen2.5-Coder...") qwen_model = Llama( r"C:\Users\ksrvisitor\Downloads\qwen2.5-coder-7b-instruct-q4_k_m.gguf", n_ctx=8192, n_threads=4, # Fastest on CPU n_batch=512, # Fastest on CPU n_gpu_layers=0, # Change to 35–99 if GPU use_mlock=True, verbose=False ) stop_event = threading.Event() # ============================== # SMART DETECTION # ============================== # ============================== # BULLETPROOF CODE DETECTION (Qwen will catch EVERYTHING now) # ============================== # ============================== # BULLETPROOF DETECTION — MATH + CODE = ALWAYS QWEN # ============================== def is_coding_or_math(text: str) -> bool: text = text.lower() # Math & number series triggers math_triggers = [ "next number", "series", "sequence", "pattern", "find the next", "what comes next", "solve", "calculate", "equation", "math", "mathematics", "integral", "derivative", "factorial", "prime", "geometry", "algebra", "probability", "statistics", "seconds", "minutes", "hours", "number", "triangular" ] # Coding triggers code_triggers = [ "code", "program", "write a", "implement", "function", "class", "python", "java", "c++", "javascript", "sql", "debug", "algorithm", "leetcode", "binary search" ] # If any math or code keyword is found → Qwen if any(trigger in text for trigger in math_triggers + code_triggers): return True # If contains numbers + math symbols → Qwen if re.search(r'\d', text) and any(op in text for op in "+-*/=^()[]{}"): return True # If contains comma-separated numbers (like 2, 6, 12, 20) → Qwen if re.search(r'\d+\s*[,]\s*\d+', text): return True return False # ============================== # FIXED STREAMING (NO ECHOING!) # ============================== def stream_mistral(prompt): stop_event.clear() system_prompt = ( "You are a helpful, concise assistant. " "Do NOT repeat the user's question. " "Answer directly and clearly." ) formatted_prompt = f"[INST] <>{system_prompt}<> {prompt} [/INST]" yield [{"role": "assistant", "content": "**[Mistral]**\n\n"}] output = "" for token in mistral_model( formatted_prompt, stream=True, max_new_tokens=800, stop=[""] ): if stop_event.is_set(): break output += token clean = output.strip() yield [{"role": "assistant", "content": f"**[Mistral]**\n\n{clean}"}] def stream_qwen(prompt): stop_event.clear() resp = "" # Start output yield [{"role": "assistant", "content": "**[Qwen2.5-Coder]**\n\n"}] formatted = ( "<|im_start|>system\n" "You are a world-class math and coding assistant. " "ALWAYS respond with clean LaTeX. Use $...$ for inline and $$...$$ for display. " "Use \\boxed{} for final answers.\n" "<|im_end|>\n" "<|im_start|>user\n" + prompt + "\n<|im_end|>\n" "<|im_start|>assistant\n" ) for chunk in qwen_model( formatted, stream=True, max_tokens=800, temperature=0.1, top_p=0.9, top_k=20, repeat_penalty=1.05 ): if stop_event.is_set(): break # SAFE EXTRACTION — won't crash choice = chunk["choices"][0] token = ( choice.get("text") or choice.get("delta", {}).get("content", "") or "" ) resp += token yield [{"role": "assistant", "content": f"**[Qwen2.5-Coder]**\n\n{resp}"}] # ============================== # MAIN CHAT — WORKS WITH MESSAGES FORMAT # ============================== def chat(message, history): stop_event.clear() # Handle history as list of dicts (Gradio's type="messages") messages = [] for msg in history: if isinstance(msg, dict) and "role" in msg: messages.append(msg) else: # Fallback for tuples (old format) for u, a in msg if isinstance(msg, (list, tuple)) else []: if u: messages.append({"role": "user", "content": u}) if a: messages.append({"role": "assistant", "content": a}) messages.append({"role": "user", "content": message}) streamer = stream_qwen(message) if is_coding_or_math(message) else stream_mistral(message) partial = messages.copy() first = True for chunk in streamer: if stop_event.is_set(): break if first: partial.append(chunk[0]) first = False else: partial[-1] = chunk[0] yield partial def stop(): stop_event.set() # ============================== # UI # ============================== with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# Dual Local AI — Clean Responses (No Echoing!)\n**Code/Math → Qwen2.5-Coder** | **Chat → Mistral**") chatbot = gr.Chatbot(height=720, type="messages", show_copy_button=True) with gr.Row(): txt = gr.Textbox(placeholder="Ask anything…", label="Message", lines=4, scale=8) send = gr.Button("Send", variant="primary") stop_btn = gr.Button("Stop", variant="stop") send.click(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt) txt.submit(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt) stop_btn.click(stop) print("Launching FINAL version (no echoing, no crashes)...") demo.launch(server_port=7860, inbrowser=True)