Spaces:
No application file
No application file
| # model_final.py ← FINAL VERSION: No more echoing, no crashes, super fast | |
| from ctransformers import AutoModelForCausalLM | |
| from llama_cpp import Llama | |
| import gradio as gr | |
| import re | |
| import threading | |
| # ============================== | |
| # LOAD MODELS – OPTIMAL SPEED | |
| # ============================== | |
| print("Loading Mistral...") | |
| mistral_model = AutoModelForCausalLM.from_pretrained( | |
| r"C:\Users\ksrvisitor\Downloads\optimizationmodel\quant_model.gguf", | |
| model_type="mistral", | |
| threads=8, | |
| batch_size=512, | |
| context_length=8192, | |
| gpu_layers=0, | |
| temperature=0.7, | |
| top_p=0.9, | |
| top_k=30, | |
| repetition_penalty=1.1, | |
| max_new_tokens=1024 | |
| ) | |
| print("Loading Qwen2.5-Coder...") | |
| qwen_model = Llama( | |
| r"C:\Users\ksrvisitor\Downloads\qwen2.5-coder-7b-instruct-q4_k_m.gguf", | |
| n_ctx=8192, | |
| n_threads=4, # Fastest on CPU | |
| n_batch=512, # Fastest on CPU | |
| n_gpu_layers=0, # Change to 35–99 if GPU | |
| use_mlock=True, | |
| verbose=False | |
| ) | |
| stop_event = threading.Event() | |
| # ============================== | |
| # SMART DETECTION | |
| # ============================== | |
| # ============================== | |
| # BULLETPROOF CODE DETECTION (Qwen will catch EVERYTHING now) | |
| # ============================== | |
| # ============================== | |
| # BULLETPROOF DETECTION — MATH + CODE = ALWAYS QWEN | |
| # ============================== | |
| def is_coding_or_math(text: str) -> bool: | |
| text = text.lower() | |
| # Math & number series triggers | |
| math_triggers = [ | |
| "next number", "series", "sequence", "pattern", "find the next", "what comes next", | |
| "solve", "calculate", "equation", "math", "mathematics", "integral", "derivative", | |
| "factorial", "prime", "geometry", "algebra", "probability", "statistics", "seconds", "minutes", "hours", "number", "triangular" | |
| ] | |
| # Coding triggers | |
| code_triggers = [ | |
| "code", "program", "write a", "implement", "function", "class", "python", "java", | |
| "c++", "javascript", "sql", "debug", "algorithm", "leetcode", "binary search" | |
| ] | |
| # If any math or code keyword is found → Qwen | |
| if any(trigger in text for trigger in math_triggers + code_triggers): | |
| return True | |
| # If contains numbers + math symbols → Qwen | |
| if re.search(r'\d', text) and any(op in text for op in "+-*/=^()[]{}"): | |
| return True | |
| # If contains comma-separated numbers (like 2, 6, 12, 20) → Qwen | |
| if re.search(r'\d+\s*[,]\s*\d+', text): | |
| return True | |
| return False | |
| # ============================== | |
| # FIXED STREAMING (NO ECHOING!) | |
| # ============================== | |
| def stream_mistral(prompt): | |
| stop_event.clear() | |
| system_prompt = ( | |
| "You are a helpful, concise assistant. " | |
| "Do NOT repeat the user's question. " | |
| "Answer directly and clearly." | |
| ) | |
| formatted_prompt = f"<s>[INST] <<SYS>>{system_prompt}<</SYS>> {prompt} [/INST]" | |
| yield [{"role": "assistant", "content": "**[Mistral]**\n\n"}] | |
| output = "" | |
| for token in mistral_model( | |
| formatted_prompt, | |
| stream=True, | |
| max_new_tokens=800, | |
| stop=["</s>"] | |
| ): | |
| if stop_event.is_set(): | |
| break | |
| output += token | |
| clean = output.strip() | |
| yield [{"role": "assistant", "content": f"**[Mistral]**\n\n{clean}"}] | |
| def stream_qwen(prompt): | |
| stop_event.clear() | |
| resp = "" | |
| # Start output | |
| yield [{"role": "assistant", "content": "**[Qwen2.5-Coder]**\n\n"}] | |
| formatted = ( | |
| "<|im_start|>system\n" | |
| "You are a world-class math and coding assistant. " | |
| "ALWAYS respond with clean LaTeX. Use $...$ for inline and $$...$$ for display. " | |
| "Use \\boxed{} for final answers.\n" | |
| "<|im_end|>\n" | |
| "<|im_start|>user\n" + prompt + "\n<|im_end|>\n" | |
| "<|im_start|>assistant\n" | |
| ) | |
| for chunk in qwen_model( | |
| formatted, | |
| stream=True, | |
| max_tokens=800, | |
| temperature=0.1, | |
| top_p=0.9, | |
| top_k=20, | |
| repeat_penalty=1.05 | |
| ): | |
| if stop_event.is_set(): | |
| break | |
| # SAFE EXTRACTION — won't crash | |
| choice = chunk["choices"][0] | |
| token = ( | |
| choice.get("text") or | |
| choice.get("delta", {}).get("content", "") or | |
| "" | |
| ) | |
| resp += token | |
| yield [{"role": "assistant", "content": f"**[Qwen2.5-Coder]**\n\n{resp}"}] | |
| # ============================== | |
| # MAIN CHAT — WORKS WITH MESSAGES FORMAT | |
| # ============================== | |
| def chat(message, history): | |
| stop_event.clear() | |
| # Handle history as list of dicts (Gradio's type="messages") | |
| messages = [] | |
| for msg in history: | |
| if isinstance(msg, dict) and "role" in msg: | |
| messages.append(msg) | |
| else: | |
| # Fallback for tuples (old format) | |
| for u, a in msg if isinstance(msg, (list, tuple)) else []: | |
| if u: messages.append({"role": "user", "content": u}) | |
| if a: messages.append({"role": "assistant", "content": a}) | |
| messages.append({"role": "user", "content": message}) | |
| streamer = stream_qwen(message) if is_coding_or_math(message) else stream_mistral(message) | |
| partial = messages.copy() | |
| first = True | |
| for chunk in streamer: | |
| if stop_event.is_set(): break | |
| if first: | |
| partial.append(chunk[0]) | |
| first = False | |
| else: | |
| partial[-1] = chunk[0] | |
| yield partial | |
| def stop(): | |
| stop_event.set() | |
| # ============================== | |
| # UI | |
| # ============================== | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# Dual Local AI — Clean Responses (No Echoing!)\n**Code/Math → Qwen2.5-Coder** | **Chat → Mistral**") | |
| chatbot = gr.Chatbot(height=720, type="messages", show_copy_button=True) | |
| with gr.Row(): | |
| txt = gr.Textbox(placeholder="Ask anything…", label="Message", lines=4, scale=8) | |
| send = gr.Button("Send", variant="primary") | |
| stop_btn = gr.Button("Stop", variant="stop") | |
| send.click(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt) | |
| txt.submit(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt) | |
| stop_btn.click(stop) | |
| print("Launching FINAL version (no echoing, no crashes)...") | |
| demo.launch(server_port=7860, inbrowser=True) |