from ctransformers import AutoModelForCausalLM import gradio as gr import re import threading # ============================== # LOAD MODELS – OPTIMAL SPEED # ============================== print("Loading Mistral from HuggingFace Hub...") mistral_model = AutoModelForCausalLM.from_pretrained( # r"C:\Users\ksrvisitor\Downloads\optimizationmodel\quant_model.gguf", "TheBloke/Mistral-7B-Instruct-v0.1-GGUF", model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf", model_type="mistral", threads=8, batch_size=512, context_length=8192, gpu_layers=0, temperature=0.7, top_p=0.9, top_k=30, repetition_penalty=1.1, max_new_tokens=1024 ) print("Loading Qwen2.5-Coder from HuggingFace Hub...") qwen_model = Llama( model_path="Qwen/Qwen2.5-Coder-7B-Instruct-GGUF", model_file="qwen2.5-coder-7b-instruct-q4_k_m.gguf", n_ctx=8192, n_threads=4, # Fastest on CPU n_batch=512, # Fastest on CPU n_gpu_layers=0, # Change to 35–99 if GPU use_mlock=True, verbose=False ) stop_event = threading.Event() # ============================== # SMART DETECTION # ============================== # ============================== # BULLETPROOF CODE DETECTION (Qwen will catch EVERYTHING now) # ============================== # ============================== # BULLETPROOF DETECTION — MATH + CODE = ALWAYS QWEN # ============================== def is_coding_or_math(text: str) -> bool: text = text.lower() # Math & number series triggers math_triggers = [ # General math "next number", "series", "sequence", "pattern", "find the next", "solve", "calculate", "equation", "math", "mathematics", "integral", "derivative", "limit", "factorial", "prime", "composite", "geometry", "algebra", "probability", "statistics", "number", "compute", "simplify", "evaluate", "expression", "fraction", "decimal", "percentage", "ratio", "proportion", "root", "square root", "logarithm", "log", "ln", "exponent", "power", "base", "matrix", "determinant", "vector", "dot product", "cross product", "trigonometry", "sine", "cosine", "tan", "cot", "sec", "cosec", "triangle", "circle", "radius", "diameter", "area", "perimeter", "volume", "surface area", "integrate", "differentiate", "quadratic", "polynomial", "cubic", "linear equation", "graph", "intercept", "slope", "intersection", "domain", "range", "modulus", "absolute", "complex number", "imaginary", "real number", "mean", "median", "mode", "variance", "standard deviation", "correlation", "regression", "distribution", "normal distribution", "binomial", "poisson", "combinatorics", "permutation", "combination", "set theory", "subset", "union", "intersection", "probability of", ] # Coding triggers code_triggers = [ # General programming "code", "program", "coding", "script", "implement", "build", "function", "method", "class", "object", "module", "package", "syntax", "runtime", "variable", "parameter", "argument", "return", "loop", "for loop", "while loop", "if statement", "condition", "boolean", "string", "array", "list", "dictionary", "hashmap", "tuple", "stack", "queue", "tree", "graph", "linked list", "pointer", "reference", "memory", "heap", "stack memory", # Languages "python", "java", "javascript", "typescript", "c++", "c#", "c language", "go", "rust", "php", "sql", "html", "css", "react", "nodejs", "json", "xml", "yaml", "bash", "shell script", # Data science / ML "pandas", "numpy", "sklearn", "tensorflow", "pytorch", "dataframe", "dataset", "model training", "machine learning", "neural network", "deep learning", # Debugging & errors "debug", "traceback", "error", "bug", "fix this code", "segmentation fault", "stack overflow", "undefined variable", # Algorithms "algorithm", "time complexity", "space complexity", "big o notation", "sort", "merge sort", "quick sort", "binary search", "dynamic programming", "recursion", "graph traversal", "dfs", "bfs", "greedy algorithm", # DevOps / tools "docker", "kubernetes", "api", "rest api", "jwt", "server", "client", "database", "mongodb", "mysql", "postgres", "ORM", "deploy", "deployment", "kafka", # Competitive coding "leetcode", "hackerrank", "codechef", "geeksforgeeks" ] # If any math or code keyword is found → Qwen if any(trigger in text for trigger in math_triggers + code_triggers): return True # If contains numbers + math symbols → Qwen if re.search(r'\d', text) and any(op in text for op in "+-*/=^()[]{}"): return True # If contains comma-separated numbers (like 2, 6, 12, 20) → Qwen if re.search(r'\d+\s*[,]\s*\d+', text): return True return False # ============================== # FIXED STREAMING (NO ECHOING!) # ============================== def stream_mistral(prompt): stop_event.clear() system_prompt = ( "You are a helpful, concise assistant. " "Do NOT repeat the user's question. " "Answer directly and clearly." ) formatted_prompt = f"[INST] <>{system_prompt}<> {prompt} [/INST]" yield [{"role": "assistant", "content": "**[Mistral]**\n\n"}] output = "" for token in mistral_model( formatted_prompt, stream=True, max_new_tokens=800, stop=[""] ): if stop_event.is_set(): break output += token clean = output.strip() yield [{"role": "assistant", "content": f"**[Mistral]**\n\n{clean}"}] def stream_qwen(prompt): stop_event.clear() resp = "" # Start output yield [{"role": "assistant", "content": "**[Qwen2.5-Coder]**\n\n"}] formatted = ( "<|im_start|>system\n" "You are a world-class math and coding assistant. " "ALWAYS respond with clean LaTeX. Use $...$ for inline and $$...$$ for display. " "Use \\boxed{} for final answers.\n" "<|im_end|>\n" "<|im_start|>user\n" + prompt + "\n<|im_end|>\n" "<|im_start|>assistant\n" ) for chunk in qwen_model( formatted, stream=True, max_tokens=800, temperature=0.1, top_p=0.9, top_k=20, repeat_penalty=1.05 ): if stop_event.is_set(): break # SAFE EXTRACTION — won't crash choice = chunk["choices"][0] token = ( choice.get("text") or choice.get("delta", {}).get("content", "") or "" ) resp += token yield [{"role": "assistant", "content": f"**[Qwen2.5-Coder]**\n\n{resp}"}] # ============================== # MAIN CHAT — WORKS WITH MESSAGES FORMAT # ============================== def chat(message, history): stop_event.clear() # Handle history as list of dicts (Gradio's type="messages") messages = [] for msg in history: if isinstance(msg, dict) and "role" in msg: messages.append(msg) else: # Fallback for tuples (old format) for u, a in msg if isinstance(msg, (list, tuple)) else []: if u: messages.append({"role": "user", "content": u}) if a: messages.append({"role": "assistant", "content": a}) messages.append({"role": "user", "content": message}) streamer = stream_qwen(message) if is_coding_or_math(message) else stream_mistral(message) partial = messages.copy() first = True for chunk in streamer: if stop_event.is_set(): break if first: partial.append(chunk[0]) first = False else: partial[-1] = chunk[0] yield partial def stop(): stop_event.set() # ============================== # UI # ============================== with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# Dual Local AI — Clean Responses (No Echoing!)\n**Code/Math → Qwen2.5-Coder** | **Chat → Mistral**") chatbot = gr.Chatbot(height=720, type="messages", show_copy_button=True) with gr.Row(): txt = gr.Textbox(placeholder="Ask anything…", label="Message", lines=4, scale=8) send = gr.Button("Send", variant="primary") stop_btn = gr.Button("Stop", variant="stop") send.click(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt) txt.submit(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt) stop_btn.click(stop) print("Launching FINAL version (no echoing, no crashes)...") demo.launch(server_port=7860, inbrowser=True)