model / model_final.py
sayalimetkar's picture
Upload model_final.py
e69868f verified
# model_final.py ← FINAL VERSION: No more echoing, no crashes, super fast
from ctransformers import AutoModelForCausalLM
from llama_cpp import Llama
import gradio as gr
import re
import threading
# ==============================
# LOAD MODELS – OPTIMAL SPEED
# ==============================
print("Loading Mistral...")
mistral_model = AutoModelForCausalLM.from_pretrained(
r"C:\Users\ksrvisitor\Downloads\optimizationmodel\quant_model.gguf",
model_type="mistral",
threads=8,
batch_size=512,
context_length=8192,
gpu_layers=0,
temperature=0.7,
top_p=0.9,
top_k=30,
repetition_penalty=1.1,
max_new_tokens=1024
)
print("Loading Qwen2.5-Coder...")
qwen_model = Llama(
r"C:\Users\ksrvisitor\Downloads\qwen2.5-coder-7b-instruct-q4_k_m.gguf",
n_ctx=8192,
n_threads=4, # Fastest on CPU
n_batch=512, # Fastest on CPU
n_gpu_layers=0, # Change to 35–99 if GPU
use_mlock=True,
verbose=False
)
stop_event = threading.Event()
# ==============================
# SMART DETECTION
# ==============================
# ==============================
# BULLETPROOF CODE DETECTION (Qwen will catch EVERYTHING now)
# ==============================
# ==============================
# BULLETPROOF DETECTION — MATH + CODE = ALWAYS QWEN
# ==============================
def is_coding_or_math(text: str) -> bool:
text = text.lower()
# Math & number series triggers
math_triggers = [
"next number", "series", "sequence", "pattern", "find the next", "what comes next",
"solve", "calculate", "equation", "math", "mathematics", "integral", "derivative",
"factorial", "prime", "geometry", "algebra", "probability", "statistics", "seconds", "minutes", "hours", "number", "triangular"
]
# Coding triggers
code_triggers = [
"code", "program", "write a", "implement", "function", "class", "python", "java",
"c++", "javascript", "sql", "debug", "algorithm", "leetcode", "binary search"
]
# If any math or code keyword is found → Qwen
if any(trigger in text for trigger in math_triggers + code_triggers):
return True
# If contains numbers + math symbols → Qwen
if re.search(r'\d', text) and any(op in text for op in "+-*/=^()[]{}"):
return True
# If contains comma-separated numbers (like 2, 6, 12, 20) → Qwen
if re.search(r'\d+\s*[,]\s*\d+', text):
return True
return False
# ==============================
# FIXED STREAMING (NO ECHOING!)
# ==============================
def stream_mistral(prompt):
stop_event.clear()
system_prompt = (
"You are a helpful, concise assistant. "
"Do NOT repeat the user's question. "
"Answer directly and clearly."
)
formatted_prompt = f"<s>[INST] <<SYS>>{system_prompt}<</SYS>> {prompt} [/INST]"
yield [{"role": "assistant", "content": "**[Mistral]**\n\n"}]
output = ""
for token in mistral_model(
formatted_prompt,
stream=True,
max_new_tokens=800,
stop=["</s>"]
):
if stop_event.is_set():
break
output += token
clean = output.strip()
yield [{"role": "assistant", "content": f"**[Mistral]**\n\n{clean}"}]
def stream_qwen(prompt):
stop_event.clear()
resp = ""
# Start output
yield [{"role": "assistant", "content": "**[Qwen2.5-Coder]**\n\n"}]
formatted = (
"<|im_start|>system\n"
"You are a world-class math and coding assistant. "
"ALWAYS respond with clean LaTeX. Use $...$ for inline and $$...$$ for display. "
"Use \\boxed{} for final answers.\n"
"<|im_end|>\n"
"<|im_start|>user\n" + prompt + "\n<|im_end|>\n"
"<|im_start|>assistant\n"
)
for chunk in qwen_model(
formatted,
stream=True,
max_tokens=800,
temperature=0.1,
top_p=0.9,
top_k=20,
repeat_penalty=1.05
):
if stop_event.is_set():
break
# SAFE EXTRACTION — won't crash
choice = chunk["choices"][0]
token = (
choice.get("text") or
choice.get("delta", {}).get("content", "") or
""
)
resp += token
yield [{"role": "assistant", "content": f"**[Qwen2.5-Coder]**\n\n{resp}"}]
# ==============================
# MAIN CHAT — WORKS WITH MESSAGES FORMAT
# ==============================
def chat(message, history):
stop_event.clear()
# Handle history as list of dicts (Gradio's type="messages")
messages = []
for msg in history:
if isinstance(msg, dict) and "role" in msg:
messages.append(msg)
else:
# Fallback for tuples (old format)
for u, a in msg if isinstance(msg, (list, tuple)) else []:
if u: messages.append({"role": "user", "content": u})
if a: messages.append({"role": "assistant", "content": a})
messages.append({"role": "user", "content": message})
streamer = stream_qwen(message) if is_coding_or_math(message) else stream_mistral(message)
partial = messages.copy()
first = True
for chunk in streamer:
if stop_event.is_set(): break
if first:
partial.append(chunk[0])
first = False
else:
partial[-1] = chunk[0]
yield partial
def stop():
stop_event.set()
# ==============================
# UI
# ==============================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Dual Local AI — Clean Responses (No Echoing!)\n**Code/Math → Qwen2.5-Coder** | **Chat → Mistral**")
chatbot = gr.Chatbot(height=720, type="messages", show_copy_button=True)
with gr.Row():
txt = gr.Textbox(placeholder="Ask anything…", label="Message", lines=4, scale=8)
send = gr.Button("Send", variant="primary")
stop_btn = gr.Button("Stop", variant="stop")
send.click(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt)
txt.submit(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt)
stop_btn.click(stop)
print("Launching FINAL version (no echoing, no crashes)...")
demo.launch(server_port=7860, inbrowser=True)