optimized_model / app.py
sayalimetkar's picture
Update app.py
16107a7 verified
from ctransformers import AutoModelForCausalLM
import gradio as gr
import re
import threading
# ==============================
# LOAD MODELS – OPTIMAL SPEED
# ==============================
print("Loading Mistral from HuggingFace Hub...")
mistral_model = AutoModelForCausalLM.from_pretrained(
# r"C:\Users\ksrvisitor\Downloads\optimizationmodel\quant_model.gguf",
"TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
model_type="mistral",
threads=8,
batch_size=512,
context_length=8192,
gpu_layers=0,
temperature=0.7,
top_p=0.9,
top_k=30,
repetition_penalty=1.1,
max_new_tokens=1024
)
print("Loading Qwen2.5-Coder from HuggingFace Hub...")
qwen_model = Llama(
model_path="Qwen/Qwen2.5-Coder-7B-Instruct-GGUF",
model_file="qwen2.5-coder-7b-instruct-q4_k_m.gguf",
n_ctx=8192,
n_threads=4, # Fastest on CPU
n_batch=512, # Fastest on CPU
n_gpu_layers=0, # Change to 35–99 if GPU
use_mlock=True,
verbose=False
)
stop_event = threading.Event()
# ==============================
# SMART DETECTION
# ==============================
# ==============================
# BULLETPROOF CODE DETECTION (Qwen will catch EVERYTHING now)
# ==============================
# ==============================
# BULLETPROOF DETECTION β€” MATH + CODE = ALWAYS QWEN
# ==============================
def is_coding_or_math(text: str) -> bool:
text = text.lower()
# Math & number series triggers
math_triggers = [
# General math
"next number", "series", "sequence", "pattern", "find the next",
"solve", "calculate", "equation", "math", "mathematics", "integral",
"derivative", "limit", "factorial", "prime", "composite",
"geometry", "algebra", "probability", "statistics", "number",
"compute", "simplify", "evaluate", "expression", "fraction",
"decimal", "percentage", "ratio", "proportion", "root", "square root",
"logarithm", "log", "ln", "exponent", "power", "base",
"matrix", "determinant", "vector", "dot product", "cross product",
"trigonometry", "sine", "cosine", "tan", "cot", "sec", "cosec",
"triangle", "circle", "radius", "diameter", "area", "perimeter",
"volume", "surface area", "integrate", "differentiate",
"quadratic", "polynomial", "cubic", "linear equation",
"graph", "intercept", "slope", "intersection", "domain", "range",
"modulus", "absolute", "complex number", "imaginary", "real number",
"mean", "median", "mode", "variance", "standard deviation",
"correlation", "regression", "distribution", "normal distribution",
"binomial", "poisson", "combinatorics", "permutation", "combination",
"set theory", "subset", "union", "intersection", "probability of",
]
# Coding triggers
code_triggers = [
# General programming
"code", "program", "coding", "script", "implement", "build",
"function", "method", "class", "object", "module", "package",
"syntax", "runtime", "variable", "parameter", "argument",
"return", "loop", "for loop", "while loop", "if statement",
"condition", "boolean", "string", "array", "list", "dictionary",
"hashmap", "tuple", "stack", "queue", "tree", "graph", "linked list",
"pointer", "reference", "memory", "heap", "stack memory",
# Languages
"python", "java", "javascript", "typescript", "c++", "c#", "c language",
"go", "rust", "php", "sql", "html", "css", "react", "nodejs",
"json", "xml", "yaml", "bash", "shell script",
# Data science / ML
"pandas", "numpy", "sklearn", "tensorflow", "pytorch",
"dataframe", "dataset", "model training", "machine learning",
"neural network", "deep learning",
# Debugging & errors
"debug", "traceback", "error", "bug", "fix this code",
"segmentation fault", "stack overflow", "undefined variable",
# Algorithms
"algorithm", "time complexity", "space complexity",
"big o notation", "sort", "merge sort", "quick sort",
"binary search", "dynamic programming", "recursion",
"graph traversal", "dfs", "bfs", "greedy algorithm",
# DevOps / tools
"docker", "kubernetes", "api", "rest api", "jwt",
"server", "client", "database", "mongodb", "mysql",
"postgres", "ORM", "deploy", "deployment", "kafka",
# Competitive coding
"leetcode", "hackerrank", "codechef", "geeksforgeeks"
]
# If any math or code keyword is found β†’ Qwen
if any(trigger in text for trigger in math_triggers + code_triggers):
return True
# If contains numbers + math symbols β†’ Qwen
if re.search(r'\d', text) and any(op in text for op in "+-*/=^()[]{}"):
return True
# If contains comma-separated numbers (like 2, 6, 12, 20) β†’ Qwen
if re.search(r'\d+\s*[,]\s*\d+', text):
return True
return False
# ==============================
# FIXED STREAMING (NO ECHOING!)
# ==============================
def stream_mistral(prompt):
stop_event.clear()
system_prompt = (
"You are a helpful, concise assistant. "
"Do NOT repeat the user's question. "
"Answer directly and clearly."
)
formatted_prompt = f"<s>[INST] <<SYS>>{system_prompt}<</SYS>> {prompt} [/INST]"
yield [{"role": "assistant", "content": "**[Mistral]**\n\n"}]
output = ""
for token in mistral_model(
formatted_prompt,
stream=True,
max_new_tokens=800,
stop=["</s>"]
):
if stop_event.is_set():
break
output += token
clean = output.strip()
yield [{"role": "assistant", "content": f"**[Mistral]**\n\n{clean}"}]
def stream_qwen(prompt):
stop_event.clear()
resp = ""
# Start output
yield [{"role": "assistant", "content": "**[Qwen2.5-Coder]**\n\n"}]
formatted = (
"<|im_start|>system\n"
"You are a world-class math and coding assistant. "
"ALWAYS respond with clean LaTeX. Use $...$ for inline and $$...$$ for display. "
"Use \\boxed{} for final answers.\n"
"<|im_end|>\n"
"<|im_start|>user\n" + prompt + "\n<|im_end|>\n"
"<|im_start|>assistant\n"
)
for chunk in qwen_model(
formatted,
stream=True,
max_tokens=800,
temperature=0.1,
top_p=0.9,
top_k=20,
repeat_penalty=1.05
):
if stop_event.is_set():
break
# SAFE EXTRACTION β€” won't crash
choice = chunk["choices"][0]
token = (
choice.get("text") or
choice.get("delta", {}).get("content", "") or
""
)
resp += token
yield [{"role": "assistant", "content": f"**[Qwen2.5-Coder]**\n\n{resp}"}]
# ==============================
# MAIN CHAT β€” WORKS WITH MESSAGES FORMAT
# ==============================
def chat(message, history):
stop_event.clear()
# Handle history as list of dicts (Gradio's type="messages")
messages = []
for msg in history:
if isinstance(msg, dict) and "role" in msg:
messages.append(msg)
else:
# Fallback for tuples (old format)
for u, a in msg if isinstance(msg, (list, tuple)) else []:
if u: messages.append({"role": "user", "content": u})
if a: messages.append({"role": "assistant", "content": a})
messages.append({"role": "user", "content": message})
streamer = stream_qwen(message) if is_coding_or_math(message) else stream_mistral(message)
partial = messages.copy()
first = True
for chunk in streamer:
if stop_event.is_set(): break
if first:
partial.append(chunk[0])
first = False
else:
partial[-1] = chunk[0]
yield partial
def stop():
stop_event.set()
# ==============================
# UI
# ==============================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Dual Local AI β€” Clean Responses (No Echoing!)\n**Code/Math β†’ Qwen2.5-Coder** | **Chat β†’ Mistral**")
chatbot = gr.Chatbot(height=720, type="messages", show_copy_button=True)
with gr.Row():
txt = gr.Textbox(placeholder="Ask anything…", label="Message", lines=4, scale=8)
send = gr.Button("Send", variant="primary")
stop_btn = gr.Button("Stop", variant="stop")
send.click(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt)
txt.submit(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt)
stop_btn.click(stop)
print("Launching FINAL version (no echoing, no crashes)...")
demo.launch(server_port=7860, inbrowser=True)