Spaces:

sayalimetkar
/

model

No application file

App Files Files Community

model / model_final.py

sayalimetkar

Upload model_final.py

e69868f verified 3 months ago

raw

history blame contribute delete

6.53 kB

	# model_final.py ← FINAL VERSION: No more echoing, no crashes, super fast
	from ctransformers import AutoModelForCausalLM
	from llama_cpp import Llama
	import gradio as gr
	import re
	import threading

	# ==============================
	# LOAD MODELS – OPTIMAL SPEED
	# ==============================
	print("Loading Mistral...")
	mistral_model = AutoModelForCausalLM.from_pretrained(
	r"C:\Users\ksrvisitor\Downloads\optimizationmodel\quant_model.gguf",
	model_type="mistral",
	threads=8,
	batch_size=512,
	context_length=8192,
	gpu_layers=0,
	temperature=0.7,
	top_p=0.9,
	top_k=30,
	repetition_penalty=1.1,
	max_new_tokens=1024
	)

	print("Loading Qwen2.5-Coder...")
	qwen_model = Llama(
	r"C:\Users\ksrvisitor\Downloads\qwen2.5-coder-7b-instruct-q4_k_m.gguf",
	n_ctx=8192,
	n_threads=4, # Fastest on CPU
	n_batch=512, # Fastest on CPU
	n_gpu_layers=0, # Change to 35–99 if GPU
	use_mlock=True,
	verbose=False
	)

	stop_event = threading.Event()

	# ==============================
	# SMART DETECTION
	# ==============================
	# ==============================
	# BULLETPROOF CODE DETECTION (Qwen will catch EVERYTHING now)
	# ==============================
	# ==============================
	# BULLETPROOF DETECTION — MATH + CODE = ALWAYS QWEN
	# ==============================
	def is_coding_or_math(text: str) -> bool:
	text = text.lower()

	# Math & number series triggers
	math_triggers = [
	"next number", "series", "sequence", "pattern", "find the next", "what comes next",
	"solve", "calculate", "equation", "math", "mathematics", "integral", "derivative",
	"factorial", "prime", "geometry", "algebra", "probability", "statistics", "seconds", "minutes", "hours", "number", "triangular"
	]

	# Coding triggers
	code_triggers = [
	"code", "program", "write a", "implement", "function", "class", "python", "java",
	"c++", "javascript", "sql", "debug", "algorithm", "leetcode", "binary search"
	]

	# If any math or code keyword is found → Qwen
	if any(trigger in text for trigger in math_triggers + code_triggers):
	return True

	# If contains numbers + math symbols → Qwen
	if re.search(r'\d', text) and any(op in text for op in "+-*/=^()[]{}"):
	return True

	# If contains comma-separated numbers (like 2, 6, 12, 20) → Qwen
	if re.search(r'\d+\s[,]\s\d+', text):
	return True

	return False

	# ==============================
	# FIXED STREAMING (NO ECHOING!)
	# ==============================
	def stream_mistral(prompt):
	stop_event.clear()

	system_prompt = (
	"You are a helpful, concise assistant. "
	"Do NOT repeat the user's question. "
	"Answer directly and clearly."
	)

	formatted_prompt = f"<s>[INST] <<SYS>>{system_prompt}<</SYS>> {prompt} [/INST]"

	yield [{"role": "assistant", "content": "[Mistral]\n\n"}]

	output = ""
	for token in mistral_model(
	formatted_prompt,
	stream=True,
	max_new_tokens=800,
	stop=["</s>"]
	):
	if stop_event.is_set():
	break

	output += token
	clean = output.strip()

	yield [{"role": "assistant", "content": f"[Mistral]\n\n{clean}"}]

	def stream_qwen(prompt):
	stop_event.clear()
	resp = ""

	# Start output
	yield [{"role": "assistant", "content": "[Qwen2.5-Coder]\n\n"}]

	formatted = (
	"<\|im_start\|>system\n"
	"You are a world-class math and coding assistant. "
	"ALWAYS respond with clean LaTeX. Use $...$ for inline and $$...$$ for display. "
	"Use \\boxed{} for final answers.\n"
	"<\|im_end\|>\n"
	"<\|im_start\|>user\n" + prompt + "\n<\|im_end\|>\n"
	"<\|im_start\|>assistant\n"
	)

	for chunk in qwen_model(
	formatted,
	stream=True,
	max_tokens=800,
	temperature=0.1,
	top_p=0.9,
	top_k=20,
	repeat_penalty=1.05
	):
	if stop_event.is_set():
	break

	# SAFE EXTRACTION — won't crash
	choice = chunk["choices"][0]
	token = (
	choice.get("text") or
	choice.get("delta", {}).get("content", "") or
	""
	)

	resp += token

	yield [{"role": "assistant", "content": f"[Qwen2.5-Coder]\n\n{resp}"}]

	# ==============================
	# MAIN CHAT — WORKS WITH MESSAGES FORMAT
	# ==============================
	def chat(message, history):
	stop_event.clear()

	# Handle history as list of dicts (Gradio's type="messages")
	messages = []
	for msg in history:
	if isinstance(msg, dict) and "role" in msg:
	messages.append(msg)
	else:
	# Fallback for tuples (old format)
	for u, a in msg if isinstance(msg, (list, tuple)) else []:
	if u: messages.append({"role": "user", "content": u})
	if a: messages.append({"role": "assistant", "content": a})
	messages.append({"role": "user", "content": message})

	streamer = stream_qwen(message) if is_coding_or_math(message) else stream_mistral(message)

	partial = messages.copy()
	first = True
	for chunk in streamer:
	if stop_event.is_set(): break
	if first:
	partial.append(chunk[0])
	first = False
	else:
	partial[-1] = chunk[0]
	yield partial

	def stop():
	stop_event.set()

	# ==============================
	# UI
	# ==============================
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Dual Local AI — Clean Responses (No Echoing!)\nCode/Math → Qwen2.5-Coder \| Chat → Mistral")
	chatbot = gr.Chatbot(height=720, type="messages", show_copy_button=True)
	with gr.Row():
	txt = gr.Textbox(placeholder="Ask anything…", label="Message", lines=4, scale=8)
	send = gr.Button("Send", variant="primary")
	stop_btn = gr.Button("Stop", variant="stop")

	send.click(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt)
	txt.submit(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt)
	stop_btn.click(stop)

	print("Launching FINAL version (no echoing, no crashes)...")
	demo.launch(server_port=7860, inbrowser=True)