Spaces:

Kingoteam
/

Gguf

Sleeping

App Files Files Community

Gguf / app.py

Kingoteam

Update app.py

136d0ec verified 4 months ago

raw

history blame contribute delete

2.22 kB

	import gradio as gr
	from threading import Thread
	from llama_cpp import Llama

	# -----------------------------
	# تنظیمات مدل VibeThinker
	# -----------------------------
	MODEL_REPO = "hieupt/TinyLlama-1.1B-Chat-v1.0-Q4_K_M-GGUF"
	MODEL_FILE = "tinyllama-1.1b-chat-v1.0-q4_k_m.gguf" # اگر فایل دیگری می‌خواهی فقط نام را عوض کن

	llm = None

	def load_model():
	global llm
	print("🔄 بارگذاری مدل VibeThinker...")
	llm = Llama.from_pretrained(
	repo_id=MODEL_REPO,
	filename=MODEL_FILE,
	chat_format="chatml",
	n_ctx=4096,
	n_threads=6, # در HF Spaces به 2 یا 4 هم می‌توان کاهش داد
	n_batch=256,
	verbose=False
	)
	print("✅ مدل آماده استفاده است!")

	# بارگذاری مدل در Thread جداگانه
	Thread(target=load_model).start()


	# -----------------------------
	# تابع چت استریم
	# -----------------------------
	def chat_stream(message, history):
	if llm is None:
	yield history + [{"role": "assistant", "content": "⏳ مدل هنوز در حال بارگذاری است..."}]
	return

	messages = history + [{"role": "user", "content": message}]

	stream = llm.create_chat_completion(
	messages=messages,
	max_tokens=512,
	temperature=0.7,
	top_p=0.9,
	stream=True,
	)

	partial = ""
	for chunk in stream:
	if "choices" in chunk:
	delta = chunk["choices"][0]["delta"].get("content", "")
	partial += delta
	yield messages + [{"role": "assistant", "content": partial}]


	# -----------------------------
	# رابط کاربری Gradio
	# -----------------------------
	with gr.Blocks(title="VibeThinker GGUF Chat") as demo:
	gr.Markdown("### 🤖 چت مدل VibeThinker — با llama.cpp و استریم لحظه‌ای")

	chatbot = gr.Chatbot(type="messages")
	msg = gr.Textbox(label="پیام")
	clear = gr.Button("پاک کردن گفتگو")

	msg.submit(chat_stream, [msg, chatbot], chatbot)
	clear.click(lambda: [], None, chatbot, queue=False)

	demo.queue().launch(server_name="0.0.0.0", server_port=7860)