| import gradio as gr |
| from threading import Thread |
| from llama_cpp import Llama |
|
|
| |
| |
| |
| MODEL_REPO = "hieupt/TinyLlama-1.1B-Chat-v1.0-Q4_K_M-GGUF" |
| MODEL_FILE = "tinyllama-1.1b-chat-v1.0-q4_k_m.gguf" |
|
|
| llm = None |
|
|
| def load_model(): |
| global llm |
| print("🔄 بارگذاری مدل VibeThinker...") |
| llm = Llama.from_pretrained( |
| repo_id=MODEL_REPO, |
| filename=MODEL_FILE, |
| chat_format="chatml", |
| n_ctx=4096, |
| n_threads=6, |
| n_batch=256, |
| verbose=False |
| ) |
| print("✅ مدل آماده استفاده است!") |
|
|
| |
| Thread(target=load_model).start() |
|
|
|
|
| |
| |
| |
| def chat_stream(message, history): |
| if llm is None: |
| yield history + [{"role": "assistant", "content": "⏳ مدل هنوز در حال بارگذاری است..."}] |
| return |
|
|
| messages = history + [{"role": "user", "content": message}] |
|
|
| stream = llm.create_chat_completion( |
| messages=messages, |
| max_tokens=512, |
| temperature=0.7, |
| top_p=0.9, |
| stream=True, |
| ) |
|
|
| partial = "" |
| for chunk in stream: |
| if "choices" in chunk: |
| delta = chunk["choices"][0]["delta"].get("content", "") |
| partial += delta |
| yield messages + [{"role": "assistant", "content": partial}] |
|
|
|
|
| |
| |
| |
| with gr.Blocks(title="VibeThinker GGUF Chat") as demo: |
| gr.Markdown("### 🤖 چت مدل **VibeThinker** — با llama.cpp و استریم لحظهای") |
|
|
| chatbot = gr.Chatbot(type="messages") |
| msg = gr.Textbox(label="پیام") |
| clear = gr.Button("پاک کردن گفتگو") |
|
|
| msg.submit(chat_stream, [msg, chatbot], chatbot) |
| clear.click(lambda: [], None, chatbot, queue=False) |
|
|
| demo.queue().launch(server_name="0.0.0.0", server_port=7860) |