import gradio as gr import os import time from typing import Iterator import threading # Global variables llm = None model_loading = True model_error = None def load_model(): """Load the GGUF model""" global llm, model_loading, model_error try: print("🔄 Loading model...") from llama_cpp import Llama # Initialize model with optimized settings for CPU-only inference llm = Llama.from_pretrained( repo_id="Tohirju/Ameena_Qwen3-8B_e3_Quantised_gguf", filename="Ameena_Qwen3-8B_e3.gguf", # CPU-optimized settings n_ctx=2048, # Context length n_threads=None, # Use all available CPU threads n_gpu_layers=0, # CPU only use_mmap=True, # Memory mapping for efficiency use_mlock=False, # Don't lock memory (can cause issues on some systems) n_batch=512, # Batch size for prompt processing verbose=False, # Reduce output noise # Additional optimizations offload_kqv=False, # Keep KV cache on CPU f16_kv=True, # Use 16-bit for KV cache ) model_loading = False print("✅ Model loaded successfully!") except Exception as e: model_error = f"Model loading failed: {str(e)}" model_loading = False print(f"❌ {model_error}") def chat_with_model( message: str, history: list, system_message: str = "Шумо ёвари хуб ҳастед ва ба забони тоҷикӣ ҷавоб медиҳед.", max_tokens: int = 150, temperature: float = 0.7, top_p: float = 0.9, ) -> Iterator[str]: """ Chat function that streams responses """ # Check if model is ready if model_loading: yield "⏳ Model is still loading, please wait..." return if model_error: yield f"❌ Model error: {model_error}" return if llm is None: yield "❌ Model not loaded. Please refresh the page." return try: # Build conversation history messages = [] # Add system message if provided if system_message.strip(): messages.append({"role": "system", "content": system_message}) # Add conversation history for user_msg, assistant_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) # Add current message messages.append({"role": "user", "content": message}) # Generate response with streaming response_stream = llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, stream=True, stop=["", "User:", "Human:", "Assistant:"], repeat_penalty=1.1, ) # Stream the response partial_response = "" for chunk in response_stream: if chunk["choices"][0]["delta"].get("content"): partial_response += chunk["choices"][0]["delta"]["content"] yield partial_response except Exception as e: yield f"❌ Generation error: {str(e)}" def get_model_status(): """Get current model status""" if model_loading: return "🔄 Loading model... Please wait." elif model_error: return f"❌ Error: {model_error}" elif llm is not None: return "✅ Model ready!" else: return "❓ Unknown status" # Load model in background thread model_thread = threading.Thread(target=load_model, daemon=True) model_thread.start() # Create Gradio interface with gr.Blocks( title="🇹🇯 Ameena Qwen3-8B Tajik Language Model", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 800px !important; margin: auto !important; } """ ) as demo: gr.Markdown(""" # 🇹🇯 Ameena Qwen3-8B - Tajik Language Model **Model**: Quantized GGUF (4GB) | **Backend**: CPU Only | **Language**: Tajik Base model: Qwen3-8B fine-tuned for Tajik language """) # Model status status_display = gr.Markdown(get_model_status()) # Main chat interface chatbot = gr.Chatbot( height=400, show_label=False, show_copy_button=True, ) with gr.Row(): msg = gr.Textbox( placeholder="Салом! Саволи худро дар ин ҷо бинависед... (Hello! Write your question here...)", show_label=False, scale=4 ) submit_btn = gr.Button("Send", scale=1, variant="primary") # Advanced settings with gr.Accordion("⚙️ Settings", open=False): system_msg = gr.Textbox( value="Шумо ёвари хуб ҳастед ва ба забони тоҷикӣ ҷавоб медиҳед.", label="System Message (Tajik)", info="Instructions for the model in Tajik language" ) with gr.Row(): max_tokens = gr.Slider( minimum=50, maximum=300, value=150, step=10, label="Max Tokens", info="Maximum response length" ) temperature = gr.Slider( minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature", info="Response creativity (higher = more creative)" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p", info="Nucleus sampling parameter" ) # Example prompts gr.Examples( examples=[ ["Салом! Чӣ хел ҳастед?"], ["Тоҷикистон дар куҷо ҷойгир аст?"], ["Барномасозӣ чист ва чӣ гуна кор мекунад?"], ["Оиди забони тоҷикӣ маълумот диҳед"], ["Шеър дар бораи табиат нависед"], ], inputs=msg, label="💡 Example Questions" ) def respond(message, history, system_message, max_tokens, temperature, top_p): """Handle user message and generate response""" if not message.strip(): return history, "" # Add user message to history history.append([message, None]) # Generate response response_generator = chat_with_model( message, history[:-1], system_message, max_tokens, temperature, top_p ) # Stream response for partial_response in response_generator: history[-1][1] = partial_response yield history, "" return history, "" def clear_chat(): """Clear chat history""" return [], "" def update_status(): """Update model status display""" return get_model_status() # Event handlers submit_btn.click( respond, inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p], outputs=[chatbot, msg] ) msg.submit( respond, inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p], outputs=[chatbot, msg] ) # Clear button clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary") clear_btn.click(clear_chat, outputs=[chatbot, msg]) # Refresh status button refresh_btn = gr.Button("🔄 Refresh Status", variant="secondary") refresh_btn.click(update_status, outputs=status_display) # Auto-refresh status every 5 seconds during loading demo.load(update_status, outputs=status_display, every=5) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True, share=False, quiet=False, )