Ameena_e3 / app.py
Tohirju's picture
Update app.py
858b02e verified
import gradio as gr
import os
import time
from typing import Iterator
import threading
# Global variables
llm = None
model_loading = True
model_error = None
def load_model():
"""Load the GGUF model"""
global llm, model_loading, model_error
try:
print("🔄 Loading model...")
from llama_cpp import Llama
# Initialize model with optimized settings for CPU-only inference
llm = Llama.from_pretrained(
repo_id="Tohirju/Ameena_Qwen3-8B_e3_Quantised_gguf",
filename="Ameena_Qwen3-8B_e3.gguf",
# CPU-optimized settings
n_ctx=2048, # Context length
n_threads=None, # Use all available CPU threads
n_gpu_layers=0, # CPU only
use_mmap=True, # Memory mapping for efficiency
use_mlock=False, # Don't lock memory (can cause issues on some systems)
n_batch=512, # Batch size for prompt processing
verbose=False, # Reduce output noise
# Additional optimizations
offload_kqv=False, # Keep KV cache on CPU
f16_kv=True, # Use 16-bit for KV cache
)
model_loading = False
print("✅ Model loaded successfully!")
except Exception as e:
model_error = f"Model loading failed: {str(e)}"
model_loading = False
print(f"❌ {model_error}")
def chat_with_model(
message: str,
history: list,
system_message: str = "Шумо ёвари хуб ҳастед ва ба забони тоҷикӣ ҷавоб медиҳед.",
max_tokens: int = 150,
temperature: float = 0.7,
top_p: float = 0.9,
) -> Iterator[str]:
"""
Chat function that streams responses
"""
# Check if model is ready
if model_loading:
yield "⏳ Model is still loading, please wait..."
return
if model_error:
yield f"❌ Model error: {model_error}"
return
if llm is None:
yield "❌ Model not loaded. Please refresh the page."
return
try:
# Build conversation history
messages = []
# Add system message if provided
if system_message.strip():
messages.append({"role": "system", "content": system_message})
# Add conversation history
for user_msg, assistant_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
# Add current message
messages.append({"role": "user", "content": message})
# Generate response with streaming
response_stream = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stream=True,
stop=["</s>", "User:", "Human:", "Assistant:"],
repeat_penalty=1.1,
)
# Stream the response
partial_response = ""
for chunk in response_stream:
if chunk["choices"][0]["delta"].get("content"):
partial_response += chunk["choices"][0]["delta"]["content"]
yield partial_response
except Exception as e:
yield f"❌ Generation error: {str(e)}"
def get_model_status():
"""Get current model status"""
if model_loading:
return "🔄 Loading model... Please wait."
elif model_error:
return f"❌ Error: {model_error}"
elif llm is not None:
return "✅ Model ready!"
else:
return "❓ Unknown status"
# Load model in background thread
model_thread = threading.Thread(target=load_model, daemon=True)
model_thread.start()
# Create Gradio interface
with gr.Blocks(
title="🇹🇯 Ameena Qwen3-8B Tajik Language Model",
theme=gr.themes.Soft(),
css="""
.gradio-container {
max-width: 800px !important;
margin: auto !important;
}
"""
) as demo:
gr.Markdown("""
# 🇹🇯 Ameena Qwen3-8B - Tajik Language Model
**Model**: Quantized GGUF (4GB) | **Backend**: CPU Only | **Language**: Tajik
Base model: Qwen3-8B fine-tuned for Tajik language
""")
# Model status
status_display = gr.Markdown(get_model_status())
# Main chat interface
chatbot = gr.Chatbot(
height=400,
show_label=False,
show_copy_button=True,
)
with gr.Row():
msg = gr.Textbox(
placeholder="Салом! Саволи худро дар ин ҷо бинависед... (Hello! Write your question here...)",
show_label=False,
scale=4
)
submit_btn = gr.Button("Send", scale=1, variant="primary")
# Advanced settings
with gr.Accordion("⚙️ Settings", open=False):
system_msg = gr.Textbox(
value="Шумо ёвари хуб ҳастед ва ба забони тоҷикӣ ҷавоб медиҳед.",
label="System Message (Tajik)",
info="Instructions for the model in Tajik language"
)
with gr.Row():
max_tokens = gr.Slider(
minimum=50,
maximum=300,
value=150,
step=10,
label="Max Tokens",
info="Maximum response length"
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.5,
value=0.7,
step=0.1,
label="Temperature",
info="Response creativity (higher = more creative)"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-p",
info="Nucleus sampling parameter"
)
# Example prompts
gr.Examples(
examples=[
["Салом! Чӣ хел ҳастед?"],
["Тоҷикистон дар куҷо ҷойгир аст?"],
["Барномасозӣ чист ва чӣ гуна кор мекунад?"],
["Оиди забони тоҷикӣ маълумот диҳед"],
["Шеър дар бораи табиат нависед"],
],
inputs=msg,
label="💡 Example Questions"
)
def respond(message, history, system_message, max_tokens, temperature, top_p):
"""Handle user message and generate response"""
if not message.strip():
return history, ""
# Add user message to history
history.append([message, None])
# Generate response
response_generator = chat_with_model(
message, history[:-1], system_message, max_tokens, temperature, top_p
)
# Stream response
for partial_response in response_generator:
history[-1][1] = partial_response
yield history, ""
return history, ""
def clear_chat():
"""Clear chat history"""
return [], ""
def update_status():
"""Update model status display"""
return get_model_status()
# Event handlers
submit_btn.click(
respond,
inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p],
outputs=[chatbot, msg]
)
msg.submit(
respond,
inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p],
outputs=[chatbot, msg]
)
# Clear button
clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
clear_btn.click(clear_chat, outputs=[chatbot, msg])
# Refresh status button
refresh_btn = gr.Button("🔄 Refresh Status", variant="secondary")
refresh_btn.click(update_status, outputs=status_display)
# Auto-refresh status every 5 seconds during loading
demo.load(update_status, outputs=status_display, every=5)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
share=False,
quiet=False,
)