iris / app.py
Datangtang's picture
真没招了
37b5c7e verified
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
# ----------------------------------------
# Global model cache
# ----------------------------------------
loaded_models = {} # Cache loaded Llama models
current_model_name = None
MODEL_CONFIGS = {
"1B Model (Datangtang/GGUF1B)": {
"repo_id": "Datangtang/GGUF1B",
"filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
},
"3B Model (Datangtang/GGUF3B)": {
"repo_id": "Datangtang/GGUF3B",
"filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
}
}
# ----------------------------------------
# Load model function
# ----------------------------------------
def load_model(model_choice):
global loaded_models, current_model_name
if model_choice in loaded_models:
print(f"Reusing already loaded model: {model_choice}")
current_model_name = model_choice
return loaded_models[model_choice]
print(f"Downloading model: {model_choice}")
cfg = MODEL_CONFIGS[model_choice]
model_path = hf_hub_download(
repo_id=cfg["repo_id"],
filename=cfg["filename"],
local_dir="./model",
token=os.environ["HF_TOKEN"]
)
print(f"Model downloaded to: {model_path}")
print("Loading GGUF model into memory...")
llm = Llama(
model_path=model_path,
n_ctx=1024,
n_threads=6,
n_batch=512,
n_gpu_layers=0,
use_mmap=True,
use_mlock=True,
verbose=False,
)
loaded_models[model_choice] = llm
current_model_name = model_choice
print("Model loaded successfully!")
return llm
# ----------------------------------------
# Chat function
# ----------------------------------------
def chat(message, history, model_choice):
llm = load_model(model_choice)
# System prompt
conversation = "System: You are a helpful assistant.\n"
# Convert ChatInterface history (list of dicts) into text prompt
for msg in history[-3:]:
# ChatInterface format: {"role": "...", "content": "..."}
if isinstance(msg, dict):
role = msg.get("role")
content = msg.get("content", "")
if role == "user":
conversation += f"User: {content}\n"
elif role == "assistant":
conversation += f"Assistant: {content}\n"
# Safety: old tuple format
elif isinstance(msg, list) or isinstance(msg, tuple):
human, assistant = msg
conversation += f"User: {human}\n"
if assistant:
conversation += f"Assistant: {assistant}\n"
# Add current message
conversation += f"User: {message}\nAssistant:"
# Generate model response
response = llm(
conversation,
max_tokens=128,
temperature=0.7,
top_p=0.9,
top_k=40,
repeat_penalty=1.1,
stop=["User:", "Assistant:"],
echo=False
)
return response["choices"][0]["text"].strip()
# ----------------------------------------
# Gradio UI
# ----------------------------------------
with gr.Blocks() as demo:
gr.Markdown("# 🦙 Datangtang GGUF Model Demo")
gr.Markdown("Switch between **1B** and **3B** GGUF models in real-time.")
model_choice = gr.Dropdown(
label="Select Model",
choices=list(MODEL_CONFIGS.keys()),
value="1B Model (Datangtang/GGUF1B)",
)
chat_iface = gr.ChatInterface(
fn=lambda message, history: chat(message, history, model_choice.value),
examples=[
"Explain deep learning in one paragraph.",
"What is the difference between supervised and unsupervised learning?",
"Explain what a transformer model is.",
],
cache_examples=False,
)
model_choice.change(
fn=lambda x: f"🔄 Switched to: {x}",
inputs=[model_choice],
outputs=[],
)
if __name__ == "__main__":
demo.launch()