Spaces:
Sleeping
Sleeping
File size: 3,991 Bytes
c983f53 9aa1169 540a0fc c983f53 540a0fc 37b5c7e 540a0fc ac0916f ee73df4 540a0fc 9aa1169 37b5c7e 540a0fc 9aa1169 37b5c7e 540a0fc 9aa1169 540a0fc 9aa1169 540a0fc 9aa1169 77cf31a 540a0fc 77cf31a 540a0fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
# ----------------------------------------
# Global model cache
# ----------------------------------------
loaded_models = {} # Cache loaded Llama models
current_model_name = None
MODEL_CONFIGS = {
"1B Model (Datangtang/GGUF1B)": {
"repo_id": "Datangtang/GGUF1B",
"filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
},
"3B Model (Datangtang/GGUF3B)": {
"repo_id": "Datangtang/GGUF3B",
"filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
}
}
# ----------------------------------------
# Load model function
# ----------------------------------------
def load_model(model_choice):
global loaded_models, current_model_name
if model_choice in loaded_models:
print(f"Reusing already loaded model: {model_choice}")
current_model_name = model_choice
return loaded_models[model_choice]
print(f"Downloading model: {model_choice}")
cfg = MODEL_CONFIGS[model_choice]
model_path = hf_hub_download(
repo_id=cfg["repo_id"],
filename=cfg["filename"],
local_dir="./model",
token=os.environ["HF_TOKEN"]
)
print(f"Model downloaded to: {model_path}")
print("Loading GGUF model into memory...")
llm = Llama(
model_path=model_path,
n_ctx=1024,
n_threads=6,
n_batch=512,
n_gpu_layers=0,
use_mmap=True,
use_mlock=True,
verbose=False,
)
loaded_models[model_choice] = llm
current_model_name = model_choice
print("Model loaded successfully!")
return llm
# ----------------------------------------
# Chat function
# ----------------------------------------
def chat(message, history, model_choice):
llm = load_model(model_choice)
# System prompt
conversation = "System: You are a helpful assistant.\n"
# Convert ChatInterface history (list of dicts) into text prompt
for msg in history[-3:]:
# ChatInterface format: {"role": "...", "content": "..."}
if isinstance(msg, dict):
role = msg.get("role")
content = msg.get("content", "")
if role == "user":
conversation += f"User: {content}\n"
elif role == "assistant":
conversation += f"Assistant: {content}\n"
# Safety: old tuple format
elif isinstance(msg, list) or isinstance(msg, tuple):
human, assistant = msg
conversation += f"User: {human}\n"
if assistant:
conversation += f"Assistant: {assistant}\n"
# Add current message
conversation += f"User: {message}\nAssistant:"
# Generate model response
response = llm(
conversation,
max_tokens=128,
temperature=0.7,
top_p=0.9,
top_k=40,
repeat_penalty=1.1,
stop=["User:", "Assistant:"],
echo=False
)
return response["choices"][0]["text"].strip()
# ----------------------------------------
# Gradio UI
# ----------------------------------------
with gr.Blocks() as demo:
gr.Markdown("# 🦙 Datangtang GGUF Model Demo")
gr.Markdown("Switch between **1B** and **3B** GGUF models in real-time.")
model_choice = gr.Dropdown(
label="Select Model",
choices=list(MODEL_CONFIGS.keys()),
value="1B Model (Datangtang/GGUF1B)",
)
chat_iface = gr.ChatInterface(
fn=lambda message, history: chat(message, history, model_choice.value),
examples=[
"Explain deep learning in one paragraph.",
"What is the difference between supervised and unsupervised learning?",
"Explain what a transformer model is.",
],
cache_examples=False,
)
model_choice.change(
fn=lambda x: f"🔄 Switched to: {x}",
inputs=[model_choice],
outputs=[],
)
if __name__ == "__main__":
demo.launch() |