Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| # ---------------------------------------- | |
| # Global model cache | |
| # ---------------------------------------- | |
| loaded_models = {} | |
| current_model_name = None | |
| MODEL_CONFIGS = { | |
| "1B Model (Datangtang/GGUF1B)": { | |
| "repo_id": "Datangtang/GGUF1B", | |
| "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf" | |
| }, | |
| "3B Model (Datangtang/GGUF3B)": { | |
| "repo_id": "Datangtang/GGUF3B", | |
| "filename": "llama-3.2-3b-instruct.Q4_K_M.gguf" | |
| } | |
| } | |
| # ---------------------------------------- | |
| # Load model function | |
| # ---------------------------------------- | |
| def load_model(model_choice): | |
| global loaded_models, current_model_name | |
| if model_choice in loaded_models: | |
| return loaded_models[model_choice] | |
| cfg = MODEL_CONFIGS[model_choice] | |
| model_path = hf_hub_download( | |
| repo_id=cfg["repo_id"], | |
| filename=cfg["filename"], | |
| local_dir="./model", | |
| token=os.environ["HF_TOKEN"] | |
| ) | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=1024, | |
| n_threads=6, | |
| n_batch=512, | |
| n_gpu_layers=0, | |
| use_mmap=True, | |
| use_mlock=True, | |
| verbose=False, | |
| ) | |
| loaded_models[model_choice] = llm | |
| current_model_name = model_choice | |
| return llm | |
| # ---------------------------------------- | |
| # Chat function (Gradio 4.x message format) | |
| # ---------------------------------------- | |
| def chat(messages, model_choice): | |
| llm = load_model(model_choice) | |
| # Construct conversation | |
| conversation = "System: You are a helpful assistant.\n" | |
| for msg in messages[-3:]: | |
| role = msg["role"] | |
| text = msg["content"] | |
| if role == "user": | |
| conversation += f"User: {text}\n" | |
| elif role == "assistant": | |
| conversation += f"Assistant: {text}\n" | |
| conversation += "Assistant:" | |
| # LLM output | |
| response = llm( | |
| conversation, | |
| max_tokens=128, | |
| temperature=0.7, | |
| top_p=0.9, | |
| top_k=40, | |
| repeat_penalty=1.1, | |
| stop=["User:", "Assistant:"] | |
| ) | |
| return response["choices"][0]["text"].strip() | |
| # ---------------------------------------- | |
| # Gradio UI (Gradio 4.x messages format) | |
| # ---------------------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🦙 Datangtang GGUF Model Demo (Gradio 4.x Compatible)") | |
| model_choice = gr.Dropdown( | |
| label="Select Model", | |
| choices=list(MODEL_CONFIGS.keys()), | |
| value="1B Model (Datangtang/GGUF1B)", | |
| ) | |
| chatbot = gr.Chatbot(label="Chat", type="messages") | |
| msg_box = gr.Textbox(label="Message") | |
| # User sends message | |
| def add_user_message(user_msg, messages): | |
| messages = messages + [{"role": "user", "content": user_msg}] | |
| return messages, "" | |
| # Bot replies | |
| def add_bot_reply(messages, model_choice): | |
| reply = chat(messages, model_choice) | |
| messages = messages + [{"role": "assistant", "content": reply}] | |
| return messages | |
| msg_box.submit( | |
| add_user_message, [msg_box, chatbot], [chatbot, msg_box] | |
| ).then( | |
| add_bot_reply, [chatbot, model_choice], chatbot | |
| ) | |
| demo.launch() | |