File size: 3,991 Bytes
c983f53
9aa1169
540a0fc
 
c983f53
540a0fc
 
 
 
 
 
 
 
 
 
 
 
37b5c7e
540a0fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac0916f
ee73df4
540a0fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9aa1169
37b5c7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540a0fc
9aa1169
37b5c7e
540a0fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9aa1169
 
540a0fc
 
9aa1169
540a0fc
 
 
 
9aa1169
77cf31a
540a0fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77cf31a
540a0fc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

# ----------------------------------------
# Global model cache
# ----------------------------------------
loaded_models = {}   # Cache loaded Llama models
current_model_name = None

MODEL_CONFIGS = {
    "1B Model (Datangtang/GGUF1B)": {
        "repo_id": "Datangtang/GGUF1B",
        "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
    },
    "3B Model (Datangtang/GGUF3B)": {
        "repo_id": "Datangtang/GGUF3B",
        "filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
    }
}


# ----------------------------------------
# Load model function
# ----------------------------------------
def load_model(model_choice):
    global loaded_models, current_model_name

    if model_choice in loaded_models:
        print(f"Reusing already loaded model: {model_choice}")
        current_model_name = model_choice
        return loaded_models[model_choice]

    print(f"Downloading model: {model_choice}")

    cfg = MODEL_CONFIGS[model_choice]

    model_path = hf_hub_download(
        repo_id=cfg["repo_id"],
        filename=cfg["filename"],
        local_dir="./model",
        token=os.environ["HF_TOKEN"]
    )

    print(f"Model downloaded to: {model_path}")
    print("Loading GGUF model into memory...")

    llm = Llama(
        model_path=model_path,
        n_ctx=1024,
        n_threads=6,
        n_batch=512,
        n_gpu_layers=0,
        use_mmap=True,
        use_mlock=True,
        verbose=False,
    )

    loaded_models[model_choice] = llm
    current_model_name = model_choice

    print("Model loaded successfully!")
    return llm


# ----------------------------------------
# Chat function
# ----------------------------------------
def chat(message, history, model_choice):
    llm = load_model(model_choice)

    # System prompt
    conversation = "System: You are a helpful assistant.\n"

    # Convert ChatInterface history (list of dicts) into text prompt
    for msg in history[-3:]:
        # ChatInterface format: {"role": "...", "content": "..."}
        if isinstance(msg, dict):
            role = msg.get("role")
            content = msg.get("content", "")
            if role == "user":
                conversation += f"User: {content}\n"
            elif role == "assistant":
                conversation += f"Assistant: {content}\n"

        # Safety: old tuple format
        elif isinstance(msg, list) or isinstance(msg, tuple):
            human, assistant = msg
            conversation += f"User: {human}\n"
            if assistant:
                conversation += f"Assistant: {assistant}\n"

    # Add current message
    conversation += f"User: {message}\nAssistant:"

    # Generate model response
    response = llm(
        conversation,
        max_tokens=128,
        temperature=0.7,
        top_p=0.9,
        top_k=40,
        repeat_penalty=1.1,
        stop=["User:", "Assistant:"],
        echo=False
    )

    return response["choices"][0]["text"].strip()

# ----------------------------------------
# Gradio UI
# ----------------------------------------
with gr.Blocks() as demo:

    gr.Markdown("# 🦙 Datangtang GGUF Model Demo")
    gr.Markdown("Switch between **1B** and **3B** GGUF models in real-time.")

    model_choice = gr.Dropdown(
        label="Select Model",
        choices=list(MODEL_CONFIGS.keys()),
        value="1B Model (Datangtang/GGUF1B)",
    )

    chat_iface = gr.ChatInterface(
        fn=lambda message, history: chat(message, history, model_choice.value),
        examples=[
            "Explain deep learning in one paragraph.",
            "What is the difference between supervised and unsupervised learning?",
            "Explain what a transformer model is.",
        ],
        cache_examples=False,
    )

    model_choice.change(
        fn=lambda x: f"🔄 Switched to: {x}",
        inputs=[model_choice],
        outputs=[],
    )


if __name__ == "__main__":
    demo.launch()