Datangtang commited on
Commit
540a0fc
·
verified ·
1 Parent(s): 1686e7e

go back again to 1b & 3b(没招了)

Browse files
Files changed (1) hide show
  1. app.py +114 -64
app.py CHANGED
@@ -1,78 +1,128 @@
1
- import os
2
  import gradio as gr
3
- from huggingface_hub import hf_hub_download
4
  from llama_cpp import Llama
 
 
5
 
6
- # ============ 下载模型 ==============
7
- # 从环境变量读取 HF Token(在 Spaces → Settings → Secrets 设置)
8
- HF_TOKEN = os.environ.get("HF_Token")
9
-
10
- # 模型仓库与文件
11
- REPO_ID = "Datangtang/GGUF3B"
12
- FILE_NAME = "llama-3.2-3b-instruct.Q4_K_M.gguf"
13
-
14
- model_path = hf_hub_download(
15
- repo_id=REPO_ID,
16
- filename=FILE_NAME,
17
- token=HF_TOKEN
18
- )
19
-
20
- # ============ 加载模型 ==============
21
- llm = Llama(
22
- model_path=model_path,
23
- n_ctx=4096,
24
- n_threads=4,
25
- chat_format="llama-3",
26
- )
27
-
28
-
29
- # ============ 推理函数 ==============
30
- def chat_fn(history, user_input):
31
- """
32
- history Gradio 聊天历史
33
- user_input 为当前用户输入
34
- """
35
- messages = []
36
-
37
- # 组织对话历史,适配 llama_cpp 的聊天格式
38
- for role, text in history:
39
- if role == "user":
40
- messages.append({"role": "user", "content": text})
41
- else:
42
- messages.append({"role": "assistant", "content": text})
43
-
44
- # 新输入
45
- messages.append({"role": "user", "content": user_input})
46
-
47
- # 调用 LLM
48
- result = llm.create_chat_completion(
49
- messages=messages,
50
- max_tokens=512,
51
- temperature=0.7,
52
- top_p=0.95
53
  )
54
 
55
- output = result["choices"][0]["message"]["content"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- # 返回:更新后的历史记录
58
- history.append(("user", user_input))
59
- history.append(("assistant", output))
60
- return history, ""
61
 
 
62
 
63
- # ============ Gradio UI ==============
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  with gr.Blocks() as demo:
65
- gr.Markdown("# 💬 Chat with Your Fine-tuned LLM")
66
 
67
- chatbot = gr.Chatbot(height=500)
68
- user_input = gr.Textbox(show_label=False, placeholder="Enter message...")
69
- submit = gr.Button("Send")
70
 
71
- submit.click(
72
- fn=chat_fn,
73
- inputs=[chatbot, user_input],
74
- outputs=[chatbot, user_input]
75
  )
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  if __name__ == "__main__":
78
- demo.launch()
 
 
1
  import gradio as gr
 
2
  from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
+ import os
5
 
6
+ # ----------------------------------------
7
+ # Global model cache
8
+ # ----------------------------------------
9
+ loaded_models = {} # Cache loaded Llama models
10
+ current_model_name = None
11
+
12
+ MODEL_CONFIGS = {
13
+ "1B Model (Datangtang/GGUF1B)": {
14
+ "repo_id": "Datangtang/GGUF1B",
15
+ "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
16
+ },
17
+ "3B Model (Datangtang/GGUF3B)": {
18
+ "repo_id": "Datangtang/GGGF3B",
19
+ "filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
20
+ }
21
+ }
22
+
23
+
24
+ # ----------------------------------------
25
+ # Load model function
26
+ # ----------------------------------------
27
+ def load_model(model_choice):
28
+ global loaded_models, current_model_name
29
+
30
+ if model_choice in loaded_models:
31
+ print(f"Reusing already loaded model: {model_choice}")
32
+ current_model_name = model_choice
33
+ return loaded_models[model_choice]
34
+
35
+ print(f"Downloading model: {model_choice}")
36
+
37
+ cfg = MODEL_CONFIGS[model_choice]
38
+
39
+ model_path = hf_hub_download(
40
+ repo_id=cfg["repo_id"],
41
+ filename=cfg["filename"],
42
+ local_dir="./model",
43
+ token=os.environ["HF_TOKEN"]
 
 
 
 
 
 
 
 
 
44
  )
45
 
46
+ print(f"Model downloaded to: {model_path}")
47
+ print("Loading GGUF model into memory...")
48
+
49
+ llm = Llama(
50
+ model_path=model_path,
51
+ n_ctx=1024,
52
+ n_threads=6,
53
+ n_batch=512,
54
+ n_gpu_layers=0,
55
+ use_mmap=True,
56
+ use_mlock=True,
57
+ verbose=False,
58
+ )
59
+
60
+ loaded_models[model_choice] = llm
61
+ current_model_name = model_choice
62
+
63
+ print("Model loaded successfully!")
64
+ return llm
65
+
66
+
67
+ # ----------------------------------------
68
+ # Chat function
69
+ # ----------------------------------------
70
+ def chat(message, history, model_choice):
71
+ llm = load_model(model_choice)
72
+
73
+ # System prompt
74
+ conversation = "System: You are a helpful assistant.\n"
75
 
76
+ # Add last 3 messages
77
+ for human, assistant in history[-3:]:
78
+ conversation += f"User: {human}\nAssistant: {assistant}\n"
 
79
 
80
+ conversation += f"User: {message}\nAssistant:"
81
 
82
+ response = llm(
83
+ conversation,
84
+ max_tokens=128,
85
+ temperature=0.7,
86
+ top_p=0.9,
87
+ top_k=40,
88
+ repeat_penalty=1.1,
89
+ stop=["User:", "Assistant:"],
90
+ echo=False
91
+ )
92
+
93
+ return response["choices"][0]["text"].strip()
94
+
95
+
96
+ # ----------------------------------------
97
+ # Gradio UI
98
+ # ----------------------------------------
99
  with gr.Blocks() as demo:
 
100
 
101
+ gr.Markdown("# 🦙 Datangtang GGUF Model Demo")
102
+ gr.Markdown("Switch between **1B** and **3B** GGUF models in real-time.")
 
103
 
104
+ model_choice = gr.Dropdown(
105
+ label="Select Model",
106
+ choices=list(MODEL_CONFIGS.keys()),
107
+ value="1B Model (Datangtang/GGUF1B)",
108
  )
109
 
110
+ chat_iface = gr.ChatInterface(
111
+ fn=lambda message, history: chat(message, history, model_choice.value),
112
+ examples=[
113
+ "Explain deep learning in one paragraph.",
114
+ "What is the difference between supervised and unsupervised learning?",
115
+ "Explain what a transformer model is.",
116
+ ],
117
+ cache_examples=False,
118
+ )
119
+
120
+ model_choice.change(
121
+ fn=lambda x: f"🔄 Switched to: {x}",
122
+ inputs=[model_choice],
123
+ outputs=[],
124
+ )
125
+
126
+
127
  if __name__ == "__main__":
128
+ demo.launch()