Datangtang commited on
Commit
9aa1169
·
verified ·
1 Parent(s): fd571bb
Files changed (1) hide show
  1. app.py +57 -57
app.py CHANGED
@@ -1,78 +1,78 @@
 
1
  import gradio as gr
2
- from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
- import os
5
 
6
- print("Downloading GGUF model from HuggingFace...")
 
 
 
 
 
 
7
 
8
- # Download model
9
  model_path = hf_hub_download(
10
- repo_id="Datangtang/GGUF3B",
11
- filename="llama-3.2-3b-instruct.Q4_K_M.gguf",
12
- local_dir="./model"
13
  )
14
 
15
- print(f"Model downloaded to: {model_path}")
16
- print("Loading GGUF model with optimized settings...")
17
-
18
- # Load with optimized settings
19
  llm = Llama(
20
  model_path=model_path,
21
- n_ctx=1024, # Reduced from 2048 (faster)
22
- n_threads=6, # Increased from 4 (use more CPU)
23
- n_batch=512, # Added: larger batch for faster processing
24
- n_gpu_layers=0,
25
- verbose=False,
26
- use_mlock=True, # Keep model in RAM
27
- use_mmap=True, # Use memory mapping
28
  )
29
 
30
- print("Model loaded successfully!")
31
-
32
- def chat(message, history):
33
- llm_message = message["content"]
34
 
35
- conversation = "System: You are a helpful assistant.\n"
 
 
 
 
 
 
36
 
37
- for msg in history[-3:]:
38
- if msg["role"] == "user":
39
- conversation += f"User: {msg['content']}\n"
 
40
  else:
41
- conversation += f"Assistant: {msg['content']}\n"
42
 
43
- conversation += f"User: {llm_message}\nAssistant:"
 
44
 
45
- response = llm(
46
- conversation,
47
- max_tokens=128,
 
48
  temperature=0.7,
49
- top_p=0.9,
50
- top_k=40,
51
- repeat_penalty=1.1,
52
- stop=["User:", "Assistant:"]
53
  )
54
 
55
- reply = response["choices"][0]["text"].strip()
56
-
57
- return {"role": "assistant", "content": reply}
58
-
59
-
60
- # Create interface WITHOUT example caching
61
- demo = gr.ChatInterface(
62
- fn=chat,
63
- title="Bit & Sugar/llama-3.2-3b-finetome-1000steps-gguf",
64
- description=(
65
- "Best model from 8 experiments (1000 steps, 23% loss improvement) | "
66
- "Optimized with GGUF Q4_K_M quantization | "
67
- "ID2223 Lab 2"
68
- ),
69
- examples=[
70
- "What is machine learning?",
71
- "Explain AI briefly",
72
- "What is LoRA?",
73
- ],
74
- cache_examples=False, # IMPORTANT: Disable caching
75
- )
76
 
77
  if __name__ == "__main__":
78
- demo.launch()
 
1
+ import os
2
  import gradio as gr
 
3
  from huggingface_hub import hf_hub_download
4
+ from llama_cpp import Llama
5
 
6
+ # ============ 下载模型 ==============
7
+ # 从环境变量读取 HF Token(在 Spaces → Settings → Secrets 设置)
8
+ HF_TOKEN = os.environ.get("HF_Token")
9
+
10
+ # 模型仓库与文件
11
+ REPO_ID = "Datangtang/GGUF3B"
12
+ FILE_NAME = "llama-3.2-3b-instruct.Q4_K_M.gguf"
13
 
 
14
  model_path = hf_hub_download(
15
+ repo_id=REPO_ID,
16
+ filename=FILE_NAME,
17
+ token=HF_TOKEN
18
  )
19
 
20
+ # ============ 加载模型 ==============
 
 
 
21
  llm = Llama(
22
  model_path=model_path,
23
+ n_ctx=4096,
24
+ n_threads=4,
25
+ chat_format="llama-3",
 
 
 
 
26
  )
27
 
 
 
 
 
28
 
29
+ # ============ 推理函数 ==============
30
+ def chat_fn(history, user_input):
31
+ """
32
+ history 为 Gradio 聊天历史
33
+ user_input 为当前用户输入
34
+ """
35
+ messages = []
36
 
37
+ # 组织对话历史,适配 llama_cpp 的聊天格式
38
+ for role, text in history:
39
+ if role == "user":
40
+ messages.append({"role": "user", "content": text})
41
  else:
42
+ messages.append({"role": "assistant", "content": text})
43
 
44
+ # 新输入
45
+ messages.append({"role": "user", "content": user_input})
46
 
47
+ # 调用 LLM
48
+ result = llm.create_chat_completion(
49
+ messages=messages,
50
+ max_tokens=512,
51
  temperature=0.7,
52
+ top_p=0.95
 
 
 
53
  )
54
 
55
+ output = result["choices"][0]["message"]["content"]
56
+
57
+ # 返回:更新后的历史记录
58
+ history.append(("user", user_input))
59
+ history.append(("assistant", output))
60
+ return history, ""
61
+
62
+
63
+ # ============ Gradio UI ==============
64
+ with gr.Blocks() as demo:
65
+ gr.Markdown("# 💬 Chat with Your Fine-tuned LLM")
66
+
67
+ chatbot = gr.Chatbot(height=500)
68
+ user_input = gr.Textbox(show_label=False, placeholder="Enter message...")
69
+ submit = gr.Button("Send")
70
+
71
+ submit.click(
72
+ fn=chat_fn,
73
+ inputs=[chatbot, user_input],
74
+ outputs=[chatbot, user_input]
75
+ )
76
 
77
  if __name__ == "__main__":
78
+ demo.launch()