Marcus719 commited on
Commit
8665c7a
·
verified ·
1 Parent(s): 49dc795

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -153
app.py CHANGED
@@ -1,230 +1,211 @@
1
- # app.py
2
  import os
3
  import traceback
4
  import time
5
  from huggingface_hub import snapshot_download
6
  import gradio as gr
7
 
8
- # Try to import llama-cpp-python; Space should install it from requirements.
9
- # If the import fails, the app will still start and show the error in status.
10
  try:
11
  from llama_cpp import Llama
12
- except Exception as e: # pragma: no cover
13
  Llama = None
14
  Llama_import_error = e
15
 
16
- # ---------- Configuration ----------
17
- MODEL_REPO = "Marcus719/Llama-3.2-3B-Instruct-FineTome-Lab2-GGUF"
18
- GGUF_ALLOW_PATTERNS = ["*.gguf"]
19
- DEFAULT_N_CTX = 4096
20
- DEFAULT_MAX_TOKENS = 256
21
- DEFAULT_N_THREADS = 4 # adjust based on Space CPU / available threads
22
- # -----------------------------------
 
 
 
23
 
24
  def log(msg: str):
25
  print(f"[app] {time.strftime('%Y-%m-%d %H:%M:%S')} - {msg}", flush=True)
26
 
27
- def find_gguf_in_dir(local_dir: str):
28
- for f in os.listdir(local_dir):
29
- if f.endswith(".gguf"):
30
- return os.path.join(local_dir, f)
31
- return None
32
-
33
- def load_model_from_hub(repo_id: str, n_ctx=DEFAULT_N_CTX, n_threads=DEFAULT_N_THREADS):
34
- """
35
- Downloads the model files using huggingface_hub.snapshot_download and returns
36
- an initialized Llama instance (from llama_cpp).
37
- """
38
  if Llama is None:
39
- raise RuntimeError(f"llama-cpp-python is not available: {Llama_import_error}")
40
-
41
- log(f"Starting snapshot_download for repo: {repo_id} (this may take a while on first run)")
42
- local_dir = snapshot_download(repo_id=repo_id, allow_patterns=GGUF_ALLOW_PATTERNS)
43
- log(f"Downloaded/located files at: {local_dir}")
44
-
45
- gguf_path = find_gguf_in_dir(local_dir)
46
- if gguf_path is None:
47
- raise FileNotFoundError(f"No .gguf file found in {local_dir}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- log(f"Found GGUF file: {gguf_path}. Initializing Llama loader...")
50
- # init the model
51
- llm = Llama(model_path=gguf_path, n_ctx=n_ctx, n_threads=n_threads)
52
- log("Llama model initialized successfully.")
 
53
  return llm, gguf_path
54
 
55
- # The Gradio app uses a simple state pattern: we store the Llama instance and gguf path in a state dict.
56
  def init_model(state):
57
- """
58
- Called by the Init button. Downloads and initializes the model if not already loaded.
59
- Returns a status message for the status Label and the state object for persistence.
60
- """
61
  try:
62
  if state.get("llm") is not None:
63
- return "✅ Ready (model already loaded)", state
64
-
65
- # show immediate feedback to user via return
66
- log("Init requested - loading model now.")
67
- state["status"] = "Downloading model from Hub..."
68
- # download and load
69
- llm, gguf_path = load_model_from_hub(MODEL_REPO)
70
- # save into state
71
  state["llm"] = llm
72
  state["gguf_path"] = gguf_path
73
- state["status"] = "✅ Ready"
74
- return "✅ Ready", state
75
  except Exception as exc:
76
  tb = traceback.format_exc()
77
- log(f"Error during init: {exc}\n{tb}")
78
- state["status"] = f"❌ Init failed: {exc}"
79
- return f"❌ Init failed: {exc}", state
80
 
81
  def generate_response(prompt: str, max_tokens: int, state):
82
- """
83
- Main generate function wired to the Generate button.
84
- Returns (output_text, status_text, state)
85
- """
86
  try:
87
  if not prompt or prompt.strip() == "":
88
- return "Please provide a prompt.", "⚠️ Idle", state
89
-
90
- # Lazy load if model not initialized
91
  if state.get("llm") is None:
92
- # try to load on-the-fly
93
- log("Model not loaded, attempting lazy-load...")
94
- # provide immediate user-visible status by returning early while we load,
95
- # but Gradio can't stream two-stage responses easily, so we'll block here and update status after.
96
  try:
97
- llm, gguf_path = load_model_from_hub(MODEL_REPO)
 
98
  state["llm"] = llm
99
  state["gguf_path"] = gguf_path
100
- log("Lazy-load successful.")
101
  except Exception as e:
102
- tb = traceback.format_exc()
103
- log(f"Lazy-load failed: {e}\n{tb}")
104
- return f"Error loading model: {e}", f"❌ Error: {e}", state
105
 
106
  llm = state.get("llm")
107
- if llm is None:
108
- return "Model not initialized.", "❌ Not initialized", state
109
-
110
- log(f"Generating for prompt (len={len(prompt)}), max_tokens={max_tokens}")
111
- status_msg = "Processing..."
112
- # Call the model synchronously; this will block until generation is done
113
- out = llm(prompt, max_tokens=max_tokens)
114
- # llama_cpp returns different shapes depending on version; handle safely
115
- text = ""
116
- if isinstance(out, dict):
117
- # common shape: {"choices":[{"text": "..."}], ...}
118
- try:
119
- choices = out.get("choices")
120
- if choices and isinstance(choices, list) and len(choices) > 0:
121
- text = choices[0].get("text", "")
122
- else:
123
- text = str(out)
124
- except Exception:
125
- text = str(out)
126
- else:
127
- # fallback
128
- text = str(out)
129
-
130
- log("Generation completed.")
131
- return text, "✅ Done", state
132
 
133
  except Exception as exc:
134
  tb = traceback.format_exc()
135
- log(f"Error during generation: {exc}\n{tb}")
136
- return f"Error: {exc}\n\n{tb}", f"❌ Error: {exc}", state
 
 
 
 
 
137
 
138
- # ---------------- Gradio UI ----------------
139
 
140
- # 使用 Soft 主题,配色更具现代感
141
  theme = gr.themes.Soft(
142
  primary_hue="indigo",
143
  secondary_hue="slate",
144
- neutral_hue="slate",
145
- font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"]
146
  )
147
 
148
- # 自定义 CSS 稍微调整一下边距和圆角
149
  custom_css = """
150
- #response-box {
151
- font-family: 'Inter', sans-serif;
152
- background-color: #f9fafb;
153
- border-radius: 8px;
154
- padding: 10px;
155
- }
156
  """
157
 
158
  with gr.Blocks(title="Llama 3.2 Lab2 Project", theme=theme, css=custom_css) as demo:
159
 
160
- # --- 标题和介绍区域 ---
161
  with gr.Row():
162
  with gr.Column(scale=1):
163
  gr.Markdown("# 🦙 Llama 3.2 (3B) Fine-Tuned Chatbot")
164
  gr.Markdown(
165
- """
166
- **ID2223 Lab 2 Project** | Fine-tuned on the **FineTome-100k** dataset.
167
- Running locally on CPU via **GGUF** quantization (4-bit).
168
  """
169
  )
170
  with gr.Column(scale=0, min_width=150):
171
- # 状态显示放在右上角,醒目
172
- status_label = gr.Label(value="⚪ Not initialized", label="System Status", show_label=False)
173
 
174
- # --- 主体布局:左侧控制,右侧输出 ---
175
  with gr.Row():
176
-
177
- # 左侧:控制面板
178
  with gr.Column(scale=4):
179
  with gr.Group():
180
  prompt_in = gr.Textbox(
181
  lines=5,
182
- label="User Instruction",
183
- placeholder="E.g., Explain quantum entanglement to a 5-year-old...",
184
  elem_id="prompt-input"
185
  )
186
 
187
- # 参数折叠起来,保持界面清爽
188
- with gr.Accordion("⚙️ Advanced Parameters", open=False):
189
  max_tokens = gr.Slider(
190
  minimum=16,
191
  maximum=1024,
192
  step=16,
193
  value=DEFAULT_MAX_TOKENS,
194
- label="Max Generation Tokens",
195
- info="Longer generations require more CPU time."
196
  )
197
 
198
- # 按钮区域
199
  with gr.Row():
200
- init_btn = gr.Button("🚀 1. Load Model", variant="secondary", scale=1)
201
- gen_btn = gr.Button("✨ 2. Generate", variant="primary", scale=2)
202
 
203
- with gr.Row():
204
- clear_btn = gr.Button("🗑️ Clear History", variant="stop")
205
 
206
- # 右侧:回复展
207
  with gr.Column(scale=6):
208
  output_txt = gr.Textbox(
209
- label="Llama Response",
210
  lines=15,
211
- placeholder="The model response will appear here...",
212
- show_copy_button=True, # 允许复制内容
213
- elem_id="response-box"
214
  )
215
 
216
- # --- 底部版权/说明 ---
217
  with gr.Row():
218
  gr.Markdown(
219
- "⚠️ *Note: Inference is running on CPU. Generation speed depends on the Space hardware.*",
220
  elem_classes=["footer-text"]
221
  )
222
 
223
- # --- 状态管理 (Hidden) ---
224
  state = gr.State({"llm": None, "gguf_path": None, "status": "Not initialized"})
225
 
226
- # --- 事件绑定 ---
227
- # 点击 Load Model
228
  init_btn.click(
229
  fn=init_model,
230
  inputs=state,
@@ -232,7 +213,6 @@ with gr.Blocks(title="Llama 3.2 Lab2 Project", theme=theme, css=custom_css) as d
232
  show_progress=True
233
  )
234
 
235
- # 点击 Generate
236
  gen_btn.click(
237
  fn=generate_response,
238
  inputs=[prompt_in, max_tokens, state],
@@ -240,22 +220,9 @@ with gr.Blocks(title="Llama 3.2 Lab2 Project", theme=theme, css=custom_css) as d
240
  show_progress=True
241
  )
242
 
243
- # 点击 Clear
244
- def clear_all():
245
- return "", "⚪ Ready", {"llm": None, "gguf_path": None, "status": "Not initialized"}
246
-
247
- # 注意:Clear 按钮逻辑稍微修改,避免清空掉已加载的模型对象
248
- # 这里的 clear_all 只是重置了 UI,实际你可以保留 state 中的 llm 以免重复加载
249
- # 改进版 Clear 逻辑:
250
- def soft_clear(current_state):
251
- # 保持模型加载状态,只清空文本
252
- status = "✅ Ready" if current_state.get("llm") else "⚪ Not initialized"
253
- return "", status, current_state
254
-
255
  clear_btn.click(fn=soft_clear, inputs=[state], outputs=[prompt_in, status_label, state])
256
- # 同时也清空输出框
257
  clear_btn.click(lambda: "", outputs=[output_txt])
258
 
259
- # Launch configuration
260
  if __name__ == "__main__":
261
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 
 
1
  import os
2
  import traceback
3
  import time
4
  from huggingface_hub import snapshot_download
5
  import gradio as gr
6
 
7
+ # 尝试导入 llama_cpp,如果失败则在 UI 中提示
 
8
  try:
9
  from llama_cpp import Llama
10
+ except Exception as e:
11
  Llama = None
12
  Llama_import_error = e
13
 
14
+ # ---------- 配置区域 ----------
15
+ # ★★★ 请在这里修改为你的模型仓库 ★★★
16
+ MODEL_REPO = "Marcus719/Llama-3.2-3B-Instruct-FineTome-Lab2-GGUF"
17
+ # 指定只下载 q4_k_m 文件,防止下载多余文件爆盘
18
+ GGUF_FILENAME = "unsloth.Q4_K_M.gguf"
19
+
20
+ DEFAULT_N_CTX = 2048 # 上下文长度
21
+ DEFAULT_MAX_TOKENS = 256 # 默认生成长度
22
+ DEFAULT_N_THREADS = 2 # 免费 CPU 建议设为 2
23
+ # ------------------------------
24
 
25
  def log(msg: str):
26
  print(f"[app] {time.strftime('%Y-%m-%d %H:%M:%S')} - {msg}", flush=True)
27
 
28
+ def load_model_from_hub(repo_id: str, filename: str, n_ctx=DEFAULT_N_CTX, n_threads=DEFAULT_N_THREADS):
 
 
 
 
 
 
 
 
 
 
29
  if Llama is None:
30
+ raise RuntimeError(f"llama-cpp-python 未安装或加载失败: {Llama_import_error}")
31
+
32
+ log(f"开始下载模型: {repo_id} / {filename} ...")
33
+
34
+ # 使用 snapshot_download 下载单个文件
35
+ # allow_patterns 确保只下载 GGUF
36
+ local_dir = snapshot_download(
37
+ repo_id=repo_id,
38
+ allow_patterns=[filename],
39
+ local_dir_use_symlinks=False # 在 Space 中有时软链接会有问题,禁用更稳
40
+ )
41
+
42
+ # 拼接完整路径
43
+ # snapshot_download 默认会保持目录结构,或者我们直接搜寻下载目录
44
+ gguf_path = os.path.join(local_dir, filename)
45
+
46
+ # 如果直接拼接找不到,尝试搜索(容错)
47
+ if not os.path.exists(gguf_path):
48
+ for root, dirs, files in os.walk(local_dir):
49
+ if filename in files:
50
+ gguf_path = os.path.join(root, filename)
51
+ break
52
+
53
+ if not os.path.exists(gguf_path):
54
+ raise FileNotFoundError(f"在 {local_dir} 中找不到 {filename}")
55
 
56
+ log(f"模型路径: {gguf_path}。正在加载到内存...")
57
+
58
+ # 初始化模型
59
+ llm = Llama(model_path=gguf_path, n_ctx=n_ctx, n_threads=n_threads, verbose=False)
60
+ log("Llama 模型加载成功!")
61
  return llm, gguf_path
62
 
 
63
  def init_model(state):
64
+ """初始化按钮的回调函数"""
 
 
 
65
  try:
66
  if state.get("llm") is not None:
67
+ return "✅ 系统就绪 (模型已加载)", state
68
+
69
+ log("收到加载请求...")
70
+ # 下载并加载
71
+ llm, gguf_path = load_model_from_hub(MODEL_REPO, GGUF_FILENAME)
72
+
73
+ # 更新状态
 
74
  state["llm"] = llm
75
  state["gguf_path"] = gguf_path
76
+
77
+ return "✅ 系统就绪", state
78
  except Exception as exc:
79
  tb = traceback.format_exc()
80
+ log(f"初始化错误: {exc}\n{tb}")
81
+ return f"❌ 初始化失败: {exc}", state
 
82
 
83
  def generate_response(prompt: str, max_tokens: int, state):
84
+ """生成按钮的回调函数"""
 
 
 
85
  try:
86
  if not prompt or prompt.strip() == "":
87
+ return "⚠️ 请输入指令。", "⚠️ 空闲", state
88
+
89
+ # 懒加载:如果没点初始化直接点生成,尝试自动加载
90
  if state.get("llm") is None:
 
 
 
 
91
  try:
92
+ log("未检测到模型,尝试自动加载...")
93
+ llm, gguf_path = load_model_from_hub(MODEL_REPO, GGUF_FILENAME)
94
  state["llm"] = llm
95
  state["gguf_path"] = gguf_path
 
96
  except Exception as e:
97
+ return f"❌ 模型加载失败: {e}", f"❌ 错误", state
 
 
98
 
99
  llm = state.get("llm")
100
+
101
+ log(f"正在生成 (Prompt 长度={len(prompt)})...")
102
+
103
+ # 构造 Llama 3 格式的 Prompt
104
+ system_prompt = "You are a helpful AI assistant."
105
+ # 简单拼接:System + User
106
+ # 如果需要更严格的格式,可以使用 tokenizer.apply_chat_template
107
+ # 这里为了通用性使用简单的文本拼接,Llama 3 通常也能理解
108
+ full_prompt = f"<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
109
+
110
+ # 推理
111
+ output = llm(
112
+ full_prompt,
113
+ max_tokens=max_tokens,
114
+ stop=["<|eot_id|>"], # 停止符
115
+ echo=False
116
+ )
117
+
118
+ text = output['choices'][0]['text']
119
+ log("生成完成。")
120
+ return text, "✅ 生成完毕", state
 
 
 
 
121
 
122
  except Exception as exc:
123
  tb = traceback.format_exc()
124
+ log(f"生成错误: {exc}\n{tb}")
125
+ return f"运行出错: {exc}", f"❌ 异常", state
126
+
127
+ def soft_clear(current_state):
128
+ """清除按钮:只清空文本,保留模型"""
129
+ status = "✅ 系统就绪" if current_state.get("llm") else "⚪ 未初始化"
130
+ return "", status, current_state
131
 
132
+ # ---------------- Gradio UI 构建 ----------------
133
 
134
+ # 主题设置
135
  theme = gr.themes.Soft(
136
  primary_hue="indigo",
137
  secondary_hue="slate",
138
+ neutral_hue="slate"
 
139
  )
140
 
141
+ # 自定义 CSS
142
  custom_css = """
143
+ .footer-text { font-size: 0.8em; color: gray; text-align: center; }
 
 
 
 
 
144
  """
145
 
146
  with gr.Blocks(title="Llama 3.2 Lab2 Project", theme=theme, css=custom_css) as demo:
147
 
148
+ # 标题
149
  with gr.Row():
150
  with gr.Column(scale=1):
151
  gr.Markdown("# 🦙 Llama 3.2 (3B) Fine-Tuned Chatbot")
152
  gr.Markdown(
153
+ f"""
154
+ **ID2223 Lab 2 Project** | Fine-tuned on **FineTome-100k**.
155
+ Running on CPU (GGUF 4-bit) | Model: `{MODEL_REPO}`
156
  """
157
  )
158
  with gr.Column(scale=0, min_width=150):
159
+ status_label = gr.Label(value="⚪ 未初始化", label="系统状态", show_label=False)
 
160
 
161
+ # 主体布局
162
  with gr.Row():
163
+ # 左侧:输入与控制
 
164
  with gr.Column(scale=4):
165
  with gr.Group():
166
  prompt_in = gr.Textbox(
167
  lines=5,
168
+ label="用户指令 (User Input)",
169
+ placeholder="例如:请解释量子力学...",
170
  elem_id="prompt-input"
171
  )
172
 
173
+ with gr.Accordion("⚙️ 高级参数 (Advanced)", open=False):
 
174
  max_tokens = gr.Slider(
175
  minimum=16,
176
  maximum=1024,
177
  step=16,
178
  value=DEFAULT_MAX_TOKENS,
179
+ label="最大生成长度 (Max Tokens)",
180
+ info="生成的越长,CPU 耗时越久。"
181
  )
182
 
 
183
  with gr.Row():
184
+ init_btn = gr.Button("🚀 1. 加载模型 (Load)", variant="secondary")
185
+ gen_btn = gr.Button("✨ 2. 生成回复 (Generate)", variant="primary")
186
 
187
+ clear_btn = gr.Button("🗑️ 清空历史 (Clear)", variant="stop")
 
188
 
189
+ # 右侧:输出显
190
  with gr.Column(scale=6):
191
  output_txt = gr.Textbox(
192
+ label="模型回复 (Response)",
193
  lines=15,
194
+ show_copy_button=True,
195
+ interactive=False
 
196
  )
197
 
198
+ # 底部说明
199
  with gr.Row():
200
  gr.Markdown(
201
+ "⚠️ *注意:推理在免费 CPU 上运行,速度可能较慢。首次运行时需要下载模型(约2GB),请耐心等待。*",
202
  elem_classes=["footer-text"]
203
  )
204
 
205
+ # 状态存储
206
  state = gr.State({"llm": None, "gguf_path": None, "status": "Not initialized"})
207
 
208
+ # 事件绑定
 
209
  init_btn.click(
210
  fn=init_model,
211
  inputs=state,
 
213
  show_progress=True
214
  )
215
 
 
216
  gen_btn.click(
217
  fn=generate_response,
218
  inputs=[prompt_in, max_tokens, state],
 
220
  show_progress=True
221
  )
222
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  clear_btn.click(fn=soft_clear, inputs=[state], outputs=[prompt_in, status_label, state])
 
224
  clear_btn.click(lambda: "", outputs=[output_txt])
225
 
226
+ # 启动应用
227
  if __name__ == "__main__":
228
+ demo.launch(server_name="0.0.0.0", server_port=7860)