wy-wu commited on
Commit
d782c6d
·
verified ·
1 Parent(s): 323b107

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -5
app.py CHANGED
@@ -1,28 +1,38 @@
 
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
 
 
 
 
 
 
5
  MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
6
 
7
  def load_pipe(model_id=MODEL_ID):
8
  tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
9
  model = AutoModelForCausalLM.from_pretrained(
10
  model_id,
11
- torch_dtype=torch.float32, # CPU 環境用 float32 比較穩定
12
  low_cpu_mem_usage=True
13
  )
14
  return pipeline(
15
  "text-generation",
16
  model=model,
17
  tokenizer=tokenizer,
18
- device=-1 # -1 = CPU
19
  )
20
 
21
  pipe = load_pipe()
22
 
23
  SYSTEM_PROMPT = "你是一個助理,請使用繁體中文並簡潔回答。"
 
24
 
25
  def chat(history, user_msg):
 
 
 
26
  prompt = ""
27
  for role, text in history:
28
  prompt += f"{role}: {text}\n"
@@ -30,12 +40,14 @@ def chat(history, user_msg):
30
 
31
  out = pipe(
32
  prompt,
33
- max_new_tokens=256,
34
  do_sample=True,
35
  temperature=0.7,
36
  top_p=0.9,
37
- repetition_penalty=1.05,
 
38
  eos_token_id=pipe.tokenizer.eos_token_id,
 
39
  )[0]["generated_text"]
40
 
41
  reply = out.split("assistant:")[-1].strip()
@@ -44,7 +56,7 @@ def chat(history, user_msg):
44
  return history, ""
45
 
46
  with gr.Blocks() as demo:
47
- gr.Markdown("## Chatbot 範例 - Qwen2.5-1.5B-Instruct (CPU)")
48
  chatbox = gr.Chatbot(height=350)
49
  msg = gr.Textbox(label="輸入訊息")
50
  clear = gr.Button("清空對話")
 
1
+ import os
2
  import gradio as gr
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
5
 
6
+ # 🔹 CPU 省時小技巧:限制多執行緒
7
+ os.environ["OMP_NUM_THREADS"] = "1"
8
+ os.environ["MKL_NUM_THREADS"] = "1"
9
+
10
+ # 🔹 換成更小、更快的 Qwen 模型
11
  MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
12
 
13
  def load_pipe(model_id=MODEL_ID):
14
  tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
15
  model = AutoModelForCausalLM.from_pretrained(
16
  model_id,
17
+ torch_dtype=torch.float32, # CPU 建議用 float32
18
  low_cpu_mem_usage=True
19
  )
20
  return pipeline(
21
  "text-generation",
22
  model=model,
23
  tokenizer=tokenizer,
24
+ device=-1 # -1 = CPU
25
  )
26
 
27
  pipe = load_pipe()
28
 
29
  SYSTEM_PROMPT = "你是一個助理,請使用繁體中文並簡潔回答。"
30
+ MAX_TURNS = 3 # 最多保留最近 3 回合,避免輸入過長
31
 
32
  def chat(history, user_msg):
33
+ # 🔹 縮短歷史,避免輸入過大拖慢
34
+ history = history[-2*MAX_TURNS:]
35
+
36
  prompt = ""
37
  for role, text in history:
38
  prompt += f"{role}: {text}\n"
 
40
 
41
  out = pipe(
42
  prompt,
43
+ max_new_tokens=128, # 🔹 限制輸出長度,加快生成
44
  do_sample=True,
45
  temperature=0.7,
46
  top_p=0.9,
47
+ top_k=50,
48
+ repetition_penalty=1.1, # 🔹 減少重複
49
  eos_token_id=pipe.tokenizer.eos_token_id,
50
+ num_return_sequences=1
51
  )[0]["generated_text"]
52
 
53
  reply = out.split("assistant:")[-1].strip()
 
56
  return history, ""
57
 
58
  with gr.Blocks() as demo:
59
+ gr.Markdown("## Chatbot 範例 - Qwen2.5-0.5B-Instruct (CPU)")
60
  chatbox = gr.Chatbot(height=350)
61
  msg = gr.Textbox(label="輸入訊息")
62
  clear = gr.Button("清空對話")