OzTianlu commited on
Commit
ac31c36
·
verified ·
1 Parent(s): 64284ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -15
app.py CHANGED
@@ -1,53 +1,66 @@
1
  import gradio as gr
2
  import torch
 
3
  from threading import Thread
4
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
5
 
6
  MODEL_ID = "NoesisLab/Spartacus-1B-Instruct"
7
 
 
8
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
9
- model = AutoModelForCausalLM.from_pretrained(
10
- MODEL_ID,
11
- torch_dtype=torch.float16,
12
- device_map="auto",
13
- trust_remote_code=True,
14
- )
15
 
 
 
16
 
 
17
  def respond(message, history):
 
 
 
 
 
 
 
 
 
 
18
  messages = [{"role": "system", "content": "You are Spartacus, a helpful assistant."}]
19
  for msg in history:
20
- messages.append({"role": msg["role"], "content": msg["content"]})
 
 
 
21
  messages.append({"role": "user", "content": message})
22
 
23
  input_ids = tokenizer.apply_chat_template(
24
  messages, add_generation_prompt=True, return_tensors="pt"
25
- ).to(model.device)
26
 
27
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
28
-
29
  generate_kwargs = dict(
30
  input_ids=input_ids,
31
  streamer=streamer,
32
- temperature=0.5,
33
- top_p=0.9,
 
34
  do_sample=True,
35
  )
36
 
 
37
  thread = Thread(target=model.generate, kwargs=generate_kwargs)
38
  thread.start()
39
 
40
  response = ""
41
  for token in streamer:
42
- response += token
43
- yield response
44
-
45
 
46
  demo = gr.ChatInterface(
47
  fn=respond,
 
48
  title="Spartacus Chat",
49
  description="Chat with NoesisLab/Spartacus-1B-Instruct",
50
  )
51
 
52
  if __name__ == "__main__":
53
- demo.launch()
 
1
  import gradio as gr
2
  import torch
3
+ import spaces # 必须导入
4
  from threading import Thread
5
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
6
 
7
  MODEL_ID = "NoesisLab/Spartacus-1B-Instruct"
8
 
9
+ # Tokenizer 可以留在全局,因为它不占 GPU
10
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 
 
 
 
 
11
 
12
+ # 建议将模型定义为全局变量,但在函数内初始化,避免重复加载
13
+ model = None
14
 
15
+ @spaces.GPU # 必须加在生成函数上
16
  def respond(message, history):
17
+ global model
18
+ # 只有在第一次运行且 GPU 环境就绪时加载模型
19
+ if model is None:
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ MODEL_ID,
22
+ torch_dtype=torch.float16,
23
+ # device_map="auto" 在 ZeroGPU 中容易报错,建议直接 .to("cuda")
24
+ trust_remote_code=True,
25
+ ).to("cuda")
26
+
27
  messages = [{"role": "system", "content": "You are Spartacus, a helpful assistant."}]
28
  for msg in history:
29
+ # 注意:Gradio history 结构可能需要适配
30
+ role = "assistant" if msg['role'] == 'assistant' else "user"
31
+ messages.append({"role": role, "content": msg["content"]})
32
+
33
  messages.append({"role": "user", "content": message})
34
 
35
  input_ids = tokenizer.apply_chat_template(
36
  messages, add_generation_prompt=True, return_tensors="pt"
37
+ ).to("cuda")
38
 
39
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
40
+
41
  generate_kwargs = dict(
42
  input_ids=input_ids,
43
  streamer=streamer,
44
+ max_new_tokens=2048, # 建议加上限制,防止超时
45
+ temperature=0.6,
46
+ top_p=0.95,
47
  do_sample=True,
48
  )
49
 
50
+ # 在 ZeroGPU 中使用 Thread 也是允许的
51
  thread = Thread(target=model.generate, kwargs=generate_kwargs)
52
  thread.start()
53
 
54
  response = ""
55
  for token in streamer:
56
+ yield token # 直接 yield 增量或累计文本
 
 
57
 
58
  demo = gr.ChatInterface(
59
  fn=respond,
60
+ type="messages", # 建议明确指定 Gradio 的消息类型
61
  title="Spartacus Chat",
62
  description="Chat with NoesisLab/Spartacus-1B-Instruct",
63
  )
64
 
65
  if __name__ == "__main__":
66
+ demo.launch()