Spaces:

caobin
/

llm_assistant

Sleeping

App Files Files Community

caobin commited on Dec 10, 2025

Commit

a90f54e

verified ·

1 Parent(s): 31dc697

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -26

app.py CHANGED Viewed

@@ -1,55 +1,70 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-MODEL_ID = "caobin/llm-caobin"
-tokenizer = AutoTokenizer.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True
-)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16,
-    device_map="auto",
     trust_remote_code=True
 )
-def chat_fn(message, history):
     full_prompt = ""
-    for user_msg, bot_msg in history:
         full_prompt += f"<|user|>{user_msg}<|assistant|>{bot_msg}"
-    full_prompt += f"<|user|>{message}<|assistant|>"
-    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
-    output_ids = model.generate(
         **inputs,
-        max_new_tokens=512,
-        temperature=0.7,
-        top_p=0.9,
         do_sample=True,
     )
-    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    if "<|assistant|>" in output_text:
-        output_text = output_text.split("<|assistant|>")[-1]
-    return output_text.strip()
-with gr.Blocks(title="caobin LLM chatbot") as demo:
     gr.Markdown("# 🤖 caobin 自定义 LLM 对话 Demo")
     chatbot = gr.Chatbot(height=450)
     msg = gr.Textbox(label="输入你的问题")
-    def respond(message, chat_history):
-        response = chat_fn(message, chat_history)
-        chat_history.append((message, response))
-        return "", chat_history
     msg.submit(respond, [msg, chatbot], [msg, chatbot])
 demo.launch()

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import torch
+import threading
+MODEL_ID = "caobin/llm-caobin"
+# 加载 tokenizer 和模型
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16,
     trust_remote_code=True
 )
+# 判断是否有 GPU
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+model.eval()
+# 边生成边输出的函数
+def generate_stream(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9, max_history=3, history=[]):
+    # 只保留最近 max_history 轮对话
+    recent_history = history[-max_history:]
     full_prompt = ""
+    for user_msg, bot_msg in recent_history:
         full_prompt += f"<|user|>{user_msg}<|assistant|>{bot_msg}"
+    full_prompt += f"<|user|>{prompt}<|assistant|>"
+    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
+    # 使用流式输出
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    generate_kwargs = dict(
         **inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
         do_sample=True,
     )
+    thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
+    # 边生成边返回文本
+    output_text = ""
+    for new_text in streamer:
+        output_text += new_text
+        yield output_text.strip()
+# Gradio 回调函数
+def respond(message, chat_history):
+    # 返回一个生成器，用于流式更新
+    generator = generate_stream(message, history=chat_history)
+    bot_response = ""
+    for partial in generator:
+        bot_response = partial
+        yield "", chat_history + [(message, bot_response)]
+# 创建 Gradio 界面
+with gr.Blocks(title="caobin LLM Chatbot") as demo:
     gr.Markdown("# 🤖 caobin 自定义 LLM 对话 Demo")
     chatbot = gr.Chatbot(height=450)
     msg = gr.Textbox(label="输入你的问题")
     msg.submit(respond, [msg, chatbot], [msg, chatbot])
 demo.launch()