| import torch |
| import re |
| import gradio as gr |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
| from threading import Thread |
|
|
| |
| model_id = "meta-llama/Llama-3.2-3B-Instruct" |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| torch_dtype=torch.bfloat16, |
| device_map="auto" |
| ) |
|
|
| |
| def stream_response(message, history): |
| |
| messages = [{"role": "system", "content": "你是一个乐于助人的 AI 助手。"}] |
| for h in history: |
| if isinstance(h, dict): |
| messages.append(h) |
| elif isinstance(h, (list, tuple)): |
| if h[0]: messages.append({"role": "user", "content": h[0]}) |
| if h[1]: messages.append({"role": "assistant", "content": h[1]}) |
| messages.append({"role": "user", "content": message}) |
|
|
| |
| model_inputs = tokenizer.apply_chat_template( |
| messages, |
| add_generation_prompt=True, |
| return_tensors="pt", |
| return_dict=True |
| ).to(model.device) |
|
|
| |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
|
|
| |
| generate_kwargs = { |
| **model_inputs, |
| "streamer": streamer, |
| "max_new_tokens": 512, |
| "do_sample": True, |
| "temperature": 0.7, |
| "pad_token_id": tokenizer.eos_token_id, |
| } |
| thread = Thread(target=model.generate, kwargs=generate_kwargs) |
| thread.start() |
|
|
| |
| partial_text = "" |
| for new_text in streamer: |
| partial_text += new_text |
| |
| |
| |
| |
| clean_display_text = re.sub(r'[\n\-\*\"”"“#]', '', partial_text) |
| |
| |
| yield clean_display_text |
|
|
| |
| demo = gr.ChatInterface( |
| fn=stream_response, |
| title="Llama-3.2 流式对话助手 ⚡", |
| |
| cache_examples=False, |
| examples=["给我写一首关于春天的诗", "如何用 Python 实现快速排序?"], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |