|
|
| import gradio as gr |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| import torch |
| import os |
|
|
| |
| |
| |
| |
|
|
| |
| |
| MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-3B-Instruct") |
|
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
| device_map="auto" |
| ) |
|
|
| def hf_chat(system_prompt: str, user_text: str, max_tokens: int = 220) -> str: |
| prompt = f'''<|system|> |
| {system_prompt.strip()} |
| <|user|> |
| {user_text.strip()} |
| <|assistant|> |
| ''' |
|
|
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
| with torch.inference_mode(): |
| output_ids = model.generate( |
| **inputs, |
| max_new_tokens=max_tokens, |
| do_sample=False, |
| temperature=0.0, |
| use_cache=True |
| ) |
|
|
| generated_ids = output_ids[0][len(inputs.input_ids[0]):] |
| text = tokenizer.decode(generated_ids, skip_special_tokens=True) |
|
|
| for token in ["<|assistant|>", "<|user|>", "<|system|>", "<|im_end|>", "<|im_start|>" ]: |
| if token in text: |
| text = text.split(token)[0].strip() |
|
|
| return text.strip() |
|
|
|
|
| def predict(message, history, system_prompt_input): |
| |
| |
| response = hf_chat(system_prompt_input, message) |
| return response |
|
|
|
|
| with gr.Blocks() as demo: |
| gr.Markdown("# MezayaAI Qwen2.5-3B-Instruct Demo") |
| system_prompt_input = gr.Textbox(label="System Prompt", value="You are a helpful AI assistant.", lines=2) |
| chatbot = gr.ChatInterface( |
| predict, |
| chatbot=gr.Chatbot(height=300), |
| textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7), |
| |
| title="Qwen2.5-3B-Instruct Chat", |
| description="Ask Qwen2.5-3B-Instruct anything!", |
| theme="soft", |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(debug=True) |
|
|