Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from unsloth import FastLanguageModel | |
| from transformers import TextStreamer | |
| import torch | |
| # Initialize the model and tokenizer | |
| def initialize_model(model_name, max_seq_length, dtype, load_in_4bit): | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=model_name, # Your Lora model name | |
| max_seq_length=max_seq_length, | |
| dtype=dtype, | |
| load_in_4bit=load_in_4bit, | |
| ) | |
| FastLanguageModel.for_inference(model) # Enable 2x faster inference | |
| return model, tokenizer | |
| # Load model and tokenizer | |
| model_name = "DominusDeorum/llama-3.2-lora_model" # Replace with your model | |
| max_seq_length = 2048 # Adjust as needed | |
| dtype = torch.float16 # Set dtype (can also use torch.bfloat16, etc.) | |
| load_in_4bit = True # Set to True if using 4-bit inference | |
| model, tokenizer = initialize_model(model_name, max_seq_length, dtype, load_in_4bit) | |
| def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p): | |
| # Prepare the chat history and system message | |
| messages = [{"role": "system", "content": system_message}] | |
| for val in history: | |
| if val[0]: | |
| messages.append({"role": "user", "content": val[0]}) | |
| if val[1]: | |
| messages.append({"role": "assistant", "content": val[1]}) | |
| # Add the user's new message | |
| messages.append({"role": "user", "content": message}) | |
| # Prepare inputs for the model | |
| inputs = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_tensors="pt", | |
| ).to("cuda") | |
| # Generate response with streaming | |
| text_streamer = TextStreamer(tokenizer, skip_prompt=True) | |
| response = "" | |
| for output in model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=max_tokens, | |
| use_cache=True, temperature=temperature, top_p=top_p): | |
| token = tokenizer.decode(output, skip_special_tokens=True) | |
| response += token | |
| yield response | |
| # Set up Gradio interface | |
| demo = gr.ChatInterface( | |
| respond, | |
| additional_inputs=[ | |
| gr.Textbox(value="You are a friendly Chatbot.", label="System message"), | |
| gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
| gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p (nucleus sampling)", | |
| ), | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |