| | import os |
| | import torch |
| | import gradio as gr |
| | from huggingface_hub import login |
| | from transformers import AutoTokenizer, AutoModelForCausalLM |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | base_model = "meta-llama/Llama-2-7b-chat-hf" |
| | adapter_model = "olacode55/zimble-llama2-finetunedhybride" |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(adapter_model) |
| |
|
| | |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | adapter_model, |
| | torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
| | device_map="auto", |
| | low_cpu_mem_usage=True |
| | ) |
| |
|
| | |
| | def generate(prompt): |
| | inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
| | with torch.no_grad(): |
| | outputs = model.generate( |
| | **inputs, |
| | max_new_tokens=250, |
| | temperature=0.7, |
| | top_p=0.9, |
| | do_sample=True, |
| | ) |
| | return tokenizer.decode(outputs[0], skip_special_tokens=True) |
| |
|
| | |
| | demo = gr.Interface( |
| | fn=generate, |
| | inputs=gr.Textbox(label="Enter your prompt", lines=4, placeholder="Type something..."), |
| | outputs=gr.Textbox(label="Model output"), |
| | title="🦙 Zimble LLaMA 2 (Merged)", |
| | description="Fine-tuned and merged version of LLaMA 2 running on Hugging Face Space" |
| | ) |
| |
|
| | demo.launch() |