Spaces:

drumwell
/

llm3

Sleeping

File size: 2,357 Bytes

facdd51
756f6d5
af78cd0
756f6d5
 
facdd51
af78cd0
 
 
756f6d5
af78cd0
756f6d5
 
 
 
af78cd0
 
 
756f6d5
facdd51
 
 
 
 
 
 
 
 
 
 
 
 
756f6d5
 
 
 
 
 
 
 
 
facdd51
 
756f6d5
 
 
 
 
 
 
 
 
facdd51
 
 
 
 
 
 
d7d0400
facdd51
d7d0400
facdd51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
756f6d5

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from peft import PeftModel
from threading import Thread
import torch

# Load base model + your adapter
print("Loading base model...")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_8bit=True,
)

print("Loading your fine-tuned adapter...")
model = PeftModel.from_pretrained(model, "drumwell/autotrain-2duhi-5mmyz")
print("Model loaded!")

def respond(
    message,
    history: list[dict[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    hf_token: gr.OAuthToken,
):
    messages = [{"role": "system", "content": system_message}]
    messages.extend(history)
    messages.append({"role": "user", "content": message})
    
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        repetition_penalty=1.1,
    )
    
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    response = ""
    for token in streamer:
        response += token
        yield response

chatbot = gr.ChatInterface(
    respond,
    type="messages",
    additional_inputs=[
        gr.Textbox(value="You are a BMW E30 M3 and 320is technical expert assistant. Answer accurately based on factory specifications.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.3, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

with gr.Blocks() as demo:
    with gr.Sidebar():
        gr.LoginButton()
    chatbot.render()

if __name__ == "__main__":
    demo.launch()