Spaces:

maxdougly
/

iris

Runtime error

File size: 2,374 Bytes

985b94e
 
 
 
 
 
 
 
 
f8b7a6c
985b94e
 
93a46f5
9d94d25
93a46f5
d8856f3
 
 
 
 
 
 
 
93a46f5
7f05911
d8856f3
9d94d25
 
93a46f5
7f05911
d8856f3
f8b7a6c
 
71235da
93a46f5
9d94d25
 
fc5bec5
9d94d25
 
 
 
 
93a46f5
985b94e
 
 
 
93a46f5
 
9c7b0c8
 
93a46f5
985b94e

import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Prevent CUDA initialization outside ZeroGPU

import spaces  # Import spaces first
import gradio as gr
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

# Load the model and tokenizer globally
model = AutoPeftModelForCausalLM.from_pretrained("eforse01/lora_model").to("cuda")  # Move model to CUDA
tokenizer = AutoTokenizer.from_pretrained("eforse01/lora_model")

@spaces.GPU(duration=120)  # Decorate the function for ZeroGPU
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, min_p):
    # Construct messages for the chat template
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    messages.append({"role": "user", "content": message})

    # Tokenize the input messages
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",  # Return tensors for PyTorch
    )

    # Ensure input_ids is moved to the same device as the model
    input_ids = inputs.to("cuda")  # Move input_ids to CUDA
    print("Input IDs shape:", input_ids.shape)

    # Generate response
    output = model.generate(
        input_ids=input_ids,  # Pass tensor explicitly as input_ids
        max_new_tokens=max_tokens,
        use_cache=True,
        temperature=temperature,
        min_p=min_p,
    )

    # Debug output
    print("Generated Output Shape:", output.shape)
    print("Generated Output:", output)

    # Decode and format the response
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Yield the response
    yield response.split("assistant")[-1]


# Gradio Interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=2048, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=1.5, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.99, step=0.01, label="Min-p"),
    ],
)

if __name__ == "__main__":
    demo.launch()