Spaces:
Sleeping
Sleeping
File size: 2,318 Bytes
d1062f4 68ff241 4b57ecb d1062f4 68ff241 d1062f4 68ff241 70a8ea1 d1062f4 68ff241 ffa43a7 d1062f4 68ff241 d1062f4 4b57ecb 68ff241 d1062f4 ffa43a7 68ff241 4b57ecb 68ff241 70a8ea1 4b57ecb 70a8ea1 4b57ecb 70a8ea1 4b57ecb 70a8ea1 68ff241 d1062f4 b6f1597 d1062f4 68ff241 d1062f4 68ff241 ffa43a7 d1062f4 4b57ecb 8f75d5a 4b57ecb 70a8ea1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import os
from huggingface_hub import login
import spaces
# Authenticate with Hugging Face
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)
# Model repository IDs
base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
peft_model_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct" # Replace with your model repo (e.g., ubiodee/my-finetuned-model)
# Load the tokenizer from the fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
base_model_id,
torch_dtype=torch.float16,
device_map="auto",
token=hf_token,
low_cpu_mem_usage=True,
trust_remote_code=True
)
base_model.resize_token_embeddings(len(tokenizer))
# Load the PEFT adapter
model = PeftModel.from_pretrained(base_model, peft_model_id, token=hf_token)
# Define the prediction function with proper device handling
@spaces.GPU(duration=120)
def predict(text, max_length=100):
try:
messages = [{"role": "user", "content": text}]
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
# Handle inputs based on type
if isinstance(inputs, dict):
inputs = {key: val.to("cuda:0") for key, val in inputs.items()}
outputs = model.generate(**inputs, max_length=max_length)
else:
# If inputs is a tensor (e.g., input_ids)
inputs = inputs.to("cuda:0")
outputs = model.generate(input_ids=inputs, max_length=max_length)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
except Exception as e:
return f"Error during inference: {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=predict,
inputs=[
gr.Textbox(label="Input Text"),
gr.Slider(label="Max Length", minimum=50, maximum=500, value=100, step=1)
],
outputs=gr.Textbox(label="Model Output"),
title="LearnPlutus Demo",
description="Test the fine-tuned Llama-3.2-3B-Instruct model on ZeroGPU.",
flagging_mode="never"
)
# Launch the app
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
debug=True
) |