Spaces:
Runtime error
Runtime error
File size: 3,335 Bytes
f58f290 05d9482 97ff0e4 f58f290 6bf78d4 05d9482 6bf78d4 f58f290 a3bf046 f58f290 6bf78d4 a3bf046 f58f290 a3bf046 f58f290 6bf78d4 f58f290 6bf78d4 a3bf046 f58f290 6bf78d4 f58f290 6bf78d4 a3bf046 f58f290 05d9482 f58f290 05d9482 a3bf046 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | import os
import torch
import gradio as gr
from datasets import load_dataset, Dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DATASET_NAME = "embedding-data/Amazon-QA"
FINETUNED_MODEL_NAME = "tinyllama-shopify-lora"
def load_data():
dataset = load_dataset(DATASET_NAME)
df = dataset['train'].to_pandas()
df = df.rename(columns={'query': 'question', 'pos': 'answer'})[['question', 'answer']]
df = df.dropna().astype(str)
df['answer'] = df['answer'].str.replace(r'\[\^.*?\^\]', '').str.strip()
return Dataset.from_pandas(df).train_test_split(test_size=0.1) # Now using imported Dataset
def setup_model():
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
torch_dtype=torch.float32
)
peft_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
return tokenizer, get_peft_model(model, peft_config)
def train_model():
tokenizer, model = setup_model()
data = load_data()
def tokenize_function(examples):
text = [f"Question: {q}\nAnswer: {a}" for q, a in zip(examples["question"], examples["answer"])]
return tokenizer(text, truncation=True, max_length=256, padding="max_length")
tokenized_data = data.map(tokenize_function, batched=True, remove_columns=["question", "answer"])
trainer = Trainer(
model=model,
args=TrainingArguments(
output_dir="./results",
per_device_train_batch_size=2,
num_train_epochs=1,
learning_rate=2e-5,
logging_steps=10,
save_strategy="steps",
save_steps=100,
optim="adamw_torch",
no_cuda=True
),
train_dataset=tokenized_data["train"],
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
print("Starting training...")
trainer.train()
model.save_pretrained(FINETUNED_MODEL_NAME)
tokenizer.save_pretrained(FINETUNED_MODEL_NAME)
print("Training complete!")
def generate_response(message, history):
tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(FINETUNED_MODEL_NAME, torch_dtype=torch.float32)
prompt = f"Question: {message}\nAnswer:"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=64)
return tokenizer.decode(outputs[0], skip_special_tokens=True).split("Answer:")[-1]
def create_interface():
if not os.path.exists(FINETUNED_MODEL_NAME):
print("Model not found. Starting training...")
train_model()
return gr.ChatInterface(
fn=generate_response,
examples=[
"What's your return policy?",
"Do you ship internationally?",
"Is this waterproof?"
]
)
if __name__ == "__main__":
create_interface().launch() |