File size: 3,335 Bytes
f58f290
 
05d9482
97ff0e4
f58f290
 
 
 
 
6bf78d4
05d9482
6bf78d4
f58f290
 
 
 
 
 
 
 
 
 
 
a3bf046
f58f290
 
 
 
 
 
 
6bf78d4
a3bf046
f58f290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3bf046
 
f58f290
6bf78d4
f58f290
6bf78d4
a3bf046
 
f58f290
 
 
 
 
 
 
 
 
 
 
 
 
6bf78d4
f58f290
 
6bf78d4
a3bf046
f58f290
 
 
 
 
 
05d9482
f58f290
 
 
 
 
 
 
 
05d9482
 
a3bf046
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import torch
import gradio as gr
from datasets import load_dataset, Dataset  
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DATASET_NAME = "embedding-data/Amazon-QA"
FINETUNED_MODEL_NAME = "tinyllama-shopify-lora"

def load_data():
    dataset = load_dataset(DATASET_NAME)
    df = dataset['train'].to_pandas()
    df = df.rename(columns={'query': 'question', 'pos': 'answer'})[['question', 'answer']]
    df = df.dropna().astype(str)
    df['answer'] = df['answer'].str.replace(r'\[\^.*?\^\]', '').str.strip()
    return Dataset.from_pandas(df).train_test_split(test_size=0.1)  # Now using imported Dataset

def setup_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        torch_dtype=torch.float32
    )

    peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    return tokenizer, get_peft_model(model, peft_config)

def train_model():
    tokenizer, model = setup_model()
    data = load_data()

    def tokenize_function(examples):
        text = [f"Question: {q}\nAnswer: {a}" for q, a in zip(examples["question"], examples["answer"])]
        return tokenizer(text, truncation=True, max_length=256, padding="max_length")

    tokenized_data = data.map(tokenize_function, batched=True, remove_columns=["question", "answer"])

    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir="./results",
            per_device_train_batch_size=2,
            num_train_epochs=1,
            learning_rate=2e-5,
            logging_steps=10,
            save_strategy="steps",
            save_steps=100,
            optim="adamw_torch",
            no_cuda=True
        ),
        train_dataset=tokenized_data["train"],
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    print("Starting training...")
    trainer.train()
    model.save_pretrained(FINETUNED_MODEL_NAME)
    tokenizer.save_pretrained(FINETUNED_MODEL_NAME)
    print("Training complete!")

def generate_response(message, history):
    tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(FINETUNED_MODEL_NAME, torch_dtype=torch.float32)

    prompt = f"Question: {message}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("Answer:")[-1]

def create_interface():
    if not os.path.exists(FINETUNED_MODEL_NAME):
        print("Model not found. Starting training...")
        train_model()

    return gr.ChatInterface(
        fn=generate_response,
        examples=[
            "What's your return policy?",
            "Do you ship internationally?",
            "Is this waterproof?"
        ]
    )

if __name__ == "__main__":
    create_interface().launch()