|
|
import gradio as gr |
|
|
from transformers import pipeline, Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer |
|
|
import torch |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
model_name = "huggingface/transformer_model" |
|
|
model = AutoModelForCausalLM.from_pretrained(model_name) |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
def upload_and_finetune(file): |
|
|
|
|
|
file_path = file.name |
|
|
data = pd.read_csv(file_path) |
|
|
|
|
|
|
|
|
|
|
|
texts = data['text'].tolist() |
|
|
encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt") |
|
|
|
|
|
|
|
|
class CustomDataset(torch.utils.data.Dataset): |
|
|
def __init__(self, encodings): |
|
|
self.encodings = encodings |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.encodings['input_ids']) |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} |
|
|
return item |
|
|
|
|
|
train_dataset = CustomDataset(encodings) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir='./results', |
|
|
num_train_epochs=3, |
|
|
per_device_train_batch_size=4, |
|
|
logging_dir='./logs', |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
) |
|
|
|
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
model.save_pretrained('./fine_tuned_model') |
|
|
|
|
|
return f"File {file.name} uploaded and model fine-tuned successfully!" |
|
|
|
|
|
|
|
|
interface = gr.Interface( |
|
|
fn=upload_and_finetune, |
|
|
inputs=[gr.File(label="Upload Dataset for Fine-Tuning", file_count="single", type="file")], |
|
|
outputs="text" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
interface.launch() |
|
|
|