| from sklearn.model_selection import train_test_split |
| from datasets import Dataset, DatasetDict, load_dataset, interleave_datasets, load_from_disk |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer |
| import torch |
| import time |
| import evaluate |
| import pandas as pd |
| import numpy as np |
|
|
| model_name = 't5-small' |
|
|
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
| original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) |
| original_model = original_model.to('cuda') |
|
|
| data = pd.read_csv("text-to-sql_from_spider.csv") |
| |
|
|
| dataset = load_dataset("csv", data_files="text-to-sql_from_spider.csv") |
| dataset = dataset["train"].train_test_split(test_size=0.4) |
| test_dataset = dataset["test"].train_test_split(test_size=0.5) |
| print(dataset["train"]) |
| dataset = DatasetDict({"train": dataset["train"], |
| "test": test_dataset["test"], |
| "validation": test_dataset["train"]}) |
|
|
|
|
| def tokenize_function(example): |
|
|
| |
| start_prompt = "Tables:\n" |
| middle_prompt = "\n\nQuestion:\n" |
| end_prompt = "\n\nAnswer:\n" |
|
|
| data_zip = zip(example['schema'], example['question']) |
| prompt = [start_prompt + context + middle_prompt + question + end_prompt for context, question in data_zip] |
| example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids |
| example['labels'] = tokenizer(example['sql'], padding="max_length", truncation=True, return_tensors="pt").input_ids |
| |
| |
|
|
| return example |
|
|
| try: |
| tokenized_datasets = load_from_disk("tokenized_datasets") |
| print("Loaded Tokenized Dataset") |
| except: |
| tokenized_datasets = dataset.map(tokenize_function, batched=True) |
| tokenized_datasets = tokenized_datasets.remove_columns(['sql', 'question', 'schema']) |
|
|
| tokenized_datasets.save_to_disk("tokenized_datasets") |
| print("Tokenized and Saved Dataset") |
| |
|
|
| |
|
|
|
|
| try: |
| finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("finetuned_model_2_epoch") |
| finetuned_model = finetuned_model.to('cuda') |
| to_train = False |
|
|
| except: |
| to_train = True |
| finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) |
| finetuned_model = finetuned_model.to('cuda') |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
| if to_train: |
| output_dir = f'./sql-training-{str(int(time.time()))}' |
|
|
| training_args = TrainingArguments( |
| output_dir=output_dir, |
| learning_rate=5e-3, |
| num_train_epochs=2, |
| per_device_train_batch_size=16, |
| per_device_eval_batch_size=16, |
| weight_decay=0.01, |
| logging_steps=50, |
| evaluation_strategy='steps', |
| eval_steps=500, |
| ) |
|
|
| trainer = Trainer( |
| model=finetuned_model, |
| args=training_args, |
| train_dataset=tokenized_datasets['train'], |
| eval_dataset=tokenized_datasets['validation'], |
| ) |
|
|
| trainer.train() |
|
|
| finetuned_model.save_pretrained("finetuned_model_2_epoch") |
|
|
| questions = dataset['test']['question'] |
| contexts = dataset['test']['schema'] |
| human_baseline_answers = dataset['test']['sql'] |
|
|
| original_model_answers = [] |
| finetuned_model_answers = [] |
|
|
| for idx, question in enumerate(questions): |
| prompt = f"""Tables: |
| {contexts[idx]} |
| |
| Question: |
| {question} |
| |
| Answer: |
| """ |
|
|
| input_ids = tokenizer(prompt, return_tensors="pt").input_ids |
| input_ids = input_ids.to('cuda') |
|
|
| human_baseline_text_output = human_baseline_answers[idx] |
|
|
| original_model_outputs = original_model.generate(input_ids=input_ids, |
| generation_config=GenerationConfig(max_new_tokens=300)) |
| original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True) |
| original_model_answers.append(original_model_text_output) |
|
|
| finetuned_model_outputs = finetuned_model.generate(input_ids=input_ids, |
| generation_config=GenerationConfig(max_new_tokens=300)) |
| finetuned_model_text_output = tokenizer.decode(finetuned_model_outputs[0], skip_special_tokens=True) |
| finetuned_model_answers.append(finetuned_model_text_output) |
|
|
| zipped_summaries = list(zip(human_baseline_answers, original_model_answers, finetuned_model_answers)) |
|
|
| df = pd.DataFrame(zipped_summaries, |
| columns=['human_baseline_answers', 'original_model_answers', 'finetuned_model_answers']) |
|
|
| rouge = evaluate.load('rouge') |
|
|
| original_model_results = rouge.compute( |
| predictions=original_model_answers, |
| references=human_baseline_answers[0:len(original_model_answers)], |
| use_aggregator=True, |
| use_stemmer=True, |
| ) |
| print('ORIGINAL MODEL:') |
| print(original_model_results) |
|
|
|
|
| finetuned_model_results = rouge.compute( |
| predictions=finetuned_model_answers, |
| references=human_baseline_answers[0:len(finetuned_model_answers)], |
| use_aggregator=True, |
| use_stemmer=True, |
| ) |
| print('FINE-TUNED MODEL:') |
| print(finetuned_model_results) |
|
|