Vigen1 commited on
Commit
14eefac
·
verified ·
1 Parent(s): 7580af0

Upload t5.py

Browse files
Files changed (1) hide show
  1. t5.py +152 -0
t5.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.model_selection import train_test_split
2
+ from datasets import Dataset, DatasetDict, load_dataset, interleave_datasets, load_from_disk
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
4
+ import torch
5
+ import time
6
+ import evaluate
7
+ import pandas as pd
8
+ import numpy as np
9
+
10
+ model_name = 't5-small'
11
+
12
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+
14
+ original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
15
+ original_model = original_model.to('cuda')
16
+
17
+ data = pd.read_csv("text-to-sql_from_spider.csv")
18
+ # print(data)
19
+
20
+ dataset = load_dataset("csv", data_files="text-to-sql_from_spider.csv")
21
+ dataset = dataset["train"].train_test_split(test_size=0.4)
22
+ test_dataset = dataset["test"].train_test_split(test_size=0.5)
23
+ print(dataset["train"])
24
+ dataset = DatasetDict({"train": dataset["train"],
25
+ "test": test_dataset["test"],
26
+ "validation": test_dataset["train"]})
27
+
28
+
29
+ def tokenize_function(example):
30
+
31
+ # print(len(example["question"]))
32
+ start_prompt = "Tables:\n"
33
+ middle_prompt = "\n\nQuestion:\n"
34
+ end_prompt = "\n\nAnswer:\n"
35
+
36
+ data_zip = zip(example['schema'], example['question'])
37
+ prompt = [start_prompt + context + middle_prompt + question + end_prompt for context, question in data_zip]
38
+ example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
39
+ example['labels'] = tokenizer(example['sql'], padding="max_length", truncation=True, return_tensors="pt").input_ids
40
+ # print(prompt[0])
41
+ # print()
42
+
43
+ return example
44
+
45
+ try:
46
+ tokenized_datasets = load_from_disk("tokenized_datasets")
47
+ print("Loaded Tokenized Dataset")
48
+ except:
49
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
50
+ tokenized_datasets = tokenized_datasets.remove_columns(['sql', 'question', 'schema'])
51
+
52
+ tokenized_datasets.save_to_disk("tokenized_datasets")
53
+ print("Tokenized and Saved Dataset")
54
+ # tokenized_datasets = dataset.map(tokenize_function, batched=True)
55
+
56
+ # print(tokenized_datasets["train"][0]["input_ids"])
57
+
58
+
59
+ try:
60
+ finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("finetuned_model_2_epoch")
61
+ finetuned_model = finetuned_model.to('cuda')
62
+ to_train = False
63
+
64
+ except:
65
+ to_train = True
66
+ finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
67
+ finetuned_model = finetuned_model.to('cuda')
68
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
69
+
70
+ if to_train:
71
+ output_dir = f'./sql-training-{str(int(time.time()))}'
72
+
73
+ training_args = TrainingArguments(
74
+ output_dir=output_dir,
75
+ learning_rate=5e-3,
76
+ num_train_epochs=2,
77
+ per_device_train_batch_size=16, # batch size per device during training
78
+ per_device_eval_batch_size=16, # batch size for evaluation
79
+ weight_decay=0.01,
80
+ logging_steps=50,
81
+ evaluation_strategy='steps', # evaluation strategy to adopt during training
82
+ eval_steps=500, # number of steps between evaluation
83
+ )
84
+
85
+ trainer = Trainer(
86
+ model=finetuned_model,
87
+ args=training_args,
88
+ train_dataset=tokenized_datasets['train'],
89
+ eval_dataset=tokenized_datasets['validation'],
90
+ )
91
+
92
+ trainer.train()
93
+
94
+ finetuned_model.save_pretrained("finetuned_model_2_epoch")
95
+
96
+ questions = dataset['test']['question']
97
+ contexts = dataset['test']['schema']
98
+ human_baseline_answers = dataset['test']['sql']
99
+
100
+ original_model_answers = []
101
+ finetuned_model_answers = []
102
+
103
+ for idx, question in enumerate(questions):
104
+ prompt = f"""Tables:
105
+ {contexts[idx]}
106
+
107
+ Question:
108
+ {question}
109
+
110
+ Answer:
111
+ """
112
+
113
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids
114
+ input_ids = input_ids.to('cuda')
115
+
116
+ human_baseline_text_output = human_baseline_answers[idx]
117
+
118
+ original_model_outputs = original_model.generate(input_ids=input_ids,
119
+ generation_config=GenerationConfig(max_new_tokens=300))
120
+ original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
121
+ original_model_answers.append(original_model_text_output)
122
+
123
+ finetuned_model_outputs = finetuned_model.generate(input_ids=input_ids,
124
+ generation_config=GenerationConfig(max_new_tokens=300))
125
+ finetuned_model_text_output = tokenizer.decode(finetuned_model_outputs[0], skip_special_tokens=True)
126
+ finetuned_model_answers.append(finetuned_model_text_output)
127
+
128
+ zipped_summaries = list(zip(human_baseline_answers, original_model_answers, finetuned_model_answers))
129
+
130
+ df = pd.DataFrame(zipped_summaries,
131
+ columns=['human_baseline_answers', 'original_model_answers', 'finetuned_model_answers'])
132
+
133
+ rouge = evaluate.load('rouge')
134
+
135
+ original_model_results = rouge.compute(
136
+ predictions=original_model_answers,
137
+ references=human_baseline_answers[0:len(original_model_answers)],
138
+ use_aggregator=True,
139
+ use_stemmer=True,
140
+ )
141
+ print('ORIGINAL MODEL:')
142
+ print(original_model_results)
143
+
144
+
145
+ finetuned_model_results = rouge.compute(
146
+ predictions=finetuned_model_answers,
147
+ references=human_baseline_answers[0:len(finetuned_model_answers)],
148
+ use_aggregator=True,
149
+ use_stemmer=True,
150
+ )
151
+ print('FINE-TUNED MODEL:')
152
+ print(finetuned_model_results)