| | from sklearn.model_selection import train_test_split |
| | from datasets import Dataset, DatasetDict, load_dataset, interleave_datasets, load_from_disk |
| | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer |
| | import torch |
| | import time |
| | import evaluate |
| | import pandas as pd |
| | import numpy as np |
| | model_name = 't5-small' |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| |
|
| | original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) |
| | original_model = original_model.to('cuda') |
| |
|
| | finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("finetuned_model_2_epoch") |
| | |
| | |
| |
|
| | question = data["question"][0] |
| | context = "CREATE TABLE table_name_11 (date VARCHAR, away_team VARCHAR)" |
| | answer = data["sql"][0] |
| |
|
| | prompt = f"""Tables: |
| | {context} |
| | |
| | Question: |
| | {question} |
| | |
| | Answer: |
| | """ |
| |
|
| | inputs = tokenizer(prompt, return_tensors='pt') |
| | inputs = inputs.to('cuda') |
| |
|
| | output = tokenizer.decode( |
| | finetuned_model.generate( |
| | inputs["input_ids"], |
| | max_new_tokens=200, |
| | )[0], |
| | skip_special_tokens=True |
| | ) |
| |
|
| | dash_line = '-'*100 |
| | print(dash_line) |
| | print(f'INPUT PROMPT:\n{prompt}') |
| | print(dash_line) |
| | print(f'BASELINE HUMAN ANSWER:\n{answer}\n') |
| | print(dash_line) |
| | print(f'MODEL GENERATION - ZERO SHOT:\n{output}') |