File size: 1,719 Bytes
dc59b01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from datasets import Dataset
from transformers import T5Tokenizer
import pandas as pd

print("Loading processed dataset...")
train = pd.read_csv("../data/processed/train.csv")
val   = pd.read_csv("../data/processed/validation.csv")

# remove hidden pandas index column if exists
train = train.drop(columns=[c for c in train.columns if "index" in c.lower()], errors="ignore")
val   = val.drop(columns=[c for c in val.columns if "index" in c.lower()], errors="ignore")

print("Loading tokenizer (t5-small)...")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

SQL_PREFIX = "translate English to SQL: "

# ------------------------------------------------------
# TOKENIZATION FUNCTION
# ------------------------------------------------------
def tokenize(example):

    # input = schema + question
    input_text = SQL_PREFIX + example["input"]

    # target = real SQL
    target_sql = example["sql"]

    model_inputs = tokenizer(
        input_text,
        text_target=target_sql,
        max_length=256,
        padding="max_length",
        truncation=True
    )

    return model_inputs


# ------------------------------------------------------
# DATASET CONVERSION
# ------------------------------------------------------
print("Preparing dataset...")
train_ds = Dataset.from_pandas(train)
val_ds   = Dataset.from_pandas(val)

print("Tokenizing train...")
train_ds = train_ds.map(tokenize, remove_columns=train_ds.column_names)

print("Tokenizing validation...")
val_ds = val_ds.map(tokenize, remove_columns=val_ds.column_names)

# save tokenized dataset
train_ds.save_to_disk("../data/tokenized/train")
val_ds.save_to_disk("../data/tokenized/validation")

print("DONE ✔ Tokenized dataset saved correctly")