Spaces:
Sleeping
Sleeping
File size: 1,719 Bytes
dc59b01 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | from datasets import Dataset
from transformers import T5Tokenizer
import pandas as pd
print("Loading processed dataset...")
train = pd.read_csv("../data/processed/train.csv")
val = pd.read_csv("../data/processed/validation.csv")
# remove hidden pandas index column if exists
train = train.drop(columns=[c for c in train.columns if "index" in c.lower()], errors="ignore")
val = val.drop(columns=[c for c in val.columns if "index" in c.lower()], errors="ignore")
print("Loading tokenizer (t5-small)...")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
SQL_PREFIX = "translate English to SQL: "
# ------------------------------------------------------
# TOKENIZATION FUNCTION
# ------------------------------------------------------
def tokenize(example):
# input = schema + question
input_text = SQL_PREFIX + example["input"]
# target = real SQL
target_sql = example["sql"]
model_inputs = tokenizer(
input_text,
text_target=target_sql,
max_length=256,
padding="max_length",
truncation=True
)
return model_inputs
# ------------------------------------------------------
# DATASET CONVERSION
# ------------------------------------------------------
print("Preparing dataset...")
train_ds = Dataset.from_pandas(train)
val_ds = Dataset.from_pandas(val)
print("Tokenizing train...")
train_ds = train_ds.map(tokenize, remove_columns=train_ds.column_names)
print("Tokenizing validation...")
val_ds = val_ds.map(tokenize, remove_columns=val_ds.column_names)
# save tokenized dataset
train_ds.save_to_disk("../data/tokenized/train")
val_ds.save_to_disk("../data/tokenized/validation")
print("DONE ✔ Tokenized dataset saved correctly")
|