Spaces:
Running
Running
| from datasets import Dataset | |
| from transformers import T5Tokenizer | |
| import pandas as pd | |
| print("Loading processed dataset...") | |
| train = pd.read_csv("../data/processed/train.csv") | |
| val = pd.read_csv("../data/processed/validation.csv") | |
| # remove hidden pandas index column if exists | |
| train = train.drop(columns=[c for c in train.columns if "index" in c.lower()], errors="ignore") | |
| val = val.drop(columns=[c for c in val.columns if "index" in c.lower()], errors="ignore") | |
| print("Loading tokenizer (t5-small)...") | |
| tokenizer = T5Tokenizer.from_pretrained("t5-small") | |
| SQL_PREFIX = "translate English to SQL: " | |
| # ------------------------------------------------------ | |
| # TOKENIZATION FUNCTION | |
| # ------------------------------------------------------ | |
| def tokenize(example): | |
| # input = schema + question | |
| input_text = SQL_PREFIX + example["input"] | |
| # target = real SQL | |
| target_sql = example["sql"] | |
| model_inputs = tokenizer( | |
| input_text, | |
| text_target=target_sql, | |
| max_length=256, | |
| padding="max_length", | |
| truncation=True | |
| ) | |
| return model_inputs | |
| # ------------------------------------------------------ | |
| # DATASET CONVERSION | |
| # ------------------------------------------------------ | |
| print("Preparing dataset...") | |
| train_ds = Dataset.from_pandas(train) | |
| val_ds = Dataset.from_pandas(val) | |
| print("Tokenizing train...") | |
| train_ds = train_ds.map(tokenize, remove_columns=train_ds.column_names) | |
| print("Tokenizing validation...") | |
| val_ds = val_ds.map(tokenize, remove_columns=val_ds.column_names) | |
| # save tokenized dataset | |
| train_ds.save_to_disk("../data/tokenized/train") | |
| val_ds.save_to_disk("../data/tokenized/validation") | |
| print("DONE ✔ Tokenized dataset saved correctly") | |