| import pandas as pd | |
| from datasets import Dataset | |
| def load_data(tokenizer): | |
| df = pd.read_csv("data/ats_dataset.csv") | |
| dataset = Dataset.from_pandas(df) | |
| def preprocess(example): | |
| return tokenizer(example["text"], padding="max_length", truncation=True) | |
| tokenized_dataset = dataset.map(preprocess) | |
| return tokenized_dataset.train_test_split(test_size=0.2) | |