import pandas as pd from datasets import Dataset def load_data(tokenizer): df = pd.read_csv("data/ats_dataset.csv") dataset = Dataset.from_pandas(df) def preprocess(example): return tokenizer(example["text"], padding="max_length", truncation=True) tokenized_dataset = dataset.map(preprocess) return tokenized_dataset.train_test_split(test_size=0.2)