|
|
from datasets import load_dataset |
|
|
from transformers import GPT2Tokenizer |
|
|
|
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") |
|
|
|
|
|
|
|
|
datasets = { |
|
|
"conversation": [ |
|
|
"bavard/personachat_truecased", |
|
|
"li2017dailydialog/daily_dialog", |
|
|
"ssbuild/alpaca_convai2" |
|
|
], |
|
|
"coding": [ |
|
|
"lvwerra/stack-exchange-paired", |
|
|
"iamtarun/python_code_instructions_18k_alpaca", |
|
|
"code-search-net/code_search_net" |
|
|
], |
|
|
"math": [ |
|
|
"allenai/math_qa", |
|
|
"qfq/openaimath", |
|
|
"meta-math/MetaMathQA" |
|
|
] |
|
|
} |
|
|
|
|
|
|
|
|
def tokenize_function(examples): |
|
|
return tokenizer(examples['text'], padding="max_length", truncation=True) |
|
|
|
|
|
|
|
|
def load_and_process_datasets(datasets): |
|
|
processed_datasets = {} |
|
|
|
|
|
for category, dataset_list in datasets.items(): |
|
|
category_data = [] |
|
|
|
|
|
for dataset_name in dataset_list: |
|
|
print(f"Loading dataset: {dataset_name}") |
|
|
dataset = load_dataset(dataset_name) |
|
|
|
|
|
|
|
|
tokenized_data = dataset.map(tokenize_function, batched=True) |
|
|
category_data.append(tokenized_data) |
|
|
|
|
|
processed_datasets[category] = category_data |
|
|
|
|
|
return processed_datasets |
|
|
|
|
|
|
|
|
processed_datasets = load_and_process_datasets(datasets) |
|
|
|
|
|
|
|
|
|
|
|
|