Spaces:
Runtime error
Runtime error
Add appy_chat_template process
Browse files- spanish_medica_llm.py +11 -1
spanish_medica_llm.py
CHANGED
|
@@ -403,6 +403,10 @@ def tokenize(element, tokenizer):
|
|
| 403 |
input_batch.append(input_ids)
|
| 404 |
return {"input_ids": input_batch}
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
def splitDatasetInTestValid(dataset):
|
| 407 |
"""
|
| 408 |
"""
|
|
@@ -705,8 +709,14 @@ def run_finnetuning_process():
|
|
| 705 |
tokenizer = loadSpanishTokenizer()
|
| 706 |
medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
|
| 707 |
print ( tokenizer.apply_chat_template(medicalSpanishDataset[5]['raw_text'], tokenize=False))
|
| 708 |
-
|
| 709 |
print('----------------------------------------------------------')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 710 |
medicalSpanishDataset = tokenizer.apply_chat_template(medicalSpanishDataset, tokenize=False)
|
| 711 |
medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
|
| 712 |
train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
|
|
|
|
| 403 |
input_batch.append(input_ids)
|
| 404 |
return {"input_ids": input_batch}
|
| 405 |
|
| 406 |
+
def apply_chat_template(example, tokenizer):
|
| 407 |
+
example['raw_text'] = tokenizer.apply_chat_template(example['raw_text'], tokenize=False)
|
| 408 |
+
return example
|
| 409 |
+
|
| 410 |
def splitDatasetInTestValid(dataset):
|
| 411 |
"""
|
| 412 |
"""
|
|
|
|
| 709 |
tokenizer = loadSpanishTokenizer()
|
| 710 |
medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
|
| 711 |
print ( tokenizer.apply_chat_template(medicalSpanishDataset[5]['raw_text'], tokenize=False))
|
| 712 |
+
|
| 713 |
print('----------------------------------------------------------')
|
| 714 |
+
medicalSpanishDataset = medicalSpanishDataset.map(apply_chat_template,
|
| 715 |
+
num_proc = os.cpu_count(),
|
| 716 |
+
fn_kwargs = {'tokenizer':tokenizer},
|
| 717 |
+
remove_columns = [col for col in medicalSpanishDataset.features if col not in ['raw_text']],
|
| 718 |
+
desc = 'Applying chat template'
|
| 719 |
+
)
|
| 720 |
medicalSpanishDataset = tokenizer.apply_chat_template(medicalSpanishDataset, tokenize=False)
|
| 721 |
medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
|
| 722 |
train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
|