Spaces:
Running
Running
| #!/usr/bin/env python | |
| # coding=utf-8 | |
| # Copyright 2020 The HuggingFace Inc. team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ | |
| Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. | |
| Here is the full list of checkpoints on the hub that can be fine-tuned by this script: | |
| https://huggingface.co/models?filter=text-generation | |
| """ | |
| # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. | |
| import logging | |
| import math | |
| import os | |
| # disable logging until training starts | |
| os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' | |
| import sys | |
| from dataclasses import dataclass, field | |
| from itertools import chain | |
| from typing import Optional | |
| import datasets | |
| import evaluate | |
| import torch | |
| from datasets import load_dataset | |
| import transformers | |
| from transformers import ( | |
| CONFIG_MAPPING, | |
| MODEL_FOR_CAUSAL_LM_MAPPING, | |
| AutoConfig, | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| HfArgumentParser, | |
| Trainer, | |
| TrainingArguments, | |
| default_data_collator, | |
| is_torch_tpu_available, | |
| set_seed, | |
| ) | |
| from transformers.testing_utils import CaptureLogger | |
| from transformers.trainer_utils import get_last_checkpoint | |
| from transformers.utils import check_min_version, send_example_telemetry | |
| from transformers.utils.versions import require_version | |
| from transformers import AutoModel, AutoTokenizer | |
| from datasets import load_dataset | |
| from transformers.testing_utils import CaptureLogger | |
| from itertools import chain | |
| logger = logging.getLogger(__name__) | |
| def get_score(submission_folder = "../env"): | |
| training_args = TrainingArguments("test_trainer") | |
| training_args.report_to = [] | |
| raw_datasets = load_dataset(submission_folder + "/babyLM_for_hf.py", "babyLM-10M", split="test") | |
| model = AutoModelForCausalLM.from_pretrained(submission_folder + "/output/") | |
| tokenizer = AutoTokenizer.from_pretrained(submission_folder + "/output/") | |
| # Preprocessing the datasets. | |
| # First we tokenize all the texts. | |
| column_names = list(raw_datasets.features) | |
| text_column_name = "text" if "text" in column_names else column_names[0] | |
| # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function | |
| tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") | |
| def tokenize_function(examples): | |
| with CaptureLogger(tok_logger) as cl: | |
| output = tokenizer(examples[text_column_name]) | |
| # clm input could be much much longer than block_size | |
| if "Token indices sequence length is longer than the" in cl.out: | |
| tok_logger.warning( | |
| "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" | |
| " before being passed to the model." | |
| ) | |
| return output | |
| with training_args.main_process_first(desc="dataset map tokenization"): | |
| # if not data_args.streaming: | |
| # tokenized_datasets = raw_datasets.map( | |
| # tokenize_function, | |
| # batched=True, | |
| # num_proc=data_args.preprocessing_num_workers, | |
| # remove_columns=column_names, | |
| # load_from_cache_file=not data_args.overwrite_cache, | |
| # desc="Running tokenizer on dataset", | |
| # ) | |
| # else: | |
| tokenized_datasets = raw_datasets.map( | |
| tokenize_function, | |
| batched=True, | |
| remove_columns=column_names, | |
| ) | |
| if True: | |
| block_size = tokenizer.model_max_length | |
| if block_size > 1024: | |
| logger.warning( | |
| "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" | |
| " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" | |
| " override this default with `--block_size xxx`." | |
| ) | |
| block_size = 1024 | |
| else: | |
| if data_args.block_size > tokenizer.model_max_length: | |
| logger.warning( | |
| f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" | |
| f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." | |
| ) | |
| block_size = min(data_args.block_size, tokenizer.model_max_length) | |
| # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. | |
| def group_texts(examples): | |
| # Concatenate all texts. | |
| concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} | |
| total_length = len(concatenated_examples[list(examples.keys())[0]]) | |
| # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. | |
| # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. | |
| total_length = (total_length // block_size) * block_size | |
| # Split by chunks of max_len. | |
| result = { | |
| k: [t[i : i + block_size] for i in range(0, total_length, block_size)] | |
| for k, t in concatenated_examples.items() | |
| } | |
| result["labels"] = result["input_ids"].copy() | |
| return result | |
| # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder | |
| # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower | |
| # to preprocess. | |
| # | |
| # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: | |
| # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map | |
| with training_args.main_process_first(desc="grouping texts together"): | |
| # if not data_args.streaming: | |
| # lm_datasets = tokenized_datasets.map( | |
| # group_texts, | |
| # batched=True, | |
| # num_proc=data_args.preprocessing_num_workers, | |
| # load_from_cache_file=not data_args.overwrite_cache, | |
| # desc=f"Grouping texts in chunks of {block_size}", | |
| # ) | |
| # else: | |
| lm_datasets = tokenized_datasets.map( | |
| group_texts, | |
| batched=True, | |
| ) | |
| eval_dataset = lm_datasets | |
| def preprocess_logits_for_metrics(logits, labels): | |
| if isinstance(logits, tuple): | |
| # Depending on the model and config, logits may contain extra tensors, | |
| # like past_key_values, but logits always come first | |
| logits = logits[0] | |
| return logits.argmax(dim=-1) | |
| metric = evaluate.load("accuracy") | |
| def compute_metrics(eval_preds): | |
| preds, labels = eval_preds | |
| # preds have the same shape as the labels, after the argmax(-1) has been calculated | |
| # by preprocess_logits_for_metrics but we need to shift the labels | |
| labels = labels[:, 1:].reshape(-1) | |
| preds = preds[:, :-1].reshape(-1) | |
| return metric.compute(predictions=preds, references=labels) | |
| # Initialize our Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=None, | |
| eval_dataset=eval_dataset, | |
| tokenizer=tokenizer, | |
| # Data collator will default to DataCollatorWithPadding, so we change it. | |
| data_collator=default_data_collator, | |
| compute_metrics=compute_metrics, | |
| preprocess_logits_for_metrics=preprocess_logits_for_metrics, | |
| ) | |
| transformers.utils.logging.set_verbosity(transformers.utils.logging.WARNING) | |
| # Evaluation | |
| metrics = trainer.evaluate() | |
| max_eval_samples = len(eval_dataset) | |
| metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) | |
| try: | |
| perplexity = math.exp(metrics["eval_loss"]) | |
| except OverflowError: | |
| perplexity = float("inf") | |
| metrics["perplexity"] = perplexity | |
| return perplexity | |
| if __name__ == "__main__": | |
| print(get_score()) |