| |
| """korscideberta.ipynb |
| |
| Automatically generated by Colaboratory. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1vJNUG_F5El5LY8xmmwRVXo66bYBfXtdz |
| """ |
|
|
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| ''' |
| ! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git |
| # %cd Mecab-ko-for-Google-Colab/ |
| ! bash install_mecab-ko_on_colab_light_220429.sh |
| # %cd .. |
| !pip install datasets transformers[sentencepiece] |
| ''' |
|
|
| |
| ''' |
| !pip install -U accelerate; pip install -U transformers; pip install pydantic==1.8 |
| ''' |
|
|
| !pwd |
| |
|
|
| ''' |
| #[ํ์]๋ฆฌ๋
์ค ํฐ๋ฏธ๋์์ ๋ณธ ์ฝ๋ ๋ฐ ํ ํฌ๋์ด์ ๋ค์ด๋ก๋ |
| #git clone https://huggingface.co/kisti/korscideberta |
| #cd korscideberta |
| |
| #[ํ์]๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ค์น(Mecab ๋ฑ ์์ธํ ์ค์น ๋ฐฉ๋ฒ์ KorSciDeBERTaํ๊ฒฝ์ค์น+ํ์ธํ๋.pdf ์ฐธ์กฐ) |
| !apt install git-lfs |
| |
| ''' |
|
|
| from datasets import load_dataset |
| import datasets |
| from huggingface_hub import notebook_login |
|
|
| notebook_login() |
| |
|
|
| model_repository = "kisti/korscideberta" |
| |
| from transformers import AutoTokenizer |
| from tokenization_korscideberta_v2 import DebertaV2Tokenizer |
| tokenizer = DebertaV2Tokenizer.from_pretrained(model_repository) |
| out = tokenizer.tokenize("<cls> ํ๊ตญ์ด ๋ชจ๋ธ์ <s> ํ๊ตญ์ด ๋ชจ๋ธ์ ๊ณต์ ํฉ๋๋ค. <s>") |
| print(str(out)) |
|
|
| |
| |
| |
| dataset = load_dataset('csv', data_files='data/Abstract_Annotation_Data_tagsentence.csv', split='train') |
| dataset = dataset.shuffle(seed=42) |
| dataset = dataset.train_test_split(test_size=0.1) |
| print("dataset:", str(dataset)) |
|
|
| |
| from datasets import ClassLabel |
| labels = [x for x in dataset['train']['tag']] |
| labels = list(set(labels)) |
| labels.sort() |
| num_labels = len(labels) |
| print('Labels: '+str(labels)[:200]) |
| ClassLabels = ClassLabel(num_classes=len(labels), names=labels) |
|
|
| def preprocess_function(example): |
| output_dict = tokenizer('<cls>'+example["sentence"]+'<s>', max_length=512, truncation=True) |
| output_dict['labels'] = ClassLabels.str2int(example['tag']) |
| return output_dict |
| |
| tokenized_datasets = dataset.map(preprocess_function, batched=False) |
| tokenized_datasets = tokenized_datasets.cast_column("labels", ClassLabel(names=labels)) |
|
|
| |
| random_id = 1 |
| print("Input IDS:", tokenized_datasets["train"][random_id]["input_ids"]) |
| print("Labels:", tokenized_datasets["train"][random_id]["labels"]) |
| tokenized_datasets.save_to_disk('data/tok') |
|
|
| |
| from transformers import AutoModelForSequenceClassification |
|
|
| num_labels = len(labels) |
| def model_init(): |
| |
| |
| return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1) |
| model = model_init() |
|
|
| |
| from transformers import DataCollatorWithPadding |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
| from collections import Counter |
| print("Test:", Counter(tokenized_datasets["test"]["labels"])) |
|
|
| |
| from datasets import load_metric |
| accuracy = load_metric("accuracy") |
|
|
| import numpy as np |
| def compute_metrics(pred): |
| pred_logits = pred.predictions |
| pred_classes = np.argmax(pred_logits, axis=-1) |
| labels = np.asarray(pred.label_ids) |
| acc = accuracy.compute(predictions=pred_classes, references=labels) |
| return {"accuracy": acc["accuracy"]} |
|
|
| |
| |
| |
|
|
| import gc |
| gc.collect() |
| from transformers import TrainingArguments |
| training_args = TrainingArguments( |
| output_dir="deberta_sent4455", |
| num_train_epochs=4, |
| |
| learning_rate=1.5e-5, |
| per_device_train_batch_size=16, |
| per_device_eval_batch_size=8, |
| weight_decay=0.01, |
| fp16=True, |
| fp16_opt_level="01", |
| warmup_steps=500, |
| logging_steps=200, |
| save_steps=2000, |
| eval_steps=500, |
| push_to_hub=True, |
| evaluation_strategy="steps", |
| ) |
|
|
| |
| import gc |
| gc.collect() |
|
|
| from transformers import Trainer |
| trainer = Trainer( |
| args=training_args, |
| compute_metrics=compute_metrics, |
| model=model, |
| |
| data_collator=data_collator, |
| train_dataset=tokenized_datasets["train"], |
| eval_dataset=tokenized_datasets["test"] |
| ) |
| train_metrics = trainer.train().metrics |
| trainer.save_metrics("train", train_metrics) |
| trainer.push_to_hub() |
| |
|
|
| |
| |
| !bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh); cd mecab-0.996-ko-0.9.2; |
| !chmod 775 ./configure; ./configure; make; chmod 775 tests/*.sh; make check; make install |
|
|
| |
| !pwd |
| |
| !cd mecab-ko-dic-2.1.1-20180720; chmod 775 ./autogen.sh; ./autogen.sh; ./configure; make |
|
|
| |
|
|
| |
| !pwd |
| !ls |
| |
|
|
| ! unzip korscideberta.zip -d korscideberta; cd korscideberta |
|
|
| |
| !pwd |
| |
|
|
| ! pip3 install -r requirements.txt; pip install --upgrade nltk; |
| !pip uninstall -y torch torchtext torch-tensorrt; pip install --upgrade pip; pip install torch==1.10.1+cu111 torchvision==0.11.2+cu111 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html --default-timeout=100; pip install setuptools_scm six mlflow; pip install "numpy<1.24.0"; pip install . |