init

Files changed (7) hide show

README.md +51 -0
all_results.json +7 -0
config.json +36 -0
pytorch_model.bin +3 -0
special_tokens_map.json +9 -0
tokenizer_config.json +17 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,51 @@

+---
+language:
+- ko
+metrics:
+- accuracy
+pipeline_tag: text-classification
+# Optional. Add this if you want to encode your eval results in a structured way.
+model-index:
+- name: ko-answerable
+  results:
+  - task:
+      type: text-classification             # Required. Example: automatic-speech-recognition
+      name: text-classification             # Optional. Example: Speech Recognition
+    metrics:
+      - type: eval_accuracy
+        value: 0.892
+        name: eval_accuracy
+        verified: false
+      - type: test_accuracy
+        value: 0.837
+        name: test_accuracy
+        verified: false
+---
+# ko-answerable: Passage와 Question이 답변을 할 수 있는가?의 2진 분류
+## Model Details
+SelfCheckGPT의 Answerable model에 감명받아 제작하게 되었습니다. (https://arxiv.org/abs/2303.08896)
+[monologg/kobigbird-bert-base](https://huggingface.co/monologg/kobigbird-bert-base) 모델을 사용하여 [BigBirdForSequenceClassification](https://huggingface.co/docs/transformers/v4.33.0/en/model_doc/big_bird#transformers.BigBirdForSequenceClassification) 으로 Fine-Tune 되었습니다
+Max Seq Len: 4096
+Input Text Style: \<BOS\>Question\<SEP>Title\<SEP\>Passage\<EOS\>
+Return: 1: 응답 없음, 0: 응답 가능 (sigmoid score 사용 가능)
+사용된 데이터셋 (해당 데이터셋에서 'is_impossible'을 기준으로 50:50으로 랜덤 추출(0,1 비중이 맞도록))
+1.   KLUE
+2.   AIHub-도서자료 기계독해
+3.   AIHub-뉴스 기사 기계독해 데이터
+4.   AIHub-행정 문서 대상 기계독해 데이터
+5.   표기반 질의응답 데이터 (매튜님에게 개인적으로 받음)
+`AIHub-기계독해` 데이터도 존재하나, 데이터 전처리하기 구조가 좀 복잡하게 달라서 제외함.
+예측 시간: 건당 평균 0.05초 이내 (RTX 3090 사용)
+사용 GPU MEM: About 20GB (Seq가 길면 많이 먹음)

all_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 10.0,
+    "train_loss": 0.07339318460044116,
+    "train_runtime": 488254.3014,
+    "train_samples_per_second": 19.153,
+    "train_steps_per_second": 0.299
+}

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_name_or_path": "monologg/kobigbird-bert-base",
+  "architectures": [
+    "BigBirdForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "attention_type": "original_full",
+  "block_size": 64,
+  "bos_token_id": 5,
+  "classifier_dropout": null,
+  "eos_token_id": 6,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu_new",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 4096,
+  "model_type": "big_bird",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_random_blocks": 3,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "rescale_embeddings": false,
+  "sep_token_id": 3,
+  "tokenizer_class": "BertTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.32.0",
+  "type_vocab_size": 2,
+  "use_bias": true,
+  "use_cache": true,
+  "vocab_size": 32500
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6d0e8c491d76d948cc56b8a8b5b76a6e01f4a28951fde29001066d70afb4b37
+size 457450105

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "[CLS]",
+  "eos_token": "</s>",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "mask_token": "[MASK]",
+  "model_max_length": 4096,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff