YooSungHyun commited on
Commit
3b5ca91
·
1 Parent(s): 25d4c6f
README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ko
4
+ metrics:
5
+ - accuracy
6
+ pipeline_tag: text-classification
7
+
8
+ # Optional. Add this if you want to encode your eval results in a structured way.
9
+ model-index:
10
+ - name: ko-answerable
11
+ results:
12
+ - task:
13
+ type: text-classification # Required. Example: automatic-speech-recognition
14
+ name: text-classification # Optional. Example: Speech Recognition
15
+ metrics:
16
+ - type: eval_accuracy
17
+ value: 0.892
18
+ name: eval_accuracy
19
+ verified: false
20
+ - type: test_accuracy
21
+ value: 0.837
22
+ name: test_accuracy
23
+ verified: false
24
+ ---
25
+ # ko-answerable: Passage와 Question이 답변을 할 수 있는가?의 2진 분류
26
+
27
+ ## Model Details
28
+
29
+ SelfCheckGPT의 Answerable model에 감명받아 제작하게 되었습니다. (https://arxiv.org/abs/2303.08896)
30
+
31
+ [monologg/kobigbird-bert-base](https://huggingface.co/monologg/kobigbird-bert-base) 모델을 사용하여 [BigBirdForSequenceClassification](https://huggingface.co/docs/transformers/v4.33.0/en/model_doc/big_bird#transformers.BigBirdForSequenceClassification) 으로 Fine-Tune 되었습니다
32
+
33
+ Max Seq Len: 4096
34
+
35
+ Input Text Style: \<BOS\>Question\<SEP>Title\<SEP\>Passage\<EOS\>
36
+
37
+ Return: 1: 응답 없음, 0: 응답 가능 (sigmoid score 사용 가능)
38
+
39
+ 사용된 데이터셋 (해당 데이터셋에서 'is_impossible'을 기준으로 50:50으로 랜덤 추출(0,1 비중이 맞도록))
40
+
41
+ 1. KLUE
42
+ 2. AIHub-도서자료 기계독해
43
+ 3. AIHub-뉴스 기사 기계독해 데이터
44
+ 4. AIHub-행정 문서 대상 기계독해 데이터
45
+ 5. 표기반 질의응답 데이터 (매튜님에게 개인적으로 받음)
46
+
47
+ `AIHub-기계독해` 데이터도 존재하나, 데이터 전처리하기 구조가 좀 복잡하게 달라서 제외함.
48
+
49
+ 예측 시간: 건당 평균 0.05초 이내 (RTX 3090 사용)
50
+
51
+ 사용 GPU MEM: About 20GB (Seq가 길면 많이 먹음)
all_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "train_loss": 0.07339318460044116,
4
+ "train_runtime": 488254.3014,
5
+ "train_samples_per_second": 19.153,
6
+ "train_steps_per_second": 0.299
7
+ }
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "monologg/kobigbird-bert-base",
3
+ "architectures": [
4
+ "BigBirdForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "attention_type": "original_full",
8
+ "block_size": 64,
9
+ "bos_token_id": 5,
10
+ "classifier_dropout": null,
11
+ "eos_token_id": 6,
12
+ "gradient_checkpointing": false,
13
+ "hidden_act": "gelu_new",
14
+ "hidden_dropout_prob": 0.1,
15
+ "hidden_size": 768,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "layer_norm_eps": 1e-12,
19
+ "max_position_embeddings": 4096,
20
+ "model_type": "big_bird",
21
+ "num_attention_heads": 12,
22
+ "num_hidden_layers": 12,
23
+ "num_random_blocks": 3,
24
+ "pad_token_id": 0,
25
+ "position_embedding_type": "absolute",
26
+ "problem_type": "single_label_classification",
27
+ "rescale_embeddings": false,
28
+ "sep_token_id": 3,
29
+ "tokenizer_class": "BertTokenizer",
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.32.0",
32
+ "type_vocab_size": 2,
33
+ "use_bias": true,
34
+ "use_cache": true,
35
+ "vocab_size": 32500
36
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6d0e8c491d76d948cc56b8a8b5b76a6e01f4a28951fde29001066d70afb4b37
3
+ size 457450105
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "</s>",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "[CLS]",
5
+ "do_basic_tokenize": true,
6
+ "do_lower_case": false,
7
+ "eos_token": "</s>",
8
+ "mask_token": "[MASK]",
9
+ "model_max_length": 4096,
10
+ "never_split": null,
11
+ "pad_token": "[PAD]",
12
+ "sep_token": "[SEP]",
13
+ "strip_accents": null,
14
+ "tokenize_chinese_chars": true,
15
+ "tokenizer_class": "BertTokenizer",
16
+ "unk_token": "[UNK]"
17
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff