multimolecule-splicebert-finetuned/eval_metrics_splicebert.csv ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,step,eval_loss,eval_seq_accuracy,eval_F1,eval_ideal_threshold
2
+ 0,1000,0.006815282162278891,0.4026603852016073,0.6171703296703297,0.4752318561077118
3
+ 1,2000,0.0037505668587982655,0.6617708188998199,0.7406869859700048,0.5850344896316528
4
+ 2,3000,0.003279034746810794,0.6894138838852709,0.7554345953767182,0.592792809009552
5
+ 3,4000,0.0029538299422711134,0.709436053761951,0.7667828668070272,0.5514923930168152
6
+ 4,5000,0.002750302664935589,0.7192046556741029,0.7722696660417968,0.4618184566497803
7
+ 5,6000,0.002676873467862606,0.7314673687127615,0.7782719854306027,0.5094878077507019
8
+ 6,7000,0.0027683533262461424,0.7301510322848829,0.7810940135632897,0.5408661961555481
9
+ 7,8000,0.002569204429164529,0.735277816267147,0.7846375222694258,0.4193652868270874
10
+ 8,9000,0.002614411525428295,0.744353609533047,0.7860623781676414,0.5125839710235596
11
+ 9,10000,0.002571880118921399,0.7435915200221699,0.7914545895225235,0.5403604507446289
12
+ 10,11000,0.002500804141163826,0.7438686434806706,0.792448951623955,0.4591529965400696
13
+ 11,12000,0.0024977801367640495,0.7506581682139393,0.7933212996389891,0.5195016264915466
14
+ 12,13000,0.002506256103515625,0.750519606484689,0.7949502408833562,0.5634196996688843
15
+ 13,14000,0.002466367557644844,0.750173202161563,0.7933188390792698,0.47746482491493225
16
+ 14,15000,0.0024979824665933847,0.7483026188166828,0.7927380664756842,0.5055601596832275
17
+ 15,16000,0.0024439790286123753,0.7586254676458363,0.7993190188312139,0.5041056871414185
18
+ 16,17000,0.0024133624974638224,0.7607038935845919,0.8025286392847164,0.5179569125175476
19
+ 17,18000,0.0023823431693017483,0.7622973534709713,0.8020388483262157,0.46387553215026855
20
+ 18,19000,0.0023969090543687344,0.7575862546764583,0.8003836141937252,0.4617359936237335
21
+ 19,20000,0.0023910230956971645,0.7623666343355965,0.8018734717773874,0.4711677134037018
22
+ 20,21000,0.0023792830761522055,0.7651378689206041,0.8037143550119539,0.4961932897567749
23
+ 21,22000,0.002382500097155571,0.7649993071913538,0.8038154699965314,0.4926639199256897
24
+ 22,23000,0.002373825293034315,0.7658306775668561,0.8052274792088895,0.506659209728241
25
+ 23,24000,0.0023699423763900995,0.7652071497852293,0.8055755827925979,0.46972033381462097
26
+ 24,25000,0.002361688995733857,0.7708188998198697,0.8070101647388713,0.5162570476531982
27
+ 25,26000,0.00235151220113039,0.768740473881114,0.8066535256077739,0.48440903425216675
28
+ 26,27000,0.0023619181010872126,0.7682555078287377,0.8066019819757605,0.4802227020263672
multimolecule-splicebert-finetuned/final_model/config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "multimolecule/splicebert",
3
+ "architectures": [
4
+ "SpliceBertForTokenPrediction"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head": {
10
+ "act": null,
11
+ "bias": true,
12
+ "dropout": 0.0,
13
+ "hidden_size": 512,
14
+ "layer_norm_eps": 1e-12,
15
+ "num_labels": 2,
16
+ "output_name": null,
17
+ "problem_type": "single_label_classification",
18
+ "transform": null,
19
+ "transform_act": "gelu"
20
+ },
21
+ "hidden_act": "gelu",
22
+ "hidden_dropout": 0.1,
23
+ "hidden_size": 512,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 2048,
26
+ "layer_norm_eps": 1e-12,
27
+ "lm_head": {
28
+ "act": null,
29
+ "bias": true,
30
+ "dropout": 0.0,
31
+ "hidden_size": null,
32
+ "layer_norm_eps": 1e-12,
33
+ "output_name": null,
34
+ "transform": "nonlinear",
35
+ "transform_act": "gelu"
36
+ },
37
+ "mask_token_id": 4,
38
+ "max_position_embeddings": 1026,
39
+ "model_type": "splicebert",
40
+ "null_token_id": 5,
41
+ "num_attention_heads": 16,
42
+ "num_hidden_layers": 6,
43
+ "pad_token_id": 0,
44
+ "position_embedding_type": "absolute",
45
+ "problem_type": "single_label_classification",
46
+ "torch_dtype": "float32",
47
+ "transformers_version": "4.46.3",
48
+ "type_vocab_size": 2,
49
+ "unk_token_id": 3,
50
+ "use_cache": true,
51
+ "vocab_size": 26
52
+ }
multimolecule-splicebert-finetuned/final_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7285e3200d1b8a727c904eabee8190dbff4eecc090be018200ef726cc13c0ee
3
+ size 78887280
multimolecule-splicebert-finetuned/final_model/special_tokens_map.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<null>"
4
+ ],
5
+ "bos_token": {
6
+ "content": "<cls>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "cls_token": {
13
+ "content": "<cls>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "content": "<eos>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "mask_token": {
27
+ "content": "<mask>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ },
33
+ "pad_token": {
34
+ "content": "<pad>",
35
+ "lstrip": false,
36
+ "normalized": false,
37
+ "rstrip": false,
38
+ "single_word": false
39
+ },
40
+ "sep_token": {
41
+ "content": "<eos>",
42
+ "lstrip": false,
43
+ "normalized": false,
44
+ "rstrip": false,
45
+ "single_word": false
46
+ },
47
+ "unk_token": {
48
+ "content": "<unk>",
49
+ "lstrip": false,
50
+ "normalized": false,
51
+ "rstrip": false,
52
+ "single_word": false
53
+ }
54
+ }
multimolecule-splicebert-finetuned/final_model/tokenizer_config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<cls>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<eos>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "<null>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
+ "additional_special_tokens": [
53
+ "<null>"
54
+ ],
55
+ "bos_token": "<cls>",
56
+ "clean_up_tokenization_spaces": true,
57
+ "cls_token": "<cls>",
58
+ "codon": false,
59
+ "eos_token": "<eos>",
60
+ "mask_token": "<mask>",
61
+ "model_max_length": 1026,
62
+ "nmers": 1,
63
+ "pad_token": "<pad>",
64
+ "replace_T_with_U": true,
65
+ "sep_token": "<eos>",
66
+ "tokenizer_class": "RnaTokenizer",
67
+ "unk_token": "<unk>"
68
+ }
multimolecule-splicebert-finetuned/final_model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce47261154b49be1dcb1e977db3183bb281e4c8a0cc142d3caa177772d66d24a
3
+ size 5368
multimolecule-splicebert-finetuned/final_model/vocab.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <pad>
2
+ <cls>
3
+ <eos>
4
+ <unk>
5
+ <mask>
6
+ <null>
7
+ A
8
+ C
9
+ G
10
+ U
11
+ N
12
+ R
13
+ Y
14
+ S
15
+ W
16
+ K
17
+ M
18
+ B
19
+ D
20
+ H
21
+ V
22
+ .
23
+ X
24
+ *
25
+ -
26
+ I