koichi12 commited on
Commit
77c6d8e
·
verified ·
1 Parent(s): 08d7121

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/eval/lm-evaluation-harness/models/abeja-gpt-neox-japanese-2.7b/harness.jsquad-1.2.sh +3 -0
  2. scripts/yans/eval/lm-evaluation-harness/models/abeja-gpt-neox-japanese-2.7b/harness.sh +3 -0
  3. scripts/yans/eval/lm-evaluation-harness/models/abeja-gpt-neox-japanese-2.7b/result.json +59 -0
  4. scripts/yans/eval/lm-evaluation-harness/models/abeja-gpt-neox-japanese-2.7b/result.jsquad-1.2.json +22 -0
  5. scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-1b_1.3.0/README.md +6 -0
  6. scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-1b_1.3.0/harness.sh +12 -0
  7. scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-1b_1.3.0/result.json +48 -0
  8. scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-3b_1.3.0/README.md +6 -0
  9. scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-3b_1.3.0/harness.sh +12 -0
  10. scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-1.7b-instruction-sft/harness.sh +3 -0
  11. scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-1.7b-instruction-sft/result.json +71 -0
  12. scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-1.7b/harness.sh +3 -0
  13. scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-1.7b/result.json +71 -0
  14. scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-3.6b-instruction-sft/harness.sh +3 -0
  15. scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-3.6b-instruction-sft/result.json +71 -0
  16. scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-3.6b/harness.sh +3 -0
  17. scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-3.6b/result.json +71 -0
  18. scripts/yans/eval/lm-evaluation-harness/models/rinna/harness.conf +4 -0
  19. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/result.json +71 -0
  20. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-bilingual-gpt-neox-4b/harness.jsquad-1.2.sh +3 -0
  21. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-bilingual-gpt-neox-4b/harness.sh +3 -0
  22. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-bilingual-gpt-neox-4b/result.json +71 -0
  23. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-bilingual-gpt-neox-4b/result.jsquad-1.2.json +22 -0
  24. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-1b/harness.jsquad-1.2.sh +3 -0
  25. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-1b/harness.sh +3 -0
  26. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-1b/result.json +71 -0
  27. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-1b/result.jsquad-1.2.json +22 -0
  28. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/harness.jsquad-1.2.sh +3 -0
  29. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/harness.sh +3 -0
  30. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.json +71 -0
  31. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.jsquad-1.2.json +22 -0
  32. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.mgsm.json +0 -0
  33. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b/harness.jsquad-1.2.sh +3 -0
  34. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b/harness.sh +3 -0
  35. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b/result.json +71 -0
  36. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b/result.jsquad-1.2.json +22 -0
  37. scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b/result.mgsm.json +0 -0
  38. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-compact-v1/harness.sh +13 -0
  39. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-compact-v1/result.json +71 -0
  40. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1-sl2k-300b/harness.sh +13 -0
  41. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1-sl2k-300b/result.json +48 -0
  42. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1-sl2k-slw-300b/harness.sh +13 -0
  43. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1-sl2k-slw-300b/result.json +71 -0
  44. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1_rp-sl2k-slw-300b/harness.sh +18 -0
  45. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1_rp-sl2k-slw-300b/result.json +71 -0
  46. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-rp_then_jav1-294b/harness.sh +18 -0
  47. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-rp_then_jav1-294b/result.json +71 -0
  48. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-3b-ja50_rp50-700b/harness_template-0.1.sh +12 -0
  49. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-3b-ja50_rp50-700b/harness_template-0.2.sh +12 -0
  50. scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-3b-ja50_rp50-700b/result_template-0.1.json +71 -0
scripts/yans/eval/lm-evaluation-harness/models/abeja-gpt-neox-japanese-2.7b/harness.jsquad-1.2.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=abeja/gpt-neox-japanese-2.7b,device_map=auto,torch_dtype=auto"
2
+ TASK="jsquad-1.2-0.2"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/abeja-gpt-neox-japanese-2.7b/result.jsquad-1.2.json"
scripts/yans/eval/lm-evaluation-harness/models/abeja-gpt-neox-japanese-2.7b/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=abeja/gpt-neox-japanese-2.7b"
2
+ TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,xlsum_ja"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1" --device "cuda" --output_path "models/abeja-gpt-neox-japanese-2.7b/result.json"
scripts/yans/eval/lm-evaluation-harness/models/abeja-gpt-neox-japanese-2.7b/result.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.2": {
4
+ "acc": 0.20017873100983022,
5
+ "acc_stderr": 0.011966979264632673,
6
+ "acc_norm": 0.22609472743521,
7
+ "acc_norm_stderr": 0.012510314229861862
8
+ },
9
+ "jnli-1.1-0.2": {
10
+ "acc": 0.3972884141331142,
11
+ "acc_stderr": 0.009920570907906705,
12
+ "acc_norm": 0.34798685291700904,
13
+ "acc_norm_stderr": 0.009656917922100158
14
+ },
15
+ "marc_ja-1.1-0.2": {
16
+ "acc": 0.749912800837112,
17
+ "acc_stderr": 0.005719527388015089,
18
+ "acc_norm": 0.749912800837112,
19
+ "acc_norm_stderr": 0.005719527388015089
20
+ },
21
+ "jsquad-1.1-0.2": {
22
+ "exact_match": 13.665015758667266,
23
+ "f1": 22.909453892411364
24
+ },
25
+ "xlsum_ja": {
26
+ "rouge2": 6.149952794206885
27
+ },
28
+ "xwinograd_ja": {
29
+ "acc": 0.6037539103232534,
30
+ "acc_stderr": 0.01580264261655725
31
+ }
32
+ },
33
+ "versions": {
34
+ "jcommonsenseqa-1.1-0.2": 1.1,
35
+ "jnli-1.1-0.2": 1.1,
36
+ "jsquad-1.1-0.2": 1.1,
37
+ "marc_ja-1.1-0.2": 1.1,
38
+ "xlsum_ja": 1.0,
39
+ "xwinograd_ja": 1.0
40
+ },
41
+ "config": {
42
+ "model": "hf-causal",
43
+ "model_args": "pretrained=abeja/gpt-neox-japanese-2.7b",
44
+ "num_fewshot": [
45
+ 2,
46
+ 3,
47
+ 3,
48
+ 3,
49
+ 1,
50
+ 0
51
+ ],
52
+ "batch_size": null,
53
+ "device": "cuda",
54
+ "no_cache": false,
55
+ "limit": null,
56
+ "bootstrap_iters": 100000,
57
+ "description_dict": {}
58
+ }
59
+ }
scripts/yans/eval/lm-evaluation-harness/models/abeja-gpt-neox-japanese-2.7b/result.jsquad-1.2.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.2-0.2": {
4
+ "exact_match": 15.803692030616839,
5
+ "f1": 25.18326978234071
6
+ }
7
+ },
8
+ "versions": {
9
+ "jsquad-1.2-0.2": 1.2
10
+ },
11
+ "config": {
12
+ "model": "hf-causal",
13
+ "model_args": "pretrained=abeja/gpt-neox-japanese-2.7b,device_map=auto,torch_dtype=auto",
14
+ "num_fewshot": 3,
15
+ "batch_size": null,
16
+ "device": "cuda",
17
+ "no_cache": false,
18
+ "limit": null,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-1b_1.3.0/README.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # cyberagent-open-calm-instruct-1b_1.3.0
2
+ - This is a supervised finetuned version of the base model [`cyberagent/open-calm-1b`](https://huggingface.co/cyberagent/open-calm-1b).
3
+ - The base model is trained on the datasets below by [Stability AI Japan](https://ja.stability.ai/).
4
+ - [japanese_hh-rlhf-49k](https://huggingface.co/datasets/fujiki/japanese_hh-rlhf-49k)
5
+ - [databricks-dolly-15k-ja](https://huggingface.co/datasets/kunishou/databricks-dolly-15k-ja)
6
+ - [japanese_alpaca_data](https://huggingface.co/datasets/fujiki/japanese_alpaca_data)
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-1b_1.3.0/harness.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eu
3
+ PROJECT_DIR=""
4
+ MODEL_ARGS="pretrained=${PROJECT_DIR}/instruction_tuning/outputs/open-calm-instruct-1b_1.3.0,tokenizer=cyberagent/open-calm-1b"
5
+ TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
6
+ python main.py \
7
+ --model hf-causal \
8
+ --model_args $MODEL_ARGS \
9
+ --tasks $TASK \
10
+ --num_fewshot "2,3,3,3" \
11
+ --device "cuda" \
12
+ --output_path "models/open-calm-instruct-1b_1.3.0/result.json"
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-1b_1.3.0/result.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.3": {
4
+ "acc": 0.7015192135835567,
5
+ "acc_stderr": 0.013685386698397504,
6
+ "acc_norm": 0.6255585344057194,
7
+ "acc_norm_stderr": 0.014474549079455518
8
+ },
9
+ "jnli-1.1-0.3": {
10
+ "acc": 0.3011503697617091,
11
+ "acc_stderr": 0.00930063317508552,
12
+ "acc_norm": 0.25842235004108466,
13
+ "acc_norm_stderr": 0.008875080429298606
14
+ },
15
+ "marc_ja-1.1-0.3": {
16
+ "acc": 0.877431906614786,
17
+ "acc_stderr": 0.004361701432875794,
18
+ "acc_norm": 0.877431906614786,
19
+ "acc_norm_stderr": 0.004361701432875794
20
+ },
21
+ "jsquad-1.1-0.3": {
22
+ "exact_match": 35.929761368752814,
23
+ "f1": 45.27144783040928
24
+ }
25
+ },
26
+ "versions": {
27
+ "jcommonsenseqa-1.1-0.3": 1.1,
28
+ "jnli-1.1-0.3": 1.1,
29
+ "jsquad-1.1-0.3": 1.1,
30
+ "marc_ja-1.1-0.3": 1.1
31
+ },
32
+ "config": {
33
+ "model": "hf-causal",
34
+ "model_args": "pretrained=${PROJECT_DIR}/instruction_tuning/outputs/open-calm-instruct-1b_1.3.0,tokenizer=cyberagent/open-calm-1b",
35
+ "num_fewshot": [
36
+ 2,
37
+ 3,
38
+ 3,
39
+ 3
40
+ ],
41
+ "batch_size": null,
42
+ "device": "cuda",
43
+ "no_cache": false,
44
+ "limit": null,
45
+ "bootstrap_iters": 100000,
46
+ "description_dict": {}
47
+ }
48
+ }
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-3b_1.3.0/README.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # cyberagent-open-calm-instruct-3b_1.3.0
2
+ - This is a supervised finetuned version of the base model [`cyberagent/open-calm-3b`](https://huggingface.co/cyberagent/open-calm-3b).
3
+ - The base model is trained on the datasets below by [Stability AI Japan](https://ja.stability.ai/).
4
+ - [japanese_hh-rlhf-49k](https://huggingface.co/datasets/fujiki/japanese_hh-rlhf-49k)
5
+ - [databricks-dolly-15k-ja](https://huggingface.co/datasets/kunishou/databricks-dolly-15k-ja)
6
+ - [japanese_alpaca_data](https://huggingface.co/datasets/fujiki/japanese_alpaca_data)
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-3b_1.3.0/harness.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eu
3
+ PROJECT_DIR=""
4
+ MODEL_ARGS="pretrained=${PROJECT_DIR}/instruction_tuning/outputs/open-calm-instruct-3b_1.3.0,tokenizer=cyberagent/open-calm-3b"
5
+ TASK="jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jsquad-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
6
+ python main.py \
7
+ --model hf-causal \
8
+ --model_args $MODEL_ARGS \
9
+ --tasks $TASK \
10
+ --num_fewshot "3,3,3,2,1,1,0,5" \
11
+ --device "cuda" \
12
+ --output_path "models/community/cyberagent-open-calm-instruct-3b_1.3.0/result.json"
scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-1.7b-instruction-sft/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=line-corporation/japanese-large-lm-1.7b-instruction-sft,use_fast=False,device_map=auto,torch_dtype=auto"
2
+ TASK="jcommonsenseqa-1.1-0.5,jnli-1.1-0.5,marc_ja-1.1-0.5,jsquad-1.1-0.5,jaqket_v2-0.2-0.5,xlsum_ja-1.0-0.5,xwinograd_ja,mgsm-1.0-0.5"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/line-corporation/line-corporation-japanese-large-lm-1.7b-instruction-sft/result.json"
scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-1.7b-instruction-sft/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.5": {
4
+ "acc": 0.22430741733690795,
5
+ "acc_stderr": 0.012475148816050531,
6
+ "acc_norm": 0.23681858802502234,
7
+ "acc_norm_stderr": 0.01271454677969028
8
+ },
9
+ "jnli-1.1-0.5": {
10
+ "acc": 0.34346754313886607,
11
+ "acc_stderr": 0.009627197865307401,
12
+ "acc_norm": 0.3011503697617091,
13
+ "acc_norm_stderr": 0.009300633175085522
14
+ },
15
+ "marc_ja-1.1-0.5": {
16
+ "acc": 0.8036788114609126,
17
+ "acc_stderr": 0.005283057698929343,
18
+ "acc_norm": 0.8036788114609126,
19
+ "acc_norm_stderr": 0.005283057698929343
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6329509906152242,
23
+ "acc_stderr": 0.015572714283682185
24
+ },
25
+ "jsquad-1.1-0.5": {
26
+ "exact_match": 30.977037370553806,
27
+ "f1": 48.12415333506568
28
+ },
29
+ "jaqket_v2-0.2-0.5": {
30
+ "exact_match": 25.257731958762886,
31
+ "f1": 40.58191140665372
32
+ },
33
+ "xlsum_ja-1.0-0.5": {
34
+ "rouge2": 1.0385441084792033
35
+ },
36
+ "mgsm-1.0-0.5": {
37
+ "acc": 0.016,
38
+ "acc_stderr": 0.007951661188874354
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.5": 1.1,
43
+ "jnli-1.1-0.5": 1.1,
44
+ "marc_ja-1.1-0.5": 1.1,
45
+ "jsquad-1.1-0.5": 1.1,
46
+ "jaqket_v2-0.2-0.5": 0.2,
47
+ "xlsum_ja-1.0-0.5": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm-1.0-0.5": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=line-corporation/japanese-large-lm-1.7b-instruction-sft,use_fast=False,device_map=auto,torch_dtype=auto",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-1.7b/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=line-corporation/japanese-large-lm-1.7b,use_fast=False,device_map=auto,torch_dtype=auto"
2
+ TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.2-0.2,xlsum_ja,xwinograd_ja,mgsm"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/line-corporation/line-corporation-japanese-large-lm-1.7b/result.json"
scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-1.7b/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.2": {
4
+ "acc": 0.30831099195710454,
5
+ "acc_stderr": 0.013811124479483034,
6
+ "acc_norm": 0.26005361930294907,
7
+ "acc_norm_stderr": 0.013119300343161644
8
+ },
9
+ "jnli-1.1-0.2": {
10
+ "acc": 0.35949055053410023,
11
+ "acc_stderr": 0.009728266419780814,
12
+ "acc_norm": 0.300328677074774,
13
+ "acc_norm_stderr": 0.00929339473482123
14
+ },
15
+ "marc_ja-1.1-0.2": {
16
+ "acc": 0.745136186770428,
17
+ "acc_stderr": 0.005796054001130057,
18
+ "acc_norm": 0.745136186770428,
19
+ "acc_norm_stderr": 0.005796054001130057
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6611053180396246,
23
+ "acc_stderr": 0.015292727421996942
24
+ },
25
+ "jsquad-1.1-0.2": {
26
+ "exact_match": 56.55110310670869,
27
+ "f1": 69.46989310703984
28
+ },
29
+ "jaqket_v2-0.2-0.2": {
30
+ "exact_match": 52.06185567010309,
31
+ "f1": 60.433303332787865
32
+ },
33
+ "xlsum_ja": {
34
+ "rouge2": 8.408787633129647
35
+ },
36
+ "mgsm": {
37
+ "acc": 0.0,
38
+ "acc_stderr": 0.0
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.2": 1.1,
43
+ "jnli-1.1-0.2": 1.1,
44
+ "marc_ja-1.1-0.2": 1.1,
45
+ "jsquad-1.1-0.2": 1.1,
46
+ "jaqket_v2-0.2-0.2": 0.2,
47
+ "xlsum_ja": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=line-corporation/japanese-large-lm-1.7b,use_fast=False,device_map=auto,torch_dtype=auto",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-3.6b-instruction-sft/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=line-corporation/japanese-large-lm-3.6b-instruction-sft,use_fast=False,device_map=auto,torch_dtype=auto"
2
+ TASK="jcommonsenseqa-1.1-0.5,jnli-1.1-0.5,marc_ja-1.1-0.5,jsquad-1.1-0.5,jaqket_v2-0.2-0.5,xlsum_ja-1.0-0.5,xwinograd_ja,mgsm-1.0-0.5"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/line-corporation/line-corporation-japanese-large-lm-3.6b-instruction-sft/result.json"
scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-3.6b-instruction-sft/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.5": {
4
+ "acc": 0.3360142984807864,
5
+ "acc_stderr": 0.014126590011265207,
6
+ "acc_norm": 0.26720285969615726,
7
+ "acc_norm_stderr": 0.013234012242081952
8
+ },
9
+ "jnli-1.1-0.5": {
10
+ "acc": 0.4256368118323747,
11
+ "acc_stderr": 0.010024017935515625,
12
+ "acc_norm": 0.3019720624486442,
13
+ "acc_norm_stderr": 0.009307836171755053
14
+ },
15
+ "marc_ja-1.1-0.5": {
16
+ "acc": 0.5509373894587902,
17
+ "acc_stderr": 0.006615536639080702,
18
+ "acc_norm": 0.5509373894587902,
19
+ "acc_norm_stderr": 0.006615536639080702
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6465067778936392,
23
+ "acc_stderr": 0.015445228301221386
24
+ },
25
+ "jsquad-1.1-0.5": {
26
+ "exact_match": 44.371904547501124,
27
+ "f1": 59.516773934435584
28
+ },
29
+ "jaqket_v2-0.2-0.5": {
30
+ "exact_match": 39.86254295532646,
31
+ "f1": 51.98299576521227
32
+ },
33
+ "xlsum_ja-1.0-0.5": {
34
+ "rouge2": 6.577976426409143
35
+ },
36
+ "mgsm-1.0-0.5": {
37
+ "acc": 0.024,
38
+ "acc_stderr": 0.009699087026964249
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.5": 1.1,
43
+ "jnli-1.1-0.5": 1.1,
44
+ "marc_ja-1.1-0.5": 1.1,
45
+ "jsquad-1.1-0.5": 1.1,
46
+ "jaqket_v2-0.2-0.5": 0.2,
47
+ "xlsum_ja-1.0-0.5": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm-1.0-0.5": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=line-corporation/japanese-large-lm-3.6b-instruction-sft,use_fast=False,device_map=auto,torch_dtype=auto",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-3.6b/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=line-corporation/japanese-large-lm-3.6b,use_fast=False,device_map=auto,torch_dtype=auto"
2
+ TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.2-0.2,xlsum_ja,xwinograd_ja,mgsm"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/line-corporation/line-corporation-japanese-large-lm-3.6b/result.json"
scripts/yans/eval/lm-evaluation-harness/models/line-corporation/line-corporation-japanese-large-lm-3.6b/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.2": {
4
+ "acc": 0.24039320822162646,
5
+ "acc_stderr": 0.01278011066769292,
6
+ "acc_norm": 0.2421805183199285,
7
+ "acc_norm_stderr": 0.0128124322893179
8
+ },
9
+ "jnli-1.1-0.2": {
10
+ "acc": 0.29950698438783896,
11
+ "acc_stderr": 0.009286120768078254,
12
+ "acc_norm": 0.30156121610517667,
13
+ "acc_norm_stderr": 0.009304239098715018
14
+ },
15
+ "marc_ja-1.1-0.2": {
16
+ "acc": 0.7939511850017686,
17
+ "acc_stderr": 0.005379506895071017,
18
+ "acc_norm": 0.7939511850017686,
19
+ "acc_norm_stderr": 0.005379506895071017
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.7028154327424401,
23
+ "acc_stderr": 0.014765597190000436
24
+ },
25
+ "jsquad-1.1-0.2": {
26
+ "exact_match": 62.26924808644755,
27
+ "f1": 74.52057820837234
28
+ },
29
+ "jaqket_v2-0.2-0.2": {
30
+ "exact_match": 67.18213058419244,
31
+ "f1": 74.29659878113482
32
+ },
33
+ "xlsum_ja": {
34
+ "rouge2": 8.610239752200977
35
+ },
36
+ "mgsm": {
37
+ "acc": 0.028,
38
+ "acc_stderr": 0.010454721651927288
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.2": 1.1,
43
+ "jnli-1.1-0.2": 1.1,
44
+ "marc_ja-1.1-0.2": 1.1,
45
+ "jsquad-1.1-0.2": 1.1,
46
+ "jaqket_v2-0.2-0.2": 0.2,
47
+ "xlsum_ja": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=line-corporation/japanese-large-lm-3.6b,use_fast=False,device_map=auto,torch_dtype=auto",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/rinna/harness.conf ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [DEFAULT]
2
+ # Recent Rinna models use the 0.4 prompt, though note that older ones used
3
+ # other prompts.
4
+ prompt = 0.4
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.5": {
4
+ "acc": 0.49508489722966936,
5
+ "acc_stderr": 0.014952992585674197,
6
+ "acc_norm": 0.4941912421805183,
7
+ "acc_norm_stderr": 0.014952705953248754
8
+ },
9
+ "jnli-1.1-0.5": {
10
+ "acc": 0.47082990961380444,
11
+ "acc_stderr": 0.010119489683056362,
12
+ "acc_norm": 0.45028759244042726,
13
+ "acc_norm_stderr": 0.010086528162038566
14
+ },
15
+ "marc_ja-1.1-0.5": {
16
+ "acc": 0.9527767951892465,
17
+ "acc_stderr": 0.0028211996518060353,
18
+ "acc_norm": 0.9527767951892465,
19
+ "acc_norm_stderr": 0.0028211996518060353
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6465067778936392,
23
+ "acc_stderr": 0.015445228301221378
24
+ },
25
+ "jsquad-1.1-0.5": {
26
+ "exact_match": 55.9882935614588,
27
+ "f1": 70.04345164121641
28
+ },
29
+ "jaqket_v2-0.1-0.5": {
30
+ "exact_match": 61.16838487972509,
31
+ "f1": 65.03049022378916
32
+ },
33
+ "xlsum_ja-1.0-0.5": {
34
+ "rouge2": 5.506882882949979
35
+ },
36
+ "mgsm-1.0-0.5": {
37
+ "acc": 0.028,
38
+ "acc_stderr": 0.010454721651927302
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.5": 1.1,
43
+ "jnli-1.1-0.5": 1.1,
44
+ "marc_ja-1.1-0.5": 1.1,
45
+ "jsquad-1.1-0.5": 1.1,
46
+ "jaqket_v2-0.1-0.5": 0.1,
47
+ "xlsum_ja-1.0-0.5": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm-1.0-0.5": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=rinna/bilingual-gpt-neox-4b-instruction-sft,use_fast=False,device_map=auto,torch_dtype=auto",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-bilingual-gpt-neox-4b/harness.jsquad-1.2.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=rinna/bilingual-gpt-neox-4b,use_fast=False,device_map=auto,torch_dtype=auto"
2
+ TASK="jsquad-1.2-0.2"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-bilingual-gpt-neox-4b/result.jsquad-1.2.json"
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-bilingual-gpt-neox-4b/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=rinna/bilingual-gpt-neox-4b,use_fast=False,device_map=auto,torch_dtype=auto"
2
+ TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/rinna/rinna-bilingual-gpt-neox-4b/result.json"
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-bilingual-gpt-neox-4b/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.2": {
4
+ "acc": 0.20822162645218945,
5
+ "acc_stderr": 0.01214349876971715,
6
+ "acc_norm": 0.22788203753351208,
7
+ "acc_norm_stderr": 0.012545153313075156
8
+ },
9
+ "jnli-1.1-0.2": {
10
+ "acc": 0.5521774856203779,
11
+ "acc_stderr": 0.010081409479626453,
12
+ "acc_norm": 0.5304026294165982,
13
+ "acc_norm_stderr": 0.01011799843670741
14
+ },
15
+ "marc_ja-1.1-0.2": {
16
+ "acc": 0.5955076052352317,
17
+ "acc_stderr": 0.0065276873249124285,
18
+ "acc_norm": 0.5955076052352317,
19
+ "acc_norm_stderr": 0.0065276873249124285
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6642335766423357,
23
+ "acc_stderr": 0.01525795361580425
24
+ },
25
+ "jsquad-1.1-0.2": {
26
+ "exact_match": 50.78793336334984,
27
+ "f1": 61.684710792645284
28
+ },
29
+ "jaqket_v2-0.1-0.2": {
30
+ "exact_match": 59.450171821305844,
31
+ "f1": 65.22394415435645
32
+ },
33
+ "xlsum_ja": {
34
+ "rouge2": 5.54788534415756
35
+ },
36
+ "mgsm": {
37
+ "acc": 0.024,
38
+ "acc_stderr": 0.009699087026964261
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.2": 1.1,
43
+ "jnli-1.1-0.2": 1.1,
44
+ "marc_ja-1.1-0.2": 1.1,
45
+ "jsquad-1.1-0.2": 1.1,
46
+ "jaqket_v2-0.1-0.2": 0.1,
47
+ "xlsum_ja": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=rinna/bilingual-gpt-neox-4b,use_fast=False,device_map=auto,torch_dtype=auto",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-bilingual-gpt-neox-4b/result.jsquad-1.2.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.2-0.2": {
4
+ "exact_match": 51.32823052678973,
5
+ "f1": 61.9390389728309
6
+ }
7
+ },
8
+ "versions": {
9
+ "jsquad-1.2-0.2": 1.2
10
+ },
11
+ "config": {
12
+ "model": "hf-causal",
13
+ "model_args": "pretrained=rinna/bilingual-gpt-neox-4b,use_fast=False,device_map=auto,torch_dtype=auto",
14
+ "num_fewshot": 2,
15
+ "batch_size": null,
16
+ "device": "cuda",
17
+ "no_cache": false,
18
+ "limit": null,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-1b/harness.jsquad-1.2.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=rinna/japanese-gpt-1b,use_fast=False"
2
+ TASK="jsquad-1.2-0.2"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-japanese-gpt-1b/result.jsquad-1.2.json"
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-1b/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=rinna/japanese-gpt-1b,use_fast=False"
2
+ TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/rinna/rinna-japanese-gpt-1b/result.json"
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-1b/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.2": {
4
+ "acc": 0.34763181411974975,
5
+ "acc_stderr": 0.014242467674129443,
6
+ "acc_norm": 0.257372654155496,
7
+ "acc_norm_stderr": 0.013075122531072186
8
+ },
9
+ "jnli-1.1-0.2": {
10
+ "acc": 0.37674609695973704,
11
+ "acc_stderr": 0.009823942907406482,
12
+ "acc_norm": 0.3011503697617091,
13
+ "acc_norm_stderr": 0.009300633175085522
14
+ },
15
+ "marc_ja-1.1-0.2": {
16
+ "acc": 0.8786187652598535,
17
+ "acc_stderr": 0.0043130554527802374,
18
+ "acc_norm": 0.8786187652598535,
19
+ "acc_norm_stderr": 0.0043130554527802374
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6454640250260688,
23
+ "acc_stderr": 0.015455512877686553
24
+ },
25
+ "jsquad-1.1-0.2": {
26
+ "exact_match": 26.181900045024765,
27
+ "f1": 44.67532835280053
28
+ },
29
+ "jaqket_v2-0.1-0.2": {
30
+ "exact_match": 37.02749140893471,
31
+ "f1": 57.99059569678122
32
+ },
33
+ "xlsum_ja": {
34
+ "rouge2": 5.335027032779865
35
+ },
36
+ "mgsm": {
37
+ "acc": 0.02,
38
+ "acc_stderr": 0.008872139507342681
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.2": 1.1,
43
+ "jnli-1.1-0.2": 1.1,
44
+ "marc_ja-1.1-0.2": 1.1,
45
+ "jsquad-1.1-0.2": 1.1,
46
+ "jaqket_v2-0.1-0.2": 0.1,
47
+ "xlsum_ja": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=rinna/japanese-gpt-1b,use_fast=False",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-1b/result.jsquad-1.2.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.2-0.2": {
4
+ "exact_match": 30.189104007203962,
5
+ "f1": 47.12467642283419
6
+ }
7
+ },
8
+ "versions": {
9
+ "jsquad-1.2-0.2": 1.2
10
+ },
11
+ "config": {
12
+ "model": "hf-causal",
13
+ "model_args": "pretrained=rinna/japanese-gpt-1b,use_fast=False",
14
+ "num_fewshot": 2,
15
+ "batch_size": null,
16
+ "device": "cuda",
17
+ "no_cache": false,
18
+ "limit": null,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/harness.jsquad-1.2.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=rinna/japanese-gpt-neox-3.6b-instruction-ppo,use_fast=False,device_map=auto,torch_dtype=auto"
2
+ TASK="jsquad-1.2-0.4"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.jsquad-1.2.json"
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=rinna/japanese-gpt-neox-3.6b-instruction-ppo,use_fast=False,device_map=auto,torch_dtype=auto"
2
+ TASK="jcommonsenseqa-1.1-0.4,jnli-1.1-0.4,marc_ja-1.1-0.4,jsquad-1.1-0.4,jaqket_v2-0.1-0.4,xlsum_ja-1.0-0.4,xwinograd_ja,mgsm-1.0-0.4"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.json"
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.4": {
4
+ "acc": 0.44057193923145666,
5
+ "acc_stderr": 0.014847715520097282,
6
+ "acc_norm": 0.4226988382484361,
7
+ "acc_norm_stderr": 0.014773923335599326
8
+ },
9
+ "jnli-1.1-0.4": {
10
+ "acc": 0.5419063270336894,
11
+ "acc_stderr": 0.01010108912658305,
12
+ "acc_norm": 0.5312243221035333,
13
+ "acc_norm_stderr": 0.01011696986287914
14
+ },
15
+ "marc_ja-1.1-0.4": {
16
+ "acc": 0.8960585978374608,
17
+ "acc_stderr": 0.004030616889059545,
18
+ "acc_norm": 0.8960585978374608,
19
+ "acc_norm_stderr": 0.004030616889059545
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6913451511991658,
23
+ "acc_stderr": 0.014924550437257583
24
+ },
25
+ "jsquad-1.1-0.4": {
26
+ "exact_match": 51.62089149031968,
27
+ "f1": 63.676339985467465
28
+ },
29
+ "jaqket_v2-0.1-0.4": {
30
+ "exact_match": 50.945017182130584,
31
+ "f1": 55.79263424624247
32
+ },
33
+ "xlsum_ja-1.0-0.4": {
34
+ "rouge2": 6.633741717885442
35
+ },
36
+ "mgsm-1.0-0.4": {
37
+ "acc": 0.044,
38
+ "acc_stderr": 0.012997373846574957
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.4": 1.1,
43
+ "jnli-1.1-0.4": 1.1,
44
+ "marc_ja-1.1-0.4": 1.1,
45
+ "jsquad-1.1-0.4": 1.1,
46
+ "jaqket_v2-0.1-0.4": 0.1,
47
+ "xlsum_ja-1.0-0.4": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm-1.0-0.4": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=rinna/japanese-gpt-neox-3.6b-instruction-ppo,use_fast=False,device_map=auto,torch_dtype=auto",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.jsquad-1.2.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.2-0.4": {
4
+ "exact_match": 52.633948671769474,
5
+ "f1": 64.387511749343
6
+ }
7
+ },
8
+ "versions": {
9
+ "jsquad-1.2-0.4": 1.2
10
+ },
11
+ "config": {
12
+ "model": "hf-causal",
13
+ "model_args": "pretrained=rinna/japanese-gpt-neox-3.6b-instruction-ppo,use_fast=False,device_map=auto,torch_dtype=auto",
14
+ "num_fewshot": 2,
15
+ "batch_size": null,
16
+ "device": "cuda",
17
+ "no_cache": false,
18
+ "limit": null,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.mgsm.json ADDED
The diff for this file is too large to render. See raw diff
 
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b/harness.jsquad-1.2.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=rinna/japanese-gpt-neox-3.6b,use_fast=False,device_map=auto,torch_dtype=auto"
2
+ TASK="jsquad-1.2-0.2"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-japanese-gpt-neox-3.6b/result.jsquad-1.2.json"
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=rinna/japanese-gpt-neox-3.6b,use_fast=False"
2
+ TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
3
+ python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/rinna/rinna-japanese-gpt-neox-3.6b/result.json"
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.2": {
4
+ "acc": 0.3163538873994638,
5
+ "acc_stderr": 0.013908534121227658,
6
+ "acc_norm": 0.2725647899910634,
7
+ "acc_norm_stderr": 0.01331714516405031
8
+ },
9
+ "jnli-1.1-0.2": {
10
+ "acc": 0.3442892358258012,
11
+ "acc_stderr": 0.009632673153167076,
12
+ "acc_norm": 0.3311421528348398,
13
+ "acc_norm_stderr": 0.009541202050062205
14
+ },
15
+ "marc_ja-1.1-0.2": {
16
+ "acc": 0.7481688175793513,
17
+ "acc_stderr": 0.005732757658862212,
18
+ "acc_norm": 0.7481688175793513,
19
+ "acc_norm_stderr": 0.005732757658862212
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.708029197080292,
23
+ "acc_stderr": 0.014689686963716971
24
+ },
25
+ "jsquad-1.1-0.2": {
26
+ "exact_match": 47.90634849167042,
27
+ "f1": 58.804568288439675
28
+ },
29
+ "jaqket_v2-0.1-0.2": {
30
+ "exact_match": 68.38487972508591,
31
+ "f1": 72.4344388906244
32
+ },
33
+ "xlsum_ja": {
34
+ "rouge2": 5.157849646982534
35
+ },
36
+ "mgsm": {
37
+ "acc": 0.012,
38
+ "acc_stderr": 0.006900323023694271
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.2": 1.1,
43
+ "jnli-1.1-0.2": 1.1,
44
+ "marc_ja-1.1-0.2": 1.1,
45
+ "jsquad-1.1-0.2": 1.1,
46
+ "jaqket_v2-0.1-0.2": 0.1,
47
+ "xlsum_ja": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=rinna/japanese-gpt-neox-3.6b,use_fast=False",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b/result.jsquad-1.2.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.2-0.2": {
4
+ "exact_match": 49.0094552003602,
5
+ "f1": 59.80363888369063
6
+ }
7
+ },
8
+ "versions": {
9
+ "jsquad-1.2-0.2": 1.2
10
+ },
11
+ "config": {
12
+ "model": "hf-causal",
13
+ "model_args": "pretrained=rinna/japanese-gpt-neox-3.6b,use_fast=False,device_map=auto,torch_dtype=auto",
14
+ "num_fewshot": 2,
15
+ "batch_size": null,
16
+ "device": "cuda",
17
+ "no_cache": false,
18
+ "limit": null,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
scripts/yans/eval/lm-evaluation-harness/models/rinna/rinna-japanese-gpt-neox-3.6b/result.mgsm.json ADDED
The diff for this file is too large to render. See raw diff
 
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-compact-v1/harness.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eu
3
+ PROJECT_DIR="/fsx/proj-jp-stablegpt"
4
+ MODEL_ARGS="pretrained=${PROJECT_DIR}/hf_model/1b-compact-v1,tokenizer=${PROJECT_DIR}/tokenizers/compact-hf/,use_fast=False"
5
+ TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
6
+ NUM_FEW_SHOTS="3,3,3,2,1,1,0,5"
7
+ python main.py \
8
+ --model hf-causal \
9
+ --model_args $MODEL_ARGS \
10
+ --tasks $TASK \
11
+ --num_fewshot $NUM_FEW_SHOTS \
12
+ --device "cuda" \
13
+ --output_path "models/stablelm/stablelm-jp-1b-compact-v1/result.json"
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-compact-v1/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.2": {
4
+ "acc": 0.4709562109025916,
5
+ "acc_stderr": 0.014928465632785326,
6
+ "acc_norm": 0.3485254691689008,
7
+ "acc_norm_stderr": 0.014250991444953297
8
+ },
9
+ "jnli-1.1-0.2": {
10
+ "acc": 0.4449465899753492,
11
+ "acc_stderr": 0.010075121089036965,
12
+ "acc_norm": 0.4026294165981923,
13
+ "acc_norm_stderr": 0.009942683448992417
14
+ },
15
+ "marc_ja-1.1-0.2": {
16
+ "acc": 0.757063132193931,
17
+ "acc_stderr": 0.005663981049607239,
18
+ "acc_norm": 0.757063132193931,
19
+ "acc_norm_stderr": 0.005663981049607239
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6339937434827946,
23
+ "acc_stderr": 0.015563382319228687
24
+ },
25
+ "jsquad-1.1-0.2": {
26
+ "exact_match": 29.536244934714095,
27
+ "f1": 39.00936796569676
28
+ },
29
+ "jaqket_v2-0.1-0.2": {
30
+ "exact_match": 33.24742268041237,
31
+ "f1": 38.13348879070528
32
+ },
33
+ "xlsum_ja": {
34
+ "rouge2": 4.3964148234614
35
+ },
36
+ "mgsm": {
37
+ "acc": 0.012,
38
+ "acc_stderr": 0.0069003230236942764
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.2": 1.1,
43
+ "jnli-1.1-0.2": 1.1,
44
+ "marc_ja-1.1-0.2": 1.1,
45
+ "jsquad-1.1-0.2": 1.1,
46
+ "jaqket_v2-0.1-0.2": 0.1,
47
+ "xlsum_ja": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=/fsx/proj-jp-stablegpt/hf_model/1b-compact-v1,tokenizer=/fsx/proj-jp-stablegpt/tokenizers/compact-hf/,use_fast=False",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1-sl2k-300b/harness.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eu
3
+ PROJECT_DIR=""
4
+ MODEL_ARGS="pretrained=${PROJECT_DIR}/hf_model/1b-jav1-sl2k-300b,tokenizer=${PROJECT_DIR}/tokenizers/nai-hf-tokenizer/,use_fast=False"
5
+ TASK="jsquad-1.1-0.2,jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2"
6
+ NUM_FEW_SHOTS="2,3,3,3"
7
+ python main.py \
8
+ --model hf-causal \
9
+ --model_args $MODEL_ARGS \
10
+ --tasks $TASK \
11
+ --num_fewshot $NUM_FEW_SHOTS \
12
+ --device "cuda" \
13
+ --output_path "models/stablelm-jp-1b-jav1-sl2k-300b/result.json"
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1-sl2k-300b/result.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.2": {
4
+ "acc": 0.2555853440571939,
5
+ "acc_stderr": 0.013045313758426092,
6
+ "acc_norm": 0.23056300268096513,
7
+ "acc_norm_stderr": 0.012596805983976347
8
+ },
9
+ "jnli-1.1-0.2": {
10
+ "acc": 0.37880032867707475,
11
+ "acc_stderr": 0.009834442099385492,
12
+ "acc_norm": 0.3648315529991783,
13
+ "acc_norm_stderr": 0.009759320919777338
14
+ },
15
+ "marc_ja-1.1-0.2": {
16
+ "acc": 0.4899186416696144,
17
+ "acc_stderr": 0.006648783869548029,
18
+ "acc_norm": 0.4899186416696144,
19
+ "acc_norm_stderr": 0.006648783869548029
20
+ },
21
+ "jsquad-1.1-0.2": {
22
+ "exact_match": 34.5114813147231,
23
+ "f1": 44.58786913290027
24
+ }
25
+ },
26
+ "versions": {
27
+ "jcommonsenseqa-1.1-0.2": 1.1,
28
+ "jnli-1.1-0.2": 1.1,
29
+ "jsquad-1.1-0.2": 1.1,
30
+ "marc_ja-1.1-0.2": 1.1
31
+ },
32
+ "config": {
33
+ "model": "hf-causal",
34
+ "model_args": "pretrained=${PROJECT_DIR}/hf_model/1b-jav1-sl2k-300b,tokenizer=${PROJECT_DIR}/tokenizers/nai-hf-tokenizer/,use_fast=False",
35
+ "num_fewshot": [
36
+ 2,
37
+ 3,
38
+ 3,
39
+ 3
40
+ ],
41
+ "batch_size": null,
42
+ "device": "cuda",
43
+ "no_cache": false,
44
+ "limit": null,
45
+ "bootstrap_iters": 100000,
46
+ "description_dict": {}
47
+ }
48
+ }
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1-sl2k-slw-300b/harness.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eu
3
+ PROJECT_DIR=""
4
+ MODEL_ARGS="pretrained=${PROJECT_DIR}/hf_model/1b-jav1-sl2k-slw-300b,tokenizer=${PROJECT_DIR}/tokenizers/nai-hf-tokenizer/,use_fast=False"
5
+ TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
6
+ NUM_FEW_SHOTS="3,3,3,2,1,1,0,5"
7
+ python main.py \
8
+ --model hf-causal \
9
+ --model_args $MODEL_ARGS \
10
+ --tasks $TASK \
11
+ --num_fewshot $NUM_FEW_SHOTS \
12
+ --device "cuda" \
13
+ --output_path "models/stablelm/stablelm-jp-1b-jav1-sl2k-slw-300b/result.json"
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1-sl2k-slw-300b/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.2": {
4
+ "acc": 0.28596961572832885,
5
+ "acc_stderr": 0.013514419338665247,
6
+ "acc_norm": 0.2421805183199285,
7
+ "acc_norm_stderr": 0.012812432289317909
8
+ },
9
+ "jnli-1.1-0.2": {
10
+ "acc": 0.3751027115858669,
11
+ "acc_stderr": 0.009815408241248628,
12
+ "acc_norm": 0.34880854560394414,
13
+ "acc_norm_stderr": 0.009662218404461801
14
+ },
15
+ "marc_ja-1.1-0.2": {
16
+ "acc": 0.7518311824206487,
17
+ "acc_stderr": 0.00570483124396955,
18
+ "acc_norm": 0.7518311824206487,
19
+ "acc_norm_stderr": 0.00570483124396955
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6527632950990615,
23
+ "acc_stderr": 0.015381826969142634
24
+ },
25
+ "jsquad-1.1-0.2": {
26
+ "exact_match": 42.390814948221525,
27
+ "f1": 52.94897262881226
28
+ },
29
+ "jaqket_v2-0.1-0.2": {
30
+ "exact_match": 42.439862542955325,
31
+ "f1": 48.18551246386296
32
+ },
33
+ "xlsum_ja": {
34
+ "rouge2": 8.371640364702019
35
+ },
36
+ "mgsm": {
37
+ "acc": 0.008,
38
+ "acc_stderr": 0.00564548367669017
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.2": 1.1,
43
+ "jnli-1.1-0.2": 1.1,
44
+ "marc_ja-1.1-0.2": 1.1,
45
+ "jsquad-1.1-0.2": 1.1,
46
+ "jaqket_v2-0.1-0.2": 0.1,
47
+ "xlsum_ja": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=${PROJECT_DIR}/hf_model/1b-jav1-sl2k-slw-300b,tokenizer=${PROJECT_DIR}/tokenizers/nai-hf-tokenizer/,use_fast=False",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1_rp-sl2k-slw-300b/harness.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eu
3
+
4
+ if [ -z ${JP_LLM_PATH+x} ]; then
5
+ echo "Error: The JP_LLM_PATH environment variable is not set"
6
+ exit 1
7
+ fi
8
+
9
+ MODEL_ARGS="pretrained=$JP_LLM_PATH/hf_model/1b-jav1_rp-sl2k-slw,tokenizer=$JP_LLM_PATH/tokenizers/nai-hf-tokenizer/,use_fast=False"
10
+ TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
11
+ NUM_FEW_SHOTS="3,3,3,2,1,1,0,5"
12
+ python main.py \
13
+ --model hf-causal \
14
+ --model_args $MODEL_ARGS \
15
+ --tasks $TASK \
16
+ --num_fewshot $NUM_FEW_SHOTS \
17
+ --device "cuda" \
18
+ --output_path "models/stablelm/stablelm-jp-1b-jav1_rp-sl2k-slw-300b/result.json"
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-jav1_rp-sl2k-slw-300b/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.2": {
4
+ "acc": 0.34137622877569257,
5
+ "acc_stderr": 0.014181247513525478,
6
+ "acc_norm": 0.2645218945487042,
7
+ "acc_norm_stderr": 0.013191518316844342
8
+ },
9
+ "jnli-1.1-0.2": {
10
+ "acc": 0.3373048479868529,
11
+ "acc_stderr": 0.00958511072017679,
12
+ "acc_norm": 0.3360723089564503,
13
+ "acc_norm_stderr": 0.009576475494957559
14
+ },
15
+ "marc_ja-1.1-0.2": {
16
+ "acc": 0.6860830136030694,
17
+ "acc_stderr": 0.006129213801621414,
18
+ "acc_norm": 0.6860830136030694,
19
+ "acc_norm_stderr": 0.006129213801621414
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6016684045881127,
23
+ "acc_stderr": 0.015816785549652837
24
+ },
25
+ "jsquad-1.1-0.2": {
26
+ "exact_match": 24.53849617289509,
27
+ "f1": 33.53058791900235
28
+ },
29
+ "jaqket_v2-0.1-0.2": {
30
+ "exact_match": 23.969072164948454,
31
+ "f1": 27.900030000545463
32
+ },
33
+ "xlsum_ja": {
34
+ "rouge2": 7.9292934294551545
35
+ },
36
+ "mgsm": {
37
+ "acc": 0.016,
38
+ "acc_stderr": 0.007951661188874313
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.2": 1.1,
43
+ "jnli-1.1-0.2": 1.1,
44
+ "marc_ja-1.1-0.2": 1.1,
45
+ "jsquad-1.1-0.2": 1.1,
46
+ "jaqket_v2-0.1-0.2": 0.1,
47
+ "xlsum_ja": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=/fsx/proj-jp-stablegpt/hf_model/1b-jav1_rp-sl2k-slw,tokenizer=/fsx/proj-jp-stablegpt/tokenizers/nai-hf-tokenizer/,use_fast=False",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-rp_then_jav1-294b/harness.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eu
3
+
4
+ if [ -z ${JP_LLM_PATH+x} ]; then
5
+ echo "Error: The JP_LLM_PATH environment variable is not set"
6
+ exit 1
7
+ fi
8
+
9
+ MODEL_ARGS="pretrained=$JP_LLM_PATH/hf_model/1b-rp_then_jav1-294b,tokenizer=$JP_LLM_PATH/tokenizers/nai-hf-tokenizer/,use_fast=False"
10
+ TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
11
+ NUM_FEW_SHOTS="3,3,3,2,1,1,0,5"
12
+ python main.py \
13
+ --model hf-causal \
14
+ --model_args $MODEL_ARGS \
15
+ --tasks $TASK \
16
+ --num_fewshot $NUM_FEW_SHOTS \
17
+ --device "cuda" \
18
+ --output_path "models/stablelm/stablelm-jp-1b-rp_then_jav1-294b/result.json"
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-1b-rp_then_jav1-294b/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.2": {
4
+ "acc": 0.2680965147453083,
5
+ "acc_stderr": 0.013248038756079302,
6
+ "acc_norm": 0.24039320822162646,
7
+ "acc_norm_stderr": 0.012780110667692907
8
+ },
9
+ "jnli-1.1-0.2": {
10
+ "acc": 0.3278553820870994,
11
+ "acc_stderr": 0.009517030628219573,
12
+ "acc_norm": 0.31183237469186526,
13
+ "acc_norm_stderr": 0.009391536814742456
14
+ },
15
+ "marc_ja-1.1-0.2": {
16
+ "acc": 0.7771189396581792,
17
+ "acc_stderr": 0.005496539565709208,
18
+ "acc_norm": 0.7771189396581792,
19
+ "acc_norm_stderr": 0.005496539565709208
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6861313868613139,
23
+ "acc_stderr": 0.01499321721472398
24
+ },
25
+ "jsquad-1.1-0.2": {
26
+ "exact_match": 54.02971634398919,
27
+ "f1": 64.2854711987419
28
+ },
29
+ "jaqket_v2-0.1-0.2": {
30
+ "exact_match": 59.450171821305844,
31
+ "f1": 65.37892424490362
32
+ },
33
+ "xlsum_ja": {
34
+ "rouge2": 9.662662093427816
35
+ },
36
+ "mgsm": {
37
+ "acc": 0.008,
38
+ "acc_stderr": 0.0056454836766901585
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.2": 1.1,
43
+ "jnli-1.1-0.2": 1.1,
44
+ "marc_ja-1.1-0.2": 1.1,
45
+ "jsquad-1.1-0.2": 1.1,
46
+ "jaqket_v2-0.1-0.2": 0.1,
47
+ "xlsum_ja": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=${PROJECT_DIR}/hf_model/1b-rp_then_jav1-294b,tokenizer=${PROJECT_DIR}/tokenizers/nai-hf-tokenizer/,use_fast=False",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-3b-ja50_rp50-700b/harness_template-0.1.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ PROJECT_DIR=""
3
+ MODEL_ARGS="pretrained=${PROJECT_DIR}/hf_model/3b-ja50_rp50-700b,tokenizer=${PROJECT_DIR}/tokenizers/nai-hf-tokenizer/,use_fast=False"
4
+ TASK="jcommonsenseqa-1.1-0.1,jnli,marc_ja,jsquad-1.1-0.1,jaqket_v2-0.1-0.1,xlsum_ja,xwinograd_ja,mgsm"
5
+ NUM_FEW_SHOTS="3,3,3,2,1,1,0,5"
6
+ python main.py \
7
+ --model hf-causal \
8
+ --model_args $MODEL_ARGS \
9
+ --tasks $TASK \
10
+ --num_fewshot $NUM_FEW_SHOTS \
11
+ --device "cuda" \
12
+ --output_path "models/stablelm/stablelm-jp-3b-ja50_rp50-700b/result_template-0.1.json"
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-3b-ja50_rp50-700b/harness_template-0.2.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ PROJECT_DIR=""
3
+ MODEL_ARGS="pretrained=${PROJECT_DIR}/hf_model/3b-ja50_rp50-700b,tokenizer=${PROJECT_DIR}/tokenizers/nai-hf-tokenizer/,use_fast=False"
4
+ TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
5
+ NUM_FEW_SHOTS="3,3,3,2,1,1,0,5"
6
+ python main.py \
7
+ --model hf-causal \
8
+ --model_args $MODEL_ARGS \
9
+ --tasks $TASK \
10
+ --num_fewshot $NUM_FEW_SHOTS \
11
+ --device "cuda" \
12
+ --output_path "models/stablelm/stablelm-jp-3b-ja50_rp50-700b/result_template-0.2.json"
scripts/yans/eval/lm-evaluation-harness/models/stabilityai/experiments/stablelm-jp-3b-ja50_rp50-700b/result_template-0.1.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.1": {
4
+ "acc": 0.4280607685433423,
5
+ "acc_stderr": 0.014798127177394432,
6
+ "acc_norm": 0.40214477211796246,
7
+ "acc_norm_stderr": 0.014664536048234705
8
+ },
9
+ "jnli": {
10
+ "acc": 0.36442070665571075,
11
+ "acc_stderr": 0.009756978284439256,
12
+ "acc_norm": 0.3245686113393591,
13
+ "acc_norm_stderr": 0.00949232990976085
14
+ },
15
+ "marc_ja": {
16
+ "acc": 0.7539239623299616,
17
+ "acc_stderr": 0.005688627090173545,
18
+ "acc_norm": 0.7539239623299616,
19
+ "acc_norm_stderr": 0.005688627090173545
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6819603753910324,
23
+ "acc_stderr": 0.015046567305192259
24
+ },
25
+ "jsquad-1.1-0.1": {
26
+ "exact_match": 57.29401170643854,
27
+ "f1": 66.44109170808048
28
+ },
29
+ "jaqket_v2-0.1-0.1": {
30
+ "exact_match": 52.40549828178694,
31
+ "f1": 58.039235010884475
32
+ },
33
+ "xlsum_ja": {
34
+ "rouge2": 8.644546504860047
35
+ },
36
+ "mgsm": {
37
+ "acc": 0.016,
38
+ "acc_stderr": 0.00795166118887434
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.1": 1.1,
43
+ "jnli": 1.1,
44
+ "marc_ja": 1.1,
45
+ "jsquad-1.1-0.1": 1.1,
46
+ "jaqket_v2-0.1-0.1": 0.1,
47
+ "xlsum_ja": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=/PROJECT_DIR/hf_model/3b-ja50_rp50-700b,tokenizer=/PROJECT_DIR/tokenizers/nai-hf-tokenizer/,use_fast=False",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": true,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }