Jellyfish042 commited on
Commit
6a5b21f
·
1 Parent(s): ff57e83

2026-01 update

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +38 -12
  2. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-ao3_english-2026-01-16_18-20-47.json +29 -0
  3. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_18-20-58.json +29 -0
  4. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_cs-2026-01-16_18-21-10.json +29 -0
  5. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_math-2026-01-16_18-21-22.json +29 -0
  6. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_other-2026-01-16_18-21-34.json +29 -0
  7. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_physics-2026-01-16_18-21-46.json +29 -0
  8. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-bbc_news-2026-01-16_18-21-56.json +29 -0
  9. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-biorxiv_all-2026-01-16_18-22-08.json +29 -0
  10. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_cpp-2026-01-16_18-22-18.json +29 -0
  11. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_javascript-2026-01-16_18-22-29.json +29 -0
  12. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_markdown-2026-01-16_18-22-39.json +29 -0
  13. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_other-2026-01-16_18-22-50.json +29 -0
  14. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_python-2026-01-16_18-23-00.json +29 -0
  15. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-wikipedia_english-2026-01-16_18-23-10.json +29 -0
  16. data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_18-23-19.json +29 -0
  17. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_18-02-38.json +28 -0
  18. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_18-03-10.json +28 -0
  19. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_18-03-45.json +28 -0
  20. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_18-04-18.json +28 -0
  21. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_other-2026-01-16_18-04-52.json +28 -0
  22. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_physics-2026-01-16_18-05-26.json +28 -0
  23. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-bbc_news-2026-01-16_18-05-58.json +28 -0
  24. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-biorxiv_all-2026-01-16_18-06-32.json +28 -0
  25. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_cpp-2026-01-16_18-07-05.json +28 -0
  26. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_javascript-2026-01-16_18-07-41.json +28 -0
  27. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_markdown-2026-01-16_18-08-13.json +28 -0
  28. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_other-2026-01-16_18-08-45.json +28 -0
  29. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_python-2026-01-16_18-09-17.json +28 -0
  30. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-wikipedia_english-2026-01-16_18-09-48.json +28 -0
  31. data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_18-10-19.json +28 -0
  32. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_19-28-57.json +29 -0
  33. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_19-29-24.json +29 -0
  34. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_19-29-54.json +29 -0
  35. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_19-30-23.json +29 -0
  36. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_other-2026-01-16_19-30-53.json +29 -0
  37. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_physics-2026-01-16_19-31-23.json +29 -0
  38. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-bbc_news-2026-01-16_19-31-49.json +29 -0
  39. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-biorxiv_all-2026-01-16_19-32-19.json +29 -0
  40. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_cpp-2026-01-16_19-32-46.json +29 -0
  41. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_javascript-2026-01-16_19-33-14.json +29 -0
  42. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_markdown-2026-01-16_19-33-41.json +29 -0
  43. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_other-2026-01-16_19-34-08.json +29 -0
  44. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_python-2026-01-16_19-34-35.json +29 -0
  45. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-wikipedia_english-2026-01-16_19-35-00.json +29 -0
  46. data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_19-35-26.json +29 -0
  47. data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_19-12-03.json +29 -0
  48. data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_19-13-12.json +29 -0
  49. data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_19-14-24.json +29 -0
  50. data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_19-15-34.json +29 -0
app.py CHANGED
@@ -158,7 +158,7 @@ def update_table(
158
  )
159
 
160
  styler = styler.hide(axis="index")
161
- widths = [250, 80, 80, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70]
162
 
163
  table_styles = []
164
  table_styles.append(
@@ -565,13 +565,15 @@ if __name__ == "__main__":
565
  color_selector = gr.CheckboxGroup(label="Colored Columns", choices=["Average", "Individual Tests"], value=initial_colors)
566
  with gr.Column():
567
  # Data Source 分组定义
568
- code_cols = ["github cpp", "github javascript", "github python", "github markdown"]
569
- science_cols = ["arxiv math", "arxiv physics", "arxiv cs"]
570
  knowledge_cols = ["wikipedia english", "bbc news", "ao3 english"]
 
571
 
572
  initial_code = [c for c in code_cols if c in initial_columns]
573
  initial_science = [c for c in science_cols if c in initial_columns]
574
  initial_knowledge = [c for c in knowledge_cols if c in initial_columns]
 
575
 
576
  with gr.Column(elem_classes=["data-source-box"]):
577
  gr.Markdown("Data Sources")
@@ -597,31 +599,40 @@ if __name__ == "__main__":
597
  choices=initial_knowledge, value=initial_knowledge, show_label=False, scale=3, elem_classes=["aligned-checkboxes"]
598
  )
599
 
600
- # # 多语言 (Multilingual) - Coming Soon
601
- # with gr.Row():
602
- # gr.Checkbox(label="🌍 Multilingual (Coming Soon)", value=False, interactive=False, scale=0, min_width=250)
 
 
 
 
 
 
 
603
 
604
  table = gr.HTML(initial_data)
605
 
606
  def update_table_wrapper(
607
- period, models_size, metric, code_sel, science_sel, knowledge_sel, color_columns, size_range, midpoint
608
  ):
609
- visible_columns = code_sel + science_sel + knowledge_sel
610
  return update_table(data_manager, period, models_size, metric, visible_columns, color_columns, size_range, midpoint)
611
 
612
- def update_column_choices(period, cur_code, cur_science, cur_knowledge):
613
  if not period:
614
  empty = gr.update(choices=[], value=[])
615
- return empty, empty, empty
616
  columns = data_manager.get_available_columns(period)
617
 
618
  new_code = [c for c in code_cols if c in columns]
619
  new_science = [c for c in science_cols if c in columns]
620
  new_knowledge = [c for c in knowledge_cols if c in columns]
 
621
 
622
  sel_code = [c for c in cur_code if c in new_code] if cur_code else new_code
623
  sel_science = [c for c in cur_science if c in new_science] if cur_science else new_science
624
  sel_knowledge = [c for c in cur_knowledge if c in new_knowledge] if cur_knowledge else new_knowledge
 
625
 
626
  if not sel_code:
627
  sel_code = new_code
@@ -629,11 +640,14 @@ if __name__ == "__main__":
629
  sel_science = new_science
630
  if not sel_knowledge:
631
  sel_knowledge = new_knowledge
 
 
632
 
633
  return (
634
  gr.update(choices=new_code, value=sel_code),
635
  gr.update(choices=new_science, value=sel_science),
636
  gr.update(choices=new_knowledge, value=sel_knowledge),
 
637
  )
638
 
639
  # 总开关功能
@@ -648,6 +662,11 @@ if __name__ == "__main__":
648
  toggle_knowledge.change(
649
  lambda enabled: toggle_group(enabled, knowledge_cols, initial_columns), inputs=[toggle_knowledge], outputs=[colfilter_knowledge]
650
  )
 
 
 
 
 
651
 
652
  shared_inputs = [
653
  period_selector,
@@ -656,6 +675,7 @@ if __name__ == "__main__":
656
  colfilter_code,
657
  colfilter_science,
658
  colfilter_knowledge,
 
659
  color_selector,
660
  size_range_slider,
661
  midpoint_slider,
@@ -663,8 +683,8 @@ if __name__ == "__main__":
663
 
664
  period_selector.change(
665
  update_column_choices,
666
- inputs=[period_selector, colfilter_code, colfilter_science, colfilter_knowledge],
667
- outputs=[colfilter_code, colfilter_science, colfilter_knowledge],
668
  )
669
  period_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
670
  model_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
@@ -672,6 +692,7 @@ if __name__ == "__main__":
672
  colfilter_code.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
673
  colfilter_science.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
674
  colfilter_knowledge.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
 
675
  color_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
676
  size_range_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
677
  midpoint_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
@@ -968,12 +989,17 @@ if __name__ == "__main__":
968
  "github javascript",
969
  "github python",
970
  "github markdown",
 
971
  "arxiv math",
972
  "arxiv physics",
973
  "arxiv cs",
 
 
974
  "wikipedia english",
 
975
  "bbc news",
976
  "ao3 english",
 
977
  ]
978
  initial_datasets = all_datasets[:4]
979
 
 
158
  )
159
 
160
  styler = styler.hide(axis="index")
161
+ widths = [250, 80, 80, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70]
162
 
163
  table_styles = []
164
  table_styles.append(
 
565
  color_selector = gr.CheckboxGroup(label="Colored Columns", choices=["Average", "Individual Tests"], value=initial_colors)
566
  with gr.Column():
567
  # Data Source 分组定义
568
+ code_cols = ["github cpp", "github javascript", "github python", "github markdown", "github other"]
569
+ science_cols = ["arxiv math", "arxiv physics", "arxiv cs", "arxiv other", "biorxiv all"]
570
  knowledge_cols = ["wikipedia english", "bbc news", "ao3 english"]
571
+ multilingual_cols = ["wikipedia nonenglish", "ao3 nonenglish"]
572
 
573
  initial_code = [c for c in code_cols if c in initial_columns]
574
  initial_science = [c for c in science_cols if c in initial_columns]
575
  initial_knowledge = [c for c in knowledge_cols if c in initial_columns]
576
+ initial_multilingual = [c for c in multilingual_cols if c in initial_columns]
577
 
578
  with gr.Column(elem_classes=["data-source-box"]):
579
  gr.Markdown("Data Sources")
 
599
  choices=initial_knowledge, value=initial_knowledge, show_label=False, scale=3, elem_classes=["aligned-checkboxes"]
600
  )
601
 
602
+ # 多语言 (Multilingual)
603
+ with gr.Row():
604
+ toggle_multilingual = gr.Checkbox(label="🌍 Multilingual", value=True, scale=0, min_width=150)
605
+ colfilter_multilingual = gr.CheckboxGroup(
606
+ choices=initial_multilingual,
607
+ value=initial_multilingual,
608
+ show_label=False,
609
+ scale=3,
610
+ elem_classes=["aligned-checkboxes"],
611
+ )
612
 
613
  table = gr.HTML(initial_data)
614
 
615
  def update_table_wrapper(
616
+ period, models_size, metric, code_sel, science_sel, knowledge_sel, multilingual_sel, color_columns, size_range, midpoint
617
  ):
618
+ visible_columns = code_sel + science_sel + knowledge_sel + multilingual_sel
619
  return update_table(data_manager, period, models_size, metric, visible_columns, color_columns, size_range, midpoint)
620
 
621
+ def update_column_choices(period, cur_code, cur_science, cur_knowledge, cur_multilingual):
622
  if not period:
623
  empty = gr.update(choices=[], value=[])
624
+ return empty, empty, empty, empty
625
  columns = data_manager.get_available_columns(period)
626
 
627
  new_code = [c for c in code_cols if c in columns]
628
  new_science = [c for c in science_cols if c in columns]
629
  new_knowledge = [c for c in knowledge_cols if c in columns]
630
+ new_multilingual = [c for c in multilingual_cols if c in columns]
631
 
632
  sel_code = [c for c in cur_code if c in new_code] if cur_code else new_code
633
  sel_science = [c for c in cur_science if c in new_science] if cur_science else new_science
634
  sel_knowledge = [c for c in cur_knowledge if c in new_knowledge] if cur_knowledge else new_knowledge
635
+ sel_multilingual = [c for c in cur_multilingual if c in new_multilingual] if cur_multilingual else new_multilingual
636
 
637
  if not sel_code:
638
  sel_code = new_code
 
640
  sel_science = new_science
641
  if not sel_knowledge:
642
  sel_knowledge = new_knowledge
643
+ if not sel_multilingual:
644
+ sel_multilingual = new_multilingual
645
 
646
  return (
647
  gr.update(choices=new_code, value=sel_code),
648
  gr.update(choices=new_science, value=sel_science),
649
  gr.update(choices=new_knowledge, value=sel_knowledge),
650
+ gr.update(choices=new_multilingual, value=sel_multilingual),
651
  )
652
 
653
  # 总开关功能
 
662
  toggle_knowledge.change(
663
  lambda enabled: toggle_group(enabled, knowledge_cols, initial_columns), inputs=[toggle_knowledge], outputs=[colfilter_knowledge]
664
  )
665
+ toggle_multilingual.change(
666
+ lambda enabled: toggle_group(enabled, multilingual_cols, initial_columns),
667
+ inputs=[toggle_multilingual],
668
+ outputs=[colfilter_multilingual],
669
+ )
670
 
671
  shared_inputs = [
672
  period_selector,
 
675
  colfilter_code,
676
  colfilter_science,
677
  colfilter_knowledge,
678
+ colfilter_multilingual,
679
  color_selector,
680
  size_range_slider,
681
  midpoint_slider,
 
683
 
684
  period_selector.change(
685
  update_column_choices,
686
+ inputs=[period_selector, colfilter_code, colfilter_science, colfilter_knowledge, colfilter_multilingual],
687
+ outputs=[colfilter_code, colfilter_science, colfilter_knowledge, colfilter_multilingual],
688
  )
689
  period_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
690
  model_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
 
692
  colfilter_code.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
693
  colfilter_science.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
694
  colfilter_knowledge.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
695
+ colfilter_multilingual.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
696
  color_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
697
  size_range_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
698
  midpoint_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
 
989
  "github javascript",
990
  "github python",
991
  "github markdown",
992
+ "github other",
993
  "arxiv math",
994
  "arxiv physics",
995
  "arxiv cs",
996
+ "arxiv other",
997
+ "biorxiv all",
998
  "wikipedia english",
999
+ "wikipedia nonenglish",
1000
  "bbc news",
1001
  "ao3 english",
1002
+ "ao3 nonenglish",
1003
  ]
1004
  initial_datasets = all_datasets[:4]
1005
 
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-ao3_english-2026-01-16_18-20-47.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 6654.736,
3
+ "avg tokens": 2256.77,
4
+ "avg character count": 8933.384,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 9114.402,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_english",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 1.0747052433462232,
26
+ "bpb": 1.0533608925330764,
27
+ "compression_rate": 13.167011156663456,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_18-20-58.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5003.824,
3
+ "avg tokens": 1407.03,
4
+ "avg character count": 3053.742,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 5339.418,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_nonenglish",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 2.3639823109749205,
26
+ "bpb": 1.3520185290384037,
27
+ "compression_rate": 16.900231612980047,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_cs-2026-01-16_18-21-10.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5986.608,
3
+ "avg tokens": 2635.674,
4
+ "avg character count": 11948.316,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 11958.88,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7228507911362735,
26
+ "bpb": 0.72221225343395,
27
+ "compression_rate": 9.027653167924376,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_math-2026-01-16_18-21-22.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 3917.952,
3
+ "avg tokens": 2318.0,
4
+ "avg character count": 8202.032,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 8205.778,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.6891475089149856,
26
+ "bpb": 0.688832907841401,
27
+ "compression_rate": 8.610411348017513,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_other-2026-01-16_18-21-34.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5534.56,
3
+ "avg tokens": 2546.324,
4
+ "avg character count": 11405.326,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 11413.15,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_other",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7000836508752509,
26
+ "bpb": 0.6996037260092457,
27
+ "compression_rate": 8.745046575115571,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_physics-2026-01-16_18-21-46.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5696.432,
3
+ "avg tokens": 2712.318,
4
+ "avg character count": 11143.33,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 11149.444,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_physics",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7375007468289282,
26
+ "bpb": 0.7370963249074304,
27
+ "compression_rate": 9.21370406134288,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-bbc_news-2026-01-16_18-21-56.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1958.056,
3
+ "avg tokens": 722.098,
4
+ "avg character count": 3105.01,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 3106.754,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-bbc_news",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.9097805420861382,
26
+ "bpb": 0.9092698298554955,
27
+ "compression_rate": 11.365872873193695,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-biorxiv_all-2026-01-16_18-22-08.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5977.352,
3
+ "avg tokens": 2737.27,
4
+ "avg character count": 11858.768,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 11866.026,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-biorxiv_all",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7271831347107665,
26
+ "bpb": 0.7267383442483377,
27
+ "compression_rate": 9.084229303104221,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_cpp-2026-01-16_18-22-18.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1378.518,
3
+ "avg tokens": 1518.258,
4
+ "avg character count": 4492.018,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 4535.59,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_cpp",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.44273666810243684,
26
+ "bpb": 0.4384834348731195,
27
+ "compression_rate": 5.481042935913994,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_javascript-2026-01-16_18-22-29.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1599.716,
3
+ "avg tokens": 1584.092,
4
+ "avg character count": 5019.138,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 5077.09,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_javascript",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.4598204592164489,
26
+ "bpb": 0.45457187877912913,
27
+ "compression_rate": 5.682148484739114,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_markdown-2026-01-16_18-22-39.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2504.239,
3
+ "avg tokens": 1425.684,
4
+ "avg character count": 4490.514,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 4834.316,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_markdown",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.8045522598305531,
26
+ "bpb": 0.7473349252512117,
27
+ "compression_rate": 9.341686565640146,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_other-2026-01-16_18-22-50.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1503.309,
3
+ "avg tokens": 1650.22,
4
+ "avg character count": 4986.25,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 5064.612,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_other",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.43495942626698353,
26
+ "bpb": 0.42822953450802287,
27
+ "compression_rate": 5.352869181350286,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_python-2026-01-16_18-23-00.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1655.09,
3
+ "avg tokens": 1614.264,
4
+ "avg character count": 5199.592,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 5309.418,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_python",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.4592264422333357,
26
+ "bpb": 0.4497272837107409,
27
+ "compression_rate": 5.621591046384261,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-wikipedia_english-2026-01-16_18-23-10.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1904.752,
3
+ "avg tokens": 790.536,
4
+ "avg character count": 3116.258,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 3133.736,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_english",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.8818192410651926,
26
+ "bpb": 0.8769010103350553,
27
+ "compression_rate": 10.961262629188191,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_18-23-19.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2765.368,
3
+ "avg tokens": 1077.904,
4
+ "avg character count": 2679.128,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 3649.448,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_nonenglish",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 1.4891347855843509,
26
+ "bpb": 1.09320168415416,
27
+ "compression_rate": 13.665021051927,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_18-02-38.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 6531.872,
3
+ "avg tokens": 2420.562,
4
+ "avg character count": 8933.384,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 9114.402,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_english",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 1.0548633465349162,
25
+ "bpb": 1.033913068802701,
26
+ "compression_rate": 12.923913360033762,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_18-03-10.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 7302.8,
3
+ "avg tokens": 2571.754,
4
+ "avg character count": 3053.742,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 5339.418,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_nonenglish",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 3.4500993681207914,
25
+ "bpb": 1.9731950831727207,
26
+ "compression_rate": 24.66493853965901,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_18-03-45.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5684.576,
3
+ "avg tokens": 2800.896,
4
+ "avg character count": 11948.316,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 11958.88,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.6863820478765726,
25
+ "bpb": 0.6857757252147709,
26
+ "compression_rate": 8.572196565184637,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_18-04-18.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 3585.648,
3
+ "avg tokens": 2590.618,
4
+ "avg character count": 8202.032,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 8205.778,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.630696952654346,
25
+ "bpb": 0.6304090347037697,
26
+ "compression_rate": 7.880112933797122,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_other-2026-01-16_18-04-52.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5165.728,
3
+ "avg tokens": 2752.038,
4
+ "avg character count": 11405.326,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 11413.15,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_other",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.6534289478600843,
25
+ "bpb": 0.6529810059607788,
26
+ "compression_rate": 8.162262574509734,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_physics-2026-01-16_18-05-26.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5413.056,
3
+ "avg tokens": 2990.946,
4
+ "avg character count": 11143.33,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 11149.444,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_physics",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.7008128671819149,
25
+ "bpb": 0.700428563725173,
26
+ "compression_rate": 8.755357046564663,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-bbc_news-2026-01-16_18-05-58.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1816.568,
3
+ "avg tokens": 767.29,
4
+ "avg character count": 3105.01,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 3106.754,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-bbc_news",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.8440403235537348,
25
+ "bpb": 0.8435665150950421,
26
+ "compression_rate": 10.544581438688025,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-biorxiv_all-2026-01-16_18-06-32.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5778.264,
3
+ "avg tokens": 3031.108,
4
+ "avg character count": 11858.768,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 11866.026,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-biorxiv_all",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.7029628050525337,
25
+ "bpb": 0.7025328292511095,
26
+ "compression_rate": 8.781660365638869,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_cpp-2026-01-16_18-07-05.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1377.0095,
3
+ "avg tokens": 1647.56,
4
+ "avg character count": 4492.018,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 4535.59,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_cpp",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.4422521853000123,
25
+ "bpb": 0.43800360634603014,
26
+ "compression_rate": 5.475045079325377,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_javascript-2026-01-16_18-07-41.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1635.156,
3
+ "avg tokens": 1763.988,
4
+ "avg character count": 5019.138,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 5077.09,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_javascript",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.47000729055065504,
25
+ "bpb": 0.4646424334175352,
26
+ "compression_rate": 5.8080304177191895,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_markdown-2026-01-16_18-08-13.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2584.2485,
3
+ "avg tokens": 1611.29,
4
+ "avg character count": 4490.514,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 4834.316,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_markdown",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.8302574038016011,
25
+ "bpb": 0.7712119967695,
26
+ "compression_rate": 9.64014995961875,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_other-2026-01-16_18-08-45.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1572.714,
3
+ "avg tokens": 1769.476,
4
+ "avg character count": 4986.25,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 5064.612,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_other",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.4550406996313151,
25
+ "bpb": 0.4480001011995875,
26
+ "compression_rate": 5.600001264994844,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_python-2026-01-16_18-09-17.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1671.52,
3
+ "avg tokens": 1723.414,
4
+ "avg character count": 5199.592,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 5309.418,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_python",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.46378516136395326,
25
+ "bpb": 0.45419170514484264,
26
+ "compression_rate": 5.677396314310533,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-wikipedia_english-2026-01-16_18-09-48.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1829.912,
3
+ "avg tokens": 873.848,
4
+ "avg character count": 3116.258,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 3133.736,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_english",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.8471715010962523,
25
+ "bpb": 0.8424465135745975,
26
+ "compression_rate": 10.53058141968247,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_18-10-19.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 3810.54,
3
+ "avg tokens": 1922.526,
4
+ "avg character count": 2679.128,
5
+ "parameters count": 0.521411104,
6
+ "avg bytes": 3649.448,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_nonenglish",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 2.0519539048186686,
25
+ "bpb": 1.5063777209893197,
26
+ "compression_rate": 18.829721512366497,
27
+ "track_byte_wise_data": false
28
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_19-28-57.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5744.024,
3
+ "avg tokens": 2356.426,
4
+ "avg character count": 8933.384,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 9114.402,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_english",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.9276299932418877,
26
+ "bpb": 0.9092066533325156,
27
+ "compression_rate": 11.365083166656445,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_19-29-24.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4660.24,
3
+ "avg tokens": 1586.388,
4
+ "avg character count": 3053.742,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 5339.418,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_nonenglish",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 2.201661154528569,
26
+ "bpb": 1.2591831426856603,
27
+ "compression_rate": 15.739789283570754,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_19-29-54.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5158.16,
3
+ "avg tokens": 2821.348,
4
+ "avg character count": 11948.316,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 11958.88,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.622820140688597,
26
+ "bpb": 0.6222699660931305,
27
+ "compression_rate": 7.778374576164131,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_19-30-23.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 3187.696,
3
+ "avg tokens": 2614.604,
4
+ "avg character count": 8202.032,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 8205.778,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.5606992524610469,
26
+ "bpb": 0.560443288992413,
27
+ "compression_rate": 7.0055411124051625,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_other-2026-01-16_19-30-53.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4676.384,
3
+ "avg tokens": 2773.23,
4
+ "avg character count": 11405.326,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 11413.15,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_other",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.5915303083921052,
26
+ "bpb": 0.5911247995594988,
27
+ "compression_rate": 7.389059994493735,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_physics-2026-01-16_19-31-23.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4866.016,
3
+ "avg tokens": 3017.018,
4
+ "avg character count": 11143.33,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 11149.444,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_physics",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.6299891641086057,
26
+ "bpb": 0.6296436981150225,
27
+ "compression_rate": 7.8705462264377815,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-bbc_news-2026-01-16_19-31-49.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1665.212,
3
+ "avg tokens": 758.108,
4
+ "avg character count": 3105.01,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 3106.754,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-bbc_news",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7737150909107514,
26
+ "bpb": 0.7732807600565711,
27
+ "compression_rate": 9.666009500707139,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-biorxiv_all-2026-01-16_19-32-19.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5181.144,
3
+ "avg tokens": 3046.746,
4
+ "avg character count": 11858.768,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 11866.026,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-biorxiv_all",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.6303193345996487,
26
+ "bpb": 0.6299337920658195,
27
+ "compression_rate": 7.874172400822744,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_cpp-2026-01-16_19-32-46.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1163.8585,
3
+ "avg tokens": 1648.624,
4
+ "avg character count": 4492.018,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 4535.59,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_cpp",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.37379478137586886,
26
+ "bpb": 0.3702038513724714,
27
+ "compression_rate": 4.627548142155892,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_javascript-2026-01-16_19-33-14.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1356.928,
3
+ "avg tokens": 1760.75,
4
+ "avg character count": 5019.138,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 5077.09,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_javascript",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.3900337660457592,
26
+ "bpb": 0.3855817597173537,
27
+ "compression_rate": 4.819771996466922,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_markdown-2026-01-16_19-33-41.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2199.5465,
3
+ "avg tokens": 1533.14,
4
+ "avg character count": 4490.514,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 4834.316,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_markdown",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7066618270769618,
26
+ "bpb": 0.6564061653716217,
27
+ "compression_rate": 8.20507706714527,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_other-2026-01-16_19-34-08.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1299.344,
3
+ "avg tokens": 1774.094,
4
+ "avg character count": 4986.25,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 5064.612,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_other",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.37594527855779986,
26
+ "bpb": 0.37012848076196736,
27
+ "compression_rate": 4.626606009524592,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_python-2026-01-16_19-34-35.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1415.762,
3
+ "avg tokens": 1702.552,
4
+ "avg character count": 5199.592,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 5309.418,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-github_python",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.39282174764463074,
26
+ "bpb": 0.3846961788427735,
27
+ "compression_rate": 4.808702235534669,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-wikipedia_english-2026-01-16_19-35-00.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1620.662,
3
+ "avg tokens": 851.824,
4
+ "avg character count": 3116.258,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 3133.736,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_english",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7502976423509188,
26
+ "bpb": 0.7461129560234777,
27
+ "compression_rate": 9.32641195029347,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_19-35-26.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2320.77,
3
+ "avg tokens": 1091.968,
4
+ "avg character count": 2679.128,
5
+ "parameters count": 1.554859392,
6
+ "avg bytes": 3649.448,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_nonenglish",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 1.2497213160565226,
26
+ "bpb": 0.9174437805508886,
27
+ "compression_rate": 11.468047256886107,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_19-12-03.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5660.136,
3
+ "avg tokens": 2356.426,
4
+ "avg character count": 8933.384,
5
+ "parameters count": 1.554872208,
6
+ "avg bytes": 9114.402,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_english",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.9140825176615149,
26
+ "bpb": 0.8959282395001991,
27
+ "compression_rate": 11.199102993752488,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_19-13-12.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4594.32,
3
+ "avg tokens": 1586.388,
4
+ "avg character count": 3053.742,
5
+ "parameters count": 1.554872208,
6
+ "avg bytes": 5339.418,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_nonenglish",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 2.1705182298494705,
26
+ "bpb": 1.2413717525499937,
27
+ "compression_rate": 15.51714690687492,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_19-14-24.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5065.248,
3
+ "avg tokens": 2821.348,
4
+ "avg character count": 11948.316,
5
+ "parameters count": 1.554872208,
6
+ "avg bytes": 11958.88,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.6116015152656441,
26
+ "bpb": 0.6110612507586614,
27
+ "compression_rate": 7.638265634483267,
28
+ "track_byte_wise_data": false
29
+ }
data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_19-15-34.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 3112.224,
3
+ "avg tokens": 2614.604,
4
+ "avg character count": 8202.032,
5
+ "parameters count": 1.554872208,
6
+ "avg bytes": 8205.778,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "bos_mode": "add_default_eos",
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.5474241176985915,
26
+ "bpb": 0.5471742144298339,
27
+ "compression_rate": 6.839677680372924,
28
+ "track_byte_wise_data": false
29
+ }