Spaces:
Running
Running
Commit ·
6a5b21f
1
Parent(s): ff57e83
2026-01 update
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- app.py +38 -12
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-ao3_english-2026-01-16_18-20-47.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_18-20-58.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_cs-2026-01-16_18-21-10.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_math-2026-01-16_18-21-22.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_other-2026-01-16_18-21-34.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_physics-2026-01-16_18-21-46.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-bbc_news-2026-01-16_18-21-56.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-biorxiv_all-2026-01-16_18-22-08.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_cpp-2026-01-16_18-22-18.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_javascript-2026-01-16_18-22-29.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_markdown-2026-01-16_18-22-39.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_other-2026-01-16_18-22-50.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_python-2026-01-16_18-23-00.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-wikipedia_english-2026-01-16_18-23-10.json +29 -0
- data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_18-23-19.json +29 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_18-02-38.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_18-03-10.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_18-03-45.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_18-04-18.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_other-2026-01-16_18-04-52.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_physics-2026-01-16_18-05-26.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-bbc_news-2026-01-16_18-05-58.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-biorxiv_all-2026-01-16_18-06-32.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_cpp-2026-01-16_18-07-05.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_javascript-2026-01-16_18-07-41.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_markdown-2026-01-16_18-08-13.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_other-2026-01-16_18-08-45.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_python-2026-01-16_18-09-17.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-wikipedia_english-2026-01-16_18-09-48.json +28 -0
- data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_18-10-19.json +28 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_19-28-57.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_19-29-24.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_19-29-54.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_19-30-23.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_other-2026-01-16_19-30-53.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_physics-2026-01-16_19-31-23.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-bbc_news-2026-01-16_19-31-49.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-biorxiv_all-2026-01-16_19-32-19.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_cpp-2026-01-16_19-32-46.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_javascript-2026-01-16_19-33-14.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_markdown-2026-01-16_19-33-41.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_other-2026-01-16_19-34-08.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_python-2026-01-16_19-34-35.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-wikipedia_english-2026-01-16_19-35-00.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_19-35-26.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_19-12-03.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_19-13-12.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_19-14-24.json +29 -0
- data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_19-15-34.json +29 -0
app.py
CHANGED
|
@@ -158,7 +158,7 @@ def update_table(
|
|
| 158 |
)
|
| 159 |
|
| 160 |
styler = styler.hide(axis="index")
|
| 161 |
-
widths = [250, 80, 80, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70]
|
| 162 |
|
| 163 |
table_styles = []
|
| 164 |
table_styles.append(
|
|
@@ -565,13 +565,15 @@ if __name__ == "__main__":
|
|
| 565 |
color_selector = gr.CheckboxGroup(label="Colored Columns", choices=["Average", "Individual Tests"], value=initial_colors)
|
| 566 |
with gr.Column():
|
| 567 |
# Data Source 分组定义
|
| 568 |
-
code_cols = ["github cpp", "github javascript", "github python", "github markdown"]
|
| 569 |
-
science_cols = ["arxiv math", "arxiv physics", "arxiv cs"]
|
| 570 |
knowledge_cols = ["wikipedia english", "bbc news", "ao3 english"]
|
|
|
|
| 571 |
|
| 572 |
initial_code = [c for c in code_cols if c in initial_columns]
|
| 573 |
initial_science = [c for c in science_cols if c in initial_columns]
|
| 574 |
initial_knowledge = [c for c in knowledge_cols if c in initial_columns]
|
|
|
|
| 575 |
|
| 576 |
with gr.Column(elem_classes=["data-source-box"]):
|
| 577 |
gr.Markdown("Data Sources")
|
|
@@ -597,31 +599,40 @@ if __name__ == "__main__":
|
|
| 597 |
choices=initial_knowledge, value=initial_knowledge, show_label=False, scale=3, elem_classes=["aligned-checkboxes"]
|
| 598 |
)
|
| 599 |
|
| 600 |
-
#
|
| 601 |
-
|
| 602 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 603 |
|
| 604 |
table = gr.HTML(initial_data)
|
| 605 |
|
| 606 |
def update_table_wrapper(
|
| 607 |
-
period, models_size, metric, code_sel, science_sel, knowledge_sel, color_columns, size_range, midpoint
|
| 608 |
):
|
| 609 |
-
visible_columns = code_sel + science_sel + knowledge_sel
|
| 610 |
return update_table(data_manager, period, models_size, metric, visible_columns, color_columns, size_range, midpoint)
|
| 611 |
|
| 612 |
-
def update_column_choices(period, cur_code, cur_science, cur_knowledge):
|
| 613 |
if not period:
|
| 614 |
empty = gr.update(choices=[], value=[])
|
| 615 |
-
return empty, empty, empty
|
| 616 |
columns = data_manager.get_available_columns(period)
|
| 617 |
|
| 618 |
new_code = [c for c in code_cols if c in columns]
|
| 619 |
new_science = [c for c in science_cols if c in columns]
|
| 620 |
new_knowledge = [c for c in knowledge_cols if c in columns]
|
|
|
|
| 621 |
|
| 622 |
sel_code = [c for c in cur_code if c in new_code] if cur_code else new_code
|
| 623 |
sel_science = [c for c in cur_science if c in new_science] if cur_science else new_science
|
| 624 |
sel_knowledge = [c for c in cur_knowledge if c in new_knowledge] if cur_knowledge else new_knowledge
|
|
|
|
| 625 |
|
| 626 |
if not sel_code:
|
| 627 |
sel_code = new_code
|
|
@@ -629,11 +640,14 @@ if __name__ == "__main__":
|
|
| 629 |
sel_science = new_science
|
| 630 |
if not sel_knowledge:
|
| 631 |
sel_knowledge = new_knowledge
|
|
|
|
|
|
|
| 632 |
|
| 633 |
return (
|
| 634 |
gr.update(choices=new_code, value=sel_code),
|
| 635 |
gr.update(choices=new_science, value=sel_science),
|
| 636 |
gr.update(choices=new_knowledge, value=sel_knowledge),
|
|
|
|
| 637 |
)
|
| 638 |
|
| 639 |
# 总开关功能
|
|
@@ -648,6 +662,11 @@ if __name__ == "__main__":
|
|
| 648 |
toggle_knowledge.change(
|
| 649 |
lambda enabled: toggle_group(enabled, knowledge_cols, initial_columns), inputs=[toggle_knowledge], outputs=[colfilter_knowledge]
|
| 650 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 651 |
|
| 652 |
shared_inputs = [
|
| 653 |
period_selector,
|
|
@@ -656,6 +675,7 @@ if __name__ == "__main__":
|
|
| 656 |
colfilter_code,
|
| 657 |
colfilter_science,
|
| 658 |
colfilter_knowledge,
|
|
|
|
| 659 |
color_selector,
|
| 660 |
size_range_slider,
|
| 661 |
midpoint_slider,
|
|
@@ -663,8 +683,8 @@ if __name__ == "__main__":
|
|
| 663 |
|
| 664 |
period_selector.change(
|
| 665 |
update_column_choices,
|
| 666 |
-
inputs=[period_selector, colfilter_code, colfilter_science, colfilter_knowledge],
|
| 667 |
-
outputs=[colfilter_code, colfilter_science, colfilter_knowledge],
|
| 668 |
)
|
| 669 |
period_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 670 |
model_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
|
@@ -672,6 +692,7 @@ if __name__ == "__main__":
|
|
| 672 |
colfilter_code.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 673 |
colfilter_science.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 674 |
colfilter_knowledge.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
|
|
|
| 675 |
color_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 676 |
size_range_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 677 |
midpoint_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
|
@@ -968,12 +989,17 @@ if __name__ == "__main__":
|
|
| 968 |
"github javascript",
|
| 969 |
"github python",
|
| 970 |
"github markdown",
|
|
|
|
| 971 |
"arxiv math",
|
| 972 |
"arxiv physics",
|
| 973 |
"arxiv cs",
|
|
|
|
|
|
|
| 974 |
"wikipedia english",
|
|
|
|
| 975 |
"bbc news",
|
| 976 |
"ao3 english",
|
|
|
|
| 977 |
]
|
| 978 |
initial_datasets = all_datasets[:4]
|
| 979 |
|
|
|
|
| 158 |
)
|
| 159 |
|
| 160 |
styler = styler.hide(axis="index")
|
| 161 |
+
widths = [250, 80, 80, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70]
|
| 162 |
|
| 163 |
table_styles = []
|
| 164 |
table_styles.append(
|
|
|
|
| 565 |
color_selector = gr.CheckboxGroup(label="Colored Columns", choices=["Average", "Individual Tests"], value=initial_colors)
|
| 566 |
with gr.Column():
|
| 567 |
# Data Source 分组定义
|
| 568 |
+
code_cols = ["github cpp", "github javascript", "github python", "github markdown", "github other"]
|
| 569 |
+
science_cols = ["arxiv math", "arxiv physics", "arxiv cs", "arxiv other", "biorxiv all"]
|
| 570 |
knowledge_cols = ["wikipedia english", "bbc news", "ao3 english"]
|
| 571 |
+
multilingual_cols = ["wikipedia nonenglish", "ao3 nonenglish"]
|
| 572 |
|
| 573 |
initial_code = [c for c in code_cols if c in initial_columns]
|
| 574 |
initial_science = [c for c in science_cols if c in initial_columns]
|
| 575 |
initial_knowledge = [c for c in knowledge_cols if c in initial_columns]
|
| 576 |
+
initial_multilingual = [c for c in multilingual_cols if c in initial_columns]
|
| 577 |
|
| 578 |
with gr.Column(elem_classes=["data-source-box"]):
|
| 579 |
gr.Markdown("Data Sources")
|
|
|
|
| 599 |
choices=initial_knowledge, value=initial_knowledge, show_label=False, scale=3, elem_classes=["aligned-checkboxes"]
|
| 600 |
)
|
| 601 |
|
| 602 |
+
# 多语言 (Multilingual)
|
| 603 |
+
with gr.Row():
|
| 604 |
+
toggle_multilingual = gr.Checkbox(label="🌍 Multilingual", value=True, scale=0, min_width=150)
|
| 605 |
+
colfilter_multilingual = gr.CheckboxGroup(
|
| 606 |
+
choices=initial_multilingual,
|
| 607 |
+
value=initial_multilingual,
|
| 608 |
+
show_label=False,
|
| 609 |
+
scale=3,
|
| 610 |
+
elem_classes=["aligned-checkboxes"],
|
| 611 |
+
)
|
| 612 |
|
| 613 |
table = gr.HTML(initial_data)
|
| 614 |
|
| 615 |
def update_table_wrapper(
|
| 616 |
+
period, models_size, metric, code_sel, science_sel, knowledge_sel, multilingual_sel, color_columns, size_range, midpoint
|
| 617 |
):
|
| 618 |
+
visible_columns = code_sel + science_sel + knowledge_sel + multilingual_sel
|
| 619 |
return update_table(data_manager, period, models_size, metric, visible_columns, color_columns, size_range, midpoint)
|
| 620 |
|
| 621 |
+
def update_column_choices(period, cur_code, cur_science, cur_knowledge, cur_multilingual):
|
| 622 |
if not period:
|
| 623 |
empty = gr.update(choices=[], value=[])
|
| 624 |
+
return empty, empty, empty, empty
|
| 625 |
columns = data_manager.get_available_columns(period)
|
| 626 |
|
| 627 |
new_code = [c for c in code_cols if c in columns]
|
| 628 |
new_science = [c for c in science_cols if c in columns]
|
| 629 |
new_knowledge = [c for c in knowledge_cols if c in columns]
|
| 630 |
+
new_multilingual = [c for c in multilingual_cols if c in columns]
|
| 631 |
|
| 632 |
sel_code = [c for c in cur_code if c in new_code] if cur_code else new_code
|
| 633 |
sel_science = [c for c in cur_science if c in new_science] if cur_science else new_science
|
| 634 |
sel_knowledge = [c for c in cur_knowledge if c in new_knowledge] if cur_knowledge else new_knowledge
|
| 635 |
+
sel_multilingual = [c for c in cur_multilingual if c in new_multilingual] if cur_multilingual else new_multilingual
|
| 636 |
|
| 637 |
if not sel_code:
|
| 638 |
sel_code = new_code
|
|
|
|
| 640 |
sel_science = new_science
|
| 641 |
if not sel_knowledge:
|
| 642 |
sel_knowledge = new_knowledge
|
| 643 |
+
if not sel_multilingual:
|
| 644 |
+
sel_multilingual = new_multilingual
|
| 645 |
|
| 646 |
return (
|
| 647 |
gr.update(choices=new_code, value=sel_code),
|
| 648 |
gr.update(choices=new_science, value=sel_science),
|
| 649 |
gr.update(choices=new_knowledge, value=sel_knowledge),
|
| 650 |
+
gr.update(choices=new_multilingual, value=sel_multilingual),
|
| 651 |
)
|
| 652 |
|
| 653 |
# 总开关功能
|
|
|
|
| 662 |
toggle_knowledge.change(
|
| 663 |
lambda enabled: toggle_group(enabled, knowledge_cols, initial_columns), inputs=[toggle_knowledge], outputs=[colfilter_knowledge]
|
| 664 |
)
|
| 665 |
+
toggle_multilingual.change(
|
| 666 |
+
lambda enabled: toggle_group(enabled, multilingual_cols, initial_columns),
|
| 667 |
+
inputs=[toggle_multilingual],
|
| 668 |
+
outputs=[colfilter_multilingual],
|
| 669 |
+
)
|
| 670 |
|
| 671 |
shared_inputs = [
|
| 672 |
period_selector,
|
|
|
|
| 675 |
colfilter_code,
|
| 676 |
colfilter_science,
|
| 677 |
colfilter_knowledge,
|
| 678 |
+
colfilter_multilingual,
|
| 679 |
color_selector,
|
| 680 |
size_range_slider,
|
| 681 |
midpoint_slider,
|
|
|
|
| 683 |
|
| 684 |
period_selector.change(
|
| 685 |
update_column_choices,
|
| 686 |
+
inputs=[period_selector, colfilter_code, colfilter_science, colfilter_knowledge, colfilter_multilingual],
|
| 687 |
+
outputs=[colfilter_code, colfilter_science, colfilter_knowledge, colfilter_multilingual],
|
| 688 |
)
|
| 689 |
period_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 690 |
model_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
|
|
|
| 692 |
colfilter_code.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 693 |
colfilter_science.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 694 |
colfilter_knowledge.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 695 |
+
colfilter_multilingual.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 696 |
color_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 697 |
size_range_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
| 698 |
midpoint_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
|
|
|
|
| 989 |
"github javascript",
|
| 990 |
"github python",
|
| 991 |
"github markdown",
|
| 992 |
+
"github other",
|
| 993 |
"arxiv math",
|
| 994 |
"arxiv physics",
|
| 995 |
"arxiv cs",
|
| 996 |
+
"arxiv other",
|
| 997 |
+
"biorxiv all",
|
| 998 |
"wikipedia english",
|
| 999 |
+
"wikipedia nonenglish",
|
| 1000 |
"bbc news",
|
| 1001 |
"ao3 english",
|
| 1002 |
+
"ao3 nonenglish",
|
| 1003 |
]
|
| 1004 |
initial_datasets = all_datasets[:4]
|
| 1005 |
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-ao3_english-2026-01-16_18-20-47.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 6654.736,
|
| 3 |
+
"avg tokens": 2256.77,
|
| 4 |
+
"avg character count": 8933.384,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 9114.402,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_english",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 1.0747052433462232,
|
| 26 |
+
"bpb": 1.0533608925330764,
|
| 27 |
+
"compression_rate": 13.167011156663456,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_18-20-58.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5003.824,
|
| 3 |
+
"avg tokens": 1407.03,
|
| 4 |
+
"avg character count": 3053.742,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 5339.418,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_nonenglish",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 2.3639823109749205,
|
| 26 |
+
"bpb": 1.3520185290384037,
|
| 27 |
+
"compression_rate": 16.900231612980047,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_cs-2026-01-16_18-21-10.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5986.608,
|
| 3 |
+
"avg tokens": 2635.674,
|
| 4 |
+
"avg character count": 11948.316,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 11958.88,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_cs",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.7228507911362735,
|
| 26 |
+
"bpb": 0.72221225343395,
|
| 27 |
+
"compression_rate": 9.027653167924376,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_math-2026-01-16_18-21-22.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 3917.952,
|
| 3 |
+
"avg tokens": 2318.0,
|
| 4 |
+
"avg character count": 8202.032,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 8205.778,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_math",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.6891475089149856,
|
| 26 |
+
"bpb": 0.688832907841401,
|
| 27 |
+
"compression_rate": 8.610411348017513,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_other-2026-01-16_18-21-34.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5534.56,
|
| 3 |
+
"avg tokens": 2546.324,
|
| 4 |
+
"avg character count": 11405.326,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 11413.15,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_other",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.7000836508752509,
|
| 26 |
+
"bpb": 0.6996037260092457,
|
| 27 |
+
"compression_rate": 8.745046575115571,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-arxiv_physics-2026-01-16_18-21-46.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5696.432,
|
| 3 |
+
"avg tokens": 2712.318,
|
| 4 |
+
"avg character count": 11143.33,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 11149.444,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_physics",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.7375007468289282,
|
| 26 |
+
"bpb": 0.7370963249074304,
|
| 27 |
+
"compression_rate": 9.21370406134288,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-bbc_news-2026-01-16_18-21-56.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1958.056,
|
| 3 |
+
"avg tokens": 722.098,
|
| 4 |
+
"avg character count": 3105.01,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 3106.754,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-bbc_news",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.9097805420861382,
|
| 26 |
+
"bpb": 0.9092698298554955,
|
| 27 |
+
"compression_rate": 11.365872873193695,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-biorxiv_all-2026-01-16_18-22-08.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5977.352,
|
| 3 |
+
"avg tokens": 2737.27,
|
| 4 |
+
"avg character count": 11858.768,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 11866.026,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-biorxiv_all",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.7271831347107665,
|
| 26 |
+
"bpb": 0.7267383442483377,
|
| 27 |
+
"compression_rate": 9.084229303104221,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_cpp-2026-01-16_18-22-18.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1378.518,
|
| 3 |
+
"avg tokens": 1518.258,
|
| 4 |
+
"avg character count": 4492.018,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 4535.59,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_cpp",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.44273666810243684,
|
| 26 |
+
"bpb": 0.4384834348731195,
|
| 27 |
+
"compression_rate": 5.481042935913994,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_javascript-2026-01-16_18-22-29.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1599.716,
|
| 3 |
+
"avg tokens": 1584.092,
|
| 4 |
+
"avg character count": 5019.138,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 5077.09,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_javascript",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.4598204592164489,
|
| 26 |
+
"bpb": 0.45457187877912913,
|
| 27 |
+
"compression_rate": 5.682148484739114,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_markdown-2026-01-16_18-22-39.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 2504.239,
|
| 3 |
+
"avg tokens": 1425.684,
|
| 4 |
+
"avg character count": 4490.514,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 4834.316,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_markdown",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.8045522598305531,
|
| 26 |
+
"bpb": 0.7473349252512117,
|
| 27 |
+
"compression_rate": 9.341686565640146,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_other-2026-01-16_18-22-50.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1503.309,
|
| 3 |
+
"avg tokens": 1650.22,
|
| 4 |
+
"avg character count": 4986.25,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 5064.612,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_other",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.43495942626698353,
|
| 26 |
+
"bpb": 0.42822953450802287,
|
| 27 |
+
"compression_rate": 5.352869181350286,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-github_python-2026-01-16_18-23-00.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1655.09,
|
| 3 |
+
"avg tokens": 1614.264,
|
| 4 |
+
"avg character count": 5199.592,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 5309.418,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_python",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.4592264422333357,
|
| 26 |
+
"bpb": 0.4497272837107409,
|
| 27 |
+
"compression_rate": 5.621591046384261,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-wikipedia_english-2026-01-16_18-23-10.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1904.752,
|
| 3 |
+
"avg tokens": 790.536,
|
| 4 |
+
"avg character count": 3116.258,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 3133.736,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_english",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.8818192410651926,
|
| 26 |
+
"bpb": 0.8769010103350553,
|
| 27 |
+
"compression_rate": 10.961262629188191,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_18-23-19.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 2765.368,
|
| 3 |
+
"avg tokens": 1077.904,
|
| 4 |
+
"avg character count": 2679.128,
|
| 5 |
+
"parameters count": 0.360748032,
|
| 6 |
+
"avg bytes": 3649.448,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 9 |
+
"tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_nonenglish",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 1.4891347855843509,
|
| 26 |
+
"bpb": 1.09320168415416,
|
| 27 |
+
"compression_rate": 13.665021051927,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_18-02-38.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 6531.872,
|
| 3 |
+
"avg tokens": 2420.562,
|
| 4 |
+
"avg character count": 8933.384,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 9114.402,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_english",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 1.0548633465349162,
|
| 25 |
+
"bpb": 1.033913068802701,
|
| 26 |
+
"compression_rate": 12.923913360033762,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_18-03-10.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 7302.8,
|
| 3 |
+
"avg tokens": 2571.754,
|
| 4 |
+
"avg character count": 3053.742,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 5339.418,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_nonenglish",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 3.4500993681207914,
|
| 25 |
+
"bpb": 1.9731950831727207,
|
| 26 |
+
"compression_rate": 24.66493853965901,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_18-03-45.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5684.576,
|
| 3 |
+
"avg tokens": 2800.896,
|
| 4 |
+
"avg character count": 11948.316,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 11958.88,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_cs",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.6863820478765726,
|
| 25 |
+
"bpb": 0.6857757252147709,
|
| 26 |
+
"compression_rate": 8.572196565184637,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_18-04-18.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 3585.648,
|
| 3 |
+
"avg tokens": 2590.618,
|
| 4 |
+
"avg character count": 8202.032,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 8205.778,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_math",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.630696952654346,
|
| 25 |
+
"bpb": 0.6304090347037697,
|
| 26 |
+
"compression_rate": 7.880112933797122,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_other-2026-01-16_18-04-52.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5165.728,
|
| 3 |
+
"avg tokens": 2752.038,
|
| 4 |
+
"avg character count": 11405.326,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 11413.15,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_other",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.6534289478600843,
|
| 25 |
+
"bpb": 0.6529810059607788,
|
| 26 |
+
"compression_rate": 8.162262574509734,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-arxiv_physics-2026-01-16_18-05-26.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5413.056,
|
| 3 |
+
"avg tokens": 2990.946,
|
| 4 |
+
"avg character count": 11143.33,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 11149.444,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_physics",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.7008128671819149,
|
| 25 |
+
"bpb": 0.700428563725173,
|
| 26 |
+
"compression_rate": 8.755357046564663,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-bbc_news-2026-01-16_18-05-58.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1816.568,
|
| 3 |
+
"avg tokens": 767.29,
|
| 4 |
+
"avg character count": 3105.01,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 3106.754,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-bbc_news",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.8440403235537348,
|
| 25 |
+
"bpb": 0.8435665150950421,
|
| 26 |
+
"compression_rate": 10.544581438688025,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-biorxiv_all-2026-01-16_18-06-32.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5778.264,
|
| 3 |
+
"avg tokens": 3031.108,
|
| 4 |
+
"avg character count": 11858.768,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 11866.026,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-biorxiv_all",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.7029628050525337,
|
| 25 |
+
"bpb": 0.7025328292511095,
|
| 26 |
+
"compression_rate": 8.781660365638869,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_cpp-2026-01-16_18-07-05.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1377.0095,
|
| 3 |
+
"avg tokens": 1647.56,
|
| 4 |
+
"avg character count": 4492.018,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 4535.59,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_cpp",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.4422521853000123,
|
| 25 |
+
"bpb": 0.43800360634603014,
|
| 26 |
+
"compression_rate": 5.475045079325377,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_javascript-2026-01-16_18-07-41.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1635.156,
|
| 3 |
+
"avg tokens": 1763.988,
|
| 4 |
+
"avg character count": 5019.138,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 5077.09,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_javascript",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.47000729055065504,
|
| 25 |
+
"bpb": 0.4646424334175352,
|
| 26 |
+
"compression_rate": 5.8080304177191895,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_markdown-2026-01-16_18-08-13.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 2584.2485,
|
| 3 |
+
"avg tokens": 1611.29,
|
| 4 |
+
"avg character count": 4490.514,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 4834.316,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_markdown",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.8302574038016011,
|
| 25 |
+
"bpb": 0.7712119967695,
|
| 26 |
+
"compression_rate": 9.64014995961875,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_other-2026-01-16_18-08-45.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1572.714,
|
| 3 |
+
"avg tokens": 1769.476,
|
| 4 |
+
"avg character count": 4986.25,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 5064.612,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_other",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.4550406996313151,
|
| 25 |
+
"bpb": 0.4480001011995875,
|
| 26 |
+
"compression_rate": 5.600001264994844,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-github_python-2026-01-16_18-09-17.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1671.52,
|
| 3 |
+
"avg tokens": 1723.414,
|
| 4 |
+
"avg character count": 5199.592,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 5309.418,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_python",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.46378516136395326,
|
| 25 |
+
"bpb": 0.45419170514484264,
|
| 26 |
+
"compression_rate": 5.677396314310533,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-wikipedia_english-2026-01-16_18-09-48.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1829.912,
|
| 3 |
+
"avg tokens": 873.848,
|
| 4 |
+
"avg character count": 3116.258,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 3133.736,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_english",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 0.8471715010962523,
|
| 25 |
+
"bpb": 0.8424465135745975,
|
| 26 |
+
"compression_rate": 10.53058141968247,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-0.5B-Base-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_18-10-19.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 3810.54,
|
| 3 |
+
"avg tokens": 1922.526,
|
| 4 |
+
"avg character count": 2679.128,
|
| 5 |
+
"parameters count": 0.521411104,
|
| 6 |
+
"avg bytes": 3649.448,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-0.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-0.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_nonenglish",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"torch_dtype": "torch.bfloat16"
|
| 17 |
+
},
|
| 18 |
+
"tokenizer_args": {
|
| 19 |
+
"trust_remote_code": true
|
| 20 |
+
},
|
| 21 |
+
"requirements": [],
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"enable_chunking": true,
|
| 24 |
+
"bpc": 2.0519539048186686,
|
| 25 |
+
"bpb": 1.5063777209893197,
|
| 26 |
+
"compression_rate": 18.829721512366497,
|
| 27 |
+
"track_byte_wise_data": false
|
| 28 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_19-28-57.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5744.024,
|
| 3 |
+
"avg tokens": 2356.426,
|
| 4 |
+
"avg character count": 8933.384,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 9114.402,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_english",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.9276299932418877,
|
| 26 |
+
"bpb": 0.9092066533325156,
|
| 27 |
+
"compression_rate": 11.365083166656445,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_19-29-24.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 4660.24,
|
| 3 |
+
"avg tokens": 1586.388,
|
| 4 |
+
"avg character count": 3053.742,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 5339.418,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_nonenglish",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 2.201661154528569,
|
| 26 |
+
"bpb": 1.2591831426856603,
|
| 27 |
+
"compression_rate": 15.739789283570754,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_19-29-54.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5158.16,
|
| 3 |
+
"avg tokens": 2821.348,
|
| 4 |
+
"avg character count": 11948.316,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 11958.88,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_cs",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.622820140688597,
|
| 26 |
+
"bpb": 0.6222699660931305,
|
| 27 |
+
"compression_rate": 7.778374576164131,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_19-30-23.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 3187.696,
|
| 3 |
+
"avg tokens": 2614.604,
|
| 4 |
+
"avg character count": 8202.032,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 8205.778,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_math",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.5606992524610469,
|
| 26 |
+
"bpb": 0.560443288992413,
|
| 27 |
+
"compression_rate": 7.0055411124051625,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_other-2026-01-16_19-30-53.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 4676.384,
|
| 3 |
+
"avg tokens": 2773.23,
|
| 4 |
+
"avg character count": 11405.326,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 11413.15,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_other",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.5915303083921052,
|
| 26 |
+
"bpb": 0.5911247995594988,
|
| 27 |
+
"compression_rate": 7.389059994493735,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-arxiv_physics-2026-01-16_19-31-23.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 4866.016,
|
| 3 |
+
"avg tokens": 3017.018,
|
| 4 |
+
"avg character count": 11143.33,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 11149.444,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_physics",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.6299891641086057,
|
| 26 |
+
"bpb": 0.6296436981150225,
|
| 27 |
+
"compression_rate": 7.8705462264377815,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-bbc_news-2026-01-16_19-31-49.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1665.212,
|
| 3 |
+
"avg tokens": 758.108,
|
| 4 |
+
"avg character count": 3105.01,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 3106.754,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-bbc_news",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.7737150909107514,
|
| 26 |
+
"bpb": 0.7732807600565711,
|
| 27 |
+
"compression_rate": 9.666009500707139,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-biorxiv_all-2026-01-16_19-32-19.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5181.144,
|
| 3 |
+
"avg tokens": 3046.746,
|
| 4 |
+
"avg character count": 11858.768,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 11866.026,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-biorxiv_all",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.6303193345996487,
|
| 26 |
+
"bpb": 0.6299337920658195,
|
| 27 |
+
"compression_rate": 7.874172400822744,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_cpp-2026-01-16_19-32-46.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1163.8585,
|
| 3 |
+
"avg tokens": 1648.624,
|
| 4 |
+
"avg character count": 4492.018,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 4535.59,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_cpp",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.37379478137586886,
|
| 26 |
+
"bpb": 0.3702038513724714,
|
| 27 |
+
"compression_rate": 4.627548142155892,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_javascript-2026-01-16_19-33-14.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1356.928,
|
| 3 |
+
"avg tokens": 1760.75,
|
| 4 |
+
"avg character count": 5019.138,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 5077.09,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_javascript",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.3900337660457592,
|
| 26 |
+
"bpb": 0.3855817597173537,
|
| 27 |
+
"compression_rate": 4.819771996466922,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_markdown-2026-01-16_19-33-41.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 2199.5465,
|
| 3 |
+
"avg tokens": 1533.14,
|
| 4 |
+
"avg character count": 4490.514,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 4834.316,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_markdown",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.7066618270769618,
|
| 26 |
+
"bpb": 0.6564061653716217,
|
| 27 |
+
"compression_rate": 8.20507706714527,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_other-2026-01-16_19-34-08.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1299.344,
|
| 3 |
+
"avg tokens": 1774.094,
|
| 4 |
+
"avg character count": 4986.25,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 5064.612,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_other",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.37594527855779986,
|
| 26 |
+
"bpb": 0.37012848076196736,
|
| 27 |
+
"compression_rate": 4.626606009524592,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-github_python-2026-01-16_19-34-35.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1415.762,
|
| 3 |
+
"avg tokens": 1702.552,
|
| 4 |
+
"avg character count": 5199.592,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 5309.418,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-github_python",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.39282174764463074,
|
| 26 |
+
"bpb": 0.3846961788427735,
|
| 27 |
+
"compression_rate": 4.808702235534669,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-wikipedia_english-2026-01-16_19-35-00.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 1620.662,
|
| 3 |
+
"avg tokens": 851.824,
|
| 4 |
+
"avg character count": 3116.258,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 3133.736,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_english",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.7502976423509188,
|
| 26 |
+
"bpb": 0.7461129560234777,
|
| 27 |
+
"compression_rate": 9.32641195029347,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Base-UncheatableEval-2026-01-wikipedia_nonenglish-2026-01-16_19-35-26.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 2320.77,
|
| 3 |
+
"avg tokens": 1091.968,
|
| 4 |
+
"avg character count": 2679.128,
|
| 5 |
+
"parameters count": 1.554859392,
|
| 6 |
+
"avg bytes": 3649.448,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-wikipedia_nonenglish",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 1.2497213160565226,
|
| 26 |
+
"bpb": 0.9174437805508886,
|
| 27 |
+
"compression_rate": 11.468047256886107,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-ao3_english-2026-01-16_19-12-03.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5660.136,
|
| 3 |
+
"avg tokens": 2356.426,
|
| 4 |
+
"avg character count": 8933.384,
|
| 5 |
+
"parameters count": 1.554872208,
|
| 6 |
+
"avg bytes": 9114.402,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_english",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.9140825176615149,
|
| 26 |
+
"bpb": 0.8959282395001991,
|
| 27 |
+
"compression_rate": 11.199102993752488,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-ao3_nonenglish-2026-01-16_19-13-12.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 4594.32,
|
| 3 |
+
"avg tokens": 1586.388,
|
| 4 |
+
"avg character count": 3053.742,
|
| 5 |
+
"parameters count": 1.554872208,
|
| 6 |
+
"avg bytes": 5339.418,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-ao3_nonenglish",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 2.1705182298494705,
|
| 26 |
+
"bpb": 1.2413717525499937,
|
| 27 |
+
"compression_rate": 15.51714690687492,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-arxiv_cs-2026-01-16_19-14-24.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 5065.248,
|
| 3 |
+
"avg tokens": 2821.348,
|
| 4 |
+
"avg character count": 11948.316,
|
| 5 |
+
"parameters count": 1.554872208,
|
| 6 |
+
"avg bytes": 11958.88,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_cs",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.6116015152656441,
|
| 26 |
+
"bpb": 0.6110612507586614,
|
| 27 |
+
"compression_rate": 7.638265634483267,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|
data/2026-01/Falcon-H1-1.5B-Deep-Base-UncheatableEval-2026-01-arxiv_math-2026-01-16_19-15-34.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"neg_log_prob_sum": 3112.224,
|
| 3 |
+
"avg tokens": 2614.604,
|
| 4 |
+
"avg character count": 8202.032,
|
| 5 |
+
"parameters count": 1.554872208,
|
| 6 |
+
"avg bytes": 8205.778,
|
| 7 |
+
"sample_count": 500,
|
| 8 |
+
"model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
|
| 9 |
+
"tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
|
| 10 |
+
"data_path": "Jellyfish042/UncheatableEval-2026-01-arxiv_math",
|
| 11 |
+
"chunk_size": 4000,
|
| 12 |
+
"bos_mode": "add_default_eos",
|
| 13 |
+
"model_args": {
|
| 14 |
+
"device_map": "auto",
|
| 15 |
+
"trust_remote_code": true,
|
| 16 |
+
"attn_implementation": "flash_attention_2",
|
| 17 |
+
"torch_dtype": "torch.bfloat16"
|
| 18 |
+
},
|
| 19 |
+
"tokenizer_args": {
|
| 20 |
+
"trust_remote_code": true
|
| 21 |
+
},
|
| 22 |
+
"requirements": [],
|
| 23 |
+
"batch_size": 1,
|
| 24 |
+
"enable_chunking": true,
|
| 25 |
+
"bpc": 0.5474241176985915,
|
| 26 |
+
"bpb": 0.5471742144298339,
|
| 27 |
+
"compression_rate": 6.839677680372924,
|
| 28 |
+
"track_byte_wise_data": false
|
| 29 |
+
}
|