Update curated.py
Browse files- curated.py +36 -318
curated.py
CHANGED
|
@@ -455,34 +455,6 @@ data_sources = [
|
|
| 455 |
"Europarl",
|
| 456 |
]
|
| 457 |
|
| 458 |
-
def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
|
| 459 |
-
doc_id = max(0, min(int(doc_id), 9))
|
| 460 |
-
|
| 461 |
-
if data_source == "Freelaw":
|
| 462 |
-
raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
|
| 463 |
-
extracted_sample_doc = json.load(
|
| 464 |
-
open("data/curated_samples/freelaw_extract.json")
|
| 465 |
-
)
|
| 466 |
-
else:
|
| 467 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 468 |
-
|
| 469 |
-
raw_json = raw_sample_doc[doc_id]
|
| 470 |
-
extracted_json = extracted_sample_doc[doc_id]
|
| 471 |
-
return view_data(
|
| 472 |
-
raw_json,
|
| 473 |
-
extracted_json,
|
| 474 |
-
doc_id=doc_id,
|
| 475 |
-
data_source="Freelaw",
|
| 476 |
-
data_sources="Freelaw",
|
| 477 |
-
target=target,
|
| 478 |
-
)
|
| 479 |
-
|
| 480 |
-
freelaw_examples = Div(
|
| 481 |
-
Div(
|
| 482 |
-
get_freelaw_data(target=gen_random_id()),
|
| 483 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
| 484 |
-
),
|
| 485 |
-
)
|
| 486 |
|
| 487 |
|
| 488 |
def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
|
|
@@ -513,261 +485,7 @@ wiki_examples = Div(
|
|
| 513 |
),
|
| 514 |
)
|
| 515 |
|
| 516 |
-
def get_se_data(data_source: str = "StackExchange", doc_id: int = 3, target: str = "foo"):
|
| 517 |
-
doc_id = max(0, min(int(doc_id), 9))
|
| 518 |
-
|
| 519 |
-
if data_source == "StackExchange":
|
| 520 |
-
raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
|
| 521 |
-
extracted_sample_doc = json.load(
|
| 522 |
-
open("data/curated_samples/stackexchange_extract.json")
|
| 523 |
-
)
|
| 524 |
-
else:
|
| 525 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 526 |
-
|
| 527 |
-
raw_json = raw_sample_doc[doc_id]
|
| 528 |
-
extracted_json = extracted_sample_doc[doc_id]
|
| 529 |
-
return view_data(
|
| 530 |
-
raw_json,
|
| 531 |
-
extracted_json,
|
| 532 |
-
doc_id=doc_id,
|
| 533 |
-
data_source="StackExchange",
|
| 534 |
-
data_sources="StackExchange",
|
| 535 |
-
target=target,
|
| 536 |
-
)
|
| 537 |
-
|
| 538 |
-
se_examples = Div(
|
| 539 |
-
Div(
|
| 540 |
-
get_se_data(target=gen_random_id()),
|
| 541 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
| 542 |
-
),
|
| 543 |
-
)
|
| 544 |
-
|
| 545 |
-
def get_phil_data(data_source: str = "PhilPapers", doc_id: int = 3, target: str = "foo"):
|
| 546 |
-
doc_id = max(0, min(int(doc_id), 9))
|
| 547 |
-
|
| 548 |
-
if data_source == "PhilPapers":
|
| 549 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
| 550 |
-
open("data/curated_samples/philpapers_raw.json")
|
| 551 |
-
)
|
| 552 |
-
else:
|
| 553 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 554 |
-
|
| 555 |
-
raw_json = raw_sample_doc[doc_id]
|
| 556 |
-
extracted_json = extracted_sample_doc[doc_id]
|
| 557 |
-
return view_data(
|
| 558 |
-
raw_json,
|
| 559 |
-
extracted_json,
|
| 560 |
-
doc_id=doc_id,
|
| 561 |
-
data_source="PhilPapers",
|
| 562 |
-
data_sources="PhilPapers",
|
| 563 |
-
target=target,
|
| 564 |
-
)
|
| 565 |
-
|
| 566 |
-
phil_examples = Div(
|
| 567 |
-
Div(
|
| 568 |
-
get_phil_data(target=gen_random_id()),
|
| 569 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
| 570 |
-
),
|
| 571 |
-
)
|
| 572 |
-
|
| 573 |
-
def get_arx_data(data_source: str = "Arxiv", doc_id: int = 3, target: str = "foo"):
|
| 574 |
-
doc_id = max(0, min(int(doc_id), 9))
|
| 575 |
-
|
| 576 |
-
if data_source == "Arxiv":
|
| 577 |
-
raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
|
| 578 |
-
extracted_sample_doc = json.load(
|
| 579 |
-
open("data/curated_samples/arxiv_extract.json")
|
| 580 |
-
)
|
| 581 |
-
else:
|
| 582 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 583 |
-
|
| 584 |
-
raw_json = raw_sample_doc[doc_id]
|
| 585 |
-
extracted_json = extracted_sample_doc[doc_id]
|
| 586 |
-
return view_data(
|
| 587 |
-
raw_json,
|
| 588 |
-
extracted_json,
|
| 589 |
-
doc_id=doc_id,
|
| 590 |
-
data_source="Arxiv",
|
| 591 |
-
data_sources="Arxiv",
|
| 592 |
-
target=target,
|
| 593 |
-
)
|
| 594 |
-
|
| 595 |
-
arx_examples = Div(
|
| 596 |
-
Div(
|
| 597 |
-
get_arx_data(target=gen_random_id()),
|
| 598 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
| 599 |
-
),
|
| 600 |
-
)
|
| 601 |
-
|
| 602 |
-
def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
|
| 603 |
-
doc_id = max(0, min(int(doc_id), 9))
|
| 604 |
-
|
| 605 |
-
if data_source == "S2ORC":
|
| 606 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
| 607 |
-
open("data/curated_samples/s2orc_raw.json")
|
| 608 |
-
)
|
| 609 |
-
else:
|
| 610 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 611 |
-
|
| 612 |
-
raw_json = raw_sample_doc[doc_id]
|
| 613 |
-
extracted_json = extracted_sample_doc[doc_id]
|
| 614 |
-
return view_data(
|
| 615 |
-
raw_json,
|
| 616 |
-
extracted_json,
|
| 617 |
-
doc_id=doc_id,
|
| 618 |
-
data_source="S2ORC",
|
| 619 |
-
data_sources="S2ORC",
|
| 620 |
-
target=target,
|
| 621 |
-
)
|
| 622 |
-
|
| 623 |
-
s2o_examples = Div(
|
| 624 |
-
Div(
|
| 625 |
-
get_S2ORC_data(target=gen_random_id()),
|
| 626 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
| 627 |
-
),
|
| 628 |
-
)
|
| 629 |
-
|
| 630 |
-
def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
|
| 631 |
-
doc_id = max(0, min(int(doc_id), 9))
|
| 632 |
-
|
| 633 |
-
if data_source == "S2ORC":
|
| 634 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
| 635 |
-
open("data/curated_samples/s2orc_abstract_raw.json")
|
| 636 |
-
)
|
| 637 |
-
else:
|
| 638 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 639 |
-
|
| 640 |
-
raw_json = raw_sample_doc[doc_id]
|
| 641 |
-
extracted_json = extracted_sample_doc[doc_id]
|
| 642 |
-
return view_data(
|
| 643 |
-
raw_json,
|
| 644 |
-
extracted_json,
|
| 645 |
-
doc_id=doc_id,
|
| 646 |
-
data_source="S2ORC Abstract",
|
| 647 |
-
data_sources="S2ORC Abstract",
|
| 648 |
-
target=target,
|
| 649 |
-
)
|
| 650 |
-
|
| 651 |
-
s2oa_examples = Div(
|
| 652 |
-
Div(
|
| 653 |
-
get_S2ORCA_data(target=gen_random_id()),
|
| 654 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
| 655 |
-
),
|
| 656 |
-
)
|
| 657 |
-
|
| 658 |
-
def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
|
| 659 |
-
doc_id = max(0, min(int(doc_id), 9))
|
| 660 |
-
|
| 661 |
-
if data_source == "Pubmed":
|
| 662 |
-
raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
|
| 663 |
-
extracted_sample_doc = json.load(
|
| 664 |
-
open("data/curated_samples/pubmed_extract.json")
|
| 665 |
-
)
|
| 666 |
-
else:
|
| 667 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 668 |
-
|
| 669 |
-
raw_json = raw_sample_doc[doc_id]
|
| 670 |
-
extracted_json = extracted_sample_doc[doc_id]
|
| 671 |
-
return view_data(
|
| 672 |
-
raw_json,
|
| 673 |
-
extracted_json,
|
| 674 |
-
doc_id=doc_id,
|
| 675 |
-
data_source="Pubmed",
|
| 676 |
-
data_sources="Pubmed",
|
| 677 |
-
target=target,
|
| 678 |
-
)
|
| 679 |
-
|
| 680 |
-
pubmed_examples = Div(
|
| 681 |
-
Div(
|
| 682 |
-
get_pubmed_data(target=gen_random_id()),
|
| 683 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
| 684 |
-
),
|
| 685 |
-
)
|
| 686 |
-
|
| 687 |
-
def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
|
| 688 |
-
doc_id = max(0, min(int(doc_id), 9))
|
| 689 |
-
|
| 690 |
-
if data_source == "DM Math":
|
| 691 |
-
raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
|
| 692 |
-
extracted_sample_doc = json.load(
|
| 693 |
-
open("data/curated_samples/dm_maths_extract.json")
|
| 694 |
-
)
|
| 695 |
-
else:
|
| 696 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 697 |
-
|
| 698 |
-
raw_json = raw_sample_doc[doc_id]
|
| 699 |
-
extracted_json = extracted_sample_doc[doc_id]
|
| 700 |
-
return view_data(
|
| 701 |
-
raw_json,
|
| 702 |
-
extracted_json,
|
| 703 |
-
doc_id=doc_id,
|
| 704 |
-
data_source="DM Math",
|
| 705 |
-
data_sources="DM Math",
|
| 706 |
-
target=target,
|
| 707 |
-
)
|
| 708 |
-
|
| 709 |
-
dmm_examples = Div(
|
| 710 |
-
Div(
|
| 711 |
-
get_dmm_data(target=gen_random_id()),
|
| 712 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
| 713 |
-
),
|
| 714 |
-
)
|
| 715 |
-
|
| 716 |
-
def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
|
| 717 |
-
doc_id = max(0, min(int(doc_id), 9))
|
| 718 |
-
|
| 719 |
-
if data_source == "PG19":
|
| 720 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
| 721 |
-
open("data/curated_samples/pg19_raw.json")
|
| 722 |
-
)
|
| 723 |
-
else:
|
| 724 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 725 |
-
|
| 726 |
-
raw_json = raw_sample_doc[doc_id]
|
| 727 |
-
extracted_json = extracted_sample_doc[doc_id]
|
| 728 |
-
return view_data(
|
| 729 |
-
raw_json,
|
| 730 |
-
extracted_json,
|
| 731 |
-
doc_id=doc_id,
|
| 732 |
-
data_source="PG19",
|
| 733 |
-
data_sources="PG19",
|
| 734 |
-
target=target,
|
| 735 |
-
)
|
| 736 |
-
|
| 737 |
-
pg19_examples = Div(
|
| 738 |
-
Div(
|
| 739 |
-
get_pg19_data(target=gen_random_id()),
|
| 740 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
| 741 |
-
),
|
| 742 |
-
)
|
| 743 |
-
|
| 744 |
-
def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
|
| 745 |
-
doc_id = max(0, min(int(doc_id), 9))
|
| 746 |
|
| 747 |
-
if data_source == "Europarl":
|
| 748 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
| 749 |
-
open("data/curated_samples/europarl_raw.json")
|
| 750 |
-
)
|
| 751 |
-
else:
|
| 752 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 753 |
-
|
| 754 |
-
raw_json = raw_sample_doc[doc_id]
|
| 755 |
-
extracted_json = extracted_sample_doc[doc_id]
|
| 756 |
-
return view_data(
|
| 757 |
-
raw_json,
|
| 758 |
-
extracted_json,
|
| 759 |
-
doc_id=doc_id,
|
| 760 |
-
data_source="Europarl",
|
| 761 |
-
data_sources="Europarl",
|
| 762 |
-
target=target,
|
| 763 |
-
)
|
| 764 |
-
|
| 765 |
-
eu_examples = Div(
|
| 766 |
-
Div(
|
| 767 |
-
get_eu_data(target=gen_random_id()),
|
| 768 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
| 769 |
-
),
|
| 770 |
-
)
|
| 771 |
|
| 772 |
filtering_process = Div(
|
| 773 |
Section(
|
|
@@ -803,10 +521,10 @@ filtering_process = Div(
|
|
| 803 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
| 804 |
),
|
| 805 |
table_div_arx,
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
),
|
| 811 |
),
|
| 812 |
Section(
|
|
@@ -845,10 +563,10 @@ filtering_process = Div(
|
|
| 845 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
| 846 |
),
|
| 847 |
table_div_s2o,
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
),
|
| 853 |
),
|
| 854 |
Section(
|
|
@@ -881,10 +599,10 @@ filtering_process = Div(
|
|
| 881 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
| 882 |
),
|
| 883 |
table_div_med,
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
),
|
| 889 |
),
|
| 890 |
Section(
|
|
@@ -898,10 +616,10 @@ filtering_process = Div(
|
|
| 898 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
| 899 |
),
|
| 900 |
table_div_phil,
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
| 905 |
),
|
| 906 |
),
|
| 907 |
Section(
|
|
@@ -913,10 +631,10 @@ filtering_process = Div(
|
|
| 913 |
H4("Filtering"),
|
| 914 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
| 915 |
table_div_up,
|
| 916 |
-
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
),
|
| 921 |
),
|
| 922 |
Section(
|
|
@@ -977,10 +695,10 @@ filtering_process = Div(
|
|
| 977 |
Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
|
| 978 |
),
|
| 979 |
table_div_freelaw,
|
| 980 |
-
|
| 981 |
-
|
| 982 |
-
|
| 983 |
-
|
| 984 |
|
| 985 |
),
|
| 986 |
),
|
|
@@ -1006,10 +724,10 @@ filtering_process = Div(
|
|
| 1006 |
Li("Minimum Word Count Filter: 10"),
|
| 1007 |
),
|
| 1008 |
table_div_se,
|
| 1009 |
-
|
| 1010 |
-
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
),
|
| 1014 |
),
|
| 1015 |
Section(
|
|
@@ -1058,10 +776,10 @@ filtering_process = Div(
|
|
| 1058 |
Li("None"),
|
| 1059 |
),
|
| 1060 |
table_div_dmm,
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
-
|
| 1064 |
-
|
| 1065 |
),
|
| 1066 |
),
|
| 1067 |
Section(
|
|
@@ -1079,10 +797,10 @@ filtering_process = Div(
|
|
| 1079 |
Li("Unigram Log Probability"),
|
| 1080 |
),
|
| 1081 |
table_div_pg19,
|
| 1082 |
-
Details(
|
| 1083 |
-
|
| 1084 |
-
|
| 1085 |
-
),
|
| 1086 |
),
|
| 1087 |
),
|
| 1088 |
)
|
|
|
|
| 455 |
"Europarl",
|
| 456 |
]
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
|
| 459 |
|
| 460 |
def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
|
|
|
|
| 485 |
),
|
| 486 |
)
|
| 487 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
|
| 490 |
filtering_process = Div(
|
| 491 |
Section(
|
|
|
|
| 521 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
| 522 |
),
|
| 523 |
table_div_arx,
|
| 524 |
+
# Details(
|
| 525 |
+
# Summary("ArXiv Filtering Examples"),
|
| 526 |
+
# arx_examples,
|
| 527 |
+
# ),
|
| 528 |
),
|
| 529 |
),
|
| 530 |
Section(
|
|
|
|
| 563 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
| 564 |
),
|
| 565 |
table_div_s2o,
|
| 566 |
+
# Details(
|
| 567 |
+
# Summary("FreeLaw Filtering Examples -- need to update"),
|
| 568 |
+
# freelaw_examples,
|
| 569 |
+
# ),
|
| 570 |
),
|
| 571 |
),
|
| 572 |
Section(
|
|
|
|
| 599 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
| 600 |
),
|
| 601 |
table_div_med,
|
| 602 |
+
# Details(
|
| 603 |
+
# Summary("PubMed Filtering Examples"),
|
| 604 |
+
# pubmed_examples,
|
| 605 |
+
# ),
|
| 606 |
),
|
| 607 |
),
|
| 608 |
Section(
|
|
|
|
| 616 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
| 617 |
),
|
| 618 |
table_div_phil,
|
| 619 |
+
# Details(
|
| 620 |
+
# Summary("Phil Papers Filtering Examples"),
|
| 621 |
+
# phil_examples,
|
| 622 |
+
# ),
|
| 623 |
),
|
| 624 |
),
|
| 625 |
Section(
|
|
|
|
| 631 |
H4("Filtering"),
|
| 632 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
| 633 |
table_div_up,
|
| 634 |
+
# Details(
|
| 635 |
+
# Summary("EuroParl Filtering Examples"),
|
| 636 |
+
# eu_examples,
|
| 637 |
+
# ),
|
| 638 |
),
|
| 639 |
),
|
| 640 |
Section(
|
|
|
|
| 695 |
Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
|
| 696 |
),
|
| 697 |
table_div_freelaw,
|
| 698 |
+
# Details(
|
| 699 |
+
# Summary("FreeLaw Filtering Examples"),
|
| 700 |
+
# freelaw_examples,
|
| 701 |
+
# ),
|
| 702 |
|
| 703 |
),
|
| 704 |
),
|
|
|
|
| 724 |
Li("Minimum Word Count Filter: 10"),
|
| 725 |
),
|
| 726 |
table_div_se,
|
| 727 |
+
# Details(
|
| 728 |
+
# Summary("StackExchange Filtering Examples"),
|
| 729 |
+
# se_examples,
|
| 730 |
+
# ),
|
| 731 |
),
|
| 732 |
),
|
| 733 |
Section(
|
|
|
|
| 776 |
Li("None"),
|
| 777 |
),
|
| 778 |
table_div_dmm,
|
| 779 |
+
# Details(
|
| 780 |
+
# Summary("DM Math Filtering Examples"),
|
| 781 |
+
# dmm_examples,
|
| 782 |
+
# ),
|
| 783 |
),
|
| 784 |
),
|
| 785 |
Section(
|
|
|
|
| 797 |
Li("Unigram Log Probability"),
|
| 798 |
),
|
| 799 |
table_div_pg19,
|
| 800 |
+
#Details(
|
| 801 |
+
# Summary("PG-19 Filtering Examples"),
|
| 802 |
+
# pg19_examples,
|
| 803 |
+
#),
|
| 804 |
),
|
| 805 |
),
|
| 806 |
)
|