TxT360

Paused

App Files Files Community

victormiller commited on Oct 2, 2024

Commit

ed0e179

verified ·

1 Parent(s): 6a7bb93

Update curated.py

Browse files

Files changed (1) hide show

curated.py +36 -318

curated.py CHANGED Viewed

@@ -455,34 +455,6 @@ data_sources = [
     "Europarl",
 ]
-def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
-    doc_id = max(0, min(int(doc_id), 9))
-    if data_source == "Freelaw":
-        raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
-        extracted_sample_doc = json.load(
-            open("data/curated_samples/freelaw_extract.json")
-        )
-    else:
-        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
-    raw_json = raw_sample_doc[doc_id]
-    extracted_json = extracted_sample_doc[doc_id]
-    return view_data(
-        raw_json,
-        extracted_json,
-        doc_id=doc_id,
-        data_source="Freelaw",
-        data_sources="Freelaw",
-        target=target,
-    )
-freelaw_examples = Div(
-    Div(
-        get_freelaw_data(target=gen_random_id()),
-        style="border: 1px solid #ccc; padding: 20px;",
-    ),
-)
 def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
@@ -513,261 +485,7 @@ wiki_examples = Div(
     ),
 )
-def get_se_data(data_source: str = "StackExchange", doc_id: int = 3, target: str = "foo"):
-    doc_id = max(0, min(int(doc_id), 9))
-    if data_source == "StackExchange":
-        raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
-        extracted_sample_doc = json.load(
-            open("data/curated_samples/stackexchange_extract.json")
-        )
-    else:
-        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
-    raw_json = raw_sample_doc[doc_id]
-    extracted_json = extracted_sample_doc[doc_id]
-    return view_data(
-        raw_json,
-        extracted_json,
-        doc_id=doc_id,
-        data_source="StackExchange",
-        data_sources="StackExchange",
-        target=target,
-    )
-se_examples = Div(
-    Div(
-        get_se_data(target=gen_random_id()),
-        style="border: 1px solid #ccc; padding: 20px;",
-    ),
-)
-def get_phil_data(data_source: str = "PhilPapers", doc_id: int = 3, target: str = "foo"):
-    doc_id = max(0, min(int(doc_id), 9))
-    if data_source == "PhilPapers":
-        raw_sample_doc = extracted_sample_doc = json.load(
-            open("data/curated_samples/philpapers_raw.json")
-        )
-    else:
-        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
-    raw_json = raw_sample_doc[doc_id]
-    extracted_json = extracted_sample_doc[doc_id]
-    return view_data(
-        raw_json,
-        extracted_json,
-        doc_id=doc_id,
-        data_source="PhilPapers",
-        data_sources="PhilPapers",
-        target=target,
-    )
-phil_examples = Div(
-    Div(
-        get_phil_data(target=gen_random_id()),
-        style="border: 1px solid #ccc; padding: 20px;",
-    ),
-)
-def get_arx_data(data_source: str = "Arxiv", doc_id: int = 3, target: str = "foo"):
-    doc_id = max(0, min(int(doc_id), 9))
-    if data_source == "Arxiv":
-        raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
-        extracted_sample_doc = json.load(
-            open("data/curated_samples/arxiv_extract.json")
-        )
-    else:
-        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
-    raw_json = raw_sample_doc[doc_id]
-    extracted_json = extracted_sample_doc[doc_id]
-    return view_data(
-        raw_json,
-        extracted_json,
-        doc_id=doc_id,
-        data_source="Arxiv",
-        data_sources="Arxiv",
-        target=target,
-    )
-arx_examples = Div(
-    Div(
-        get_arx_data(target=gen_random_id()),
-        style="border: 1px solid #ccc; padding: 20px;",
-    ),
-)
-def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
-    doc_id = max(0, min(int(doc_id), 9))
-    if data_source == "S2ORC":
-        raw_sample_doc = extracted_sample_doc = json.load(
-            open("data/curated_samples/s2orc_raw.json")
-        )
-    else:
-        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
-    raw_json = raw_sample_doc[doc_id]
-    extracted_json = extracted_sample_doc[doc_id]
-    return view_data(
-        raw_json,
-        extracted_json,
-        doc_id=doc_id,
-        data_source="S2ORC",
-        data_sources="S2ORC",
-        target=target,
-    )
-s2o_examples = Div(
-    Div(
-        get_S2ORC_data(target=gen_random_id()),
-        style="border: 1px solid #ccc; padding: 20px;",
-    ),
-)
-def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
-    doc_id = max(0, min(int(doc_id), 9))
-    if data_source == "S2ORC":
-        raw_sample_doc = extracted_sample_doc = json.load(
-            open("data/curated_samples/s2orc_abstract_raw.json")
-        )
-    else:
-        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
-    raw_json = raw_sample_doc[doc_id]
-    extracted_json = extracted_sample_doc[doc_id]
-    return view_data(
-        raw_json,
-        extracted_json,
-        doc_id=doc_id,
-        data_source="S2ORC Abstract",
-        data_sources="S2ORC Abstract",
-        target=target,
-    )
-s2oa_examples = Div(
-    Div(
-        get_S2ORCA_data(target=gen_random_id()),
-        style="border: 1px solid #ccc; padding: 20px;",
-    ),
-)
-def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
-    doc_id = max(0, min(int(doc_id), 9))
-    if data_source == "Pubmed":
-        raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
-        extracted_sample_doc = json.load(
-            open("data/curated_samples/pubmed_extract.json")
-        )
-    else:
-        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
-    raw_json = raw_sample_doc[doc_id]
-    extracted_json = extracted_sample_doc[doc_id]
-    return view_data(
-        raw_json,
-        extracted_json,
-        doc_id=doc_id,
-        data_source="Pubmed",
-        data_sources="Pubmed",
-        target=target,
-    )
-pubmed_examples = Div(
-    Div(
-        get_pubmed_data(target=gen_random_id()),
-        style="border: 1px solid #ccc; padding: 20px;",
-    ),
-)
-def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
-    doc_id = max(0, min(int(doc_id), 9))
-    if data_source == "DM Math":
-        raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
-        extracted_sample_doc = json.load(
-            open("data/curated_samples/dm_maths_extract.json")
-        )
-    else:
-        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
-    raw_json = raw_sample_doc[doc_id]
-    extracted_json = extracted_sample_doc[doc_id]
-    return view_data(
-        raw_json,
-        extracted_json,
-        doc_id=doc_id,
-        data_source="DM Math",
-        data_sources="DM Math",
-        target=target,
-    )
-dmm_examples = Div(
-    Div(
-        get_dmm_data(target=gen_random_id()),
-        style="border: 1px solid #ccc; padding: 20px;",
-    ),
-)
-def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
-    doc_id = max(0, min(int(doc_id), 9))
-    if data_source == "PG19":
-        raw_sample_doc = extracted_sample_doc = json.load(
-            open("data/curated_samples/pg19_raw.json")
-        )
-    else:
-        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
-    raw_json = raw_sample_doc[doc_id]
-    extracted_json = extracted_sample_doc[doc_id]
-    return view_data(
-        raw_json,
-        extracted_json,
-        doc_id=doc_id,
-        data_source="PG19",
-        data_sources="PG19",
-        target=target,
-    )
-pg19_examples = Div(
-    Div(
-        get_pg19_data(target=gen_random_id()),
-        style="border: 1px solid #ccc; padding: 20px;",
-    ),
-)
-def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
-    doc_id = max(0, min(int(doc_id), 9))
-    if data_source == "Europarl":
-        raw_sample_doc = extracted_sample_doc = json.load(
-            open("data/curated_samples/europarl_raw.json")
-        )
-    else:
-        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
-    raw_json = raw_sample_doc[doc_id]
-    extracted_json = extracted_sample_doc[doc_id]
-    return view_data(
-        raw_json,
-        extracted_json,
-        doc_id=doc_id,
-        data_source="Europarl",
-        data_sources="Europarl",
-        target=target,
-    )
-eu_examples = Div(
-    Div(
-        get_eu_data(target=gen_random_id()),
-        style="border: 1px solid #ccc; padding: 20px;",
-    ),
-)
 filtering_process = Div(
     Section(
@@ -803,10 +521,10 @@ filtering_process = Div(
             Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
         ),
         table_div_arx,
-        Details(
-            Summary("ArXiv Filtering Examples"),
-            arx_examples,
-        ),
         ),
     ),
     Section(
@@ -845,10 +563,10 @@ filtering_process = Div(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
         ),
         table_div_s2o,
-        Details(
-            Summary("FreeLaw Filtering Examples -- need to update"),
-            freelaw_examples,
-        ),
         ),
     ),
     Section(
@@ -881,10 +599,10 @@ filtering_process = Div(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
         table_div_med,
-        Details(
-            Summary("PubMed Filtering Examples"),
-            pubmed_examples,
-        ),
         ),
     ),
     Section(
@@ -898,10 +616,10 @@ filtering_process = Div(
             Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
         ),
         table_div_phil,
-        Details(
-            Summary("Phil Papers Filtering Examples"),
-            phil_examples,
-        ),
         ),
     ),
     Section(
@@ -913,10 +631,10 @@ filtering_process = Div(
         H4("Filtering"),
         P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
         table_div_up,
-        Details(
-            Summary("EuroParl Filtering Examples"),
-            eu_examples,
-        ),
     ),
     ),
     Section(
@@ -977,10 +695,10 @@ filtering_process = Div(
             Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
         ),
         table_div_freelaw,
-        Details(
-            Summary("FreeLaw Filtering Examples"),
-            freelaw_examples,
-        ),
         ),
     ),
@@ -1006,10 +724,10 @@ filtering_process = Div(
             Li("Minimum Word Count Filter: 10"),
         ),
         table_div_se,
-        Details(
-            Summary("StackExchange Filtering Examples"),
-            se_examples,
-        ),
         ),
     ),
     Section(
@@ -1058,10 +776,10 @@ filtering_process = Div(
             Li("None"),
         ),
         table_div_dmm,
-        Details(
-            Summary("DM Math Filtering Examples"),
-            dmm_examples,
-        ),
        ),
     ),
     Section(
@@ -1079,10 +797,10 @@ filtering_process = Div(
             Li("Unigram Log Probability"),
         ),
         table_div_pg19,
-        Details(
-            Summary("PG-19 Filtering Examples"),
-            pg19_examples,
-        ),
         ),
     ),
 )

     "Europarl",
 ]
 def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
     ),
 )
 filtering_process = Div(
     Section(
             Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
         ),
         table_div_arx,
+       # Details(
+       #     Summary("ArXiv Filtering Examples"),
+       #     arx_examples,
+       # ),
         ),
     ),
     Section(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
         ),
         table_div_s2o,
+       # Details(
+      #      Summary("FreeLaw Filtering Examples -- need to update"),
+      #      freelaw_examples,
+      #  ),
         ),
     ),
     Section(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
         table_div_med,
+      #  Details(
+      #      Summary("PubMed Filtering Examples"),
+      #      pubmed_examples,
+      #  ),
         ),
     ),
     Section(
             Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
         ),
         table_div_phil,
+      #  Details(
+      #      Summary("Phil Papers Filtering Examples"),
+       #     phil_examples,
+       # ),
         ),
     ),
     Section(
         H4("Filtering"),
         P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
         table_div_up,
+      #  Details(
+      #      Summary("EuroParl Filtering Examples"),
+      #      eu_examples,
+      #  ),
     ),
     ),
     Section(
             Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
         ),
         table_div_freelaw,
+      #  Details(
+      #      Summary("FreeLaw Filtering Examples"),
+      #      freelaw_examples,
+      #  ),
         ),
     ),
             Li("Minimum Word Count Filter: 10"),
         ),
         table_div_se,
+       # Details(
+       #     Summary("StackExchange Filtering Examples"),
+       #     se_examples,
+       # ),
         ),
     ),
     Section(
             Li("None"),
         ),
         table_div_dmm,
+       # Details(
+       #     Summary("DM Math Filtering Examples"),
+       #     dmm_examples,
+       # ),
        ),
     ),
     Section(
             Li("Unigram Log Probability"),
         ),
         table_div_pg19,
+        #Details(
+        #    Summary("PG-19 Filtering Examples"),
+        #    pg19_examples,
+        #),
         ),
     ),
 )