Spaces:
Running
Running
Update curated.py
Browse files- curated.py +4 -3
curated.py
CHANGED
|
@@ -599,8 +599,8 @@ filtering_process = Div(
|
|
| 599 |
Section(
|
| 600 |
H3("FreeLaw"),
|
| 601 |
H4("Download and Extraction"),
|
| 602 |
-
P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"),". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function.",
|
| 603 |
-
|
| 604 |
("html", html2text),
|
| 605 |
("html_lawbox", html2text),
|
| 606 |
("html_columbia", html2text),
|
|
@@ -608,7 +608,8 @@ filtering_process = Div(
|
|
| 608 |
("html_with_citations", html2text),
|
| 609 |
("xml_harvard", html2text),
|
| 610 |
plain_text
|
| 611 |
-
""", language ="SQL"),
|
|
|
|
| 612 |
H4("Filtering"),
|
| 613 |
Ol(
|
| 614 |
Li("Language Filter: English"),
|
|
|
|
| 599 |
Section(
|
| 600 |
H3("FreeLaw"),
|
| 601 |
H4("Download and Extraction"),
|
| 602 |
+
P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"),". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
|
| 603 |
+
D_code("""
|
| 604 |
("html", html2text),
|
| 605 |
("html_lawbox", html2text),
|
| 606 |
("html_columbia", html2text),
|
|
|
|
| 608 |
("html_with_citations", html2text),
|
| 609 |
("xml_harvard", html2text),
|
| 610 |
plain_text
|
| 611 |
+
""", language ="SQL"),
|
| 612 |
+
P("All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
|
| 613 |
H4("Filtering"),
|
| 614 |
Ol(
|
| 615 |
Li("Language Filter: English"),
|