Spaces:

lhoestq
/

Common-Crawl-Pipeline-Creator

Running

App Files Files Community

lhoestq HF Staff commited on Oct 11, 2024

Commit

905f549

1 Parent(s): c361455

add excluded tab

Browse files

Files changed (1) hide show

app.py +128 -62

app.py CHANGED Viewed

@@ -1,15 +1,13 @@
 import ast
 import glob
-import tempfile
-from dataclasses import asdict
 from itertools import islice
 from functools import partial
-from typing import Optional
 import gradio as gr
 import nltk
 import pandas as pd
-from datatrove.utils.typeshelper import Languages
 from datatrove.executor.local import LocalPipelineExecutor
 from datatrove.pipeline.extractors import Trafilatura
 from datatrove.pipeline.filters.base_filter import BaseFilter
@@ -23,8 +21,7 @@ from datatrove.pipeline.filters import (
 )
 from datatrove.pipeline.formatters import PIIFormatter
 from datatrove.pipeline.readers import JsonlReader, WarcReader
-from datatrove.pipeline.writers.jsonl import JsonlWriter
-from difflib import Differ
 nltk.download('punkt_tab')
@@ -114,6 +111,9 @@ blocks = sorted(glob.glob("images/*.png"))
 def prepare_as_list_or_none(text: str) -> Optional[list[str]]:
     return ([x.strip() for x in text.split(",") if x.strip()] or None) if text else None
 def build_code_snippet(steps, params=None):
     # TODO
     return (
@@ -183,8 +183,8 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
             language_filtering_checkbox = gr.Checkbox(True, label="Enable")
             with gr.Accordion("Parameters", open=True) as acc:
                 with gr.Row():
-                    languages_textbox = gr.Textbox("", label="languages", info="list of languages to keep. empty for all")
-                    languages_textbox.prepare_parameter = prepare_as_list_or_none
                     language_threshold_slider = gr.Slider(0, 1, value=0.65, step=0.05, label="language_threshold", info="minimum score to accept a document")
             language_filtering_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=language_filtering_checkbox, outputs=acc)
         language_filtering_parameters_components = [languages_textbox, language_threshold_slider]
@@ -196,7 +196,7 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
             with gr.Accordion("Parameters", open=True) as acc:
                 with gr.Group():
                     with gr.Row():
-                        language_dropdown1 = gr.Dropdown([v for k, v in vars(Languages).items() if not k.startswith("__")], value=Languages.english, label="language", info="tokenizer language")
                         top_n_grams_textbox = gr.Textbox("(2, 0.2), (3, 0.18), (4, 0.16)", label="top_n_grams")
                         top_n_grams_textbox.prepare_parameter = ast.literal_eval
                         dup_n_grams_textbox = gr.Textbox("(5, 0.15), (6, 0.14), (7, 0.13), (8, 0.12), (9, 0.11), (10, 0.10)", label="dup_n_grams")
@@ -250,7 +250,7 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
                     with gr.Row():
                         split_paragraph_checkbox = gr.Checkbox(True, label="split_paragraph", info="disable to apply the filters to each sentence instead of to each line")
                     with gr.Row():
-                        language_dropdown2 = gr.Dropdown([v for k, v in vars(Languages).items() if not k.startswith("__")], value=Languages.english, label="language", info="tokenizer language")
                         min_num_sentences_slider = gr.Slider(0, 10, value=5, step=1, label="min_num_sentences", info="remove documents that do not have at least this number of sentences (after line filtering)")
                         min_words_per_line_slider = gr.Slider(0, 10, value=3, step=1, label="min_words_per_line", info="drop lines without this min number of words")
                         max_word_length_slider = gr.Slider(0, 2000, value=1000, step=10, label="max_word_length", info=" drop lines where at least one word has more than this number of characters")
@@ -271,7 +271,7 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
             with gr.Accordion("Parameters", open=True) as acc:
                 with gr.Group():
                     with gr.Row():
-                        language_dropdown2 = gr.Dropdown([v for k, v in vars(Languages).items() if not k.startswith("__")], value=Languages.english, label="language", info="tokenizer language")
                         min_doc_words_slider = gr.Slider(0, 1000, value=50, step=10, label="min_doc_words")
                         max_doc_words_slider = gr.Slider(0, 200_000, value=100_000, step=10_000, label="max_doc_words")
                     with gr.Row():
@@ -289,7 +289,9 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
             gopher_filtering_quality_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=gopher_filtering_quality_checkbox, outputs=acc)
         gopher_filtering_quality_parameters_components = [language_dropdown2, min_doc_words_slider, max_doc_words_slider, min_avg_word_length_slider, max_avg_word_length_slider, max_symbol_word_ratio_slider, max_bullet_lines_ratio_slider, max_ellipsis_lines_ratio_slider, max_non_alpha_words_ratio_slider, min_stop_words_slider, stop_words_textbox]
-    view_pipeline_results_button = gr.Button("View Pipeline Results", variant="primary")
     steps = [
         URLFilter,
@@ -313,7 +315,15 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
     ]
     with gr.Tab("Output") as output_tab:
-        output_dataframe_diff = gr.DataFrame(datatype="markdown")
     with gr.Tab("Python code") as code_tab:
         python_code_markdown = gr.Markdown(build_code_snippet(steps))
@@ -338,7 +348,7 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
         pii_removal_checkbox
     ] + sum(steps_parameters_components, [])
-    @view_pipeline_results_button.click(inputs=inputs, outputs=[output_tab, output_dataframe_diff])
     def view_pipeline_results(*args):
         enable_steps, steps_parameters = args[:len(steps)], args[len(steps):]
         steps_parameters_iter = iter(steps_parameters)
@@ -357,53 +367,109 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
             for step_parameters_components in steps_parameters_components
         ]
-        with tempfile.TemporaryDirectory() as output_path:
-            steps_to_run = [
-                step(**step_parameters, **({"exclusion_writer": JsonlWriter(f"{output_path}/base_processing/removed/{step.__name__}/{DUMP_TO_PROCESS}")} if issubclass(step, BaseFilter) and False else {}))
-                for step, step_parameters, enable_step in zip(steps, steps_parameters, enable_steps)
-                if enable_step
-            ]
-            output_docs = []
-            if steps_parameters[:2] == default_steps_parameters[:2] and all(enable_steps[:2]):
-                num_warc_samples = 2000
-                default_output_docs = default_output_docs_2k
-                pipeline = LocalPipelineExecutor(
-                    pipeline=[
-                        JsonlReader(data_folder=f"output_text_extraction-2k/base_processing/output/{DUMP_TO_PROCESS}", glob_pattern="*.jsonl.gz")
-                    ] + steps_to_run[2:] + [
-                        lambda data, rank, world_size: map(output_docs.append, data)
-                    ],
-                    logging_dir="logs",
-                    skip_completed=False
-                )
-            else:
-                num_warc_samples = 200
-                default_output_docs = default_output_docs_200
-                pipeline = LocalPipelineExecutor(
-                    pipeline=[
-                        WarcReader(data_folder="data", glob_pattern="*.warc.gz"),
-                        lambda data, rank, world_size: islice(data, num_warc_samples),
-                    ] + steps_to_run + [
-                        lambda data, rank, world_size: map(output_docs.append, data)
-                    ],
-                    logging_dir="logs",
-                    skip_completed=False
-                )
-            pipeline.run()
-            out = [doc.text[:1_000] + f" [+{len(doc.text) - 1000} chars]" if len(doc.text) > 1_000 else doc.text for doc in output_docs]
-            default_out = [doc["text"][:1_000] + f" [+{len(doc['text']) - 1000} chars]" if len(doc["text"]) > 1_000 else doc["text"] for doc in default_output_docs]
-            output_diff = []
-            for text_diff in Differ().compare(default_out, out[:len(default_out) * 10]):
-                opcode, text = text_diff[0], text_diff[2:]
-                if opcode == "-":
-                    text = f'<div class="diffDeletion">\n\n{text}\n\n</div>'
-                elif opcode == "+":
-                    text = f'<div class="diffInsertion">\n\n{text}\n\n</div>'
-                output_diff.append(text)
-            return {
-                output_tab: gr.Tab(f"Output: kept {len(out)/num_warc_samples*100:.02f}% of data"),
-                output_dataframe_diff: pd.DataFrame({"text": output_diff}),
-            }
-demo.launch()

 import ast
 import glob
 from itertools import islice
 from functools import partial
+from typing import Optional, Type
 import gradio as gr
 import nltk
 import pandas as pd
+from datatrove.data import Document
 from datatrove.executor.local import LocalPipelineExecutor
 from datatrove.pipeline.extractors import Trafilatura
 from datatrove.pipeline.filters.base_filter import BaseFilter
 )
 from datatrove.pipeline.formatters import PIIFormatter
 from datatrove.pipeline.readers import JsonlReader, WarcReader
+from datatrove.utils.typeshelper import Languages
 nltk.download('punkt_tab')
 def prepare_as_list_or_none(text: str) -> Optional[list[str]]:
     return ([x.strip() for x in text.split(",") if x.strip()] or None) if text else None
+def non_empty_list_or_none(input_list: list[str]) -> Optional[list[str]]:
+    return input_list or None
 def build_code_snippet(steps, params=None):
     # TODO
     return (
             language_filtering_checkbox = gr.Checkbox(True, label="Enable")
             with gr.Accordion("Parameters", open=True) as acc:
                 with gr.Row():
+                    languages_textbox = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), multiselect=True, label="languages", info="list of languages to keep. empty for all")
+                    languages_textbox.prepare_parameter = non_empty_list_or_none
                     language_threshold_slider = gr.Slider(0, 1, value=0.65, step=0.05, label="language_threshold", info="minimum score to accept a document")
             language_filtering_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=language_filtering_checkbox, outputs=acc)
         language_filtering_parameters_components = [languages_textbox, language_threshold_slider]
             with gr.Accordion("Parameters", open=True) as acc:
                 with gr.Group():
                     with gr.Row():
+                        language_dropdown1 = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), value=Languages.english, label="language", info="tokenizer language")
                         top_n_grams_textbox = gr.Textbox("(2, 0.2), (3, 0.18), (4, 0.16)", label="top_n_grams")
                         top_n_grams_textbox.prepare_parameter = ast.literal_eval
                         dup_n_grams_textbox = gr.Textbox("(5, 0.15), (6, 0.14), (7, 0.13), (8, 0.12), (9, 0.11), (10, 0.10)", label="dup_n_grams")
                     with gr.Row():
                         split_paragraph_checkbox = gr.Checkbox(True, label="split_paragraph", info="disable to apply the filters to each sentence instead of to each line")
                     with gr.Row():
+                        language_dropdown2 = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), value=Languages.english, label="language", info="tokenizer language")
                         min_num_sentences_slider = gr.Slider(0, 10, value=5, step=1, label="min_num_sentences", info="remove documents that do not have at least this number of sentences (after line filtering)")
                         min_words_per_line_slider = gr.Slider(0, 10, value=3, step=1, label="min_words_per_line", info="drop lines without this min number of words")
                         max_word_length_slider = gr.Slider(0, 2000, value=1000, step=10, label="max_word_length", info=" drop lines where at least one word has more than this number of characters")
             with gr.Accordion("Parameters", open=True) as acc:
                 with gr.Group():
                     with gr.Row():
+                        language_dropdown2 = gr.Dropdown(sorted(v for k, v in vars(Languages).items() if not k.startswith("__")), value=Languages.english, label="language", info="tokenizer language")
                         min_doc_words_slider = gr.Slider(0, 1000, value=50, step=10, label="min_doc_words")
                         max_doc_words_slider = gr.Slider(0, 200_000, value=100_000, step=10_000, label="max_doc_words")
                     with gr.Row():
             gopher_filtering_quality_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=gopher_filtering_quality_checkbox, outputs=acc)
         gopher_filtering_quality_parameters_components = [language_dropdown2, min_doc_words_slider, max_doc_words_slider, min_avg_word_length_slider, max_avg_word_length_slider, max_symbol_word_ratio_slider, max_bullet_lines_ratio_slider, max_ellipsis_lines_ratio_slider, max_non_alpha_words_ratio_slider, min_stop_words_slider, stop_words_textbox]
+    with gr.Row():
+        view_pipeline_results_button = gr.Button("Run Pipeline & Stream Results", variant="primary", scale=4)
+        stop_button = gr.Button("Stop")
     steps = [
         URLFilter,
     ]
     with gr.Tab("Output") as output_tab:
+        output_dataframe = gr.DataFrame(datatype="markdown")
+    with gr.Tab("Excluded") as excluded_tab:
+        excluded_dataframes: dict[Type, gr.DataFrame] = {}
+        excluded_tabs: dict[Type, gr.Tab] = {}
+        for step in steps:
+            if issubclass(step, BaseFilter) and step is not URLFilter:
+                with gr.Tab(step.__name__) as t:
+                    excluded_dataframes[step] = gr.DataFrame(datatype="markdown")
+                    excluded_tabs[step] = t
     with gr.Tab("Python code") as code_tab:
         python_code_markdown = gr.Markdown(build_code_snippet(steps))
         pii_removal_checkbox
     ] + sum(steps_parameters_components, [])
+    @view_pipeline_results_button.click(inputs=inputs, outputs=[output_tab, output_dataframe, excluded_tab] + list(excluded_dataframes.values()) + list(excluded_tabs.values()))
     def view_pipeline_results(*args):
         enable_steps, steps_parameters = args[:len(steps)], args[len(steps):]
         steps_parameters_iter = iter(steps_parameters)
             for step_parameters_components in steps_parameters_components
         ]
+        class ExclusionWriter:
+            def __init__(self) -> None:
+                self.docs: list[Document] = []
+            def __enter__(self):
+                return self
+            def __exit__(self, exc_type, exc_val, exc_tb):
+                return
+            def write(self, doc, rank):
+                self.docs.append(doc)
+        steps_to_run = [
+            step(**step_parameters, **({"exclusion_writer": ExclusionWriter()} if step in excluded_dataframes else {}))
+            for step, step_parameters, enable_step in zip(steps, steps_parameters, enable_steps)
+            if enable_step
+        ]
+        output_docs: list[Document] = []
+        num_warc_samples = 0
+        def increment_num_warc_samples(data, rank, world_size, num_warc_samples_per_doc=1):
+            nonlocal num_warc_samples
+            for x in data:
+                num_warc_samples += num_warc_samples_per_doc
+                yield x
+        if steps_parameters[:2] == default_steps_parameters[:2] and all(enable_steps[:2]):
+            pipeline_executor = LocalPipelineExecutor(
+                pipeline=[
+                    JsonlReader(data_folder=f"output_text_extraction-2k/base_processing/output/{DUMP_TO_PROCESS}", glob_pattern="*.jsonl.gz"),
+                    partial(increment_num_warc_samples, num_warc_samples_per_doc=2000 / 1687)
+                ] + steps_to_run[2:] + [
+                    lambda data, rank, world_size: map(output_docs.append, data)
+                ],
+                logging_dir="logs",
+                skip_completed=False
+            )
+        else:
+            pipeline_executor = LocalPipelineExecutor(
+                pipeline=[
+                    WarcReader(data_folder="data", glob_pattern="*.warc.gz"),
+                    lambda data, rank, world_size: islice(data, num_warc_samples),
+                ] + steps_to_run + [
+                    lambda data, rank, world_size: map(output_docs.append, data)
+                ],
+                logging_dir="logs",
+                skip_completed=False
+            )
+        from threading import Thread
+        thread = Thread(target=pipeline_executor.run)
+        thread.start()
+        while thread.is_alive():
+            thread.join(timeout=1)
+            if num_warc_samples:
+                yield {
+                    output_tab: gr.Tab(f"Output (~{len(output_docs)/num_warc_samples*100:.03f}% of data)"),
+                    excluded_tab: gr.Tab(f"Excluded (~{100 - len(output_docs)/num_warc_samples*100:.03f}% of data)"),
+                    output_dataframe: pd.DataFrame({"text": [doc.text for doc in output_docs]}),
+                    **{
+                        excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
+                        for step_to_run in pipeline_executor.pipeline
+                        if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
+                    },
+                    **{
+                        excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} (~{len(step_to_run.exclusion_writer.docs)/num_warc_samples*100:.03f}% of data)")
+                        for step_to_run in pipeline_executor.pipeline
+                        if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
+                    },
+                }
+            else:
+                yield {
+                    output_tab: gr.Tab("Output (loading...)"),
+                    excluded_tab: gr.Tab("Excluded (loading...)"),
+                    **{
+                        excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
+                        for step_to_run in pipeline_executor.pipeline
+                        if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
+                    },
+                    **{
+                        excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} (~{len(step_to_run.exclusion_writer.docs)/num_warc_samples*100:.03f}% of data)")
+                        for step_to_run in pipeline_executor.pipeline
+                        if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
+                    },
+                }
+        yield {
+            output_tab: gr.Tab(f"Output (~{len(output_docs)/num_warc_samples*100:.03f}% of data)"),
+            excluded_tab: gr.Tab(f"Excluded (~{100 - len(output_docs)/num_warc_samples*100:.03f}% of data)"),
+            output_dataframe: pd.DataFrame({"text": [doc.text for doc in output_docs]}),
+            **{
+                excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
+                for step_to_run in pipeline_executor.pipeline
+                if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
+            },
+            **{
+                excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} (~{len(step_to_run.exclusion_writer.docs)/num_warc_samples*100:.03f}% of data)")
+                for step_to_run in pipeline_executor.pipeline
+                if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
+            },
+        }
+if __name__ == "__main__":
+    demo.launch()