Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

1744dee

1 Parent(s): ed5b7bd

Updates

Browse files

Files changed (1) hide show

app.py +42 -8

app.py CHANGED Viewed

@@ -201,7 +201,7 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
     deduplication_type = gr.Radio(
         choices=["Single dataset", "Cross-dataset"],
         label="Deduplication Type",
-        value="Single dataset",
     )
     with gr.Row():
@@ -209,7 +209,7 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
         dataset1_split = gr.Textbox(value=default_dataset_split, label="Dataset 1 Split")
         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-    dataset2_inputs = gr.Column(visible=False)
     with dataset2_inputs:
         gr.Markdown("### Dataset 2")
         with gr.Row():
@@ -245,8 +245,6 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
 demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
@@ -270,7 +268,16 @@ demo.launch()
 #     batch_size: int = 1024,
 #     progress=None
 # ) -> tuple[np.ndarray, dict[int, int]]:
-#     """Deduplicate embeddings within one dataset or across two datasets."""
 #     if embeddings_b is None:
 #         reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
 #         duplicate_to_original = {}
@@ -298,13 +305,27 @@ demo.launch()
 #         return duplicate_indices_in_b, duplicate_to_original
 # def display_word_differences(x: str, y: str) -> str:
-#     """Display word-level differences between two texts, avoiding Markdown issues."""
 #     diff = ndiff(x.split(), y.split())
 #     formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
 #     return f"```\n{formatted_diff}\n```"
 # def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
-#     """Load texts from a specified dataset and split."""
 #     ds = load_dataset(dataset_name, split=dataset_split)
 #     return [example[text_column] for example in ds]
@@ -319,7 +340,20 @@ demo.launch()
 #     threshold: float = default_threshold,
 #     progress: gr.Progress = gr.Progress(track_tqdm=True)
 # ):
-#     """Perform deduplication on one or two datasets."""
 #     try:
 #         threshold = float(threshold)

     deduplication_type = gr.Radio(
         choices=["Single dataset", "Cross-dataset"],
         label="Deduplication Type",
+        value="Cross-dataset",  # Set "Cross-dataset" as the default value
     )
     with gr.Row():
         dataset1_split = gr.Textbox(value=default_dataset_split, label="Dataset 1 Split")
         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+    dataset2_inputs = gr.Column(visible=True)  # Make dataset2_inputs visible by default
     with dataset2_inputs:
         gr.Markdown("### Dataset 2")
         with gr.Row():
 demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
 #     batch_size: int = 1024,
 #     progress=None
 # ) -> tuple[np.ndarray, dict[int, int]]:
+#     """
+#     Deduplicate embeddings within one dataset or across two datasets.
+#     :param embeddings_a: Embeddings of Dataset 1.
+#     :param embeddings_b: Optional, embeddings of Dataset 2.
+#     :param threshold: Similarity threshold for deduplication.
+#     :param batch_size: Batch size for similarity computation.
+#     :param progress: Gradio progress tracker for feedback.
+#     :return: Deduplicated indices and a mapping of removed indices to their original counterparts.
+#     """
 #     if embeddings_b is None:
 #         reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
 #         duplicate_to_original = {}
 #         return duplicate_indices_in_b, duplicate_to_original
 # def display_word_differences(x: str, y: str) -> str:
+#     """
+#     Display the word-level differences between two texts, formatted to avoid
+#     misinterpretation of Markdown syntax.
+#     :param x: First text.
+#     :param y: Second text.
+#     :return: A string showing word-level differences, wrapped in a code block.
+#     """
 #     diff = ndiff(x.split(), y.split())
 #     formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
 #     return f"```\n{formatted_diff}\n```"
 # def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
+#     """
+#     Load texts from a specified dataset and split.
+#     :param dataset_name: Name of the dataset.
+#     :param dataset_split: Split of the dataset (e.g., 'train', 'validation').
+#     :param text_column: Name of the text column.
+#     :return: A list of texts from the dataset.
+#     """
 #     ds = load_dataset(dataset_name, split=dataset_split)
 #     return [example[text_column] for example in ds]
 #     threshold: float = default_threshold,
 #     progress: gr.Progress = gr.Progress(track_tqdm=True)
 # ):
+#     """
+#     Perform deduplication on one or two datasets based on the deduplication type.
+#     :param deduplication_type: 'Single dataset' or 'Cross-dataset'.
+#     :param dataset1_name: Name of the first dataset.
+#     :param dataset1_split: Split of the first dataset.
+#     :param dataset1_text_column: Text column of the first dataset.
+#     :param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
+#     :param dataset2_split: Optional, split of the second dataset.
+#     :param dataset2_text_column: Optional, text column of the second dataset.
+#     :param threshold: Similarity threshold for deduplication.
+#     :param progress: Gradio progress tracker.
+#     :return: Status updates and result text for the Gradio interface.
+#     """
 #     try:
 #         threshold = float(threshold)