Spaces:

Danielrahmai1991
/

dataset_interface

Sleeping

App Files Files Community

Danielrahmai1991 commited on Mar 10, 2025

Commit

5fa49a2

verified ·

1 Parent(s): 6d2e16c

Update utils.py

Browse files

Files changed (1) hide show

utils.py +24 -3

utils.py CHANGED Viewed

@@ -4,6 +4,26 @@ import os
 from datasets import load_dataset
 from datasets import DownloadConfig
 def update_db_hub(texts, topics, dates):
     api_token = os.getenv("hf_key")
@@ -20,11 +40,12 @@ def update_db_hub(texts, topics, dates):
     try:
         # Load the dataset (use_auth_token=True if it's private)
-        # dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
         # print("Dataset loaded successfully!", dataset)
         # print(dataset)
-        # updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
-        updated_dataset = new_dataset
     except Exception as e:
         updated_dataset = new_dataset
         print(f"Failed to load dataset: {e}")

 from datasets import load_dataset
 from datasets import DownloadConfig
+def remove_duplicates(original_dataset: Dataset, new_dataset: Dataset, unique_key: str = "text") -> Dataset:
+    """
+    Removes duplicates from the new_dataset that already exist in the original_dataset.
+    Args:
+        original_dataset (Dataset): The original dataset (e.g., dataset['train']).
+        new_dataset (Dataset): The new dataset to be added.
+        unique_key (str): The column name that uniquely identifies each entry.
+    Returns:
+        Dataset: A new dataset with duplicates removed.
+    """
+    # Extract unique keys from the original dataset
+    original_ids = set(original_dataset[unique_key])
+    # Filter out rows in the new dataset whose unique key exists in the original dataset
+    filtered_new_dataset = new_dataset.filter(lambda example: example[unique_key] not in original_ids)
+    return filtered_new_dataset
 def update_db_hub(texts, topics, dates):
     api_token = os.getenv("hf_key")
     try:
         # Load the dataset (use_auth_token=True if it's private)
+        dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
         # print("Dataset loaded successfully!", dataset)
         # print(dataset)
+        # deduplicated_new_dataset = remove_duplicates(dataset['train'], new_dataset, unique_key="text")
+        updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
+        # updated_dataset = new_dataset
     except Exception as e:
         updated_dataset = new_dataset
         print(f"Failed to load dataset: {e}")