Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -4,6 +4,26 @@ import os
|
|
| 4 |
from datasets import load_dataset
|
| 5 |
from datasets import DownloadConfig
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def update_db_hub(texts, topics, dates):
|
| 9 |
api_token = os.getenv("hf_key")
|
|
@@ -20,11 +40,12 @@ def update_db_hub(texts, topics, dates):
|
|
| 20 |
|
| 21 |
try:
|
| 22 |
# Load the dataset (use_auth_token=True if it's private)
|
| 23 |
-
|
| 24 |
# print("Dataset loaded successfully!", dataset)
|
| 25 |
# print(dataset)
|
| 26 |
-
#
|
| 27 |
-
updated_dataset = new_dataset
|
|
|
|
| 28 |
except Exception as e:
|
| 29 |
updated_dataset = new_dataset
|
| 30 |
print(f"Failed to load dataset: {e}")
|
|
|
|
| 4 |
from datasets import load_dataset
|
| 5 |
from datasets import DownloadConfig
|
| 6 |
|
| 7 |
+
def remove_duplicates(original_dataset: Dataset, new_dataset: Dataset, unique_key: str = "text") -> Dataset:
|
| 8 |
+
"""
|
| 9 |
+
Removes duplicates from the new_dataset that already exist in the original_dataset.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
original_dataset (Dataset): The original dataset (e.g., dataset['train']).
|
| 13 |
+
new_dataset (Dataset): The new dataset to be added.
|
| 14 |
+
unique_key (str): The column name that uniquely identifies each entry.
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
Dataset: A new dataset with duplicates removed.
|
| 18 |
+
"""
|
| 19 |
+
# Extract unique keys from the original dataset
|
| 20 |
+
original_ids = set(original_dataset[unique_key])
|
| 21 |
+
|
| 22 |
+
# Filter out rows in the new dataset whose unique key exists in the original dataset
|
| 23 |
+
filtered_new_dataset = new_dataset.filter(lambda example: example[unique_key] not in original_ids)
|
| 24 |
+
|
| 25 |
+
return filtered_new_dataset
|
| 26 |
+
|
| 27 |
|
| 28 |
def update_db_hub(texts, topics, dates):
|
| 29 |
api_token = os.getenv("hf_key")
|
|
|
|
| 40 |
|
| 41 |
try:
|
| 42 |
# Load the dataset (use_auth_token=True if it's private)
|
| 43 |
+
dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
|
| 44 |
# print("Dataset loaded successfully!", dataset)
|
| 45 |
# print(dataset)
|
| 46 |
+
# deduplicated_new_dataset = remove_duplicates(dataset['train'], new_dataset, unique_key="text")
|
| 47 |
+
updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
|
| 48 |
+
# updated_dataset = new_dataset
|
| 49 |
except Exception as e:
|
| 50 |
updated_dataset = new_dataset
|
| 51 |
print(f"Failed to load dataset: {e}")
|