Spaces:
Sleeping
Sleeping
| from datasets import Dataset, concatenate_datasets | |
| from huggingface_hub import login | |
| import os | |
| from datasets import load_dataset | |
| from datasets import DownloadConfig | |
| import gc | |
| def remove_duplicates(original_dataset, new_dataset, unique_key="text", batch_size=1000): | |
| """ | |
| Removes duplicates from the new_dataset that already exist in the original_dataset. | |
| Args: | |
| original_dataset: The original dataset (e.g., dataset['train']). | |
| new_dataset: The new dataset to be added. | |
| unique_key: The column name that uniquely identifies each entry. | |
| batch_size: The size of batches for processing large datasets. | |
| Returns: | |
| A new dataset with duplicates removed. | |
| """ | |
| # Extract unique keys from the original dataset in batches to save memory | |
| original_ids = set() | |
| for batch in original_dataset.iter(batch_size=batch_size): | |
| original_ids.update(batch[unique_key]) | |
| # Filter out rows in the new dataset whose unique key exists in the original dataset | |
| def filter_function(example): | |
| return example[unique_key] not in original_ids | |
| deduplicated_new_dataset = new_dataset.filter(filter_function, batched=True, batch_size=batch_size) | |
| del original_ids | |
| return deduplicated_new_dataset | |
| def update_db_hub(texts, topics, dates): | |
| api_token = os.getenv("hf_key") | |
| login(token=api_token) | |
| dataset_name = "Danielrahmai1991/row_data" | |
| new_rows = { | |
| 'text': texts, | |
| "topic": topics, | |
| "date": dates | |
| } | |
| # print("new_rows", new_rows) | |
| new_dataset = Dataset.from_dict(new_rows) | |
| try: | |
| # Load the dataset (use_auth_token=True if it's private) | |
| dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token)) | |
| # print("Dataset loaded successfully!", dataset) | |
| # print(dataset) | |
| # deduplicated_new_dataset = remove_duplicates( | |
| # dataset['train'], | |
| # new_dataset, | |
| # unique_key="text", | |
| # batch_size=1000 # Adjust batch size based on available memory | |
| # ) | |
| updated_dataset = concatenate_datasets([dataset['train'], new_dataset]) | |
| # updated_dataset = new_dataset | |
| # del dataset | |
| except Exception as e: | |
| updated_dataset = new_dataset | |
| print(f"Failed to load dataset: {e}") | |
| gc.collect() | |
| # Replace with your Space's repository name | |
| # Sample data | |
| print("updated_dataset", updated_dataset) | |
| # Push the updated dataset back to the hub | |
| try: | |
| updated_dataset.push_to_hub(dataset_name, private=True) # Set private=False if it's not private | |
| print(f"Updated dataset pushed to the Hugging Face Hub: {dataset_name}") | |
| except Exception as e: | |
| print(f"Failed to push dataset: {e}") |