| | import os |
| | import datasets |
| | import pandas as pd |
| | from datetime import datetime |
| |
|
| | from config import BACKUP_FOLDER, HF_DATASET_REPO_ID, HF_TOKEN, RESULTS_CSV_FILE, CSV_HEADERS |
| |
|
| | def main(): |
| | """ |
| | Gets the dataset from HF Hub where preferences are being collected, |
| | save it locally to a backup folder with a timestamp. |
| | Then creates an empty dataset with the same structure and saves it to the HF Hub. |
| | """ |
| | print(f"Attempting to load dataset '{HF_DATASET_REPO_ID}' from Hugging Face Hub (file: {RESULTS_CSV_FILE})...") |
| | try: |
| | |
| | |
| | dataset = datasets.load_dataset(HF_DATASET_REPO_ID, data_files=RESULTS_CSV_FILE, token=HF_TOKEN, split='train') |
| | print(f"Successfully loaded dataset. It has {len(dataset)} entries.") |
| | dataset_df = dataset.to_pandas() |
| | except Exception as e: |
| | print(f"Error loading dataset from Hugging Face Hub: {e}") |
| | print("This could be due to the dataset/file not existing, or token issues.") |
| | print("Attempting to proceed by creating an empty structure for backup and remote reset.") |
| | |
| | |
| | dataset_df = pd.DataFrame(columns=CSV_HEADERS) |
| |
|
| | |
| | if not os.path.exists(BACKUP_FOLDER): |
| | os.makedirs(BACKUP_FOLDER) |
| | print(f"Created backup folder: {BACKUP_FOLDER}") |
| |
|
| | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| | backup_filename = f"preferences_backup_{timestamp}.csv" |
| | backup_filepath = os.path.join(BACKUP_FOLDER, backup_filename) |
| |
|
| | try: |
| | dataset_df.to_csv(backup_filepath, index=False) |
| | print(f"Successfully backed up current preferences (or empty structure) to: {backup_filepath}") |
| | except Exception as e: |
| | print(f"Error saving backup to {backup_filepath}: {e}") |
| | |
| | |
| |
|
| | |
| | print(f"Creating an empty dataset structure using predefined CSV_HEADERS: {CSV_HEADERS}") |
| | empty_df = pd.DataFrame(columns=CSV_HEADERS) |
| | empty_dataset = datasets.Dataset.from_pandas(empty_df) |
| |
|
| | |
| | print(f"Attempting to push the empty dataset to '{HF_DATASET_REPO_ID}' (file: {RESULTS_CSV_FILE}) on Hugging Face Hub...") |
| | try: |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | dataset_dict_to_push = datasets.DatasetDict({"train": empty_dataset}) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | temp_empty_csv_path = "_temp_empty_prefs.csv" |
| | empty_df.to_csv(temp_empty_csv_path, index=False) |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | from huggingface_hub import HfApi |
| | api = HfApi(token=os.getenv("HF_HUB_TOKEN", HF_TOKEN)) |
| |
|
| | api.upload_file( |
| | path_or_fileobj=temp_empty_csv_path, |
| | path_in_repo=RESULTS_CSV_FILE, |
| | repo_id=HF_DATASET_REPO_ID, |
| | repo_type="dataset", |
| | commit_message=f"Reset {RESULTS_CSV_FILE} to empty by script" |
| | ) |
| |
|
| | if os.path.exists(temp_empty_csv_path): |
| | os.remove(temp_empty_csv_path) |
| |
|
| | print(f"Successfully pushed empty dataset to replace {RESULTS_CSV_FILE} in Hugging Face Hub: {HF_DATASET_REPO_ID}") |
| | print("The remote dataset CSV should now be empty but retain its structure based on CSV_HEADERS.") |
| | print(f"IMPORTANT: The old data (if any) is backed up at {backup_filepath}") |
| |
|
| | except Exception as e: |
| | print(f"Error pushing empty dataset to Hugging Face Hub: {e}") |
| | if os.path.exists(temp_empty_csv_path): |
| | os.remove(temp_empty_csv_path) |
| | print("The remote dataset might not have been cleared. Please check the Hugging Face Hub.") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |