| """ |
| This script is used to curate the data for the project. |
| |
| Implement your functions to to clean the data and prepare it for model training. |
| |
| Note: the competition requires that you use FiftyOne for data curation and you are only allowed to |
| use the approaved dataset from the hub, Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set, which can |
| be found here: https://huggingface.co/datasets/Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set |
| """ |
|
|
| import fiftyone as fo |
| import fiftyone.utils.huggingface as fouh |
|
|
| |
|
|
| def shuffle_data(dataset): |
| """Shuffle the dataset""" |
| return dataset.shuffle(seed=51) |
|
|
| def take_random_sample(dataset): |
| """Take a sample from the dataset""" |
| return dataset.take(size=10,seed=51) |
|
|
| def prepare_dataset(name): |
| """ |
| Prepare the dataset for model training. |
| |
| Args: |
| name (str): The name of the dataset to load. Must be "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set". |
| |
| Returns: |
| fiftyone.core.dataset.Dataset: The curated dataset. |
| |
| Raises: |
| ValueError: If the provided dataset name is not the approved one. |
| |
| Note: |
| The following code block MUST NOT be removed from your submission: |
| |
| APPROVED_DATASET = "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set" |
| |
| if name != APPROVED_DATASET: |
| raise ValueError(f"Only the approved dataset '{APPROVED_DATASET}' is allowed for this competition.") |
| |
| This ensures that only the approved dataset is used for the competition. |
| """ |
| APPROVED_DATASET = "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set" |
| Vox |
| if name != APPROVED_DATASET: |
| raise ValueError(f"Only the approved dataset '{APPROVED_DATASET}' is allowed for this competition.") |
| |
| |
| dataset = fouh.load_from_hub(name, split="train") |
| |
| |
| dataset = shuffle_data(dataset) |
| dataset = take_random_sample(dataset) |
| |
| |
| curated_dataset = dataset.clone() |
| return curated_dataset |