Buckets:
| from dagster import AssetExecutionContext, MaterializeResult | |
| from dagster_hf_datasets import hf_dataset_asset | |
| from datasets import Dataset | |
| def imdb_train( | |
| context: AssetExecutionContext, | |
| dataset: Dataset, | |
| ) -> MaterializeResult: | |
| """Load the IMDb training split from the Hugging Face Hub. | |
| Materializes the raw dataset as a Dagster asset and attaches | |
| Hub metadata (row count, columns, fingerprint, revision) for | |
| lineage tracking in the Dagster UI. | |
| """ | |
| context.log.info("Loaded IMDb train split: %s rows", len(dataset)) | |
| context.log.info("Columns: %s", dataset.column_names) | |
| return MaterializeResult( | |
| value=dataset, | |
| metadata={ | |
| "rows": len(dataset), | |
| "columns": dataset.column_names, | |
| "source_dataset": "stanfordnlp/imdb", | |
| "split": "train", | |
| "fingerprint": dataset._fingerprint, | |
| }, | |
| ) | |
| def imdb_test( | |
| context: AssetExecutionContext, | |
| dataset: Dataset, | |
| ) -> MaterializeResult: | |
| """Load the IMDb test split from the Hugging Face Hub.""" | |
| context.log.info("Loaded IMDb test split: %s rows", len(dataset)) | |
| return MaterializeResult( | |
| value=dataset, | |
| metadata={ | |
| "rows": len(dataset), | |
| "columns": dataset.column_names, | |
| "source_dataset": "stanfordnlp/imdb", | |
| "split": "test", | |
| "fingerprint": dataset._fingerprint, | |
| }, | |
| ) | |
Xet Storage Details
- Size:
- 1.74 kB
- Xet hash:
- d06862d7c51aecc1ed8964785550fd52de8b7af5d7e3d5aff0f8d90127df9895
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.