AINovice2005's picture
download
raw
1.74 kB
from dagster import AssetExecutionContext, MaterializeResult
from dagster_hf_datasets import hf_dataset_asset
from datasets import Dataset
@hf_dataset_asset(
path="stanfordnlp/imdb",
split="train",
group_name="basic_hub_ingestion",
io_manager_key="hf_parquet_io_manager",
)
def imdb_train(
context: AssetExecutionContext,
dataset: Dataset,
) -> MaterializeResult:
"""Load the IMDb training split from the Hugging Face Hub.
Materializes the raw dataset as a Dagster asset and attaches
Hub metadata (row count, columns, fingerprint, revision) for
lineage tracking in the Dagster UI.
"""
context.log.info("Loaded IMDb train split: %s rows", len(dataset))
context.log.info("Columns: %s", dataset.column_names)
return MaterializeResult(
value=dataset,
metadata={
"rows": len(dataset),
"columns": dataset.column_names,
"source_dataset": "stanfordnlp/imdb",
"split": "train",
"fingerprint": dataset._fingerprint,
},
)
@hf_dataset_asset(
path="stanfordnlp/imdb",
split="test",
group_name="basic_hub_ingestion",
io_manager_key="hf_parquet_io_manager",
)
def imdb_test(
context: AssetExecutionContext,
dataset: Dataset,
) -> MaterializeResult:
"""Load the IMDb test split from the Hugging Face Hub."""
context.log.info("Loaded IMDb test split: %s rows", len(dataset))
return MaterializeResult(
value=dataset,
metadata={
"rows": len(dataset),
"columns": dataset.column_names,
"source_dataset": "stanfordnlp/imdb",
"split": "test",
"fingerprint": dataset._fingerprint,
},
)

Xet Storage Details

Size:
1.74 kB
·
Xet hash:
d06862d7c51aecc1ed8964785550fd52de8b7af5d7e3d5aff0f8d90127df9895

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.