Mastering HuggingFace Datasets - Comprehensive Guide (Full Tutorial)
Mastering HuggingFace Datasets - Comprehensive Guide
Author: AYI-NEDJIMI | AI & Cybersecurity Consultant
This tutorial covers everything you need to know about HuggingFace datasets: browsing, loading, streaming, transforming, creating, uploading, versioning, and preprocessing for model training.
1. Browse and Search Datasets
The Hub hosts over 100,000 datasets covering all AI domains.
1.1 Hub Navigation
Visit huggingface.co/datasets to explore:
- By task: text-classification, question-answering, image-classification
- By language: English, French, multilingual
- By size: from small datasets to terabytes
- By license: MIT, Apache 2.0, CC-BY
1.2 Programmatic Search
from huggingface_hub import HfApi
api = HfApi()
# Most downloaded datasets
datasets = api.list_datasets(sort="downloads", direction=-1, limit=10)
for ds in datasets:
print(f"{ds.id:40s} | {ds.downloads:>12,} DL")
# Search by language
en_datasets = api.list_datasets(language="en", sort="downloads", direction=-1, limit=10)
for ds in en_datasets:
print(f" {ds.id}")
# Text search
cybersec_ds = api.list_datasets(search="cybersecurity", limit=10)
for ds in cybersec_ds:
print(f" {ds.id}")
# Detailed information
info = api.dataset_info("squad_v2")
print(f"Dataset: {info.id}")
print(f"Downloads: {info.downloads:,}")
print(f"Likes: {info.likes}")
2. Load Datasets with the datasets Library
2.1 Standard Loading
from datasets import load_dataset
# Load a complete dataset
dataset = load_dataset("imdb")
print(dataset)
# DatasetDict({
# train: Dataset({features: ['text', 'label'], num_rows: 25000}),
# test: Dataset({features: ['text', 'label'], num_rows: 25000})
# })
# Access a specific split
train_data = dataset['train']
print(f"Number of examples: {len(train_data)}")
print(f"Columns: {train_data.column_names}")
print(f"First example: {train_data[0]}")
# Load only one split
train_only = load_dataset("imdb", split="train")
# Load a subset
small = load_dataset("imdb", split="train[:100]")
print(f"First 100 examples: {len(small)}")
# Load a percentage
ten_percent = load_dataset("imdb", split="train[:10%]")
print(f"10% of train: {len(ten_percent)}")
2.2 Streaming Loading
Streaming is essential for large datasets:
from datasets import load_dataset
# Streaming - doesn't download the entire dataset into memory
dataset = load_dataset("wikipedia", "20220301.en", streaming=True)
# Iterate over examples
for i, example in enumerate(dataset['train']):
print(f"Article: {example['title']}")
print(f"Text: {example['text'][:200]}...")
if i >= 2:
break
# Combine streaming with transformations
streamed = load_dataset("imdb", split="train", streaming=True)
filtered = streamed.filter(lambda x: len(x['text']) > 500)
for i, example in enumerate(filtered):
print(f" Long review ({len(example['text'])} chars): {example['text'][:100]}...")
if i >= 2:
break
2.3 Load from Different Formats
from datasets import load_dataset
# From a CSV file
# dataset = load_dataset("csv", data_files="data.csv")
# From a JSON/JSONL file
# dataset = load_dataset("json", data_files="data.jsonl")
# From a Parquet file
# dataset = load_dataset("parquet", data_files="data.parquet")
# From multiple files
# dataset = load_dataset("csv", data_files={"train": "train.csv", "test": "test.csv"})
# From an image folder
# dataset = load_dataset("imagefolder", data_dir="./images")
# From an audio folder
# dataset = load_dataset("audiofolder", data_dir="./audio")
3. Filter, Map, Select, Shuffle
3.1 Filter
from datasets import load_dataset
dataset = load_dataset("imdb", split="train")
# Filter positive reviews
positive = dataset.filter(lambda x: x['label'] == 1)
print(f"Positive reviews: {len(positive)}")
# Filter with multiple conditions
long_positive = dataset.filter(lambda x: x['label'] == 1 and len(x['text']) > 1000)
print(f"Long positive reviews: {len(long_positive)}")
# Batch filter (faster)
positive_batch = dataset.filter(lambda batch: [l == 1 for l in batch['label']], batched=True)
print(f"Positive (batch): {len(positive_batch)}")
3.2 Map (Transform)
# Add a column
def add_length(example):
example['text_length'] = len(example['text'])
return example
dataset_with_length = dataset.map(add_length)
print(dataset_with_length[0]['text_length'])
# Batch transformation (much faster)
def tokenize_batch(batch):
batch['num_words'] = [len(text.split()) for text in batch['text']]
return batch
dataset_processed = dataset.map(tokenize_batch, batched=True, batch_size=1000)
print(f"Words in first example: {dataset_processed[0]['num_words']}")
# With HuggingFace tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
tokenized = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
print(f"Columns after tokenization: {tokenized.column_names}")
3.3 Select and Shuffle
# Select specific indices
subset = dataset.select(range(100))
print(f"Subset: {len(subset)} examples")
# Random selection
import random
indices = random.sample(range(len(dataset)), 500)
random_subset = dataset.select(indices)
# Shuffle the dataset
shuffled = dataset.shuffle(seed=42)
# Sort
sorted_ds = dataset.sort("label")
3.4 Rename and Remove Columns
# Rename a column
renamed = dataset.rename_column("label", "sentiment")
print(f"Columns: {renamed.column_names}")
# Remove a column
no_text = dataset.remove_columns(["text"])
print(f"Remaining columns: {no_text.column_names}")
4. Process Large Datasets (Memory-Efficient)
4.1 Arrow Format
HuggingFace datasets use Apache Arrow internally, enabling:
- Memory-mapping: data stays on disk
- Zero-copy: no duplication in memory
- Fast access: O(1) indexing
from datasets import load_dataset
# The dataset does NOT load everything into memory
dataset = load_dataset("imdb", split="train")
print(f"Size in memory: {dataset.dataset_size / 1e6:.1f} MB")
# Automatic caching
dataset_processed = dataset.map(lambda x: {"upper": x["text"].upper()})
# Results are automatically cached to disk
4.2 Batch Processing
# Batch processing with num_proc for parallelism
def heavy_processing(batch):
batch['processed'] = [text.lower().strip() for text in batch['text']]
return batch
# Use multiple processes
processed = dataset.map(
heavy_processing,
batched=True,
batch_size=1000,
num_proc=4 # 4 parallel processes
)
4.3 Streaming for Very Large Datasets
from datasets import load_dataset
# The Pile (800 GB) with streaming
# pile = load_dataset("EleutherAI/pile", streaming=True)
# Take only N examples
# samples = pile['train'].take(1000)
# Combine streaming + shuffle
# shuffled_stream = pile['train'].shuffle(seed=42, buffer_size=10000)
5. Create Your Own Dataset
5.1 From a Python Dictionary
from datasets import Dataset, DatasetDict
# Simple dataset
data = {
"text": [
"Alert: phishing attempt detected",
"Security update installed successfully",
"Suspicious connection from unknown IP",
"Firewall configured correctly",
],
"label": [1, 0, 1, 0], # 1=threat, 0=normal
"category": ["phishing", "update", "intrusion", "firewall"]
}
dataset = Dataset.from_dict(data)
print(dataset)
# Create a DatasetDict with train/test
full_data = Dataset.from_dict({
"text": data["text"] * 25, # 100 examples
"label": data["label"] * 25,
})
# Split train/test
ds_dict = full_data.train_test_split(test_size=0.2, seed=42)
print(ds_dict)
5.2 From a JSONL File
import json
# Create a JSONL file
records = [
{"text": "SQL injection detected in form", "label": "attack", "severity": "high"},
{"text": "SSL certificate renewed", "label": "maintenance", "severity": "low"},
{"text": "DDoS ongoing on main server", "label": "attack", "severity": "critical"},
]
with open("cybersec_data.jsonl", "w") as f:
for record in records:
f.write(json.dumps(record) + "\n")
# Load
from datasets import load_dataset
dataset = load_dataset("json", data_files="cybersec_data.jsonl")
print(dataset)
5.3 From a CSV File
import csv
with open("cybersec_data.csv", "w", newline='') as f:
writer = csv.writer(f)
writer.writerow(["text", "label", "severity"])
writer.writerow(["Brute force attack detected", "attack", "high"])
writer.writerow(["System update completed", "maintenance", "low"])
dataset = load_dataset("csv", data_files="cybersec_data.csv")
print(dataset)
5.4 From a Pandas DataFrame
import pandas as pd
from datasets import Dataset
df = pd.DataFrame({
"text": ["Malware detected", "Firewall updated", "Unauthorized access attempt"],
"label": [1, 0, 1],
"timestamp": ["2026-01-15", "2026-01-16", "2026-01-17"]
})
dataset = Dataset.from_pandas(df)
print(dataset)
6. Dataset Cards and Metadata
A good dataset card improves discoverability and usability:
---
language:
- en
- fr
license: mit
size_categories:
- 1K<n<10K
task_categories:
- text-classification
tags:
- cybersecurity
- threat-detection
pretty_name: CyberSec Threat Dataset
---
# CyberSec Threat Dataset
## Dataset Description
Dataset for cybersecurity threat classification.
## Dataset Structure
### Data Fields
- `text`: alert text (string)
- `label`: category (string: attack, maintenance, normal)
- `severity`: severity (string: low, medium, high, critical)
### Data Splits
| Split | Examples |
|-------|----------|
| train | 8,000 |
| test | 2,000 |
7. Upload and Version Datasets
7.1 Push to Hub
from datasets import Dataset
from huggingface_hub import HfApi
# Method 1: push_to_hub
dataset = Dataset.from_dict({"text": ["example 1", "example 2"], "label": [0, 1]})
# dataset.push_to_hub("my-username/my-dataset")
# Method 2: with revision (versioning)
# dataset.push_to_hub("my-username/my-dataset", revision="v1.0")
# Method 3: upload files
api = HfApi(token="hf_your_token")
# api.upload_file(
# path_or_fileobj="data.parquet",
# path_in_repo="data/train-00000-of-00001.parquet",
# repo_id="my-username/my-dataset",
# repo_type="dataset"
# )
8. Dataset Viewer Features
The HuggingFace Dataset Viewer offers:
- Visual preview of data directly on the Hub
- Filtering by column and value
- Statistics on distributions
- Search within data
- API to access data without downloading
import requests
# Dataset Viewer API
url = "https://datasets-server.huggingface.co/rows"
params = {
"dataset": "imdb",
"config": "plain_text",
"split": "train",
"offset": 0,
"length": 5
}
response = requests.get(url, params=params)
data = response.json()
for row in data['rows']:
print(f" Label: {row['row']['label']} | Text: {row['row']['text'][:80]}...")
9. Data Preprocessing for Training
9.1 Tokenization for Text Models
from datasets import load_dataset
from transformers import AutoTokenizer
dataset = load_dataset("imdb", split="train[:1000]")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=256
)
tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])
tokenized.set_format("torch") # Convert to PyTorch tensors
print(f"Columns: {tokenized.column_names}")
print(f"Shape input_ids: {tokenized[0]['input_ids'].shape}")
9.2 Instruction Format for Fine-tuning
from datasets import Dataset
# Instruction format for SFTTrainer
instruction_data = {
"instruction": [
"Classify this security alert",
"Summarize this incident report",
],
"input": [
"Failed login attempt 5 times from IP 192.168.1.100",
"On January 15, a DDoS attack targeted our servers for 3 hours...",
],
"output": [
"Classification: Brute Force Attack - Severity: Medium",
"Summary: 3-hour DDoS attack on 01/15, moderate impact, CDN mitigation.",
]
}
dataset = Dataset.from_dict(instruction_data)
# Format for chat template
def format_instruction(example):
return {
"text": f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
}
formatted = dataset.map(format_instruction)
print(formatted[0]['text'])
9.3 Data Augmentation
def augment_text(examples):
augmented_texts = []
augmented_labels = []
for text, label in zip(examples['text'], examples['label']):
augmented_texts.append(text)
augmented_labels.append(label)
augmented_texts.append(text.lower())
augmented_labels.append(label)
return {"text": augmented_texts, "label": augmented_labels}
Conclusion
The HuggingFace datasets library is a powerful and flexible tool for managing your training data. From streaming loading to advanced preprocessing, it simplifies every step of the data pipeline.
Explore our CyberSec AI collection: CyberSec AI Portfolio
Tutorial written by AYI-NEDJIMI - AI & Cybersecurity Consultant