Mastering HuggingFace Datasets - Comprehensive Guide (Full Tutorial)

#1
by AYI-NEDJIMI - opened

Mastering HuggingFace Datasets - Comprehensive Guide

Author: AYI-NEDJIMI | AI & Cybersecurity Consultant

This tutorial covers everything you need to know about HuggingFace datasets: browsing, loading, streaming, transforming, creating, uploading, versioning, and preprocessing for model training.


1. Browse and Search Datasets

The Hub hosts over 100,000 datasets covering all AI domains.

1.1 Hub Navigation

Visit huggingface.co/datasets to explore:

  • By task: text-classification, question-answering, image-classification
  • By language: English, French, multilingual
  • By size: from small datasets to terabytes
  • By license: MIT, Apache 2.0, CC-BY

1.2 Programmatic Search

from huggingface_hub import HfApi

api = HfApi()

# Most downloaded datasets
datasets = api.list_datasets(sort="downloads", direction=-1, limit=10)
for ds in datasets:
    print(f"{ds.id:40s} | {ds.downloads:>12,} DL")

# Search by language
en_datasets = api.list_datasets(language="en", sort="downloads", direction=-1, limit=10)
for ds in en_datasets:
    print(f"  {ds.id}")

# Text search
cybersec_ds = api.list_datasets(search="cybersecurity", limit=10)
for ds in cybersec_ds:
    print(f"  {ds.id}")

# Detailed information
info = api.dataset_info("squad_v2")
print(f"Dataset: {info.id}")
print(f"Downloads: {info.downloads:,}")
print(f"Likes: {info.likes}")

2. Load Datasets with the datasets Library

2.1 Standard Loading

from datasets import load_dataset

# Load a complete dataset
dataset = load_dataset("imdb")
print(dataset)
# DatasetDict({
#     train: Dataset({features: ['text', 'label'], num_rows: 25000}),
#     test: Dataset({features: ['text', 'label'], num_rows: 25000})
# })

# Access a specific split
train_data = dataset['train']
print(f"Number of examples: {len(train_data)}")
print(f"Columns: {train_data.column_names}")
print(f"First example: {train_data[0]}")

# Load only one split
train_only = load_dataset("imdb", split="train")

# Load a subset
small = load_dataset("imdb", split="train[:100]")
print(f"First 100 examples: {len(small)}")

# Load a percentage
ten_percent = load_dataset("imdb", split="train[:10%]")
print(f"10% of train: {len(ten_percent)}")

2.2 Streaming Loading

Streaming is essential for large datasets:

from datasets import load_dataset

# Streaming - doesn't download the entire dataset into memory
dataset = load_dataset("wikipedia", "20220301.en", streaming=True)

# Iterate over examples
for i, example in enumerate(dataset['train']):
    print(f"Article: {example['title']}")
    print(f"Text: {example['text'][:200]}...")
    if i >= 2:
        break

# Combine streaming with transformations
streamed = load_dataset("imdb", split="train", streaming=True)
filtered = streamed.filter(lambda x: len(x['text']) > 500)
for i, example in enumerate(filtered):
    print(f"  Long review ({len(example['text'])} chars): {example['text'][:100]}...")
    if i >= 2:
        break

2.3 Load from Different Formats

from datasets import load_dataset

# From a CSV file
# dataset = load_dataset("csv", data_files="data.csv")

# From a JSON/JSONL file
# dataset = load_dataset("json", data_files="data.jsonl")

# From a Parquet file
# dataset = load_dataset("parquet", data_files="data.parquet")

# From multiple files
# dataset = load_dataset("csv", data_files={"train": "train.csv", "test": "test.csv"})

# From an image folder
# dataset = load_dataset("imagefolder", data_dir="./images")

# From an audio folder
# dataset = load_dataset("audiofolder", data_dir="./audio")

3. Filter, Map, Select, Shuffle

3.1 Filter

from datasets import load_dataset

dataset = load_dataset("imdb", split="train")

# Filter positive reviews
positive = dataset.filter(lambda x: x['label'] == 1)
print(f"Positive reviews: {len(positive)}")

# Filter with multiple conditions
long_positive = dataset.filter(lambda x: x['label'] == 1 and len(x['text']) > 1000)
print(f"Long positive reviews: {len(long_positive)}")

# Batch filter (faster)
positive_batch = dataset.filter(lambda batch: [l == 1 for l in batch['label']], batched=True)
print(f"Positive (batch): {len(positive_batch)}")

3.2 Map (Transform)

# Add a column
def add_length(example):
    example['text_length'] = len(example['text'])
    return example

dataset_with_length = dataset.map(add_length)
print(dataset_with_length[0]['text_length'])

# Batch transformation (much faster)
def tokenize_batch(batch):
    batch['num_words'] = [len(text.split()) for text in batch['text']]
    return batch

dataset_processed = dataset.map(tokenize_batch, batched=True, batch_size=1000)
print(f"Words in first example: {dataset_processed[0]['num_words']}")

# With HuggingFace tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

tokenized = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
print(f"Columns after tokenization: {tokenized.column_names}")

3.3 Select and Shuffle

# Select specific indices
subset = dataset.select(range(100))
print(f"Subset: {len(subset)} examples")

# Random selection
import random
indices = random.sample(range(len(dataset)), 500)
random_subset = dataset.select(indices)

# Shuffle the dataset
shuffled = dataset.shuffle(seed=42)

# Sort
sorted_ds = dataset.sort("label")

3.4 Rename and Remove Columns

# Rename a column
renamed = dataset.rename_column("label", "sentiment")
print(f"Columns: {renamed.column_names}")

# Remove a column
no_text = dataset.remove_columns(["text"])
print(f"Remaining columns: {no_text.column_names}")

4. Process Large Datasets (Memory-Efficient)

4.1 Arrow Format

HuggingFace datasets use Apache Arrow internally, enabling:

  • Memory-mapping: data stays on disk
  • Zero-copy: no duplication in memory
  • Fast access: O(1) indexing
from datasets import load_dataset

# The dataset does NOT load everything into memory
dataset = load_dataset("imdb", split="train")
print(f"Size in memory: {dataset.dataset_size / 1e6:.1f} MB")

# Automatic caching
dataset_processed = dataset.map(lambda x: {"upper": x["text"].upper()})
# Results are automatically cached to disk

4.2 Batch Processing

# Batch processing with num_proc for parallelism
def heavy_processing(batch):
    batch['processed'] = [text.lower().strip() for text in batch['text']]
    return batch

# Use multiple processes
processed = dataset.map(
    heavy_processing,
    batched=True,
    batch_size=1000,
    num_proc=4  # 4 parallel processes
)

4.3 Streaming for Very Large Datasets

from datasets import load_dataset

# The Pile (800 GB) with streaming
# pile = load_dataset("EleutherAI/pile", streaming=True)

# Take only N examples
# samples = pile['train'].take(1000)

# Combine streaming + shuffle
# shuffled_stream = pile['train'].shuffle(seed=42, buffer_size=10000)

5. Create Your Own Dataset

5.1 From a Python Dictionary

from datasets import Dataset, DatasetDict

# Simple dataset
data = {
    "text": [
        "Alert: phishing attempt detected",
        "Security update installed successfully",
        "Suspicious connection from unknown IP",
        "Firewall configured correctly",
    ],
    "label": [1, 0, 1, 0],  # 1=threat, 0=normal
    "category": ["phishing", "update", "intrusion", "firewall"]
}

dataset = Dataset.from_dict(data)
print(dataset)

# Create a DatasetDict with train/test
full_data = Dataset.from_dict({
    "text": data["text"] * 25,  # 100 examples
    "label": data["label"] * 25,
})

# Split train/test
ds_dict = full_data.train_test_split(test_size=0.2, seed=42)
print(ds_dict)

5.2 From a JSONL File

import json

# Create a JSONL file
records = [
    {"text": "SQL injection detected in form", "label": "attack", "severity": "high"},
    {"text": "SSL certificate renewed", "label": "maintenance", "severity": "low"},
    {"text": "DDoS ongoing on main server", "label": "attack", "severity": "critical"},
]

with open("cybersec_data.jsonl", "w") as f:
    for record in records:
        f.write(json.dumps(record) + "\n")

# Load
from datasets import load_dataset
dataset = load_dataset("json", data_files="cybersec_data.jsonl")
print(dataset)

5.3 From a CSV File

import csv

with open("cybersec_data.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["text", "label", "severity"])
    writer.writerow(["Brute force attack detected", "attack", "high"])
    writer.writerow(["System update completed", "maintenance", "low"])

dataset = load_dataset("csv", data_files="cybersec_data.csv")
print(dataset)

5.4 From a Pandas DataFrame

import pandas as pd
from datasets import Dataset

df = pd.DataFrame({
    "text": ["Malware detected", "Firewall updated", "Unauthorized access attempt"],
    "label": [1, 0, 1],
    "timestamp": ["2026-01-15", "2026-01-16", "2026-01-17"]
})

dataset = Dataset.from_pandas(df)
print(dataset)

6. Dataset Cards and Metadata

A good dataset card improves discoverability and usability:

---
language:
  - en
  - fr
license: mit
size_categories:
  - 1K<n<10K
task_categories:
  - text-classification
tags:
  - cybersecurity
  - threat-detection
pretty_name: CyberSec Threat Dataset
---

# CyberSec Threat Dataset

## Dataset Description
Dataset for cybersecurity threat classification.

## Dataset Structure
### Data Fields
- `text`: alert text (string)
- `label`: category (string: attack, maintenance, normal)
- `severity`: severity (string: low, medium, high, critical)

### Data Splits
| Split | Examples |
|-------|----------|
| train | 8,000 |
| test  | 2,000 |

7. Upload and Version Datasets

7.1 Push to Hub

from datasets import Dataset
from huggingface_hub import HfApi

# Method 1: push_to_hub
dataset = Dataset.from_dict({"text": ["example 1", "example 2"], "label": [0, 1]})
# dataset.push_to_hub("my-username/my-dataset")

# Method 2: with revision (versioning)
# dataset.push_to_hub("my-username/my-dataset", revision="v1.0")

# Method 3: upload files
api = HfApi(token="hf_your_token")
# api.upload_file(
#     path_or_fileobj="data.parquet",
#     path_in_repo="data/train-00000-of-00001.parquet",
#     repo_id="my-username/my-dataset",
#     repo_type="dataset"
# )

8. Dataset Viewer Features

The HuggingFace Dataset Viewer offers:

  • Visual preview of data directly on the Hub
  • Filtering by column and value
  • Statistics on distributions
  • Search within data
  • API to access data without downloading
import requests

# Dataset Viewer API
url = "https://datasets-server.huggingface.co/rows"
params = {
    "dataset": "imdb",
    "config": "plain_text",
    "split": "train",
    "offset": 0,
    "length": 5
}
response = requests.get(url, params=params)
data = response.json()
for row in data['rows']:
    print(f"  Label: {row['row']['label']} | Text: {row['row']['text'][:80]}...")

9. Data Preprocessing for Training

9.1 Tokenization for Text Models

from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset("imdb", split="train[:1000]")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])
tokenized.set_format("torch")  # Convert to PyTorch tensors

print(f"Columns: {tokenized.column_names}")
print(f"Shape input_ids: {tokenized[0]['input_ids'].shape}")

9.2 Instruction Format for Fine-tuning

from datasets import Dataset

# Instruction format for SFTTrainer
instruction_data = {
    "instruction": [
        "Classify this security alert",
        "Summarize this incident report",
    ],
    "input": [
        "Failed login attempt 5 times from IP 192.168.1.100",
        "On January 15, a DDoS attack targeted our servers for 3 hours...",
    ],
    "output": [
        "Classification: Brute Force Attack - Severity: Medium",
        "Summary: 3-hour DDoS attack on 01/15, moderate impact, CDN mitigation.",
    ]
}

dataset = Dataset.from_dict(instruction_data)

# Format for chat template
def format_instruction(example):
    return {
        "text": f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    }

formatted = dataset.map(format_instruction)
print(formatted[0]['text'])

9.3 Data Augmentation

def augment_text(examples):
    augmented_texts = []
    augmented_labels = []
    for text, label in zip(examples['text'], examples['label']):
        augmented_texts.append(text)
        augmented_labels.append(label)
        augmented_texts.append(text.lower())
        augmented_labels.append(label)
    return {"text": augmented_texts, "label": augmented_labels}

Conclusion

The HuggingFace datasets library is a powerful and flexible tool for managing your training data. From streaming loading to advanced preprocessing, it simplifies every step of the data pipeline.

Explore our CyberSec AI collection: CyberSec AI Portfolio


Tutorial written by AYI-NEDJIMI - AI & Cybersecurity Consultant

Sign up or log in to comment