gliner-training-scripts / train_gliner_pii.py
kzfastino's picture
Upload train_gliner_pii.py with huggingface_hub
a762a42 verified
# /// script
# dependencies = [
# "gliner>=0.2.0",
# "datasets>=2.14.0",
# "torch>=2.0.0",
# "transformers>=4.30.0",
# "trackio>=0.1.0",
# "huggingface-hub>=0.19.0",
# "accelerate>=0.20.0"
# ]
# ///
"""
Fine-tune GLiNER large-v2.1 on nvidia/Nemotron-PII for PII detection
"""
from datasets import load_dataset
from gliner import GLiNER, GLiNERConfig
from gliner.training import Trainer, TrainingArguments
import json
import os
from huggingface_hub import HfApi
import torch
print("=" * 80)
print("GLiNER Fine-tuning for PII Detection")
print("Model: urchade/gliner_largev2.1")
print("Dataset: nvidia/Nemotron-PII")
print("=" * 80)
# Configuration
BASE_MODEL = "urchade/gliner_largev2.1"
OUTPUT_DIR = "gliner-pii-detector"
HUB_MODEL_ID = "gliner-pii-nemotron" # Will be prefixed with username
MAX_SAMPLES = 10000 # Limit for demo, set to None for full dataset
BATCH_SIZE = 4
LEARNING_RATE = 5e-6
NUM_EPOCHS = 3
MAX_LENGTH = 384
print(f"\nConfiguration:")
print(f" Base model: {BASE_MODEL}")
print(f" Output directory: {OUTPUT_DIR}")
print(f" Max samples: {MAX_SAMPLES if MAX_SAMPLES else 'All'}")
print(f" Batch size: {BATCH_SIZE}")
print(f" Learning rate: {LEARNING_RATE}")
print(f" Epochs: {NUM_EPOCHS}")
print(f" Max length: {MAX_LENGTH}")
# Load the Nemotron-PII dataset
print("\n" + "=" * 80)
print("Loading nvidia/Nemotron-PII dataset...")
print("=" * 80)
train_dataset = load_dataset("nvidia/Nemotron-PII", split="train")
eval_dataset = load_dataset("nvidia/Nemotron-PII", split="test")
print(f"Train samples: {len(train_dataset)}")
print(f"Eval samples: {len(eval_dataset)}")
# Limit dataset size for faster training (optional)
if MAX_SAMPLES:
train_dataset = train_dataset.select(range(min(MAX_SAMPLES, len(train_dataset))))
eval_dataset = eval_dataset.select(range(min(MAX_SAMPLES // 10, len(eval_dataset))))
print(f"\nLimited to {len(train_dataset)} train and {len(eval_dataset)} eval samples")
# Print example to understand format
print("\nExample from dataset:")
example = train_dataset[0]
print(f"Keys: {list(example.keys())}")
print(f"Text snippet: {example['text'][:200]}...")
print(f"Spans: {example['spans'][:3] if len(example['spans']) > 0 else 'No spans'}...")
def convert_nemotron_to_gliner_format(examples):
"""
Convert Nemotron-PII format to GLiNER training format.
Nemotron format:
{
"text": "John Doe lives at...",
"spans": [
{"start": 0, "end": 8, "label": "PERSON", "text": "John Doe"},
...
]
}
GLiNER format:
{
"tokenized_text": ["John", "Doe", "lives", "at", ...],
"ner": [[0, 1, "PERSON"], ...] # token indices
}
"""
converted = []
for text, spans in zip(examples["text"], examples["spans"]):
# Simple whitespace tokenization (GLiNER will handle proper tokenization)
# We'll use character-level spans and convert them
if not text or not spans:
continue
# Sort spans by start position
sorted_spans = sorted(spans, key=lambda x: x["start"])
# Create a simple word-based tokenization
tokens = text.split()
# Map character positions to token indices
char_to_token = {}
current_pos = 0
for token_idx, token in enumerate(tokens):
# Find this token in the original text
token_start = text.find(token, current_pos)
if token_start >= 0:
token_end = token_start + len(token)
for char_pos in range(token_start, token_end):
char_to_token[char_pos] = token_idx
current_pos = token_end
# Convert character spans to token spans
ner_annotations = []
for span in sorted_spans:
start_char = span["start"]
end_char = span["end"]
label = span["label"]
# Find corresponding token indices
start_token = char_to_token.get(start_char)
end_token = char_to_token.get(end_char - 1) # end is exclusive
if start_token is not None and end_token is not None:
ner_annotations.append([start_token, end_token, label])
if ner_annotations: # Only include examples with valid annotations
converted.append({
"tokenized_text": tokens,
"ner": ner_annotations
})
return converted
print("\n" + "=" * 80)
print("Converting dataset to GLiNER format...")
print("=" * 80)
train_data = convert_nemotron_to_gliner_format(train_dataset)
eval_data = convert_nemotron_to_gliner_format(eval_dataset)
print(f"Converted train samples: {len(train_data)}")
print(f"Converted eval samples: {len(eval_data)}")
if len(train_data) > 0:
print("\nExample converted data:")
print(f"Tokens (first 10): {train_data[0]['tokenized_text'][:10]}")
print(f"NER annotations: {train_data[0]['ner'][:3]}")
# Save converted data to JSON files
print("\nSaving converted data...")
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(f"{OUTPUT_DIR}/train_data.json", "w") as f:
json.dump(train_data, f, indent=2)
with open(f"{OUTPUT_DIR}/eval_data.json", "w") as f:
json.dump(eval_data, f, indent=2)
print(f"Saved to {OUTPUT_DIR}/train_data.json and {OUTPUT_DIR}/eval_data.json")
# Load pre-trained GLiNER model
print("\n" + "=" * 80)
print(f"Loading base model: {BASE_MODEL}")
print("=" * 80)
model = GLiNER.from_pretrained(BASE_MODEL)
print("Model loaded successfully!")
print(f"Model config: {model.config}")
# Configure training arguments
print("\n" + "=" * 80)
print("Configuring training...")
print("=" * 80)
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
learning_rate=LEARNING_RATE,
weight_decay=0.01,
others_lr=1e-5,
others_weight_decay=0.01,
lr_scheduler_type="linear",
warmup_ratio=0.1,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
num_train_epochs=NUM_EPOCHS,
evaluation_strategy="steps",
eval_steps=500,
save_steps=500,
save_total_limit=3,
dataloader_num_workers=0,
use_cpu=False,
report_to=["trackio"],
logging_steps=50,
# Trackio configuration
project_name="gliner-pii-detection",
run_name="nemotron-pii-finetune",
)
print("Training arguments configured!")
# Initialize trainer
print("\n" + "=" * 80)
print("Initializing trainer...")
print("=" * 80)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_data,
eval_dataset=eval_data,
tokenizer=model.data_processor.transformer_tokenizer,
data_collator=model.data_processor.collate_fn,
)
print("Trainer initialized!")
# Start training
print("\n" + "=" * 80)
print("Starting training...")
print("=" * 80)
trainer.train()
print("\n" + "=" * 80)
print("Training completed!")
print("=" * 80)
# Save the final model
print("\nSaving final model...")
model.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")
# Push to Hub
print("\n" + "=" * 80)
print("Pushing model to Hugging Face Hub...")
print("=" * 80)
try:
api = HfApi()
username = api.whoami()["name"]
full_repo_id = f"{username}/{HUB_MODEL_ID}"
print(f"Pushing to: {full_repo_id}")
model.push_to_hub(
full_repo_id,
use_auth_token=True,
commit_message="Fine-tuned GLiNER on nvidia/Nemotron-PII for PII detection"
)
print(f"\n✅ Model successfully pushed to: https://huggingface.co/{full_repo_id}")
except Exception as e:
print(f"\n❌ Failed to push to Hub: {e}")
print("Model is saved locally in:", OUTPUT_DIR)
print("\n" + "=" * 80)
print("Training completed successfully!")
print("=" * 80)