gliner-training-scripts / train_gliner_pii.py

Upload train_gliner_pii.py with huggingface_hub

a762a42 verified 5 months ago

7.73 kB

	# /// script
	# dependencies = [
	# "gliner>=0.2.0",
	# "datasets>=2.14.0",
	# "torch>=2.0.0",
	# "transformers>=4.30.0",
	# "trackio>=0.1.0",
	# "huggingface-hub>=0.19.0",
	# "accelerate>=0.20.0"
	# ]
	# ///

	"""
	Fine-tune GLiNER large-v2.1 on nvidia/Nemotron-PII for PII detection
	"""

	from datasets import load_dataset
	from gliner import GLiNER, GLiNERConfig
	from gliner.training import Trainer, TrainingArguments
	import json
	import os
	from huggingface_hub import HfApi
	import torch

	print("=" * 80)
	print("GLiNER Fine-tuning for PII Detection")
	print("Model: urchade/gliner_largev2.1")
	print("Dataset: nvidia/Nemotron-PII")
	print("=" * 80)

	# Configuration
	BASE_MODEL = "urchade/gliner_largev2.1"
	OUTPUT_DIR = "gliner-pii-detector"
	HUB_MODEL_ID = "gliner-pii-nemotron" # Will be prefixed with username
	MAX_SAMPLES = 10000 # Limit for demo, set to None for full dataset
	BATCH_SIZE = 4
	LEARNING_RATE = 5e-6
	NUM_EPOCHS = 3
	MAX_LENGTH = 384

	print(f"\nConfiguration:")
	print(f" Base model: {BASE_MODEL}")
	print(f" Output directory: {OUTPUT_DIR}")
	print(f" Max samples: {MAX_SAMPLES if MAX_SAMPLES else 'All'}")
	print(f" Batch size: {BATCH_SIZE}")
	print(f" Learning rate: {LEARNING_RATE}")
	print(f" Epochs: {NUM_EPOCHS}")
	print(f" Max length: {MAX_LENGTH}")

	# Load the Nemotron-PII dataset
	print("\n" + "=" * 80)
	print("Loading nvidia/Nemotron-PII dataset...")
	print("=" * 80)

	train_dataset = load_dataset("nvidia/Nemotron-PII", split="train")
	eval_dataset = load_dataset("nvidia/Nemotron-PII", split="test")

	print(f"Train samples: {len(train_dataset)}")
	print(f"Eval samples: {len(eval_dataset)}")

	# Limit dataset size for faster training (optional)
	if MAX_SAMPLES:
	train_dataset = train_dataset.select(range(min(MAX_SAMPLES, len(train_dataset))))
	eval_dataset = eval_dataset.select(range(min(MAX_SAMPLES // 10, len(eval_dataset))))
	print(f"\nLimited to {len(train_dataset)} train and {len(eval_dataset)} eval samples")

	# Print example to understand format
	print("\nExample from dataset:")
	example = train_dataset[0]
	print(f"Keys: {list(example.keys())}")
	print(f"Text snippet: {example['text'][:200]}...")
	print(f"Spans: {example['spans'][:3] if len(example['spans']) > 0 else 'No spans'}...")

	def convert_nemotron_to_gliner_format(examples):
	"""
	Convert Nemotron-PII format to GLiNER training format.

	Nemotron format:
	{
	"text": "John Doe lives at...",
	"spans": [
	{"start": 0, "end": 8, "label": "PERSON", "text": "John Doe"},
	...
	]
	}

	GLiNER format:
	{
	"tokenized_text": ["John", "Doe", "lives", "at", ...],
	"ner": [[0, 1, "PERSON"], ...] # token indices
	}
	"""
	converted = []

	for text, spans in zip(examples["text"], examples["spans"]):
	# Simple whitespace tokenization (GLiNER will handle proper tokenization)
	# We'll use character-level spans and convert them

	if not text or not spans:
	continue

	# Sort spans by start position
	sorted_spans = sorted(spans, key=lambda x: x["start"])

	# Create a simple word-based tokenization
	tokens = text.split()

	# Map character positions to token indices
	char_to_token = {}
	current_pos = 0
	for token_idx, token in enumerate(tokens):
	# Find this token in the original text
	token_start = text.find(token, current_pos)
	if token_start >= 0:
	token_end = token_start + len(token)
	for char_pos in range(token_start, token_end):
	char_to_token[char_pos] = token_idx
	current_pos = token_end

	# Convert character spans to token spans
	ner_annotations = []
	for span in sorted_spans:
	start_char = span["start"]
	end_char = span["end"]
	label = span["label"]

	# Find corresponding token indices
	start_token = char_to_token.get(start_char)
	end_token = char_to_token.get(end_char - 1) # end is exclusive

	if start_token is not None and end_token is not None:
	ner_annotations.append([start_token, end_token, label])

	if ner_annotations: # Only include examples with valid annotations
	converted.append({
	"tokenized_text": tokens,
	"ner": ner_annotations
	})

	return converted

	print("\n" + "=" * 80)
	print("Converting dataset to GLiNER format...")
	print("=" * 80)

	train_data = convert_nemotron_to_gliner_format(train_dataset)
	eval_data = convert_nemotron_to_gliner_format(eval_dataset)

	print(f"Converted train samples: {len(train_data)}")
	print(f"Converted eval samples: {len(eval_data)}")

	if len(train_data) > 0:
	print("\nExample converted data:")
	print(f"Tokens (first 10): {train_data[0]['tokenized_text'][:10]}")
	print(f"NER annotations: {train_data[0]['ner'][:3]}")

	# Save converted data to JSON files
	print("\nSaving converted data...")
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	with open(f"{OUTPUT_DIR}/train_data.json", "w") as f:
	json.dump(train_data, f, indent=2)

	with open(f"{OUTPUT_DIR}/eval_data.json", "w") as f:
	json.dump(eval_data, f, indent=2)

	print(f"Saved to {OUTPUT_DIR}/train_data.json and {OUTPUT_DIR}/eval_data.json")

	# Load pre-trained GLiNER model
	print("\n" + "=" * 80)
	print(f"Loading base model: {BASE_MODEL}")
	print("=" * 80)

	model = GLiNER.from_pretrained(BASE_MODEL)

	print("Model loaded successfully!")
	print(f"Model config: {model.config}")

	# Configure training arguments
	print("\n" + "=" * 80)
	print("Configuring training...")
	print("=" * 80)

	training_args = TrainingArguments(
	output_dir=OUTPUT_DIR,
	learning_rate=LEARNING_RATE,
	weight_decay=0.01,
	others_lr=1e-5,
	others_weight_decay=0.01,
	lr_scheduler_type="linear",
	warmup_ratio=0.1,
	per_device_train_batch_size=BATCH_SIZE,
	per_device_eval_batch_size=BATCH_SIZE,
	num_train_epochs=NUM_EPOCHS,
	evaluation_strategy="steps",
	eval_steps=500,
	save_steps=500,
	save_total_limit=3,
	dataloader_num_workers=0,
	use_cpu=False,
	report_to=["trackio"],
	logging_steps=50,
	# Trackio configuration
	project_name="gliner-pii-detection",
	run_name="nemotron-pii-finetune",
	)

	print("Training arguments configured!")

	# Initialize trainer
	print("\n" + "=" * 80)
	print("Initializing trainer...")
	print("=" * 80)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_data,
	eval_dataset=eval_data,
	tokenizer=model.data_processor.transformer_tokenizer,
	data_collator=model.data_processor.collate_fn,
	)

	print("Trainer initialized!")

	# Start training
	print("\n" + "=" * 80)
	print("Starting training...")
	print("=" * 80)

	trainer.train()

	print("\n" + "=" * 80)
	print("Training completed!")
	print("=" * 80)

	# Save the final model
	print("\nSaving final model...")
	model.save_pretrained(OUTPUT_DIR)

	print(f"Model saved to {OUTPUT_DIR}")

	# Push to Hub
	print("\n" + "=" * 80)
	print("Pushing model to Hugging Face Hub...")
	print("=" * 80)

	try:
	api = HfApi()
	username = api.whoami()["name"]
	full_repo_id = f"{username}/{HUB_MODEL_ID}"

	print(f"Pushing to: {full_repo_id}")

	model.push_to_hub(
	full_repo_id,
	use_auth_token=True,
	commit_message="Fine-tuned GLiNER on nvidia/Nemotron-PII for PII detection"
	)

	print(f"\n✅ Model successfully pushed to: https://huggingface.co/{full_repo_id}")

	except Exception as e:
	print(f"\n❌ Failed to push to Hub: {e}")
	print("Model is saved locally in:", OUTPUT_DIR)

	print("\n" + "=" * 80)
	print("Training completed successfully!")
	print("=" * 80)