| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| Fine-tune GLiNER large-v2.1 on nvidia/Nemotron-PII for PII detection |
| """ |
|
|
| from datasets import load_dataset |
| from gliner import GLiNER, GLiNERConfig |
| from gliner.training import Trainer, TrainingArguments |
| import json |
| import os |
| from huggingface_hub import HfApi |
| import torch |
|
|
| print("=" * 80) |
| print("GLiNER Fine-tuning for PII Detection") |
| print("Model: urchade/gliner_largev2.1") |
| print("Dataset: nvidia/Nemotron-PII") |
| print("=" * 80) |
|
|
| |
| BASE_MODEL = "urchade/gliner_largev2.1" |
| OUTPUT_DIR = "gliner-pii-detector" |
| HUB_MODEL_ID = "gliner-pii-nemotron" |
| MAX_SAMPLES = 10000 |
| BATCH_SIZE = 4 |
| LEARNING_RATE = 5e-6 |
| NUM_EPOCHS = 3 |
| MAX_LENGTH = 384 |
|
|
| print(f"\nConfiguration:") |
| print(f" Base model: {BASE_MODEL}") |
| print(f" Output directory: {OUTPUT_DIR}") |
| print(f" Max samples: {MAX_SAMPLES if MAX_SAMPLES else 'All'}") |
| print(f" Batch size: {BATCH_SIZE}") |
| print(f" Learning rate: {LEARNING_RATE}") |
| print(f" Epochs: {NUM_EPOCHS}") |
| print(f" Max length: {MAX_LENGTH}") |
|
|
| |
| print("\n" + "=" * 80) |
| print("Loading nvidia/Nemotron-PII dataset...") |
| print("=" * 80) |
|
|
| train_dataset = load_dataset("nvidia/Nemotron-PII", split="train") |
| eval_dataset = load_dataset("nvidia/Nemotron-PII", split="test") |
|
|
| print(f"Train samples: {len(train_dataset)}") |
| print(f"Eval samples: {len(eval_dataset)}") |
|
|
| |
| if MAX_SAMPLES: |
| train_dataset = train_dataset.select(range(min(MAX_SAMPLES, len(train_dataset)))) |
| eval_dataset = eval_dataset.select(range(min(MAX_SAMPLES // 10, len(eval_dataset)))) |
| print(f"\nLimited to {len(train_dataset)} train and {len(eval_dataset)} eval samples") |
|
|
| |
| print("\nExample from dataset:") |
| example = train_dataset[0] |
| print(f"Keys: {list(example.keys())}") |
| print(f"Text snippet: {example['text'][:200]}...") |
| print(f"Spans: {example['spans'][:3] if len(example['spans']) > 0 else 'No spans'}...") |
|
|
| def convert_nemotron_to_gliner_format(examples): |
| """ |
| Convert Nemotron-PII format to GLiNER training format. |
| |
| Nemotron format: |
| { |
| "text": "John Doe lives at...", |
| "spans": [ |
| {"start": 0, "end": 8, "label": "PERSON", "text": "John Doe"}, |
| ... |
| ] |
| } |
| |
| GLiNER format: |
| { |
| "tokenized_text": ["John", "Doe", "lives", "at", ...], |
| "ner": [[0, 1, "PERSON"], ...] # token indices |
| } |
| """ |
| converted = [] |
|
|
| for text, spans in zip(examples["text"], examples["spans"]): |
| |
| |
|
|
| if not text or not spans: |
| continue |
|
|
| |
| sorted_spans = sorted(spans, key=lambda x: x["start"]) |
|
|
| |
| tokens = text.split() |
|
|
| |
| char_to_token = {} |
| current_pos = 0 |
| for token_idx, token in enumerate(tokens): |
| |
| token_start = text.find(token, current_pos) |
| if token_start >= 0: |
| token_end = token_start + len(token) |
| for char_pos in range(token_start, token_end): |
| char_to_token[char_pos] = token_idx |
| current_pos = token_end |
|
|
| |
| ner_annotations = [] |
| for span in sorted_spans: |
| start_char = span["start"] |
| end_char = span["end"] |
| label = span["label"] |
|
|
| |
| start_token = char_to_token.get(start_char) |
| end_token = char_to_token.get(end_char - 1) |
|
|
| if start_token is not None and end_token is not None: |
| ner_annotations.append([start_token, end_token, label]) |
|
|
| if ner_annotations: |
| converted.append({ |
| "tokenized_text": tokens, |
| "ner": ner_annotations |
| }) |
|
|
| return converted |
|
|
| print("\n" + "=" * 80) |
| print("Converting dataset to GLiNER format...") |
| print("=" * 80) |
|
|
| train_data = convert_nemotron_to_gliner_format(train_dataset) |
| eval_data = convert_nemotron_to_gliner_format(eval_dataset) |
|
|
| print(f"Converted train samples: {len(train_data)}") |
| print(f"Converted eval samples: {len(eval_data)}") |
|
|
| if len(train_data) > 0: |
| print("\nExample converted data:") |
| print(f"Tokens (first 10): {train_data[0]['tokenized_text'][:10]}") |
| print(f"NER annotations: {train_data[0]['ner'][:3]}") |
|
|
| |
| print("\nSaving converted data...") |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
| with open(f"{OUTPUT_DIR}/train_data.json", "w") as f: |
| json.dump(train_data, f, indent=2) |
|
|
| with open(f"{OUTPUT_DIR}/eval_data.json", "w") as f: |
| json.dump(eval_data, f, indent=2) |
|
|
| print(f"Saved to {OUTPUT_DIR}/train_data.json and {OUTPUT_DIR}/eval_data.json") |
|
|
| |
| print("\n" + "=" * 80) |
| print(f"Loading base model: {BASE_MODEL}") |
| print("=" * 80) |
|
|
| model = GLiNER.from_pretrained(BASE_MODEL) |
|
|
| print("Model loaded successfully!") |
| print(f"Model config: {model.config}") |
|
|
| |
| print("\n" + "=" * 80) |
| print("Configuring training...") |
| print("=" * 80) |
|
|
| training_args = TrainingArguments( |
| output_dir=OUTPUT_DIR, |
| learning_rate=LEARNING_RATE, |
| weight_decay=0.01, |
| others_lr=1e-5, |
| others_weight_decay=0.01, |
| lr_scheduler_type="linear", |
| warmup_ratio=0.1, |
| per_device_train_batch_size=BATCH_SIZE, |
| per_device_eval_batch_size=BATCH_SIZE, |
| num_train_epochs=NUM_EPOCHS, |
| evaluation_strategy="steps", |
| eval_steps=500, |
| save_steps=500, |
| save_total_limit=3, |
| dataloader_num_workers=0, |
| use_cpu=False, |
| report_to=["trackio"], |
| logging_steps=50, |
| |
| project_name="gliner-pii-detection", |
| run_name="nemotron-pii-finetune", |
| ) |
|
|
| print("Training arguments configured!") |
|
|
| |
| print("\n" + "=" * 80) |
| print("Initializing trainer...") |
| print("=" * 80) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_data, |
| eval_dataset=eval_data, |
| tokenizer=model.data_processor.transformer_tokenizer, |
| data_collator=model.data_processor.collate_fn, |
| ) |
|
|
| print("Trainer initialized!") |
|
|
| |
| print("\n" + "=" * 80) |
| print("Starting training...") |
| print("=" * 80) |
|
|
| trainer.train() |
|
|
| print("\n" + "=" * 80) |
| print("Training completed!") |
| print("=" * 80) |
|
|
| |
| print("\nSaving final model...") |
| model.save_pretrained(OUTPUT_DIR) |
|
|
| print(f"Model saved to {OUTPUT_DIR}") |
|
|
| |
| print("\n" + "=" * 80) |
| print("Pushing model to Hugging Face Hub...") |
| print("=" * 80) |
|
|
| try: |
| api = HfApi() |
| username = api.whoami()["name"] |
| full_repo_id = f"{username}/{HUB_MODEL_ID}" |
|
|
| print(f"Pushing to: {full_repo_id}") |
|
|
| model.push_to_hub( |
| full_repo_id, |
| use_auth_token=True, |
| commit_message="Fine-tuned GLiNER on nvidia/Nemotron-PII for PII detection" |
| ) |
|
|
| print(f"\n✅ Model successfully pushed to: https://huggingface.co/{full_repo_id}") |
|
|
| except Exception as e: |
| print(f"\n❌ Failed to push to Hub: {e}") |
| print("Model is saved locally in:", OUTPUT_DIR) |
|
|
| print("\n" + "=" * 80) |
| print("Training completed successfully!") |
| print("=" * 80) |
|
|