Spaces:

vuminhtue
/

Bert_base_NER_PII_43k

Sleeping

App Files Files Community

vuminhtue commited on Oct 14, 2025

Commit

2a4d835

verified ·

1 Parent(s): 765b531

Upload 12 files

Browse files

Files changed (12) hide show

Bert_base_NER_PII43k/.gitattributes +35 -0
Bert_base_NER_PII43k/README.md +54 -0
Bert_base_NER_PII43k/config.json +84 -0
Bert_base_NER_PII43k/model.safetensors +3 -0
Bert_base_NER_PII43k/special_tokens_map.json +7 -0
Bert_base_NER_PII43k/tokenizer.json +0 -0
Bert_base_NER_PII43k/tokenizer_config.json +55 -0
Bert_base_NER_PII43k/training_args.bin +3 -0
Bert_base_NER_PII43k/vocab.txt +0 -0
README.md +83 -7
app.py +202 -0
requirements.txt +4 -0

Bert_base_NER_PII43k/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Bert_base_NER_PII43k/README.md ADDED Viewed

	@@ -0,0 +1,54 @@

+---
+license: apache-2.0
+base_model: google-bert/bert-base-uncased
+tags:
+- generated_from_trainer
+model-index:
+- name: Bert_base_NER_PII43k
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/tuevu_smu/huggingface/runs/5vg4k8gw)
+# Bert_base_NER_PII43k
+This model is a fine-tuned version of [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) on the [ai4privacy/pii-masking-43k](https://huggingface.co/datasets/ai4privacy/pii-masking-43k) dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 16
+- eval_batch_size: 64
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 500
+- num_epochs: 3
+### Training results
+### Framework versions
+- Transformers 4.41.0.dev0
+- Pytorch 1.13.1
+- Datasets 2.18.0
+- Tokenizers 0.19.1

Bert_base_NER_PII43k/config.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "_name_or_path": "google-bert/bert-base-uncased",
+  "architectures": [
+    "BertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "ACCOUNTNAME",
+    "1": "ACCOUNTNUM",
+    "2": "USERNAME",
+    "3": "NEARBYGPSCOORDINATE",
+    "4": "ORDINALDIRECTION",
+    "5": "COINADDRESS",
+    "6": "CREDITCARDISSUER",
+    "7": "CREDITCARDNUM",
+    "8": "CURRENCY",
+    "9": "DISPLAYNAME",
+    "10": "EMAIL",
+    "11": "GENDER",
+    "12": "GEO",
+    "13": "IBAN",
+    "14": "ZIPCODE",
+    "15": "IP",
+    "16": "JOB",
+    "17": "MAC",
+    "18": "NAME",
+    "19": "NUM",
+    "20": "PASSWORD",
+    "21": "STREET",
+    "22": "URL",
+    "23": "USERAGENT",
+    "24": "ADDRESS",
+    "25": "BIC",
+    "26": "O"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "ACCOUNTNAME": 0,
+    "ACCOUNTNUM": 1,
+    "ADDRESS": 24,
+    "BIC": 25,
+    "COINADDRESS": 5,
+    "CREDITCARDISSUER": 6,
+    "CREDITCARDNUM": 7,
+    "CURRENCY": 8,
+    "DISPLAYNAME": 9,
+    "EMAIL": 10,
+    "GENDER": 11,
+    "GEO": 12,
+    "IBAN": 13,
+    "IP": 15,
+    "JOB": 16,
+    "MAC": 17,
+    "NAME": 18,
+    "NEARBYGPSCOORDINATE": 3,
+    "NUM": 19,
+    "O": 26,
+    "ORDINALDIRECTION": 4,
+    "PASSWORD": 20,
+    "STREET": 21,
+    "URL": 22,
+    "USERAGENT": 23,
+    "USERNAME": 2,
+    "ZIPCODE": 14
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.0.dev0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

Bert_base_NER_PII43k/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a55d6ddc6954f3e07a94d6211ac7f9668ffd2aa8cc824d02ae463edc025148f
+size 134

Bert_base_NER_PII43k/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

Bert_base_NER_PII43k/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Bert_base_NER_PII43k/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

Bert_base_NER_PII43k/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:244bc12c4638255ef18230e73685dca6d51a1931705da8015039fdd9a6225f9e
+size 129

Bert_base_NER_PII43k/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,14 +1,90 @@
 ---
-title: Bert Base NER PII 43k
-emoji: 🐨
-colorFrom: gray
-colorTo: blue
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Train Bert base model with PII data
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: PII Detection with BERT
+emoji: 🔍
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
+# PII Detection with BERT
+This Space demonstrates a BERT model fine-tuned for detecting Personal Identifiable Information (PII) in text.
+## Model Details
+- **Base Model**: [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased)
+- **Training Dataset**: [ai4privacy/pii-masking-43k](https://huggingface.co/datasets/ai4privacy/pii-masking-43k)
+- **Task**: Token Classification / Named Entity Recognition (NER)
+- **Number of Entity Types**: 27
+## Detectable PII Types
+The model can identify 27 different types of personal information:
+### Identity Information
+- NAME, USERNAME, DISPLAYNAME, GENDER, JOB
+### Contact Information
+- EMAIL, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE
+### Financial Information
+- CREDITCARDNUM, CREDITCARDISSUER, IBAN, BIC
+- ACCOUNTNAME, ACCOUNTNUM, CURRENCY, COINADDRESS
+### Technical Information
+- IP, MAC, URL, USERAGENT, PASSWORD
+### Other
+- NUM, ORDINALDIRECTION
+## How It Works
+1. **Input**: User provides text that may contain personal information
+2. **Tokenization**: Text is split into tokens using BERT tokenizer
+3. **Classification**: Each token is classified into one of 27 entity types or "O" (no entity)
+4. **Visualization**: Detected entities are highlighted with different colors
+## Training Details
+- Learning Rate: 5e-05
+- Batch Size: 16 (train), 64 (eval)
+- Epochs: 3
+- Optimizer: Adam (β1=0.9, β2=0.999, ε=1e-08)
+- Warmup Steps: 500
+## Use Cases
+- **Data Privacy**: Identify PII before sharing documents
+- **Data Anonymization**: Find information that needs masking
+- **Compliance**: Help meet GDPR, CCPA requirements
+- **Security**: Detect sensitive information leaks
+## Limitations
+- Maximum input length: 512 tokens
+- Optimized for English text
+- May not detect all variations of PII
+- Performance depends on text format and quality
+## Example Usage
+```python
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+model_name = "your-username/your-space-name"  # Update after deployment
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForTokenClassification.from_pretrained(model_name)
+text = "My name is John Smith and my email is john@example.com"
+inputs = tokenizer(text, return_tensors="pt")
+outputs = model(**inputs)
+```
+## License
+Apache 2.0

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+HuggingFace Space App for PII Detection
+This app uses a BERT model to identify Personal Identifiable Information in text.
+"""
+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+import torch
+# Load the model and tokenizer
+MODEL_PATH = "./Bert_base_NER_PII43k"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
+# Entity label colors for visualization
+ENTITY_COLORS = {
+    "NAME": "#FF6B6B",
+    "EMAIL": "#4ECDC4",
+    "CREDITCARDNUM": "#FFE66D",
+    "IP": "#95E1D3",
+    "PASSWORD": "#F38181",
+    "STREET": "#AA96DA",
+    "ACCOUNTNAME": "#FCBAD3",
+    "ACCOUNTNUM": "#FFFFD2",
+    "USERNAME": "#A8E6CF",
+    "ZIPCODE": "#FFD3B6",
+    "IBAN": "#FFAAA5",
+    "URL": "#FF8B94",
+    "JOB": "#C7CEEA",
+    "GENDER": "#FFDAC1",
+    "ADDRESS": "#B5EAD7",
+    "MAC": "#C9CBA3",
+    "GEO": "#FFE2E2",
+    "NEARBYGPSCOORDINATE": "#F7D9C4",
+    "COINADDRESS": "#FAACA8",
+    "CREDITCARDISSUER": "#DCD6F7",
+    "CURRENCY": "#A6D9F7",
+    "DISPLAYNAME": "#FAD9A1",
+    "NUM": "#D4F1F4",
+    "BIC": "#FFB6B9",
+    "USERAGENT": "#C2E9FB",
+    "ORDINALDIRECTION": "#F6EAC2",
+}
+def detect_pii(text):
+    """
+    Detect PII entities in the input text.
+    Args:
+        text (str): Input text to analyze
+    Returns:
+        list: Highlighted entities for Gradio display
+        str: Summary of detected entities
+    """
+    if not text.strip():
+        return None, "Please enter some text to analyze."
+    # Tokenize input
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+    # Get predictions
+    with torch.no_grad():
+        outputs = model(**inputs)
+        predictions = torch.argmax(outputs.logits, dim=2)
+    # Convert tokens back to words and align with predictions
+    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+    predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
+    # Reconstruct words and their labels
+    highlighted_entities = []
+    current_word = ""
+    current_label = None
+    for token, label in zip(tokens, predicted_labels):
+        # Skip special tokens
+        if token in ["[CLS]", "[SEP]", "[PAD]"]:
+            continue
+        # Handle subword tokens (starting with ##)
+        if token.startswith("##"):
+            current_word += token[2:]
+        else:
+            # Save previous word if it exists
+            if current_word:
+                if current_label and current_label != "O":
+                    highlighted_entities.append((current_word, current_label))
+                else:
+                    highlighted_entities.append((current_word, None))
+                current_word = " "  # Add space between words
+            current_word += token
+            current_label = label
+    # Add the last word
+    if current_word.strip():
+        if current_label and current_label != "O":
+            highlighted_entities.append((current_word, current_label))
+        else:
+            highlighted_entities.append((current_word, None))
+    # Create summary
+    detected_entities = {}
+    for word, label in highlighted_entities:
+        if label and label != "O":
+            if label not in detected_entities:
+                detected_entities[label] = []
+            detected_entities[label].append(word.strip())
+    if detected_entities:
+        summary = "**Detected PII:**\n\n"
+        for entity_type, words in detected_entities.items():
+            summary += f"- **{entity_type}**: {', '.join(words)}\n"
+    else:
+        summary = "No PII detected in the text."
+    return highlighted_entities, summary
+# Example texts for users to try
+examples = [
+    ["My name is John Smith and my email is john.smith@example.com. I live at 123 Main Street."],
+    ["Please send the payment to IBAN GB29 NWBK 6016 1331 9268 19 or call me at my office."],
+    ["Contact Sarah Johnson at sarah.j@company.org for more details about the project."],
+    ["My credit card number is 4532-1234-5678-9010 and my username is mike_user123."],
+]
+# Create Gradio interface
+with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🔍 Personal Identifiable Information (PII) Detector
+        This tool uses a fine-tuned BERT model to automatically detect and highlight personal information in text.
+        It can identify **27 different types** of PII including names, emails, addresses, credit cards, and more.
+        ### How to use:
+        1. Enter or paste text in the box below
+        2. Click "Detect PII" to analyze
+        3. View highlighted entities and summary
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="Input Text",
+                placeholder="Enter text to analyze for PII...",
+                lines=6,
+            )
+            detect_btn = gr.Button("🔍 Detect PII", variant="primary")
+        with gr.Column():
+            output_highlighted = gr.HighlightedText(
+                label="Highlighted PII Entities",
+                combine_adjacent=True,
+                color_map=ENTITY_COLORS,
+            )
+            output_summary = gr.Markdown(label="Summary")
+    gr.Markdown("### 📝 Try these examples:")
+    gr.Examples(
+        examples=examples,
+        inputs=input_text,
+    )
+    gr.Markdown(
+        """
+        ### 🏷️ Detectable Entity Types:
+        **Identity**: NAME, USERNAME, DISPLAYNAME, GENDER, JOB
+        **Contact**: EMAIL, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE
+        **Financial**: CREDITCARDNUM, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, ACCOUNTNUM, CURRENCY, COINADDRESS
+        **Technical**: IP, MAC, URL, USERAGENT, PASSWORD
+        **Other**: NUM, ORDINALDIRECTION
+        ---
+        **Model**: BERT-base fine-tuned on [ai4privacy/pii-masking-43k](https://huggingface.co/datasets/ai4privacy/pii-masking-43k) dataset
+        **Base Model**: [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased)
+        """
+    )
+    # Connect the button to the function
+    detect_btn.click(
+        fn=detect_pii,
+        inputs=input_text,
+        outputs=[output_highlighted, output_summary]
+    )
+    # Also trigger on Enter key
+    input_text.submit(
+        fn=detect_pii,
+        inputs=input_text,
+        outputs=[output_highlighted, output_summary]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio==4.44.0
+transformers==4.45.0
+torch==2.1.0