Spaces:

AmitHirpara
/

PII-Detection

Sleeping

App Files Files Community

AmitHirpara commited on Aug 8, 2025

Commit

de46a17

1 Parent(s): 94f7fb3

add binary files

Browse files

Files changed (20) hide show

.gitattributes +5 -34
app.py +415 -0
data_augmentation.py +539 -0
lstm.py +289 -0
lstm_training.ipynb +1350 -0
requirements.txt +111 -0
saved_lstm/best_lstm_checkpoint.pt +3 -0
saved_lstm/lstm_training_curves.png +3 -0
saved_lstm/model_config.pkl +3 -0
saved_lstm/pii_lstm_model.pt +3 -0
saved_lstm/vocabularies.pkl +3 -0
saved_transformer/best_transformer_checkpoint.pt +3 -0
saved_transformer/model_config.pkl +3 -0
saved_transformer/pii_transformer_model.pt +3 -0
saved_transformer/transformer_training_curves.png +3 -0
saved_transformer/vocabularies.pkl +3 -0
train.json +3 -0
train_augmented.json +3 -0
transformer.py +403 -0
transformer_training.ipynb +1319 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,6 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.json filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+saved_lstm/** filter=lfs diff=lfs merge=lfs -text
+saved_transformer/** filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,415 @@

+import gradio as gr
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pickle
+import os
+import math
+from typing import List, Tuple
+from collections import Counter
+import warnings
+warnings.filterwarnings('ignore')
+# Define the Vocabulary class (needed for unpickling)
+class Vocabulary:
+    """Vocabulary class for encoding/decoding text and labels"""
+    def __init__(self, max_size=100000):
+        self.word2idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}
+        self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}
+        self.word_count = Counter()
+        self.max_size = max_size
+    def add_sentence(self, sentence):
+        for word in sentence:
+            self.word_count[word.lower()] += 1
+    def build(self):
+        most_common = self.word_count.most_common(self.max_size - len(self.word2idx))
+        for word, _ in most_common:
+            if word not in self.word2idx:
+                idx = len(self.word2idx)
+                self.word2idx[word] = idx
+                self.idx2word[idx] = word
+    def __len__(self):
+        return len(self.word2idx)
+    def encode(self, sentence):
+        return [self.word2idx.get(word.lower(), self.word2idx['<unk>']) for word in sentence]
+    def decode(self, indices):
+        return [self.idx2word.get(idx, '<unk>') for idx in indices]
+# Custom Transformer components to match the saved model
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model, num_heads, dropout=0.1):
+        super().__init__()
+        assert d_model % num_heads == 0
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.d_k = d_model // num_heads
+        self.w_q = nn.Linear(d_model, d_model)
+        self.w_k = nn.Linear(d_model, d_model)
+        self.w_v = nn.Linear(d_model, d_model)
+        self.w_o = nn.Linear(d_model, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, query, key, value, mask=None):
+        batch_size = query.size(0)
+        # Linear transformations and split into heads
+        Q = self.w_q(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
+        K = self.w_k(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
+        V = self.w_v(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
+        # Attention
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
+        if mask is not None:
+            mask = mask.unsqueeze(1).unsqueeze(1)
+            scores = scores.masked_fill(mask, -1e9)
+        attention = F.softmax(scores, dim=-1)
+        attention = self.dropout(attention)
+        context = torch.matmul(attention, V)
+        # Concatenate heads
+        context = context.transpose(1, 2).contiguous().view(
+            batch_size, -1, self.d_model
+        )
+        output = self.w_o(context)
+        return output
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super().__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.w_2(self.dropout(F.gelu(self.w_1(x))))
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
+        super().__init__()
+        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
+        self.feed_forward = FeedForward(d_model, d_ff, dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask=None):
+        # Self-attention with residual connection and layer norm
+        attn_output = self.self_attention(x, x, x, mask)
+        x = self.norm1(x + self.dropout(attn_output))
+        # Feed forward with residual connection and layer norm
+        ff_output = self.feed_forward(x)
+        x = self.norm2(x + self.dropout(ff_output))
+        return x
+class TransformerEncoder(nn.Module):
+    def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            EncoderLayer(d_model, num_heads, d_ff, dropout)
+            for _ in range(num_layers)
+        ])
+        self.norm = nn.LayerNorm(d_model)
+    def forward(self, x, mask=None):
+        for layer in self.layers:
+            x = layer(x, mask)
+        return self.norm(x)
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super().__init__()
+        self.d_model = d_model
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).unsqueeze(1).float()
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
+                           -(torch.log(torch.tensor(10000.0)) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe.unsqueeze(0))
+    def forward(self, x):
+        return x * torch.sqrt(torch.tensor(self.d_model, dtype=x.dtype)) + self.pe[:, :x.size(1)]
+class TransformerPIIDetector(nn.Module):
+    def __init__(self, vocab_size, num_classes, d_model=256, num_heads=8,
+                 d_ff=512, num_layers=4, dropout=0.1, max_len=512):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
+        self.positional_encoding = PositionalEncoding(d_model, max_len)  # Changed name to match saved model
+        self.dropout = nn.Dropout(dropout)
+        # Custom encoder to match saved model structure
+        self.encoder = TransformerEncoder(num_layers, d_model, num_heads, d_ff, dropout)
+        self.classifier = nn.Linear(d_model, num_classes)
+    def forward(self, x):
+        padding_mask = (x == 0)
+        x = self.embedding(x)
+        x = self.positional_encoding(x)
+        x = self.dropout(x)
+        x = self.encoder(x, padding_mask)
+        return self.classifier(x)
+def create_transformer_pii_model(**kwargs):
+    return TransformerPIIDetector(**kwargs)
+class PIIDetector:
+    def __init__(self, model_dir='saved_transformer'):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model_dir = model_dir
+        self.model = None
+        self.text_vocab = None
+        self.label_vocab = None
+        self.load_model()
+        # Single color for all PII highlighting
+        self.highlight_color = '#FF6B6B'
+    def load_model(self):
+        """Load the trained model and vocabularies"""
+        try:
+            # Load vocabularies
+            vocab_path = os.path.join(self.model_dir, 'vocabularies.pkl')
+            with open(vocab_path, 'rb') as f:
+                vocabs = pickle.load(f)
+                self.text_vocab = vocabs['text_vocab']
+                self.label_vocab = vocabs['label_vocab']
+            # Load model configuration
+            config_path = os.path.join(self.model_dir, 'model_config.pkl')
+            with open(config_path, 'rb') as f:
+                model_config = pickle.load(f)
+            # Create and load model
+            self.model = create_transformer_pii_model(**model_config)
+            model_path = os.path.join(self.model_dir, 'pii_transformer_model.pt')
+            self.model.load_state_dict(torch.load(model_path, map_location=self.device))
+            self.model.to(self.device)
+            self.model.eval()
+            print(f"Model loaded successfully from {self.model_dir}")
+            print(f"Using device: {self.device}")
+        except Exception as e:
+            print(f"Error loading model: {str(e)}")
+            raise
+    def tokenize(self, text: str) -> List[str]:
+        """Simple tokenization by splitting on spaces and punctuation"""
+        import re
+        # Split on whitespace and keep punctuation as separate tokens
+        tokens = re.findall(r'\w+|[^\w\s]', text)
+        return tokens
+    def predict(self, text: str) -> List[Tuple[str, str]]:
+        """Predict PII labels for input text"""
+        if not text.strip():
+            return []
+        # Tokenize
+        tokens = self.tokenize(text)
+        # Add start and end tokens
+        tokens_with_special = ['<start>'] + tokens + ['<end>']
+        # Encode tokens
+        token_ids = self.text_vocab.encode(tokens_with_special)
+        # Convert to tensor and add batch dimension
+        input_tensor = torch.tensor([token_ids]).to(self.device)
+        # Predict
+        with torch.no_grad():
+            outputs = self.model(input_tensor)
+            predictions = torch.argmax(outputs, dim=-1)
+        # Decode predictions (skip start and end tokens)
+        predicted_labels = []
+        for idx in predictions[0][1:-1]:  # Skip <start> and <end>
+            label = self.label_vocab.idx2word.get(idx.item(), 'O')
+            predicted_labels.append(label.upper())
+        # Pair tokens with their labels
+        return list(zip(tokens, predicted_labels))
+    def create_highlighted_html(self, token_label_pairs: List[Tuple[str, str]]) -> str:
+        """Create HTML with highlighted PII entities"""
+        html_parts = ['<div style="font-family: Arial, sans-serif; line-height: 1.8; padding: 20px; background-color: white; border-radius: 8px; color: black;">']
+        i = 0
+        while i < len(token_label_pairs):
+            token, label = token_label_pairs[i]
+            # Check if this is the start of a PII entity
+            if label != 'O':
+                # Collect all tokens for this entity
+                entity_tokens = [token]
+                entity_label = label
+                j = i + 1
+                # Look for continuation tokens (I- tags)
+                while j < len(token_label_pairs):
+                    next_token, next_label = token_label_pairs[j]
+                    if next_label.startswith('I-') and next_label.replace('I-', 'B-') == entity_label:
+                        entity_tokens.append(next_token)
+                        j += 1
+                    else:
+                        break
+                # Join tokens with appropriate spacing
+                entity_text = ''
+                for k, tok in enumerate(entity_tokens):
+                    if k > 0 and tok not in '.,!?;:':
+                        entity_text += ' '
+                    entity_text += tok
+                # Add highlighted entity
+                label_display = entity_label.replace('B-', '').replace('I-', '').replace('_', ' ')
+                html_parts.append(
+                    f'<mark style="background-color: {self.highlight_color}; padding: 2px 4px; '
+                    f'border-radius: 3px; margin: 0 2px; font-weight: 500;" '
+                    f'title="{label_display}">{entity_text}</mark>'
+                )
+                i = j
+            else:
+                # Add space before token if needed
+                if i > 0 and token not in '.,!?;:' and len(token_label_pairs) > i-1:
+                    prev_token, _ = token_label_pairs[i-1]
+                    if prev_token not in '(':
+                        html_parts.append(' ')
+                html_parts.append(f'<span style="color: black;">{token}</span>')
+                i += 1
+        html_parts.append('</div>')
+        return ''.join(html_parts)
+    def get_statistics(self, token_label_pairs: List[Tuple[str, str]]) -> str:
+        """Generate statistics about detected PII"""
+        stats = {}
+        total_tokens = len(token_label_pairs)
+        pii_tokens = 0
+        for _, label in token_label_pairs:
+            if label != 'O':
+                pii_tokens += 1
+                # Clean up label for display
+                label_clean = label.replace('B-', '').replace('I-', '').replace('_', ' ')
+                stats[label_clean] = stats.get(label_clean, 0) + 1
+        # Create statistics text
+        stats_text = f"### Detection Summary\n\n"
+        stats_text += f"**Total tokens:** {total_tokens}\n\n"
+        stats_text += f"**PII tokens:** {pii_tokens} ({pii_tokens/total_tokens*100:.1f}%)\n\n"
+        if not stats:
+            stats_text += "*No PII detected in the text.*"
+        return stats_text
+# Initialize the detector
+print("Initializing PII Detector...")
+detector = PIIDetector()
+def detect_pii(text):
+    """Main function for Gradio interface"""
+    if not text:
+        return "<p style='color: #6c757d; padding: 20px;'>Please enter some text to analyze.</p>", "No text provided."
+    try:
+        # Get predictions
+        token_label_pairs = detector.predict(text)
+        # Create highlighted HTML
+        highlighted_html = detector.create_highlighted_html(token_label_pairs)
+        # Get statistics
+        stats = detector.get_statistics(token_label_pairs)
+        return highlighted_html, stats
+    except Exception as e:
+        error_html = f'<div style="color: #dc3545; padding: 20px; background-color: #f8d7da; border-radius: 8px;">Error: {str(e)}</div>'
+        error_stats = f"Error occurred: {str(e)}"
+        return error_html, error_stats
+# Example texts
+examples = [
+    "My name is John Smith and my email is john.smith@email.com. You can reach me at 555-123-4567.",
+    "Student ID: 12345678. Please send the documents to 123 Main Street, Anytown, USA 12345.",
+    "Contact Sarah Johnson at sarah_j_2023@gmail.com or visit her profile at linkedin.com/in/sarahjohnson",
+    "The project was completed by student A1B2C3D4 who lives at 456 Oak Avenue.",
+    "For verification, my phone number is (555) 987-6543 and my username is cool_user_99.",
+    "Hi, I'm Emily Chen. My student number is STU-2023-98765 and I live at 789 Pine Street, Apt 4B.",
+    "You can reach me at my personal website: www.johndoe.com or call me at +1-555-0123.",
+]
+# Create Gradio interface
+with gr.Blocks(title="PII Detection System", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🔒 PII Detection System
+        Enter or paste text below to analyze it for PII content.
+        """
+    )
+    with gr.Column():
+        input_text = gr.Textbox(
+            label="Input Text",
+            placeholder="Enter text to analyze for PII...",
+            lines=8,
+            max_lines=20
+        )
+        with gr.Row():
+            analyze_btn = gr.Button("🔍 Detect PII", variant="primary", scale=2)
+            clear_btn = gr.Button("🗑️ Clear", scale=1)
+        highlighted_output = gr.HTML(
+            label="Highlighted Text",
+            value="<p style='color: #6c757d; padding: 20px;'>Results will appear here after analysis...</p>"
+        )
+        stats_output = gr.Markdown(
+            label="Detection Statistics",
+            value="*Statistics will appear here...*"
+        )
+    # Set up event handlers
+    analyze_btn.click(
+        fn=detect_pii,
+        inputs=[input_text],
+        outputs=[highlighted_output, stats_output]
+    )
+    clear_btn.click(
+        fn=lambda: ("", "<p style='color: #6c757d; padding: 20px;'>Results will appear here after analysis...</p>", "*Statistics will appear here...*"),
+        outputs=[input_text, highlighted_output, stats_output]
+    )
+# Launch the app
+if __name__ == "__main__":
+    print("\nLaunching Gradio interface...")
+    demo.launch(
+        share=False,
+        server_name="127.0.0.1",
+        server_port=7860,
+        show_error=True
+    )

data_augmentation.py ADDED Viewed

	@@ -0,0 +1,539 @@

+import random
+from faker import Faker
+import pandas as pd
+import numpy as np
+from collections import Counter
+import torch
+class PIIDataAugmenter:
+    """
+    Generates synthetic PII examples to augment training data.
+    This class creates realistic examples of various PII types including names,
+    emails, phone numbers, addresses, IDs, URLs, and usernames.
+    """
+    def __init__(self, seed=42):
+        """Initialize the augmenter with random seeds for reproducibility."""
+        random.seed(seed)
+        np.random.seed(seed)
+        self.fake = Faker()
+        Faker.seed(seed)
+        self._init_templates()
+        self._init_context_phrases()
+        self._init_generators()
+    def _init_templates(self):
+        """Initialize templates for different PII types."""
+        self.templates = {
+            'NAME_STUDENT': [
+                "My name is {name}",
+                "I am {name}",
+                "This is {name} speaking",
+                "Student: {name}",
+                "{name} here",
+                "Submitted by {name}",
+                "Author: {name}",
+                "Contact {name} for more information",
+                "Please call {name}",
+                "{name} is my name"
+            ],
+            'EMAIL': [
+                "Email me at {email}",
+                "My email is {email}",
+                "Contact: {email}",
+                "Send to {email}",
+                "Reach me at {email}",
+                "Email address: {email}",
+                "You can email {email}",
+                "Write to {email}",
+                "My contact email is {email}",
+                "Send your response to {email}"
+            ],
+            'PHONE_NUM': [
+                "Call me at {phone}",
+                "My phone number is {phone}",
+                "Phone: {phone}",
+                "Contact number: {phone}",
+                "Reach me at {phone}",
+                "My number is {phone}",
+                "You can call {phone}",
+                "Mobile: {phone}",
+                "Tel: {phone}",
+                "Phone contact: {phone}"
+            ],
+            'STREET_ADDRESS': [
+                "I live at {address}",
+                "My address is {address}",
+                "Located at {address}",
+                "Address: {address}",
+                "Find me at {address}",
+                "Residence: {address}",
+                "Mail to {address}",
+                "Home address: {address}",
+                "Visit us at {address}",
+                "Ship to {address}"
+            ],
+            'ID_NUM': [
+                "ID: {id_num}",
+                "Student ID: {id_num}",
+                "ID number {id_num}",
+                "Reference number: {id_num}",
+                "Account: {id_num}",
+                "Member ID: {id_num}",
+                "Registration: {id_num}",
+                "Code: {id_num}",
+                "Identification: {id_num}",
+                "Number: {id_num}"
+            ],
+            'URL_PERSONAL': [
+                "Visit my website at {url}",
+                "Check out {url}",
+                "My portfolio: {url}",
+                "Website: {url}",
+                "Link: {url}",
+                "Find me online at {url}",
+                "Personal site: {url}",
+                "URL: {url}",
+                "Web: {url}",
+                "Online at {url}"
+            ],
+            'USERNAME': [
+                "Username: {username}",
+                "User: {username}",
+                "Handle: {username}",
+                "My username is {username}",
+                "Find me as {username}",
+                "Account: {username}",
+                "Login: {username}",
+                "Profile: {username}",
+                "Known as {username}",
+                "Tag me @{username}"
+            ]
+        }
+    def _init_context_phrases(self):
+        """Initialize context phrases for more natural text generation."""
+        self.context_prefix = [
+            "Hello everyone,",
+            "Dear Sir/Madam,",
+            "To whom it may concern,",
+            "Please note that",
+            "For your reference,",
+            "As requested,",
+            "I would like to inform you that",
+            "This is to confirm that",
+            "Please be advised that",
+            "I am writing to tell you that"
+        ]
+        self.context_suffix = [
+            "Thank you.",
+            "Best regards.",
+            "Please let me know if you need anything else.",
+            "Looking forward to your response.",
+            "Have a great day!",
+            "Thanks for your attention.",
+            "Feel free to contact me.",
+            "I appreciate your help.",
+            "Hope this helps.",
+            "Let me know if you have questions."
+        ]
+        self.connectors = [
+            " and ", " or ", ", ", ". Also, ", ". Additionally, "
+        ]
+    def _init_generators(self):
+        """Initialize PII generators mapping."""
+        self.generators = {
+            'NAME_STUDENT': self.generate_name,
+            'EMAIL': self.generate_email,
+            'PHONE_NUM': self.generate_phone,
+            'STREET_ADDRESS': self.generate_address,
+            'ID_NUM': self.generate_id_num,
+            'URL_PERSONAL': self.generate_url,
+            'USERNAME': self.generate_username
+        }
+        self.format_keys = {
+            'NAME_STUDENT': 'name',
+            'EMAIL': 'email',
+            'PHONE_NUM': 'phone',
+            'STREET_ADDRESS': 'address',
+            'ID_NUM': 'id_num',
+            'URL_PERSONAL': 'url',
+            'USERNAME': 'username'
+        }
+    # ========== PII Generators ==========
+    def generate_name(self):
+        """Generate realistic person names."""
+        return self.fake.name()
+    def generate_email(self):
+        """Generate realistic email addresses."""
+        return self.fake.email()
+    def generate_phone(self):
+        """Generate realistic phone numbers in various formats."""
+        formats = [
+            "555-{:03d}-{:04d}",
+            "(555) {:03d}-{:04d}",
+            "555.{:03d}.{:04d}",
+            "+1-555-{:03d}-{:04d}",
+            "555{:03d}{:04d}"
+        ]
+        format_choice = random.choice(formats)
+        area = random.randint(100, 999)
+        number = random.randint(1000, 9999)
+        return format_choice.format(area, number)
+    def generate_address(self):
+        """Generate realistic street addresses."""
+        return self.fake.address().replace('\n', ', ')
+    def generate_id_num(self):
+        """Generate various ID number formats."""
+        formats = [
+            "{:06d}",           # 6-digit ID
+            "{:08d}",           # 8-digit ID
+            "ID{:05d}",         # ID prefix
+            "STU{:06d}",        # Student ID
+            "{:04d}-{:04d}",    # Hyphenated
+            "A{:07d}",          # Letter prefix
+        ]
+        format_choice = random.choice(formats)
+        if '-' in format_choice:
+            return format_choice.format(
+                random.randint(1000, 9999),
+                random.randint(1000, 9999)
+            )
+        else:
+            return format_choice.format(random.randint(10000, 9999999))
+    def generate_url(self):
+        """Generate personal website URLs."""
+        domains = ['github.com', 'linkedin.com', 'portfolio.com',
+                  'personal.com', 'website.com']
+        username = self.fake.user_name()
+        domain = random.choice(domains)
+        return f"https://{domain}/{username}"
+    def generate_username(self):
+        """Generate usernames."""
+        return self.fake.user_name()
+    # ========== Synthetic Example Creation ==========
+    def create_synthetic_example(self, pii_type, add_context=True):
+        """
+        Create a synthetic example with proper BIO labeling.
+        Args:
+            pii_type: Type of PII to generate
+            add_context: Whether to add context phrases
+        Returns:
+            Tuple of (tokens, labels)
+        """
+        # Generate PII value
+        pii_value = self.generators[pii_type]()
+        # Select and fill template
+        template = random.choice(self.templates[pii_type])
+        format_key = self.format_keys[pii_type]
+        sentence = template.format(**{format_key: pii_value})
+        # Add context if requested
+        if add_context and random.random() > 0.3:
+            sentence = self._add_context(sentence)
+        # Tokenize and label
+        tokens, labels = self._tokenize_and_label(sentence, pii_value, pii_type)
+        return tokens, labels
+    def create_mixed_example(self, pii_types, num_pii=2):
+        """
+        Create examples with multiple PII types.
+        Args:
+            pii_types: List of PII types to include
+            num_pii: Number of PII entities to include
+        Returns:
+            Tuple of (tokens, labels)
+        """
+        selected_types = random.sample(pii_types, min(num_pii, len(pii_types)))
+        all_tokens = []
+        all_labels = []
+        # Add context prefix
+        if random.random() > 0.3:
+            prefix = random.choice(self.context_prefix)
+            all_tokens.extend(prefix.split())
+            all_labels.extend(['O'] * len(prefix.split()))
+        # Add each PII with connectors
+        for i, pii_type in enumerate(selected_types):
+            # Add connector between PIIs
+            if i > 0 and random.random() > 0.5:
+                connector = random.choice(self.connectors)
+                all_tokens.extend(connector.strip().split())
+                all_labels.extend(['O'] * len(connector.strip().split()))
+            # Create PII example without additional context
+            tokens, labels = self.create_synthetic_example(pii_type, add_context=False)
+            all_tokens.extend(tokens)
+            all_labels.extend(labels)
+        # Add context suffix
+        if random.random() > 0.3:
+            suffix = random.choice(self.context_suffix)
+            all_tokens.extend(suffix.split())
+            all_labels.extend(['O'] * len(suffix.split()))
+        return all_tokens, all_labels
+    def _add_context(self, sentence):
+        """Add context phrases to make text more natural."""
+        if random.random() > 0.5:
+            sentence = random.choice(self.context_prefix) + " " + sentence
+        if random.random() > 0.5:
+            sentence = sentence + " " + random.choice(self.context_suffix)
+        return sentence
+    def _tokenize_and_label(self, sentence, pii_value, pii_type):
+        """
+        Tokenize sentence and apply BIO labels for PII.
+        Args:
+            sentence: The sentence containing PII
+            pii_value: The PII value to find and label
+            pii_type: The type of PII for labeling
+        Returns:
+            Tuple of (tokens, labels)
+        """
+        tokens = sentence.split()
+        labels = ['O'] * len(tokens)
+        # Tokenize PII value
+        pii_tokens = pii_value.split()
+        # Find and label PII in the sentence
+        for i in range(len(tokens) - len(pii_tokens) + 1):
+            # Check if tokens match PII value
+            if (tokens[i:i+len(pii_tokens)] == pii_tokens or
+                ' '.join(tokens[i:i+len(pii_tokens)]).lower() == pii_value.lower()):
+                # Apply BIO labels
+                labels[i] = f'B-{pii_type}'
+                for j in range(1, len(pii_tokens)):
+                    labels[i+j] = f'I-{pii_type}'
+                break
+        return tokens, labels
+    # ========== Dataset Augmentation ==========
+    def augment_dataset(self, original_data, target_samples_per_class=1000, mix_ratio=0.3):
+        """
+        Augment dataset with synthetic examples to balance PII classes.
+        Args:
+            original_data: Original dataset DataFrame
+            target_samples_per_class: Target number of samples per PII class
+            mix_ratio: Ratio of mixed (multi-PII) examples
+        Returns:
+            Augmented dataset DataFrame
+        """
+        # Analyze original distribution
+        label_counts = self._analyze_label_distribution(original_data)
+        print("\nOriginal label distribution:")
+        self._print_distribution(label_counts)
+        # Generate synthetic examples
+        synthetic_tokens, synthetic_labels = self._generate_synthetic_data(
+            label_counts, target_samples_per_class, mix_ratio
+        )
+        # Add non-PII examples
+        synthetic_tokens, synthetic_labels = self._add_non_pii_examples(
+            synthetic_tokens, synthetic_labels
+        )
+        # Combine and shuffle data
+        augmented_df = self._combine_and_shuffle(
+            original_data, synthetic_tokens, synthetic_labels
+        )
+        # Analyze new distribution
+        new_label_counts = self._analyze_label_distribution(augmented_df)
+        print("\nAugmented label distribution:")
+        self._print_distribution(new_label_counts)
+        return augmented_df
+    def _analyze_label_distribution(self, data):
+        """Analyze the distribution of PII labels in the dataset."""
+        label_counts = Counter()
+        for labels in data['labels']:
+            for label in labels:
+                if label != 'O':
+                    # Extract base label (remove B- or I- prefix)
+                    base_label = label.split('-')[1] if '-' in label else label
+                    label_counts[base_label] += 1
+        return label_counts
+    def _print_distribution(self, label_counts):
+        """Print label distribution statistics."""
+        total = sum(label_counts.values())
+        for label, count in label_counts.most_common():
+            percentage = (count / total * 100) if total > 0 else 0
+            print(f"  {label:15} : {count:6,} ({percentage:5.2f}%)")
+    def _generate_synthetic_data(self, label_counts, target_samples, mix_ratio):
+        """Generate synthetic PII examples based on current distribution."""
+        synthetic_tokens = []
+        synthetic_labels = []
+        for pii_type in self.templates.keys():
+            current_count = label_counts.get(pii_type, 0)
+            needed = max(0, target_samples - current_count)
+            if needed == 0:
+                continue
+            print(f"\nGenerating {needed} synthetic examples for {pii_type}")
+            # Single PII examples
+            single_count = int(needed * (1 - mix_ratio))
+            for _ in range(single_count):
+                tokens, labels = self.create_synthetic_example(pii_type)
+                synthetic_tokens.append(tokens)
+                synthetic_labels.append(labels)
+            # Mixed PII examples
+            mixed_count = int(needed * mix_ratio)
+            for _ in range(mixed_count):
+                # Ensure current PII type is included
+                other_types = [t for t in self.templates.keys() if t != pii_type]
+                selected_types = [pii_type] + random.sample(
+                    other_types, min(1, len(other_types))
+                )
+                tokens, labels = self.create_mixed_example(selected_types, num_pii=2)
+                synthetic_tokens.append(tokens)
+                synthetic_labels.append(labels)
+        return synthetic_tokens, synthetic_labels
+    def _add_non_pii_examples(self, synthetic_tokens, synthetic_labels):
+        """Add examples without PII (all 'O' labels) for balance."""
+        num_non_pii = int(len(synthetic_tokens) * 0.1)
+        for _ in range(num_non_pii):
+            # Generate random text without PII
+            sentence = self.fake.text(max_nb_chars=100)
+            tokens = sentence.split()
+            labels = ['O'] * len(tokens)
+            synthetic_tokens.append(tokens)
+            synthetic_labels.append(labels)
+        return synthetic_tokens, synthetic_labels
+    def _combine_and_shuffle(self, original_data, synthetic_tokens, synthetic_labels):
+        """Combine original and synthetic data, then shuffle."""
+        # Combine data
+        all_tokens = original_data['tokens'].tolist() + synthetic_tokens
+        all_labels = original_data['labels'].tolist() + synthetic_labels
+        # Create DataFrame
+        augmented_data = pd.DataFrame({
+            'tokens': all_tokens,
+            'labels': all_labels
+        })
+        # Shuffle
+        augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
+        print(f"\nTotal augmented samples: {len(augmented_data):,}")
+        return augmented_data
+def calculate_class_weights(data, label_vocab):
+    """
+    Calculate class weights for balanced loss function.
+    Args:
+        data: Dataset DataFrame with 'labels' column
+        label_vocab: Vocabulary object with word2idx mapping
+    Returns:
+        Tensor of class weights
+    """
+    # Count label occurrences
+    label_counts = Counter()
+    for labels in data['labels']:
+        for label in labels:
+            label_id = label_vocab.word2idx.get(label.lower(), 0)
+            label_counts[label_id] += 1
+    # Calculate inverse frequency weights
+    total_samples = sum(label_counts.values())
+    num_classes = len(label_vocab)
+    weights = torch.zeros(num_classes)
+    for class_id, count in label_counts.items():
+        if count > 0:
+            # Inverse frequency with smoothing
+            weights[class_id] = total_samples / (num_classes * count)
+    # Normalize weights
+    weights = weights / weights.sum() * num_classes
+    # Cap extreme weights to prevent instability
+    weights = torch.clamp(weights, min=0.1, max=10.0)
+    # Set padding weight to 0
+    weights[0] = 0.0
+    return weights
+if __name__ == '__main__':
+    """Example usage of the augmentation module."""
+    # Load original data
+    print("Loading original training data...")
+    original_data = pd.read_json('train.json')
+    print(f"Original dataset size: {len(original_data):,}")
+    # Initialize augmenter
+    augmenter = PIIDataAugmenter(seed=42)
+    # Augment dataset
+    print("\n" + "="*60)
+    print("Starting data augmentation...")
+    print("="*60)
+    augmented_data = augmenter.augment_dataset(
+        original_data,
+        target_samples_per_class=2000,
+        mix_ratio=0.3
+    )
+    # Save augmented data
+    output_path = './train_augmented.json'
+    augmented_data.to_json(output_path, orient='records', lines=True)
+    print(f"\nSaved augmented data to {output_path}")

lstm.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, PackedSequence
+class LSTMCell(nn.Module):
+    """
+    LSTM cell implementation from scratch
+    """
+    def __init__(self, input_size: int, hidden_size: int):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        # Initialize weight matrices and bias vectors for LSTM gates
+        # Input gate
+        self.W_ii = nn.Parameter(torch.Tensor(input_size, hidden_size))
+        self.W_hi = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
+        self.b_i = nn.Parameter(torch.Tensor(hidden_size))
+        # Forget gate
+        self.W_if = nn.Parameter(torch.Tensor(input_size, hidden_size))
+        self.W_hf = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
+        self.b_f = nn.Parameter(torch.Tensor(hidden_size))
+        # Input node (candidate)
+        self.W_in = nn.Parameter(torch.Tensor(input_size, hidden_size))
+        self.W_hn = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
+        self.b_n = nn.Parameter(torch.Tensor(hidden_size))
+        # Output gate
+        self.W_io = nn.Parameter(torch.Tensor(input_size, hidden_size))
+        self.W_ho = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
+        self.b_o = nn.Parameter(torch.Tensor(hidden_size))
+        # Initialize all weights with xavier_uniform and biases with zeros
+        for name, param in self.named_parameters():
+            if 'W_' in name:
+                nn.init.xavier_uniform_(param)
+            elif 'b_' in name:
+                nn.init.zeros_(param)
+    def forward(self, input: torch.Tensor, states: tuple[torch.Tensor, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass for one time step
+        Args:
+            input: input at current time step [batch_size, input_size]
+            states: tuple of (hidden_state, cell_state) from previous time step
+                    both with shape [batch_size, hidden_size]
+        Returns:
+            new_hidden: updated hidden state [batch_size, hidden_size]
+            new_cell: updated cell state [batch_size, hidden_size]
+        """
+        hidden, cell = states
+        # Implement LSTM cell forward pass
+        # Forget gate: f_t = sigmoid(W_if @ x_t + W_hf @ h_{t-1} + b_f)
+        forget_gate = torch.sigmoid(torch.mm(input, self.W_if) + torch.mm(hidden, self.W_hf) + self.b_f)
+        # Input gate: i_t = sigmoid(W_ii @ x_t + W_hi @ h_{t-1} + b_i)
+        input_gate = torch.sigmoid(torch.mm(input, self.W_ii) + torch.mm(hidden, self.W_hi) + self.b_i)
+        # Input node values: n_t = tanh(W_in @ x_t + W_hn @ h_{t-1} + b_n)
+        candidate = torch.tanh(torch.mm(input, self.W_in) + torch.mm(hidden, self.W_hn) + self.b_n)
+        # Output gate: o_t = sigmoid(W_io @ x_t + W_ho @ h_{t-1} + b_o)
+        output_gate = torch.sigmoid(torch.mm(input, self.W_io) + torch.mm(hidden, self.W_ho) + self.b_o)
+        # Update cell state: c_t = f_t * c_{t-1} + i_t * n_t
+        new_cell = forget_gate * cell + input_gate * candidate
+        # Update hidden state: h_t = o_t * tanh(c_t)
+        new_hidden = output_gate * torch.tanh(new_cell)
+        return new_hidden, new_cell
+class BidirectionalLSTM(nn.Module):
+    """
+    Multi-layer bidirectional LSTM implementation using custom LSTM cells
+    """
+    def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1,
+                 batch_first: bool = True, dropout: float = 0.0):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.batch_first = batch_first
+        self.dropout = dropout if num_layers > 1 else 0.0
+        # Create forward and backward cells for each layer
+        self.forward_cells = nn.ModuleList()
+        self.backward_cells = nn.ModuleList()
+        self.dropout_layers = nn.ModuleList() if self.dropout > 0 else None
+        for layer in range(num_layers):
+            # Input size is input_size for first layer, hidden_size * 2 for others (bidirectional)
+            layer_input_size = input_size if layer == 0 else hidden_size * 2
+            self.forward_cells.append(LSTMCell(layer_input_size, hidden_size))
+            self.backward_cells.append(LSTMCell(layer_input_size, hidden_size))
+            if self.dropout > 0 and layer < num_layers - 1:
+                self.dropout_layers.append(nn.Dropout(dropout))
+    def forward(self, input, states=None, lengths=None):
+        # Handle PackedSequence input
+        is_packed = isinstance(input, PackedSequence)
+        if is_packed:
+            padded, lengths = pad_packed_sequence(input, batch_first=self.batch_first)
+            outputs, (h_n, c_n) = self._forward_unpacked(padded, states, lengths)
+            packed_out = pack_padded_sequence(
+                outputs, lengths,
+                batch_first=self.batch_first,
+                enforce_sorted=False
+            )
+            return packed_out, (h_n, c_n)
+        else:
+            return self._forward_unpacked(input, states, lengths)
+    def _forward_unpacked(self, input: torch.Tensor, states, lengths=None):
+        if not self.batch_first:
+            input = input.transpose(0, 1)
+        batch_size, seq_len, _ = input.size()
+        # Initialize states if not provided
+        if states is None:
+            h_t_forward = [input.new_zeros(batch_size, self.hidden_size)
+                          for _ in range(self.num_layers)]
+            c_t_forward = [input.new_zeros(batch_size, self.hidden_size)
+                          for _ in range(self.num_layers)]
+            h_t_backward = [input.new_zeros(batch_size, self.hidden_size)
+                           for _ in range(self.num_layers)]
+            c_t_backward = [input.new_zeros(batch_size, self.hidden_size)
+                           for _ in range(self.num_layers)]
+        else:
+            h0, c0 = states
+            # h0 and c0 are [num_layers * 2, batch_size, hidden_size]
+            h_t_forward = []
+            c_t_forward = []
+            h_t_backward = []
+            c_t_backward = []
+            for layer in range(self.num_layers):
+                h_t_forward.append(h0[layer * 2])
+                c_t_forward.append(c0[layer * 2])
+                h_t_backward.append(h0[layer * 2 + 1])
+                c_t_backward.append(c0[layer * 2 + 1])
+        # Process through layers
+        layer_input = input
+        for layer_idx in range(self.num_layers):
+            # Forward direction
+            forward_output = input.new_zeros(batch_size, seq_len, self.hidden_size)
+            for t in range(seq_len):
+                x = layer_input[:, t, :]
+                h, c = self.forward_cells[layer_idx](x, (h_t_forward[layer_idx], c_t_forward[layer_idx]))
+                h_t_forward[layer_idx] = h
+                c_t_forward[layer_idx] = c
+                forward_output[:, t, :] = h
+            # Backward direction
+            backward_output = input.new_zeros(batch_size, seq_len, self.hidden_size)
+            for t in reversed(range(seq_len)):
+                x = layer_input[:, t, :]
+                h, c = self.backward_cells[layer_idx](x, (h_t_backward[layer_idx], c_t_backward[layer_idx]))
+                h_t_backward[layer_idx] = h
+                c_t_backward[layer_idx] = c
+                backward_output[:, t, :] = h
+            # Concatenate forward and backward
+            layer_output = torch.cat([forward_output, backward_output], dim=2)
+            # Apply dropout between layers (except last layer)
+            if self.dropout > 0 and layer_idx < self.num_layers - 1:
+                layer_output = self.dropout_layers[layer_idx](layer_output)
+            layer_input = layer_output
+        # Final output
+        outputs = layer_output
+        # Stack hidden and cell states
+        h_n = []
+        c_n = []
+        for layer in range(self.num_layers):
+            h_n.extend([h_t_forward[layer], h_t_backward[layer]])
+            c_n.extend([c_t_forward[layer], c_t_backward[layer]])
+        h_n = torch.stack(h_n, dim=0)
+        c_n = torch.stack(c_n, dim=0)
+        if not self.batch_first:
+            outputs = outputs.transpose(0, 1)
+        return outputs, (h_n, c_n)
+class LSTM(nn.Module):
+    """
+    Bidirectional LSTM model for PII detection (sequence labeling)
+    """
+    def __init__(self, vocab_size: int, num_classes: int, embed_size: int = 128,
+                 hidden_size: int = 256, num_layers: int = 2, dropout: float = 0.1,
+                 max_len: int = 512):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.num_classes = num_classes
+        self.embed_size = embed_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        # Embedding layer
+        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
+        self.embed_dropout = nn.Dropout(dropout)
+        # Bidirectional LSTM layers
+        self.lstm = BidirectionalLSTM(
+            input_size=embed_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0.0
+        )
+        # Output projection layer
+        lstm_output_size = hidden_size * 2  # bidirectional
+        self.fc = nn.Linear(lstm_output_size, num_classes)
+        self.output_dropout = nn.Dropout(dropout)
+    def forward(self, input_ids, lengths=None):
+        """
+        Forward pass
+        Args:
+            input_ids: token ids [batch_size, seq_len]
+            lengths: actual lengths of sequences (optional)
+        Returns:
+            logits: class predictions [batch_size, seq_len, num_classes]
+        """
+        # Embedding
+        embedded = self.embedding(input_ids)  # [batch_size, seq_len, embed_size]
+        embedded = self.embed_dropout(embedded)
+        # Pack if lengths provided for efficiency
+        if lengths is not None:
+            packed_embedded = pack_padded_sequence(
+                embedded, lengths.cpu(),
+                batch_first=True,
+                enforce_sorted=False
+            )
+            lstm_out, _ = self.lstm(packed_embedded)
+            lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
+        else:
+            lstm_out, _ = self.lstm(embedded)
+        # Apply dropout and project to output
+        lstm_out = self.output_dropout(lstm_out)
+        logits = self.fc(lstm_out)  # [batch_size, seq_len, num_classes]
+        return logits
+def create_lstm_pii_model(vocab_size: int, num_classes: int, d_model: int = 256,
+                         num_heads: int = 8, d_ff: int = 512, num_layers: int = 4,
+                         dropout: float = 0.1, max_len: int = 512):
+    """
+    Create Bidirectional LSTM model for PII detection
+    Note: num_heads and d_ff are ignored (kept for compatibility with transformer interface)
+    Args:
+        vocab_size: size of vocabulary
+        num_classes: number of output classes (PII tags)
+        d_model: hidden dimension size
+        num_heads: ignored (for compatibility)
+        d_ff: ignored (for compatibility)
+        num_layers: number of LSTM layers
+        dropout: dropout rate
+        max_len: maximum sequence length
+    Returns:
+        LSTM
+    """
+    return LSTM(
+        vocab_size=vocab_size,
+        num_classes=num_classes,
+        embed_size=d_model // 2,  # Use half of d_model as embedding size
+        hidden_size=d_model,
+        num_layers=num_layers,
+        dropout=dropout,
+        max_len=max_len
+    )

lstm_training.ipynb ADDED Viewed

	@@ -0,0 +1,1350 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6bce68a8",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:08.438040Z",
+     "iopub.status.busy": "2025-08-03T18:03:08.437435Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.190888Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.190285Z"
+    },
+    "papermill": {
+     "duration": 6.758353,
+     "end_time": "2025-08-03T18:03:15.192202",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:08.433849",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler\n",
+    "from torch.nn.utils.rnn import pad_sequence\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from collections import Counter\n",
+    "import pickle\n",
+    "from tqdm import tqdm\n",
+    "import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from datetime import datetime\n",
+    "from lstm import create_lstm_pii_model\n",
+    "from data_augmentation import calculate_class_weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1207cd93",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.199050Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.198726Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.205267Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.204584Z"
+    },
+    "papermill": {
+     "duration": 0.010986,
+     "end_time": "2025-08-03T18:03:15.206321",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.195335",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "class Vocabulary:\n",
+    "    \"\"\"Vocabulary class for encoding/decoding text and labels\"\"\"\n",
+    "    def __init__(self, max_size=100000):\n",
+    "        self.word2idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}\n",
+    "        self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}\n",
+    "        self.word_count = Counter()\n",
+    "        self.max_size = max_size\n",
+    "        \n",
+    "    def add_sentence(self, sentence):\n",
+    "        for word in sentence:\n",
+    "            self.word_count[word.lower()] += 1\n",
+    "    \n",
+    "    def build(self):\n",
+    "        most_common = self.word_count.most_common(self.max_size - len(self.word2idx))\n",
+    "        for word, _ in most_common:\n",
+    "            if word not in self.word2idx:\n",
+    "                idx = len(self.word2idx)\n",
+    "                self.word2idx[word] = idx\n",
+    "                self.idx2word[idx] = word\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.word2idx)\n",
+    "    \n",
+    "    def encode(self, sentence):\n",
+    "        return [self.word2idx.get(word.lower(), self.word2idx['<unk>']) for word in sentence]\n",
+    "    \n",
+    "    def decode(self, indices):\n",
+    "        return [self.idx2word.get(idx, '<unk>') for idx in indices]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f4056292",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.212478Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.212272Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.217352Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.216675Z"
+    },
+    "papermill": {
+     "duration": 0.009321,
+     "end_time": "2025-08-03T18:03:15.218370",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.209049",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "class PIIDataset(Dataset):\n",
+    "    \"\"\"PyTorch Dataset for PII detection\"\"\"\n",
+    "    def __init__(self, tokens, labels, text_vocab, label_vocab, max_len=512):\n",
+    "        self.tokens = tokens\n",
+    "        self.labels = labels\n",
+    "        self.text_vocab = text_vocab\n",
+    "        self.label_vocab = label_vocab\n",
+    "        self.max_len = max_len\n",
+    "        \n",
+    "    def __len__(self):\n",
+    "        return len(self.tokens)\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        # Add start and end tokens\n",
+    "        tokens = ['<start>'] + self.tokens[idx] + ['<end>']\n",
+    "        labels = ['<start>'] + self.labels[idx] + ['<end>']\n",
+    "        \n",
+    "        # Truncate if too long\n",
+    "        if len(tokens) > self.max_len:\n",
+    "            tokens = tokens[:self.max_len-1] + ['<end>']\n",
+    "            labels = labels[:self.max_len-1] + ['<end>']\n",
+    "        \n",
+    "        # Encode\n",
+    "        token_ids = self.text_vocab.encode(tokens)\n",
+    "        label_ids = self.label_vocab.encode(labels)\n",
+    "        \n",
+    "        return torch.tensor(token_ids), torch.tensor(label_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "499deba2",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.224549Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.224344Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.227931Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.227258Z"
+    },
+    "papermill": {
+     "duration": 0.00789,
+     "end_time": "2025-08-03T18:03:15.229026",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.221136",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def collate_fn(batch):\n",
+    "    \"\"\"Custom collate function for padding sequences\"\"\"\n",
+    "    tokens, labels = zip(*batch)\n",
+    "    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)\n",
+    "    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)\n",
+    "    return tokens_padded, labels_padded"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7ade0505",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.237394Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.236977Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.250346Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.249624Z"
+    },
+    "papermill": {
+     "duration": 0.018587,
+     "end_time": "2025-08-03T18:03:15.251405",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.232818",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "class F1ScoreMetric:\n",
+    "    \"\"\"Custom F1 score metric with beta parameter\"\"\"\n",
+    "    def __init__(self, beta=5, num_classes=20, ignore_index=0, label_vocab=None):\n",
+    "        self.beta = beta\n",
+    "        self.num_classes = num_classes\n",
+    "        self.ignore_index = ignore_index\n",
+    "        self.label_vocab = label_vocab\n",
+    "        self.reset()\n",
+    "        \n",
+    "    def reset(self):\n",
+    "        self.true_positives = 0\n",
+    "        self.false_positives = 0\n",
+    "        self.false_negatives = 0\n",
+    "        self.class_metrics = {}\n",
+    "        \n",
+    "    def update(self, predictions, targets):\n",
+    "        mask = (targets != self.ignore_index) & (targets != 2) & (targets != 3)\n",
+    "        o_idx = self.label_vocab.word2idx.get('o', -1) if self.label_vocab else -1\n",
+    "        \n",
+    "        for class_id in range(1, self.num_classes):\n",
+    "            if class_id == o_idx:\n",
+    "                continue\n",
+    "                \n",
+    "            pred_mask = (predictions == class_id) & mask\n",
+    "            true_mask = (targets == class_id) & mask\n",
+    "            \n",
+    "            tp = ((pred_mask) & (true_mask)).sum().item()\n",
+    "            fp = ((pred_mask) & (~true_mask)).sum().item()\n",
+    "            fn = ((~pred_mask) & (true_mask)).sum().item()\n",
+    "            \n",
+    "            self.true_positives += tp\n",
+    "            self.false_positives += fp\n",
+    "            self.false_negatives += fn\n",
+    "            \n",
+    "            if class_id not in self.class_metrics:\n",
+    "                self.class_metrics[class_id] = {'tp': 0, 'fp': 0, 'fn': 0}\n",
+    "            self.class_metrics[class_id]['tp'] += tp\n",
+    "            self.class_metrics[class_id]['fp'] += fp\n",
+    "            self.class_metrics[class_id]['fn'] += fn\n",
+    "    \n",
+    "    def compute(self):\n",
+    "        beta_squared = self.beta ** 2\n",
+    "        precision = self.true_positives / (self.true_positives + self.false_positives + 1e-8)\n",
+    "        recall = self.true_positives / (self.true_positives + self.false_negatives + 1e-8)\n",
+    "        f1 = (1 + beta_squared) * precision * recall / (beta_squared * precision + recall + 1e-8)\n",
+    "        return f1\n",
+    "    \n",
+    "    def get_class_metrics(self):\n",
+    "        results = {}\n",
+    "        for class_id, metrics in self.class_metrics.items():\n",
+    "            if self.label_vocab and class_id in self.label_vocab.idx2word:\n",
+    "                class_name = self.label_vocab.idx2word[class_id]\n",
+    "                precision = metrics['tp'] / (metrics['tp'] + metrics['fp'] + 1e-8)\n",
+    "                recall = metrics['tp'] / (metrics['tp'] + metrics['fn'] + 1e-8)\n",
+    "                f1 = 2 * precision * recall / (precision + recall + 1e-8)\n",
+    "                results[class_name] = {\n",
+    "                    'precision': precision,\n",
+    "                    'recall': recall,\n",
+    "                    'f1': f1,\n",
+    "                    'support': metrics['tp'] + metrics['fn']\n",
+    "                }\n",
+    "        return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "361b5505",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.258002Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.257703Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.265171Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.264658Z"
+    },
+    "papermill": {
+     "duration": 0.011955,
+     "end_time": "2025-08-03T18:03:15.266159",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.254204",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "class FocalLoss(nn.Module):\n",
+    "    \"\"\"Focal Loss for addressing class imbalance\"\"\"\n",
+    "    def __init__(self, alpha=None, gamma=2.0, reduction='mean', ignore_index=-100):\n",
+    "        super(FocalLoss, self).__init__()\n",
+    "        self.alpha = alpha\n",
+    "        self.gamma = gamma\n",
+    "        self.reduction = reduction\n",
+    "        self.ignore_index = ignore_index\n",
+    "        \n",
+    "    def forward(self, inputs, targets):\n",
+    "        ce_loss = nn.functional.cross_entropy(\n",
+    "            inputs, targets, \n",
+    "            weight=self.alpha, \n",
+    "            reduction='none',\n",
+    "            ignore_index=self.ignore_index\n",
+    "        )\n",
+    "        \n",
+    "        pt = torch.exp(-ce_loss)\n",
+    "        focal_loss = (1 - pt) ** self.gamma * ce_loss\n",
+    "        \n",
+    "        if self.reduction == 'mean':\n",
+    "            return focal_loss.mean()\n",
+    "        elif self.reduction == 'sum':\n",
+    "            return focal_loss.sum()\n",
+    "        else:\n",
+    "            return focal_loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "1de646e9",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.272639Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.272459Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.277673Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.277165Z"
+    },
+    "papermill": {
+     "duration": 0.009528,
+     "end_time": "2025-08-03T18:03:15.278705",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.269177",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def train_epoch(model, dataloader, optimizer, criterion, device, f1_metric):\n",
+    "    \"\"\"Train for one epoch\"\"\"\n",
+    "    model.train()\n",
+    "    total_loss = 0\n",
+    "    f1_metric.reset()\n",
+    "    \n",
+    "    progress_bar = tqdm(dataloader, desc='Training')\n",
+    "    for batch_idx, (tokens, labels) in enumerate(progress_bar):\n",
+    "        tokens = tokens.to(device)\n",
+    "        labels = labels.to(device)\n",
+    "        \n",
+    "        # Forward pass\n",
+    "        optimizer.zero_grad()\n",
+    "        outputs = model(tokens)\n",
+    "        \n",
+    "        # Reshape for loss calculation\n",
+    "        outputs_flat = outputs.view(-1, outputs.size(-1))\n",
+    "        labels_flat = labels.view(-1)\n",
+    "        \n",
+    "        # Calculate loss and backward pass\n",
+    "        loss = criterion(outputs_flat, labels_flat)\n",
+    "        loss.backward()\n",
+    "        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)\n",
+    "        optimizer.step()\n",
+    "        \n",
+    "        # Update metrics\n",
+    "        total_loss += loss.item()\n",
+    "        predictions = torch.argmax(outputs, dim=-1)\n",
+    "        f1_metric.update(predictions, labels)\n",
+    "        \n",
+    "        # Update progress bar\n",
+    "        progress_bar.set_postfix({\n",
+    "            'loss': f\"{loss.item():.4f}\",\n",
+    "            'f1': f\"{f1_metric.compute():.4f}\"\n",
+    "        })\n",
+    "    \n",
+    "    return total_loss / len(dataloader), f1_metric.compute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "d1ce3b0f",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.284917Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.284718Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.289392Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.288854Z"
+    },
+    "papermill": {
+     "duration": 0.008891,
+     "end_time": "2025-08-03T18:03:15.290379",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.281488",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def evaluate(model, dataloader, criterion, device, f1_metric):\n",
+    "    \"\"\"Evaluate model on validation/test set\"\"\"\n",
+    "    model.eval()\n",
+    "    total_loss = 0\n",
+    "    f1_metric.reset()\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        for tokens, labels in tqdm(dataloader, desc='Evaluating'):\n",
+    "            tokens = tokens.to(device)\n",
+    "            labels = labels.to(device)\n",
+    "            \n",
+    "            # Forward pass\n",
+    "            outputs = model(tokens)\n",
+    "            outputs_flat = outputs.view(-1, outputs.size(-1))\n",
+    "            labels_flat = labels.view(-1)\n",
+    "            \n",
+    "            # Calculate loss\n",
+    "            loss = criterion(outputs_flat, labels_flat)\n",
+    "            total_loss += loss.item()\n",
+    "            \n",
+    "            # Update metrics\n",
+    "            predictions = torch.argmax(outputs, dim=-1)\n",
+    "            f1_metric.update(predictions, labels)\n",
+    "    \n",
+    "    return total_loss / len(dataloader), f1_metric.compute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "da3ff80c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.296567Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.296378Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.300725Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.300185Z"
+    },
+    "papermill": {
+     "duration": 0.008576,
+     "end_time": "2025-08-03T18:03:15.301673",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.293097",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def create_balanced_sampler(dataset, label_vocab):\n",
+    "    \"\"\"Create a weighted sampler to balance classes during training\"\"\"\n",
+    "    sample_weights = []\n",
+    "    \n",
+    "    for idx in range(len(dataset)):\n",
+    "        _, labels = dataset[idx]\n",
+    "        \n",
+    "        # Give higher weight to samples with rare PII\n",
+    "        min_weight = 1.0\n",
+    "        for label_id in labels:\n",
+    "            if label_id > 3:  # Skip special tokens\n",
+    "                label_name = label_vocab.idx2word.get(label_id.item(), 'O')\n",
+    "                if label_name != 'o' and 'B-' in label_name:\n",
+    "                    min_weight = 10.0\n",
+    "                    break\n",
+    "        \n",
+    "        sample_weights.append(min_weight)\n",
+    "    \n",
+    "    sampler = WeightedRandomSampler(\n",
+    "        weights=sample_weights,\n",
+    "        num_samples=len(sample_weights),\n",
+    "        replacement=True\n",
+    "    )\n",
+    "    \n",
+    "    return sampler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "69b37e68",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.307761Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.307589Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.311849Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.311334Z"
+    },
+    "papermill": {
+     "duration": 0.008327,
+     "end_time": "2025-08-03T18:03:15.312778",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.304451",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def print_label_distribution(data, title=\"Label Distribution\"):\n",
+    "    \"\"\"Print label distribution statistics\"\"\"\n",
+    "    label_counts = Counter()\n",
+    "    for label_seq in data.labels:\n",
+    "        for label in label_seq:\n",
+    "            if label not in ['<pad>', '<start>', '<end>']:\n",
+    "                label_counts[label] += 1\n",
+    "    \n",
+    "    print(f\"\\n{title}:\")\n",
+    "    print(\"-\" * 50)\n",
+    "    total = sum(label_counts.values())\n",
+    "    for label, count in label_counts.most_common():\n",
+    "        percentage = (count / total) * 100\n",
+    "        print(f\"  {label:20} : {count:8,} ({percentage:5.2f}%)\")\n",
+    "    print(\"-\" * 50)\n",
+    "    print(f\"  {'Total':20} : {total:8,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "4b1b4f86",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.319812Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.319647Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.323992Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.323517Z"
+    },
+    "papermill": {
+     "duration": 0.00942,
+     "end_time": "2025-08-03T18:03:15.325043",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.315623",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def save_model(model, text_vocab, label_vocab, config, save_dir):\n",
+    "    \"\"\"Save model and all necessary components for deployment\"\"\"\n",
+    "    os.makedirs(save_dir, exist_ok=True)\n",
+    "    \n",
+    "    # Save model state\n",
+    "    model_path = os.path.join(save_dir, 'pii_lstm_model.pt')\n",
+    "    torch.save(model.state_dict(), model_path)\n",
+    "    \n",
+    "    # Save vocabularies\n",
+    "    vocab_path = os.path.join(save_dir, 'vocabularies.pkl')\n",
+    "    with open(vocab_path, 'wb') as f:\n",
+    "        pickle.dump({\n",
+    "            'text_vocab': text_vocab,\n",
+    "            'label_vocab': label_vocab\n",
+    "        }, f)\n",
+    "    \n",
+    "    # Save model configuration\n",
+    "    config_path = os.path.join(save_dir, 'model_config.pkl')\n",
+    "    with open(config_path, 'wb') as f:\n",
+    "        pickle.dump(config, f)\n",
+    "    \n",
+    "    print(f\"\\nModel saved for deployment in '{save_dir}/' directory\")\n",
+    "    print(\"Files saved:\")\n",
+    "    print(f\"  - {model_path}\")\n",
+    "    print(f\"  - {vocab_path}\")\n",
+    "    print(f\"  - {config_path}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "31d2f1b1",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.331818Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.331643Z",
+     "iopub.status.idle": "2025-08-03T18:03:15.347264Z",
+     "shell.execute_reply": "2025-08-03T18:03:15.346735Z"
+    },
+    "papermill": {
+     "duration": 0.020356,
+     "end_time": "2025-08-03T18:03:15.348292",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.327936",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def train_lstm_pii_model(\n",
+    "    data_path,\n",
+    "    num_epochs=30,\n",
+    "    batch_size=32,\n",
+    "    learning_rate=3e-4,\n",
+    "    use_focal_loss=True,\n",
+    "    focal_gamma=2.0,\n",
+    "    device='cuda',\n",
+    "):\n",
+    "    \"\"\"Main training function for LSTM model\"\"\"\n",
+    "    \n",
+    "    # Load data\n",
+    "    print(\"Loading augmented data...\")\n",
+    "    data = pd.read_json(data_path, lines=True)\n",
+    "    print(f\"Total samples: {len(data)}\")\n",
+    "    \n",
+    "    # Print initial label distribution\n",
+    "    print_label_distribution(data, \"Label Distribution in Augmented Data\")\n",
+    "    \n",
+    "    # Build vocabularies\n",
+    "    print(\"\\nBuilding vocabularies...\")\n",
+    "    text_vocab = Vocabulary(max_size=100000)\n",
+    "    label_vocab = Vocabulary(max_size=50)\n",
+    "    \n",
+    "    for tokens in data.tokens:\n",
+    "        text_vocab.add_sentence(tokens)\n",
+    "    for labels in data.labels:\n",
+    "        label_vocab.add_sentence(labels)\n",
+    "    \n",
+    "    text_vocab.build()\n",
+    "    label_vocab.build()\n",
+    "    \n",
+    "    print(f\"\\nVocabulary sizes:\")\n",
+    "    print(f\"  - Text vocabulary: {len(text_vocab):,}\")\n",
+    "    print(f\"  - Label vocabulary: {len(label_vocab)}\")\n",
+    "    \n",
+    "    # Calculate class weights\n",
+    "    class_weights = calculate_class_weights(data, label_vocab)\n",
+    "    class_weights = class_weights.to(device)\n",
+    "    \n",
+    "    # Split data\n",
+    "    X_train, X_val, y_train, y_val = train_test_split(\n",
+    "        data.tokens.tolist(),\n",
+    "        data.labels.tolist(),\n",
+    "        test_size=0.2,\n",
+    "        random_state=42\n",
+    "    )\n",
+    "    \n",
+    "    print(f\"\\nData split:\")\n",
+    "    print(f\"  - Train samples: {len(X_train):,}\")\n",
+    "    print(f\"  - Validation samples: {len(X_val):,}\")\n",
+    "    \n",
+    "    # Create datasets and dataloaders\n",
+    "    max_seq_len = 512\n",
+    "    train_dataset = PIIDataset(X_train, y_train, text_vocab, label_vocab, max_len=max_seq_len)\n",
+    "    val_dataset = PIIDataset(X_val, y_val, text_vocab, label_vocab, max_len=max_seq_len)\n",
+    "    \n",
+    "    # Use balanced sampler for training\n",
+    "    train_sampler = create_balanced_sampler(train_dataset, label_vocab)\n",
+    "    \n",
+    "    train_loader = DataLoader(\n",
+    "        train_dataset, \n",
+    "        batch_size=batch_size,\n",
+    "        sampler=train_sampler,\n",
+    "        collate_fn=collate_fn,\n",
+    "        num_workers=0\n",
+    "    )\n",
+    "    \n",
+    "    val_loader = DataLoader(\n",
+    "        val_dataset, \n",
+    "        batch_size=batch_size,\n",
+    "        shuffle=False, \n",
+    "        collate_fn=collate_fn,\n",
+    "        num_workers=0\n",
+    "    )\n",
+    "    \n",
+    "    # Model configuration\n",
+    "    model_config = {\n",
+    "        'vocab_size': len(text_vocab),\n",
+    "        'num_classes': len(label_vocab),\n",
+    "        'd_model': 256,\n",
+    "        'num_heads': 8,  # Not used by LSTM, kept for compatibility\n",
+    "        'd_ff': 512,     # Not used by LSTM, kept for compatibility\n",
+    "        'num_layers': 2,  # Number of LSTM layers\n",
+    "        'dropout': 0.1,\n",
+    "        'max_len': max_seq_len\n",
+    "    }\n",
+    "    \n",
+    "    # Create model\n",
+    "    print(\"\\nCreating LSTM model...\")\n",
+    "    model = create_lstm_pii_model(**model_config).to(device)\n",
+    "    print(f\"Model parameters: {sum(p.numel() for p in model.parameters()):,}\")\n",
+    "    \n",
+    "    # Print model architecture\n",
+    "    print(\"\\nModel Architecture:\")\n",
+    "    print(f\"  - Embedding: {model_config['vocab_size']} -> {model_config['d_model'] // 2}\")\n",
+    "    print(f\"  - Bidirectional LSTM: {model_config['num_layers']} layers, hidden size: {model_config['d_model']}\")\n",
+    "    print(f\"  - Output: {model_config['d_model'] * 2} -> {model_config['num_classes']}\")\n",
+    "    \n",
+    "    # Setup loss function\n",
+    "    if use_focal_loss:\n",
+    "        criterion = FocalLoss(\n",
+    "            alpha=class_weights,\n",
+    "            gamma=focal_gamma,\n",
+    "            ignore_index=0\n",
+    "        )\n",
+    "        print(f\"\\nUsing Focal Loss with gamma={focal_gamma}\")\n",
+    "    else:\n",
+    "        criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=0)\n",
+    "        print(\"\\nUsing Cross Entropy Loss\")\n",
+    "    \n",
+    "    # Setup optimizer and scheduler\n",
+    "    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)\n",
+    "    scheduler = optim.lr_scheduler.ReduceLROnPlateau(\n",
+    "        optimizer, \n",
+    "        mode='min',\n",
+    "        patience=3, \n",
+    "        factor=0.5,\n",
+    "        min_lr=1e-6\n",
+    "    )\n",
+    "    \n",
+    "    # Metrics\n",
+    "    f1_metric_train = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
+    "    f1_metric_val = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
+    "    \n",
+    "    # Training loop\n",
+    "    train_losses, train_f1s, val_losses, val_f1s = [], [], [], []\n",
+    "    best_val_f1 = 0\n",
+    "    patience = 7\n",
+    "    patience_counter = 0\n",
+    "    \n",
+    "    print(\"\\nStarting training...\")\n",
+    "    print(\"=\" * 60)\n",
+    "    \n",
+    "    for epoch in range(num_epochs):\n",
+    "        print(f\"\\nEpoch {epoch+1}/{num_epochs}\")\n",
+    "        \n",
+    "        # Train and validate\n",
+    "        train_loss, train_f1 = train_epoch(\n",
+    "            model, train_loader, optimizer, criterion, device, f1_metric_train\n",
+    "        )\n",
+    "        val_loss, val_f1 = evaluate(\n",
+    "            model, val_loader, criterion, device, f1_metric_val\n",
+    "        )\n",
+    "        \n",
+    "        # Step scheduler based on validation loss\n",
+    "        scheduler.step(val_loss)\n",
+    "        \n",
+    "        # Store metrics\n",
+    "        train_losses.append(train_loss)\n",
+    "        train_f1s.append(train_f1)\n",
+    "        val_losses.append(val_loss)\n",
+    "        val_f1s.append(val_f1)\n",
+    "        \n",
+    "        # Print epoch results\n",
+    "        print(f\"Train Loss: {train_loss:.4f}, Train F1: {train_f1:.4f}\")\n",
+    "        print(f\"Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}\")\n",
+    "        print(f\"Learning rate: {optimizer.param_groups[0]['lr']:.6f}\")\n",
+    "        \n",
+    "        # Save best model\n",
+    "        if val_f1 > best_val_f1:\n",
+    "            best_val_f1 = val_f1\n",
+    "            patience_counter = 0\n",
+    "            \n",
+    "            # Save complete checkpoint\n",
+    "            checkpoint = {\n",
+    "                'epoch': epoch,\n",
+    "                'model_state_dict': model.state_dict(),\n",
+    "                'optimizer_state_dict': optimizer.state_dict(),\n",
+    "                'scheduler_state_dict': scheduler.state_dict(),\n",
+    "                'train_loss': train_loss,\n",
+    "                'val_loss': val_loss,\n",
+    "                'train_f1': train_f1,\n",
+    "                'val_f1': val_f1,\n",
+    "                'text_vocab': text_vocab,\n",
+    "                'label_vocab': label_vocab,\n",
+    "                'model_config': model_config\n",
+    "            }\n",
+    "            torch.save(checkpoint, 'best_lstm_checkpoint.pt')\n",
+    "            \n",
+    "            print(f\"✓ Saved best model with F1: {val_f1:.4f}\")\n",
+    "        else:\n",
+    "            patience_counter += 1\n",
+    "            \n",
+    "        # Early stopping\n",
+    "        if patience_counter >= patience and epoch > 10:\n",
+    "            print(f\"\\nEarly stopping triggered after {patience} epochs without improvement\")\n",
+    "            break\n",
+    "    \n",
+    "    # Plot training curves\n",
+    "    plt.figure(figsize=(12, 5))\n",
+    "    \n",
+    "    plt.subplot(1, 2, 1)\n",
+    "    plt.plot(train_losses, label='Train Loss', linewidth=2)\n",
+    "    plt.plot(val_losses, label='Val Loss', linewidth=2)\n",
+    "    plt.xlabel('Epoch')\n",
+    "    plt.ylabel('Loss')\n",
+    "    plt.title('Training and Validation Loss')\n",
+    "    plt.legend()\n",
+    "    plt.grid(True, alpha=0.3)\n",
+    "    \n",
+    "    plt.subplot(1, 2, 2)\n",
+    "    plt.plot(train_f1s, label='Train F1', linewidth=2)\n",
+    "    plt.plot(val_f1s, label='Val F1', linewidth=2)\n",
+    "    plt.axhline(y=best_val_f1, color='r', linestyle='--', label=f'Best F1: {best_val_f1:.4f}')\n",
+    "    plt.xlabel('Epoch')\n",
+    "    plt.ylabel('F1 Score')\n",
+    "    plt.title('Training and Validation F1 Score')\n",
+    "    plt.legend()\n",
+    "    plt.grid(True, alpha=0.3)\n",
+    "    \n",
+    "    plt.tight_layout()\n",
+    "    plt.savefig('lstm_training_curves.png', dpi=300, bbox_inches='tight')\n",
+    "    plt.close()\n",
+    "    \n",
+    "    print(f\"\\n{'='*60}\")\n",
+    "    print(f\"Training completed!\")\n",
+    "    print(f\"Best validation F1: {best_val_f1:.4f}\")\n",
+    "    print(f\"Training curves saved to: lstm_training_curves.png\")\n",
+    "    \n",
+    "    # Save model for deployment\n",
+    "    save_model(model, text_vocab, label_vocab, model_config, 'saved_lstm_model')\n",
+    "    \n",
+    "    return model, text_vocab, label_vocab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcb2b401",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T18:03:15.354835Z",
+     "iopub.status.busy": "2025-08-03T18:03:15.354423Z",
+     "iopub.status.idle": "2025-08-04T04:06:32.402286Z",
+     "shell.execute_reply": "2025-08-04T04:06:32.401401Z"
+    },
+    "papermill": {
+     "duration": 36197.052354,
+     "end_time": "2025-08-04T04:06:32.403447",
+     "exception": false,
+     "start_time": "2025-08-03T18:03:15.351093",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: cuda\n",
+      "Loading augmented data...\n",
+      "Total samples: 19694\n",
+      "\n",
+      "Label Distribution in Augmented Data:\n",
+      "--------------------------------------------------\n",
+      "  O                    : 5,082,150 (99.33%)\n",
+      "  I-STREET_ADDRESS     :   15,650 ( 0.31%)\n",
+      "  B-ID_NUM             :    2,505 ( 0.05%)\n",
+      "  B-EMAIL              :    2,488 ( 0.05%)\n",
+      "  B-URL_PERSONAL       :    2,478 ( 0.05%)\n",
+      "  B-STREET_ADDRESS     :    2,452 ( 0.05%)\n",
+      "  B-PHONE_NUM          :    2,450 ( 0.05%)\n",
+      "  B-USERNAME           :    2,210 ( 0.04%)\n",
+      "  B-NAME_STUDENT       :    1,968 ( 0.04%)\n",
+      "  I-NAME_STUDENT       :    1,735 ( 0.03%)\n",
+      "  I-PHONE_NUM          :      500 ( 0.01%)\n",
+      "  I-URL_PERSONAL       :        1 ( 0.00%)\n",
+      "  I-ID_NUM             :        1 ( 0.00%)\n",
+      "--------------------------------------------------\n",
+      "  Total                : 5,116,588\n",
+      "\n",
+      "Building vocabularies...\n",
+      "\n",
+      "Vocabulary sizes:\n",
+      "  - Text vocabulary: 65,295\n",
+      "  - Label vocabulary: 17\n",
+      "\n",
+      "Data split:\n",
+      "  - Train samples: 15,755\n",
+      "  - Validation samples: 3,939\n",
+      "\n",
+      "Creating LSTM model...\n",
+      "Model parameters: 10,729,873\n",
+      "\n",
+      "Model Architecture:\n",
+      "  - Embedding: 65295 -> 128\n",
+      "  - Bidirectional LSTM: 2 layers, hidden size: 256\n",
+      "  - Output: 512 -> 17\n",
+      "\n",
+      "Using Focal Loss with gamma=2.0\n",
+      "\n",
+      "Starting training...\n",
+      "============================================================\n",
+      "\n",
+      "Epoch 1/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:41<00:00,  3.49s/it, loss=0.0000, f1=0.1535]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:27<00:00,  1.41it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0002, Train F1: 0.1535\n",
+      "Val Loss: 0.0001, Val F1: 0.4344\n",
+      "Learning rate: 0.000300\n",
+      "✓ Saved best model with F1: 0.4344\n",
+      "\n",
+      "Epoch 2/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:39<00:00,  3.49s/it, loss=0.0000, f1=0.5546]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:29<00:00,  1.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.5546\n",
+      "Val Loss: 0.0000, Val F1: 0.6417\n",
+      "Learning rate: 0.000300\n",
+      "✓ Saved best model with F1: 0.6417\n",
+      "\n",
+      "Epoch 3/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:39<00:00,  3.49s/it, loss=0.0000, f1=0.7183]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:28<00:00,  1.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.7183\n",
+      "Val Loss: 0.0000, Val F1: 0.7736\n",
+      "Learning rate: 0.000300\n",
+      "✓ Saved best model with F1: 0.7736\n",
+      "\n",
+      "Epoch 4/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:39<00:00,  3.49s/it, loss=0.0000, f1=0.8117]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:29<00:00,  1.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.8117\n",
+      "Val Loss: 0.0000, Val F1: 0.8568\n",
+      "Learning rate: 0.000300\n",
+      "✓ Saved best model with F1: 0.8568\n",
+      "\n",
+      "Epoch 5/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:36<00:00,  3.48s/it, loss=0.0000, f1=0.8686]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:28<00:00,  1.41it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.8686\n",
+      "Val Loss: 0.0000, Val F1: 0.8847\n",
+      "Learning rate: 0.000300\n",
+      "✓ Saved best model with F1: 0.8847\n",
+      "\n",
+      "Epoch 6/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:38<00:00,  3.49s/it, loss=0.0000, f1=0.8942]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:30<00:00,  1.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.8942\n",
+      "Val Loss: 0.0000, Val F1: 0.8983\n",
+      "Learning rate: 0.000300\n",
+      "✓ Saved best model with F1: 0.8983\n",
+      "\n",
+      "Epoch 7/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:36<00:00,  3.48s/it, loss=0.0000, f1=0.9097]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:28<00:00,  1.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9097\n",
+      "Val Loss: 0.0000, Val F1: 0.9147\n",
+      "Learning rate: 0.000300\n",
+      "✓ Saved best model with F1: 0.9147\n",
+      "\n",
+      "Epoch 8/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:36<00:00,  3.48s/it, loss=0.0000, f1=0.9271]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:29<00:00,  1.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9271\n",
+      "Val Loss: 0.0000, Val F1: 0.9386\n",
+      "Learning rate: 0.000300\n",
+      "✓ Saved best model with F1: 0.9386\n",
+      "\n",
+      "Epoch 9/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:36<00:00,  3.48s/it, loss=0.0000, f1=0.9362]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:28<00:00,  1.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9362\n",
+      "Val Loss: 0.0000, Val F1: 0.9371\n",
+      "Learning rate: 0.000300\n",
+      "\n",
+      "Epoch 10/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:37<00:00,  3.48s/it, loss=0.0000, f1=0.9457]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:28<00:00,  1.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9457\n",
+      "Val Loss: 0.0000, Val F1: 0.9418\n",
+      "Learning rate: 0.000150\n",
+      "✓ Saved best model with F1: 0.9418\n",
+      "\n",
+      "Epoch 11/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:41<00:00,  3.49s/it, loss=0.0000, f1=0.9561]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:29<00:00,  1.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9561\n",
+      "Val Loss: 0.0000, Val F1: 0.9471\n",
+      "Learning rate: 0.000150\n",
+      "✓ Saved best model with F1: 0.9471\n",
+      "\n",
+      "Epoch 12/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:38<00:00,  3.49s/it, loss=0.0000, f1=0.9579]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:28<00:00,  1.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9579\n",
+      "Val Loss: 0.0000, Val F1: 0.9463\n",
+      "Learning rate: 0.000150\n",
+      "\n",
+      "Epoch 13/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:37<00:00,  3.48s/it, loss=0.0000, f1=0.9590]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:28<00:00,  1.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9590\n",
+      "Val Loss: 0.0000, Val F1: 0.9526\n",
+      "Learning rate: 0.000150\n",
+      "✓ Saved best model with F1: 0.9526\n",
+      "\n",
+      "Epoch 14/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:37<00:00,  3.48s/it, loss=0.0000, f1=0.9665]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:29<00:00,  1.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9665\n",
+      "Val Loss: 0.0000, Val F1: 0.9499\n",
+      "Learning rate: 0.000075\n",
+      "\n",
+      "Epoch 15/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:38<00:00,  3.49s/it, loss=0.0000, f1=0.9674]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:29<00:00,  1.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9674\n",
+      "Val Loss: 0.0000, Val F1: 0.9518\n",
+      "Learning rate: 0.000075\n",
+      "\n",
+      "Epoch 16/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:41<00:00,  3.49s/it, loss=0.0000, f1=0.9679]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:30<00:00,  1.37it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9679\n",
+      "Val Loss: 0.0000, Val F1: 0.9509\n",
+      "Learning rate: 0.000075\n",
+      "\n",
+      "Epoch 17/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:40<00:00,  3.49s/it, loss=0.0000, f1=0.9706]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:29<00:00,  1.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9706\n",
+      "Val Loss: 0.0000, Val F1: 0.9525\n",
+      "Learning rate: 0.000075\n",
+      "\n",
+      "Epoch 18/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:37<00:00,  3.48s/it, loss=0.0000, f1=0.9738]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:29<00:00,  1.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9738\n",
+      "Val Loss: 0.0000, Val F1: 0.9509\n",
+      "Learning rate: 0.000037\n",
+      "\n",
+      "Epoch 19/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:39<00:00,  3.49s/it, loss=0.0000, f1=0.9722]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:29<00:00,  1.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9722\n",
+      "Val Loss: 0.0000, Val F1: 0.9524\n",
+      "Learning rate: 0.000037\n",
+      "\n",
+      "Epoch 20/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [28:37<00:00,  3.48s/it, loss=0.0000, f1=0.9747]\n",
+      "Evaluating: 100%|██████████| 124/124 [01:30<00:00,  1.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9747\n",
+      "Val Loss: 0.0000, Val F1: 0.9535\n",
+      "Learning rate: 0.000037\n",
+      "✓ Saved best model with F1: 0.9535\n",
+      "\n",
+      "============================================================\n",
+      "Training completed!\n",
+      "Best validation F1: 0.9535\n",
+      "Training curves saved to: lstm_training_curves.png\n",
+      "\n",
+      "Model saved for deployment in 'saved_lstm_model/' directory\n",
+      "Files saved:\n",
+      "  - saved_lstm_model/pii_lstm_model.pt\n",
+      "  - saved_lstm_model/vocabularies.pkl\n",
+      "  - saved_lstm_model/model_config.pkl\n"
+     ]
+    }
+   ],
+   "source": [
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "print(f\"Using device: {device}\")\n",
+    "\n",
+    "model, text_vocab, label_vocab = train_lstm_pii_model(\n",
+    "    data_path='train_augmented.json',\n",
+    "    num_epochs=20,\n",
+    "    batch_size=32,\n",
+    "    learning_rate=3e-4,\n",
+    "    use_focal_loss=True,\n",
+    "    focal_gamma=2.0,\n",
+    "    device=device\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "nvidiaTeslaT4",
+   "dataSources": [
+    {
+     "isSourceIdPinned": true,
+     "modelId": 419045,
+     "modelInstanceId": 400879,
+     "sourceId": 504813,
+     "sourceType": "modelInstanceVersion"
+    }
+   ],
+   "dockerImageVersionId": 31090,
+   "isGpuEnabled": true,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "py310-torch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  },
+  "papermill": {
+   "default_parameters": {},
+   "duration": 36216.685618,
+   "end_time": "2025-08-04T04:06:35.164363",
+   "environment_variables": {},
+   "exception": null,
+   "input_path": "__notebook__.ipynb",
+   "output_path": "__notebook__.ipynb",
+   "parameters": {},
+   "start_time": "2025-08-03T18:02:58.478745",
+   "version": "2.6.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,111 @@

+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.9.0
+asttokens==3.0.0
+Brotli==1.1.0
+brotlicffi==1.0.9.2
+certifi==2025.7.14
+cffi==1.17.1
+charset-normalizer==3.3.2
+click==8.2.1
+colorama==0.4.6
+comm==0.2.3
+contourpy==1.3.2
+cycler==0.12.1
+debugpy==1.8.15
+decorator==5.2.1
+exceptiongroup==1.3.0
+executing==2.2.0
+Faker==37.5.3
+fastapi==0.116.1
+ffmpy==0.6.1
+filelock==3.17.0
+fonttools==4.59.0
+fsspec==2025.7.0
+gmpy2==2.2.1
+gradio==5.39.0
+gradio_client==1.11.0
+groovy==0.1.2
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.34.3
+idna==3.7
+importlib_metadata==8.7.0
+ipykernel==6.30.0
+ipython==8.37.0
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.5.1
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+kiwisolver==1.4.8
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.10.5
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mkl_fft==1.3.11
+mkl_random==1.2.8
+mkl-service==2.4.0
+mpmath==1.3.0
+nest_asyncio==1.6.0
+networkx==3.4.2
+numpy==2.0.1
+orjson==3.11.1
+packaging==25.0
+pandas==2.3.1
+parso==0.8.4
+pickleshare==0.7.5
+pillow==11.1.0
+pip==25.1
+platformdirs==4.3.8
+prompt_toolkit==3.0.51
+psutil==7.0.0
+pure_eval==0.2.3
+pycparser==2.21
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.2
+pyparsing==3.2.3
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+pywin32==311
+PyYAML==6.0.2
+pyzmq==27.0.0
+requests==2.32.4
+rich==14.1.0
+ruff==0.12.7
+safehttpx==0.1.6
+scikit-learn==1.7.1
+scipy==1.15.3
+semantic-version==2.10.0
+setuptools==78.1.1
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+stack_data==0.6.3
+starlette==0.47.2
+sympy==1.13.1
+threadpoolctl==3.6.0
+tomlkit==0.13.3
+torch==2.5.1
+torchaudio==2.5.1
+torchvision==0.20.1
+tornado==6.5.1
+tqdm==4.67.1
+traitlets==5.14.3
+typer==0.16.0
+typing_extensions==4.12.2
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
+uvicorn==0.35.0
+wcwidth==0.2.13
+websockets==15.0.1
+wheel==0.45.1
+win-inet-pton==1.1.0
+zipp==3.23.0

saved_lstm/best_lstm_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:506bc8e4cf77c01844014f0b0f4b2a89235ba256be99cc221bd654722cbe1511
+size 131276970

saved_lstm/lstm_training_curves.png ADDED Viewed

Git LFS Details

SHA256: 7541582e6aaa9ab04826f862fe4c2eb178a7cc0c428aecc6609ed6a131817339
Pointer size: 131 Bytes
Size of remote file: 206 kB

saved_lstm/model_config.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ca75bdd6914a57731e2aaa0d62a94e31263e92108d1cc4357f701a2bb92a7e7
+size 132

saved_lstm/pii_lstm_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7abbce5ad1109cfbade6c4d12d2ae3fc4247e187287d8ca270603e4767613ae
+size 42936702

saved_lstm/vocabularies.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9242868bc0aaefd68419706becdee2cf7336799c047886e1e1639af8e1726978
+size 1996397

saved_transformer/best_transformer_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:921e9615e48200b1970f359b8e8343d310f3125d7f070e37b666d38b687e0778
+size 229015868

saved_transformer/model_config.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1caf1f3dd5bcc8ca70bff3443223a0a68636d3f3da5c32897170998e2ca0bc83
+size 132

saved_transformer/pii_transformer_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b31abf1d79f3cd02870dfd47eb3deef0d4456d20023b906bd55c0124cd374b3c
+size 75867152

saved_transformer/transformer_training_curves.png ADDED Viewed

Git LFS Details

SHA256: 0e4aa3d3f521646f300a70f2b189abc66c3d5c0c2bae7eb920d4ce57f3a24b50
Pointer size: 131 Bytes
Size of remote file: 191 kB

saved_transformer/vocabularies.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9242868bc0aaefd68419706becdee2cf7336799c047886e1e1639af8e1726978
+size 1996397

train.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8276cd44f3b2eb357dfb405b3c5d8e9f821388e984cbf66e92e7df03f1b13117
+size 109496478

train_augmented.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f195815ab3d4b50ec302f6fc4ab07770c440608054fab9a83136229c0b723e8
+size 59487171

transformer.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+def scaled_dot_product_attention(q, k, v, mask=None, dropout=None):
+    """
+    Compute scaled dot-product attention.
+    Args:
+        q: queries (batch_size, num_heads, seq_len_q, d_k)
+        k: keys    (batch_size, num_heads, seq_len_k, d_k)
+        v: values  (batch_size, num_heads, seq_len_v, d_v)
+        mask: mask tensor (batch_size, 1, 1, seq_len_k) or (batch_size, 1, seq_len_q, seq_len_k)
+        dropout: dropout layer
+    Returns:
+        output: attended values (batch_size, num_heads, seq_len_q, d_v)
+        attention_weights: attention weights (batch_size, num_heads, seq_len_q, seq_len_k)
+    """
+    d_k = q.size(-1)
+    # Calculate attention scores
+    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
+    # Apply mask if provided
+    if mask is not None:
+        scores = scores.masked_fill(mask == 0, float('-inf'))
+    # Apply softmax
+    attention_weights = F.softmax(scores, dim=-1)
+    # Apply dropout if provided
+    if dropout is not None:
+        attention_weights = dropout(attention_weights)
+    # Apply attention to values
+    output = torch.matmul(attention_weights, v)
+    return output, attention_weights
+class MultiHeadAttention(nn.Module):
+    """Multi-Head Attention mechanism"""
+    def __init__(self, d_model, num_heads, dropout=0.1):
+        super(MultiHeadAttention, self).__init__()
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.d_k = d_model // num_heads
+        # Linear projections for Q, K, V
+        self.w_q = nn.Linear(d_model, d_model)
+        self.w_k = nn.Linear(d_model, d_model)
+        self.w_v = nn.Linear(d_model, d_model)
+        # Output projection
+        self.w_o = nn.Linear(d_model, d_model)
+        # Dropout
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, query, key, value, mask=None):
+        """
+        Args:
+            query: (batch_size, seq_len_q, d_model)
+            key:   (batch_size, seq_len_k, d_model)
+            value: (batch_size, seq_len_v, d_model)
+            mask:  (batch_size, 1, 1, seq_len_k) or None
+        Returns:
+            output: (batch_size, seq_len_q, d_model)
+            attention_weights: (batch_size, num_heads, seq_len_q, seq_len_k)
+        """
+        batch_size = query.size(0)
+        seq_len_q = query.size(1)
+        seq_len_k = key.size(1)
+        seq_len_v = value.size(1)
+        # 1. Linear projections in batch from d_model => h x d_k
+        Q = self.w_q(query).view(batch_size, seq_len_q, self.num_heads, self.d_k).transpose(1, 2)
+        K = self.w_k(key).view(batch_size, seq_len_k, self.num_heads, self.d_k).transpose(1, 2)
+        V = self.w_v(value).view(batch_size, seq_len_v, self.num_heads, self.d_k).transpose(1, 2)
+        # 2. Apply attention on all the projected vectors in batch
+        attention_output, attention_weights = scaled_dot_product_attention(
+            Q, K, V, mask=mask, dropout=self.dropout
+        )
+        # 3. Concatenate heads and put through final linear layer
+        attention_output = attention_output.transpose(1, 2).contiguous().view(
+            batch_size, seq_len_q, self.d_model
+        )
+        output = self.w_o(attention_output)
+        return output, attention_weights
+class PositionwiseFeedForward(nn.Module):
+    """Position-wise Feed Forward Network"""
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = nn.ReLU()
+    def forward(self, x):
+        """
+        Args:
+            x: (batch_size, seq_len, d_model)
+        Returns:
+            output: (batch_size, seq_len, d_model)
+        """
+        return self.w_2(self.dropout(self.activation(self.w_1(x))))
+class EncoderLayer(nn.Module):
+    """Single Encoder Layer"""
+    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
+        super(EncoderLayer, self).__init__()
+        # Multi-head attention
+        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
+        # Position-wise feed forward
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
+        # Layer normalization
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        # Dropout
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: (batch_size, seq_len, d_model)
+            mask: (batch_size, 1, 1, seq_len) or None
+        Returns:
+            output: (batch_size, seq_len, d_model)
+        """
+        # Self-attention with residual connection and layer norm
+        attn_output, _ = self.self_attention(x, x, x, mask)
+        x = self.norm1(x + self.dropout(attn_output))
+        # Feed forward with residual connection and layer norm
+        ff_output = self.feed_forward(x)
+        x = self.norm2(x + self.dropout(ff_output))
+        return x
+class TransformerEncoder(nn.Module):
+    """Stack of Encoder Layers"""
+    def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList([
+            EncoderLayer(d_model, num_heads, d_ff, dropout)
+            for _ in range(num_layers)
+        ])
+        self.norm = nn.LayerNorm(d_model)
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: (batch_size, seq_len, d_model)
+            mask: (batch_size, 1, 1, seq_len) or None
+        Returns:
+            output: (batch_size, seq_len, d_model)
+        """
+        for layer in self.layers:
+            x = layer(x, mask)
+        return self.norm(x)
+class PositionalEncoding(nn.Module):
+    """Positional Encoding for Transformer"""
+    def __init__(self, d_model, max_len=5000, dropout=0.1):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(dropout)
+        # Create positional encoding matrix
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        # Create div_term for sin/cos frequencies
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
+                           (-math.log(10000.0) / d_model))
+        # Apply sin to even indices
+        pe[:, 0::2] = torch.sin(position * div_term)
+        # Apply cos to odd indices
+        if d_model % 2 == 1:
+            pe[:, 1::2] = torch.cos(position * div_term[:-1])
+        else:
+            pe[:, 1::2] = torch.cos(position * div_term)
+        # Add batch dimension and register as buffer
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        """
+        Args:
+            x: (batch_size, seq_len, d_model)
+        Returns:
+            output: (batch_size, seq_len, d_model)
+        """
+        # Add positional encoding
+        x = x + self.pe[:, :x.size(1), :]
+        return self.dropout(x)
+class TransformerPII(nn.Module):
+    """
+    Transformer model for PII detection (token classification)
+    Built from scratch with custom implementation
+    """
+    def __init__(self, vocab_size, num_classes, d_model=256, num_heads=8,
+                 d_ff=512, num_layers=4, dropout=0.1, max_len=512, pad_idx=0):
+        super(TransformerPII, self).__init__()
+        self.d_model = d_model
+        self.pad_idx = pad_idx
+        # Token embedding layer
+        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
+        # Positional encoding
+        self.positional_encoding = PositionalEncoding(d_model, max_len, dropout)
+        # Transformer encoder stack
+        self.encoder = TransformerEncoder(num_layers, d_model, num_heads, d_ff, dropout)
+        # Classification head
+        self.classifier = nn.Linear(d_model, num_classes)
+        # Dropout
+        self.dropout = nn.Dropout(dropout)
+        # Initialize weights
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize model weights"""
+        # Initialize embeddings
+        nn.init.normal_(self.embedding.weight, mean=0, std=self.d_model**-0.5)
+        if self.pad_idx is not None:
+            nn.init.constant_(self.embedding.weight[self.pad_idx], 0)
+        # Initialize classifier
+        nn.init.xavier_uniform_(self.classifier.weight)
+        if self.classifier.bias is not None:
+            nn.init.constant_(self.classifier.bias, 0)
+    def create_padding_mask(self, x):
+        """
+        Create padding mask for attention
+        Args:
+            x: (batch_size, seq_len) - input token indices
+        Returns:
+            mask: (batch_size, 1, 1, seq_len) - attention mask
+        """
+        # Create mask where padding tokens are marked as 0
+        mask = (x != self.pad_idx).unsqueeze(1).unsqueeze(2)
+        return mask.float()
+    def forward(self, x, mask=None):
+        """
+        Forward pass for token classification
+        Args:
+            x: (batch_size, seq_len) - input token indices
+            mask: Optional custom attention mask
+        Returns:
+            logits: (batch_size, seq_len, num_classes) - classification logits
+        """
+        # Check input dimensions
+        if x.dim() != 2:
+            raise ValueError(f"Expected input to have 2 dimensions [batch_size, seq_len], got {x.dim()}")
+        batch_size, seq_len = x.shape
+        # Create padding mask if not provided
+        if mask is None:
+            mask = self.create_padding_mask(x)
+        # Embedding with scaling
+        x = self.embedding(x) * math.sqrt(self.d_model)
+        # Add positional encoding
+        x = self.positional_encoding(x)
+        # Pass through transformer encoder
+        encoder_output = self.encoder(x, mask)
+        # Apply dropout before classification
+        encoder_output = self.dropout(encoder_output)
+        # Classify each token
+        logits = self.classifier(encoder_output)
+        return logits
+    def predict(self, x):
+        """
+        Get predictions for inference
+        Args:
+            x: (batch_size, seq_len) - input token indices
+        Returns:
+            predictions: (batch_size, seq_len) - predicted class indices
+        """
+        self.eval()
+        with torch.no_grad():
+            logits = self.forward(x)
+            predictions = torch.argmax(logits, dim=-1)
+        return predictions
+class TransformerPIIWithCRF(TransformerPII):
+    """
+    Transformer with CRF layer for improved sequence labeling
+    (Optional enhancement - requires pytorch-crf)
+    """
+    def __init__(self, vocab_size, num_classes, d_model=256, num_heads=8,
+                 d_ff=512, num_layers=4, dropout=0.1, max_len=512, pad_idx=0):
+        super(TransformerPIIWithCRF, self).__init__(
+            vocab_size, num_classes, d_model, num_heads,
+            d_ff, num_layers, dropout, max_len, pad_idx
+        )
+        # CRF layer would be initialized here
+        # from torchcrf import CRF
+        # self.crf = CRF(num_classes, batch_first=True)
+    def forward(self, x, labels=None):
+        """Forward pass with optional CRF"""
+        # Get transformer outputs
+        emissions = super().forward(x)
+        if labels is not None:
+            # Training mode with CRF
+            # mask = (x != self.pad_idx)
+            # loss = -self.crf(emissions, labels, mask=mask)
+            # return loss
+            pass
+        else:
+            # Inference mode with CRF
+            # mask = (x != self.pad_idx)
+            # predictions = self.crf.decode(emissions, mask=mask)
+            # return predictions
+            pass
+        return emissions
+def create_transformer_pii_model(vocab_size, num_classes, d_model=256, num_heads=8,
+                                d_ff=512, num_layers=4, dropout=0.1, max_len=512):
+    """
+    Factory function to create transformer model for PII detection
+    Args:
+        vocab_size: Size of vocabulary
+        num_classes: Number of PII classes (e.g., 20)
+        d_model: Dimension of model (hidden size)
+        num_heads: Number of attention heads
+        d_ff: Dimension of feedforward network
+        num_layers: Number of transformer layers
+        dropout: Dropout rate
+        max_len: Maximum sequence length
+    Returns:
+        TransformerPII model instance
+    """
+    model = TransformerPII(
+        vocab_size=vocab_size,
+        num_classes=num_classes,
+        d_model=d_model,
+        num_heads=num_heads,
+        d_ff=d_ff,
+        num_layers=num_layers,
+        dropout=dropout,
+        max_len=max_len,
+        pad_idx=0  # Assuming 0 is padding index
+    )
+    return model

transformer_training.ipynb ADDED Viewed

	@@ -0,0 +1,1319 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "216181fb",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:32.135992Z",
+     "iopub.status.busy": "2025-08-03T16:54:32.135203Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.757081Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.756283Z"
+    },
+    "papermill": {
+     "duration": 12.627911,
+     "end_time": "2025-08-03T16:54:44.758473",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:32.130562",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler\n",
+    "from torch.nn.utils.rnn import pad_sequence\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from collections import Counter\n",
+    "import pickle\n",
+    "from tqdm import tqdm\n",
+    "import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from datetime import datetime\n",
+    "from transformer import create_transformer_pii_model\n",
+    "from data_augmentation import calculate_class_weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ff1782dd",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.767637Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.766805Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.774888Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.774045Z"
+    },
+    "papermill": {
+     "duration": 0.013734,
+     "end_time": "2025-08-03T16:54:44.776187",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.762453",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "class Vocabulary:\n",
+    "    \"\"\"Vocabulary class for encoding/decoding text and labels\"\"\"\n",
+    "    def __init__(self, max_size=100000):\n",
+    "        self.word2idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}\n",
+    "        self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}\n",
+    "        self.word_count = Counter()\n",
+    "        self.max_size = max_size\n",
+    "        \n",
+    "    def add_sentence(self, sentence):\n",
+    "        for word in sentence:\n",
+    "            self.word_count[word.lower()] += 1\n",
+    "    \n",
+    "    def build(self):\n",
+    "        most_common = self.word_count.most_common(self.max_size - len(self.word2idx))\n",
+    "        for word, _ in most_common:\n",
+    "            if word not in self.word2idx:\n",
+    "                idx = len(self.word2idx)\n",
+    "                self.word2idx[word] = idx\n",
+    "                self.idx2word[idx] = word\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.word2idx)\n",
+    "    \n",
+    "    def encode(self, sentence):\n",
+    "        return [self.word2idx.get(word.lower(), self.word2idx['<unk>']) for word in sentence]\n",
+    "    \n",
+    "    def decode(self, indices):\n",
+    "        return [self.idx2word.get(idx, '<unk>') for idx in indices]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "5b2b46d6",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.785061Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.784479Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.790645Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.790095Z"
+    },
+    "papermill": {
+     "duration": 0.011749,
+     "end_time": "2025-08-03T16:54:44.791761",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.780012",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "class PIIDataset(Dataset):\n",
+    "    \"\"\"PyTorch Dataset for PII detection\"\"\"\n",
+    "    def __init__(self, tokens, labels, text_vocab, label_vocab, max_len=512):\n",
+    "        self.tokens = tokens\n",
+    "        self.labels = labels\n",
+    "        self.text_vocab = text_vocab\n",
+    "        self.label_vocab = label_vocab\n",
+    "        self.max_len = max_len\n",
+    "        \n",
+    "    def __len__(self):\n",
+    "        return len(self.tokens)\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        # Add start and end tokens\n",
+    "        tokens = ['<start>'] + self.tokens[idx] + ['<end>']\n",
+    "        labels = ['<start>'] + self.labels[idx] + ['<end>']\n",
+    "        \n",
+    "        # Truncate if too long\n",
+    "        if len(tokens) > self.max_len:\n",
+    "            tokens = tokens[:self.max_len-1] + ['<end>']\n",
+    "            labels = labels[:self.max_len-1] + ['<end>']\n",
+    "        \n",
+    "        # Encode\n",
+    "        token_ids = self.text_vocab.encode(tokens)\n",
+    "        label_ids = self.label_vocab.encode(labels)\n",
+    "        \n",
+    "        return torch.tensor(token_ids), torch.tensor(label_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e7ca8f8f",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.799705Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.799475Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.803433Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.802870Z"
+    },
+    "papermill": {
+     "duration": 0.009288,
+     "end_time": "2025-08-03T16:54:44.804692",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.795404",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def collate_fn(batch):\n",
+    "    \"\"\"Custom collate function for padding sequences\"\"\"\n",
+    "    tokens, labels = zip(*batch)\n",
+    "    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)\n",
+    "    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)\n",
+    "    return tokens_padded, labels_padded"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "85b32e21",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.813147Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.812906Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.823227Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.822443Z"
+    },
+    "papermill": {
+     "duration": 0.016244,
+     "end_time": "2025-08-03T16:54:44.824490",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.808246",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "class F1ScoreMetric:\n",
+    "    \"\"\"Custom F1 score metric with beta parameter\"\"\"\n",
+    "    def __init__(self, beta=5, num_classes=20, ignore_index=0, label_vocab=None):\n",
+    "        self.beta = beta\n",
+    "        self.num_classes = num_classes\n",
+    "        self.ignore_index = ignore_index\n",
+    "        self.label_vocab = label_vocab\n",
+    "        self.reset()\n",
+    "        \n",
+    "    def reset(self):\n",
+    "        self.true_positives = 0\n",
+    "        self.false_positives = 0\n",
+    "        self.false_negatives = 0\n",
+    "        self.class_metrics = {}\n",
+    "        \n",
+    "    def update(self, predictions, targets):\n",
+    "        mask = (targets != self.ignore_index) & (targets != 2) & (targets != 3)\n",
+    "        o_idx = self.label_vocab.word2idx.get('o', -1) if self.label_vocab else -1\n",
+    "        \n",
+    "        for class_id in range(1, self.num_classes):\n",
+    "            if class_id == o_idx:\n",
+    "                continue\n",
+    "                \n",
+    "            pred_mask = (predictions == class_id) & mask\n",
+    "            true_mask = (targets == class_id) & mask\n",
+    "            \n",
+    "            tp = ((pred_mask) & (true_mask)).sum().item()\n",
+    "            fp = ((pred_mask) & (~true_mask)).sum().item()\n",
+    "            fn = ((~pred_mask) & (true_mask)).sum().item()\n",
+    "            \n",
+    "            self.true_positives += tp\n",
+    "            self.false_positives += fp\n",
+    "            self.false_negatives += fn\n",
+    "            \n",
+    "            if class_id not in self.class_metrics:\n",
+    "                self.class_metrics[class_id] = {'tp': 0, 'fp': 0, 'fn': 0}\n",
+    "            self.class_metrics[class_id]['tp'] += tp\n",
+    "            self.class_metrics[class_id]['fp'] += fp\n",
+    "            self.class_metrics[class_id]['fn'] += fn\n",
+    "    \n",
+    "    def compute(self):\n",
+    "        beta_squared = self.beta ** 2\n",
+    "        precision = self.true_positives / (self.true_positives + self.false_positives + 1e-8)\n",
+    "        recall = self.true_positives / (self.true_positives + self.false_negatives + 1e-8)\n",
+    "        f1 = (1 + beta_squared) * precision * recall / (beta_squared * precision + recall + 1e-8)\n",
+    "        return f1\n",
+    "    \n",
+    "    def get_class_metrics(self):\n",
+    "        results = {}\n",
+    "        for class_id, metrics in self.class_metrics.items():\n",
+    "            if self.label_vocab and class_id in self.label_vocab.idx2word:\n",
+    "                class_name = self.label_vocab.idx2word[class_id]\n",
+    "                precision = metrics['tp'] / (metrics['tp'] + metrics['fp'] + 1e-8)\n",
+    "                recall = metrics['tp'] / (metrics['tp'] + metrics['fn'] + 1e-8)\n",
+    "                f1 = 2 * precision * recall / (precision + recall + 1e-8)\n",
+    "                results[class_name] = {\n",
+    "                    'precision': precision,\n",
+    "                    'recall': recall,\n",
+    "                    'f1': f1,\n",
+    "                    'support': metrics['tp'] + metrics['fn']\n",
+    "                }\n",
+    "        return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "60cf16eb",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.832210Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.831970Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.837466Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.836871Z"
+    },
+    "papermill": {
+     "duration": 0.01072,
+     "end_time": "2025-08-03T16:54:44.838672",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.827952",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "class FocalLoss(nn.Module):\n",
+    "    \"\"\"Focal Loss for addressing class imbalance\"\"\"\n",
+    "    def __init__(self, alpha=None, gamma=2.0, reduction='mean', ignore_index=-100):\n",
+    "        super(FocalLoss, self).__init__()\n",
+    "        self.alpha = alpha\n",
+    "        self.gamma = gamma\n",
+    "        self.reduction = reduction\n",
+    "        self.ignore_index = ignore_index\n",
+    "        \n",
+    "    def forward(self, inputs, targets):\n",
+    "        ce_loss = nn.functional.cross_entropy(\n",
+    "            inputs, targets, \n",
+    "            weight=self.alpha, \n",
+    "            reduction='none',\n",
+    "            ignore_index=self.ignore_index\n",
+    "        )\n",
+    "        \n",
+    "        pt = torch.exp(-ce_loss)\n",
+    "        focal_loss = (1 - pt) ** self.gamma * ce_loss\n",
+    "        \n",
+    "        if self.reduction == 'mean':\n",
+    "            return focal_loss.mean()\n",
+    "        elif self.reduction == 'sum':\n",
+    "            return focal_loss.sum()\n",
+    "        else:\n",
+    "            return focal_loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "4e56747c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.846907Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.846289Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.852363Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.851772Z"
+    },
+    "papermill": {
+     "duration": 0.011242,
+     "end_time": "2025-08-03T16:54:44.853481",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.842239",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def train_epoch(model, dataloader, optimizer, criterion, device, f1_metric):\n",
+    "    \"\"\"Train for one epoch\"\"\"\n",
+    "    model.train()\n",
+    "    total_loss = 0\n",
+    "    f1_metric.reset()\n",
+    "    \n",
+    "    progress_bar = tqdm(dataloader, desc='Training')\n",
+    "    for batch_idx, (tokens, labels) in enumerate(progress_bar):\n",
+    "        tokens = tokens.to(device)\n",
+    "        labels = labels.to(device)\n",
+    "        \n",
+    "        # Forward pass\n",
+    "        optimizer.zero_grad()\n",
+    "        outputs = model(tokens)\n",
+    "        \n",
+    "        # Reshape for loss calculation\n",
+    "        outputs_flat = outputs.view(-1, outputs.size(-1))\n",
+    "        labels_flat = labels.view(-1)\n",
+    "        \n",
+    "        # Calculate loss and backward pass\n",
+    "        loss = criterion(outputs_flat, labels_flat)\n",
+    "        loss.backward()\n",
+    "        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)\n",
+    "        optimizer.step()\n",
+    "        \n",
+    "        # Update metrics\n",
+    "        total_loss += loss.item()\n",
+    "        predictions = torch.argmax(outputs, dim=-1)\n",
+    "        f1_metric.update(predictions, labels)\n",
+    "        \n",
+    "        # Update progress bar\n",
+    "        progress_bar.set_postfix({\n",
+    "            'loss': f\"{loss.item():.4f}\",\n",
+    "            'f1': f\"{f1_metric.compute():.4f}\"\n",
+    "        })\n",
+    "    \n",
+    "    return total_loss / len(dataloader), f1_metric.compute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "8a2e8d19",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.860755Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.860552Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.865987Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.865175Z"
+    },
+    "papermill": {
+     "duration": 0.010585,
+     "end_time": "2025-08-03T16:54:44.867309",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.856724",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def evaluate(model, dataloader, criterion, device, f1_metric):\n",
+    "    \"\"\"Evaluate model on validation/test set\"\"\"\n",
+    "    model.eval()\n",
+    "    total_loss = 0\n",
+    "    f1_metric.reset()\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        for tokens, labels in tqdm(dataloader, desc='Evaluating'):\n",
+    "            tokens = tokens.to(device)\n",
+    "            labels = labels.to(device)\n",
+    "            \n",
+    "            # Forward pass\n",
+    "            outputs = model(tokens)\n",
+    "            outputs_flat = outputs.view(-1, outputs.size(-1))\n",
+    "            labels_flat = labels.view(-1)\n",
+    "            \n",
+    "            # Calculate loss\n",
+    "            loss = criterion(outputs_flat, labels_flat)\n",
+    "            total_loss += loss.item()\n",
+    "            \n",
+    "            # Update metrics\n",
+    "            predictions = torch.argmax(outputs, dim=-1)\n",
+    "            f1_metric.update(predictions, labels)\n",
+    "    \n",
+    "    return total_loss / len(dataloader), f1_metric.compute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "6e292ace",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.876030Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.875513Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.880655Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.879870Z"
+    },
+    "papermill": {
+     "duration": 0.010355,
+     "end_time": "2025-08-03T16:54:44.881962",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.871607",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def create_balanced_sampler(dataset, label_vocab):\n",
+    "    \"\"\"Create a weighted sampler to balance classes during training\"\"\"\n",
+    "    sample_weights = []\n",
+    "    \n",
+    "    for idx in range(len(dataset)):\n",
+    "        _, labels = dataset[idx]\n",
+    "        \n",
+    "        # Give higher weight to samples with rare PII\n",
+    "        min_weight = 1.0\n",
+    "        for label_id in labels:\n",
+    "            if label_id > 3:  # Skip special tokens\n",
+    "                label_name = label_vocab.idx2word.get(label_id.item(), 'O')\n",
+    "                if label_name != 'o' and 'B-' in label_name:\n",
+    "                    min_weight = 10.0\n",
+    "                    break\n",
+    "        \n",
+    "        sample_weights.append(min_weight)\n",
+    "    \n",
+    "    sampler = WeightedRandomSampler(\n",
+    "        weights=sample_weights,\n",
+    "        num_samples=len(sample_weights),\n",
+    "        replacement=True\n",
+    "    )\n",
+    "    \n",
+    "    return sampler\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "857335cb",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.889690Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.889472Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.894459Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.893888Z"
+    },
+    "papermill": {
+     "duration": 0.010295,
+     "end_time": "2025-08-03T16:54:44.895625",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.885330",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def print_label_distribution(data, title=\"Label Distribution\"):\n",
+    "    \"\"\"Print label distribution statistics\"\"\"\n",
+    "    label_counts = Counter()\n",
+    "    for label_seq in data.labels:\n",
+    "        for label in label_seq:\n",
+    "            if label not in ['<pad>', '<start>', '<end>']:\n",
+    "                label_counts[label] += 1\n",
+    "    \n",
+    "    print(f\"\\n{title}:\")\n",
+    "    print(\"-\" * 50)\n",
+    "    total = sum(label_counts.values())\n",
+    "    for label, count in label_counts.most_common():\n",
+    "        percentage = (count / total) * 100\n",
+    "        print(f\"  {label:20} : {count:8,} ({percentage:5.2f}%)\")\n",
+    "    print(\"-\" * 50)\n",
+    "    print(f\"  {'Total':20} : {total:8,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "1738f8a9",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.903649Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.903207Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.908673Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.908076Z"
+    },
+    "papermill": {
+     "duration": 0.010714,
+     "end_time": "2025-08-03T16:54:44.909864",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.899150",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def save_model(model, text_vocab, label_vocab, config, save_dir):\n",
+    "    \"\"\"Save model and all necessary components for Flask deployment\"\"\"\n",
+    "    os.makedirs(save_dir, exist_ok=True)\n",
+    "    \n",
+    "    # Save model state\n",
+    "    model_path = os.path.join(save_dir, 'pii_transformer_model.pt')\n",
+    "    torch.save(model.state_dict(), model_path)\n",
+    "    \n",
+    "    # Save vocabularies\n",
+    "    vocab_path = os.path.join(save_dir, 'vocabularies.pkl')\n",
+    "    with open(vocab_path, 'wb') as f:\n",
+    "        pickle.dump({\n",
+    "            'text_vocab': text_vocab,\n",
+    "            'label_vocab': label_vocab\n",
+    "        }, f)\n",
+    "    \n",
+    "    # Save model configuration\n",
+    "    config_path = os.path.join(save_dir, 'model_config.pkl')\n",
+    "    with open(config_path, 'wb') as f:\n",
+    "        pickle.dump(config, f)\n",
+    "    \n",
+    "    print(f\"\\nModel saved for deployment in '{save_dir}/' directory\")\n",
+    "    print(\"Files saved:\")\n",
+    "    print(f\"  - {model_path}\")\n",
+    "    print(f\"  - {vocab_path}\")\n",
+    "    print(f\"  - {config_path}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "d93e7c25",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.917693Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.917438Z",
+     "iopub.status.idle": "2025-08-03T16:54:44.933820Z",
+     "shell.execute_reply": "2025-08-03T16:54:44.933284Z"
+    },
+    "papermill": {
+     "duration": 0.021776,
+     "end_time": "2025-08-03T16:54:44.935035",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.913259",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def train_transformer_pii_model(\n",
+    "    data_path,\n",
+    "    num_epochs=30,\n",
+    "    batch_size=32,\n",
+    "    learning_rate=2e-4,\n",
+    "    use_focal_loss=True,\n",
+    "    focal_gamma=2.0,\n",
+    "    device='cuda',\n",
+    "):\n",
+    "    \"\"\"Main training function\"\"\"\n",
+    "    \n",
+    "    # Load data\n",
+    "    print(\"Loading augmented data...\")\n",
+    "    data = pd.read_json(data_path, lines=True)\n",
+    "    print(f\"Total samples: {len(data)}\")\n",
+    "    \n",
+    "    # Print initial label distribution\n",
+    "    print_label_distribution(data, \"Label Distribution in Augmented Data\")\n",
+    "    \n",
+    "    # Build vocabularies\n",
+    "    print(\"\\nBuilding vocabularies...\")\n",
+    "    text_vocab = Vocabulary(max_size=100000)\n",
+    "    label_vocab = Vocabulary(max_size=50)\n",
+    "    \n",
+    "    for tokens in data.tokens:\n",
+    "        text_vocab.add_sentence(tokens)\n",
+    "    for labels in data.labels:\n",
+    "        label_vocab.add_sentence(labels)\n",
+    "    \n",
+    "    text_vocab.build()\n",
+    "    label_vocab.build()\n",
+    "    \n",
+    "    # Calculate class weights\n",
+    "    class_weights = calculate_class_weights(data, label_vocab)\n",
+    "    class_weights = class_weights.to(device)\n",
+    "    \n",
+    "    # Split data\n",
+    "    X_train, X_val, y_train, y_val = train_test_split(\n",
+    "        data.tokens.tolist(),\n",
+    "        data.labels.tolist(),\n",
+    "        test_size=0.2,\n",
+    "        random_state=42\n",
+    "    )\n",
+    "    \n",
+    "    print(f\"\\nData split:\")\n",
+    "    print(f\"  - Train samples: {len(X_train):,}\")\n",
+    "    print(f\"  - Validation samples: {len(X_val):,}\")\n",
+    "    \n",
+    "    # Create datasets and dataloaders\n",
+    "    max_seq_len = 512\n",
+    "    train_dataset = PIIDataset(X_train, y_train, text_vocab, label_vocab, max_len=max_seq_len)\n",
+    "    val_dataset = PIIDataset(X_val, y_val, text_vocab, label_vocab, max_len=max_seq_len)\n",
+    "    \n",
+    "    train_sampler = create_balanced_sampler(train_dataset, label_vocab)\n",
+    "    \n",
+    "    train_loader = DataLoader(\n",
+    "        train_dataset, \n",
+    "        batch_size=batch_size,\n",
+    "        sampler=train_sampler,\n",
+    "        collate_fn=collate_fn,\n",
+    "        num_workers=0\n",
+    "    )\n",
+    "    \n",
+    "    val_loader = DataLoader(\n",
+    "        val_dataset, \n",
+    "        batch_size=batch_size,\n",
+    "        shuffle=False, \n",
+    "        collate_fn=collate_fn,\n",
+    "        num_workers=0\n",
+    "    )\n",
+    "    \n",
+    "    # Model configuration\n",
+    "    model_config = {\n",
+    "        'vocab_size': len(text_vocab),\n",
+    "        'num_classes': len(label_vocab),\n",
+    "        'd_model': 256,\n",
+    "        'num_heads': 8,\n",
+    "        'd_ff': 512,\n",
+    "        'num_layers': 4,\n",
+    "        'dropout': 0.1,\n",
+    "        'max_len': max_seq_len\n",
+    "    }\n",
+    "    \n",
+    "    # Create model\n",
+    "    print(\"\\nCreating model...\")\n",
+    "    model = create_transformer_pii_model(**model_config).to(device)\n",
+    "    print(f\"Model parameters: {sum(p.numel() for p in model.parameters()):,}\")\n",
+    "    \n",
+    "    # Setup loss function\n",
+    "    if use_focal_loss:\n",
+    "        criterion = FocalLoss(\n",
+    "            alpha=class_weights,\n",
+    "            gamma=focal_gamma,\n",
+    "            ignore_index=0\n",
+    "        )\n",
+    "    else:\n",
+    "        criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=0)\n",
+    "    \n",
+    "    # Setup optimizer and scheduler\n",
+    "    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)\n",
+    "    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)\n",
+    "    \n",
+    "    # Metrics\n",
+    "    f1_metric_train = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
+    "    f1_metric_val = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
+    "    \n",
+    "    # Training loop\n",
+    "    train_losses, train_f1s, val_losses, val_f1s = [], [], [], []\n",
+    "    best_val_f1 = 0\n",
+    "    patience = 5\n",
+    "    patience_counter = 0\n",
+    "    \n",
+    "    print(\"\\nStarting training...\")\n",
+    "    print(\"=\" * 60)\n",
+    "    \n",
+    "    for epoch in range(num_epochs):\n",
+    "        print(f\"\\nEpoch {epoch+1}/{num_epochs}\")\n",
+    "        \n",
+    "        # Train and validate\n",
+    "        train_loss, train_f1 = train_epoch(\n",
+    "            model, train_loader, optimizer, criterion, device, f1_metric_train\n",
+    "        )\n",
+    "        val_loss, val_f1 = evaluate(\n",
+    "            model, val_loader, criterion, device, f1_metric_val\n",
+    "        )\n",
+    "        \n",
+    "        # Step scheduler\n",
+    "        scheduler.step(val_loss)\n",
+    "        \n",
+    "        # Store metrics\n",
+    "        train_losses.append(train_loss)\n",
+    "        train_f1s.append(train_f1)\n",
+    "        val_losses.append(val_loss)\n",
+    "        val_f1s.append(val_f1)\n",
+    "        \n",
+    "        # Print epoch results\n",
+    "        print(f\"Train Loss: {train_loss:.4f}, Train F1: {train_f1:.4f}\")\n",
+    "        print(f\"Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}\")\n",
+    "        print(f\"Learning rate: {optimizer.param_groups[0]['lr']:.6f}\")\n",
+    "        \n",
+    "        # Save best model\n",
+    "        if val_f1 > best_val_f1:\n",
+    "            best_val_f1 = val_f1\n",
+    "            patience_counter = 0\n",
+    "            \n",
+    "            # Save complete checkpoint\n",
+    "            torch.save({\n",
+    "                'epoch': epoch,\n",
+    "                'model_state_dict': model.state_dict(),\n",
+    "                'optimizer_state_dict': optimizer.state_dict(),\n",
+    "                'train_loss': train_loss,\n",
+    "                'val_loss': val_loss,\n",
+    "                'train_f1': train_f1,\n",
+    "                'val_f1': val_f1,\n",
+    "                'text_vocab': text_vocab,\n",
+    "                'label_vocab': label_vocab,\n",
+    "                'model_config': model_config\n",
+    "            }, 'best_transformer_checkpoint.pt')\n",
+    "            \n",
+    "            print(f\"Saved best model with F1: {val_f1:.4f}\")\n",
+    "        else:\n",
+    "            patience_counter += 1\n",
+    "            \n",
+    "        # Early stopping\n",
+    "        if patience_counter >= patience and epoch > 10:\n",
+    "            print(f\"\\nEarly stopping triggered after {patience} epochs without improvement\")\n",
+    "            break\n",
+    "    \n",
+    "    # Plot training curves\n",
+    "    plt.figure(figsize=(12, 5))\n",
+    "    \n",
+    "    plt.subplot(1, 2, 1)\n",
+    "    plt.plot(train_losses, label='Train Loss', linewidth=2)\n",
+    "    plt.plot(val_losses, label='Val Loss', linewidth=2)\n",
+    "    plt.xlabel('Epoch')\n",
+    "    plt.ylabel('Loss')\n",
+    "    plt.title('Training and Validation Loss')\n",
+    "    plt.legend()\n",
+    "    plt.grid(True, alpha=0.3)\n",
+    "    \n",
+    "    plt.subplot(1, 2, 2)\n",
+    "    plt.plot(train_f1s, label='Train F1', linewidth=2)\n",
+    "    plt.plot(val_f1s, label='Val F1', linewidth=2)\n",
+    "    plt.xlabel('Epoch')\n",
+    "    plt.ylabel('F1 Score')\n",
+    "    plt.title('Training and Validation F1 Score')\n",
+    "    plt.legend()\n",
+    "    plt.grid(True, alpha=0.3)\n",
+    "    \n",
+    "    plt.tight_layout()\n",
+    "    plt.savefig('transformer_training_curves.png', dpi=300, bbox_inches='tight')\n",
+    "    plt.close()\n",
+    "    \n",
+    "    print(f\"\\n{'='*60}\")\n",
+    "    print(f\"Training completed!\")\n",
+    "    print(f\"Best validation F1: {best_val_f1:.4f}\")\n",
+    "    \n",
+    "    save_model(model, text_vocab, label_vocab, model_config, 'saved_transformer_model')\n",
+    "    \n",
+    "    return model, text_vocab, label_vocab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbf345da",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-08-03T16:54:44.942669Z",
+     "iopub.status.busy": "2025-08-03T16:54:44.942460Z",
+     "iopub.status.idle": "2025-08-03T17:39:36.443370Z",
+     "shell.execute_reply": "2025-08-03T17:39:36.442507Z"
+    },
+    "papermill": {
+     "duration": 2691.506418,
+     "end_time": "2025-08-03T17:39:36.444814",
+     "exception": false,
+     "start_time": "2025-08-03T16:54:44.938396",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: cuda\n",
+      "Loading augmented data...\n",
+      "Total samples: 19694\n",
+      "\n",
+      "Label Distribution in Augmented Data:\n",
+      "--------------------------------------------------\n",
+      "  O                    : 5,082,150 (99.33%)\n",
+      "  I-STREET_ADDRESS     :   15,650 ( 0.31%)\n",
+      "  B-ID_NUM             :    2,505 ( 0.05%)\n",
+      "  B-EMAIL              :    2,488 ( 0.05%)\n",
+      "  B-URL_PERSONAL       :    2,478 ( 0.05%)\n",
+      "  B-STREET_ADDRESS     :    2,452 ( 0.05%)\n",
+      "  B-PHONE_NUM          :    2,450 ( 0.05%)\n",
+      "  B-USERNAME           :    2,210 ( 0.04%)\n",
+      "  B-NAME_STUDENT       :    1,968 ( 0.04%)\n",
+      "  I-NAME_STUDENT       :    1,735 ( 0.03%)\n",
+      "  I-PHONE_NUM          :      500 ( 0.01%)\n",
+      "  I-URL_PERSONAL       :        1 ( 0.00%)\n",
+      "  I-ID_NUM             :        1 ( 0.00%)\n",
+      "--------------------------------------------------\n",
+      "  Total                : 5,116,588\n",
+      "\n",
+      "Building vocabularies...\n",
+      "\n",
+      "Data split:\n",
+      "  - Train samples: 15,755\n",
+      "  - Validation samples: 3,939\n",
+      "\n",
+      "Creating model...\n",
+      "Model parameters: 18,828,817\n",
+      "\n",
+      "Starting training...\n",
+      "============================================================\n",
+      "\n",
+      "Epoch 1/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.06it/s, loss=0.0000, f1=0.2908]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.63it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0001, Train F1: 0.2908\n",
+      "Val Loss: 0.0001, Val F1: 0.5855\n",
+      "Learning rate: 0.000200\n",
+      "Saved best model with F1: 0.5855\n",
+      "\n",
+      "Epoch 2/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s, loss=0.0000, f1=0.6256]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.56it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.6256\n",
+      "Val Loss: 0.0000, Val F1: 0.7335\n",
+      "Learning rate: 0.000200\n",
+      "Saved best model with F1: 0.7335\n",
+      "\n",
+      "Epoch 3/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.06it/s, loss=0.0000, f1=0.7573]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.55it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.7573\n",
+      "Val Loss: 0.0000, Val F1: 0.7576\n",
+      "Learning rate: 0.000200\n",
+      "Saved best model with F1: 0.7576\n",
+      "\n",
+      "Epoch 4/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.06it/s, loss=0.0000, f1=0.8054]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.58it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.8054\n",
+      "Val Loss: 0.0000, Val F1: 0.7756\n",
+      "Learning rate: 0.000200\n",
+      "Saved best model with F1: 0.7756\n",
+      "\n",
+      "Epoch 5/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s, loss=0.0000, f1=0.8403]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.59it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.8403\n",
+      "Val Loss: 0.0000, Val F1: 0.7872\n",
+      "Learning rate: 0.000200\n",
+      "Saved best model with F1: 0.7872\n",
+      "\n",
+      "Epoch 6/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s, loss=0.0001, f1=0.8743]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.61it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.8743\n",
+      "Val Loss: 0.0000, Val F1: 0.7695\n",
+      "Learning rate: 0.000200\n",
+      "\n",
+      "Epoch 7/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s, loss=0.0000, f1=0.8976]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.61it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.8976\n",
+      "Val Loss: 0.0000, Val F1: 0.8148\n",
+      "Learning rate: 0.000200\n",
+      "Saved best model with F1: 0.8148\n",
+      "\n",
+      "Epoch 8/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s, loss=0.0000, f1=0.9231]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.62it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9231\n",
+      "Val Loss: 0.0000, Val F1: 0.8247\n",
+      "Learning rate: 0.000100\n",
+      "Saved best model with F1: 0.8247\n",
+      "\n",
+      "Epoch 9/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s, loss=0.0000, f1=0.9384]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.60it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9384\n",
+      "Val Loss: 0.0000, Val F1: 0.8289\n",
+      "Learning rate: 0.000100\n",
+      "Saved best model with F1: 0.8289\n",
+      "\n",
+      "Epoch 10/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s, loss=0.0000, f1=0.9508]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.61it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9508\n",
+      "Val Loss: 0.0000, Val F1: 0.8402\n",
+      "Learning rate: 0.000100\n",
+      "Saved best model with F1: 0.8402\n",
+      "\n",
+      "Epoch 11/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.06it/s, loss=0.0000, f1=0.9544]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.60it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9544\n",
+      "Val Loss: 0.0000, Val F1: 0.8414\n",
+      "Learning rate: 0.000100\n",
+      "Saved best model with F1: 0.8414\n",
+      "\n",
+      "Epoch 12/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.06it/s, loss=0.0000, f1=0.9617]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.63it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9617\n",
+      "Val Loss: 0.0001, Val F1: 0.8420\n",
+      "Learning rate: 0.000050\n",
+      "Saved best model with F1: 0.8420\n",
+      "\n",
+      "Epoch 13/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.06it/s, loss=0.0000, f1=0.9672]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.61it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9672\n",
+      "Val Loss: 0.0000, Val F1: 0.8435\n",
+      "Learning rate: 0.000050\n",
+      "Saved best model with F1: 0.8435\n",
+      "\n",
+      "Epoch 14/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.06it/s, loss=0.0000, f1=0.9656]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.59it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9656\n",
+      "Val Loss: 0.0000, Val F1: 0.8481\n",
+      "Learning rate: 0.000050\n",
+      "Saved best model with F1: 0.8481\n",
+      "\n",
+      "Epoch 15/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.06it/s, loss=0.0000, f1=0.9683]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.59it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9683\n",
+      "Val Loss: 0.0001, Val F1: 0.8483\n",
+      "Learning rate: 0.000050\n",
+      "Saved best model with F1: 0.8483\n",
+      "\n",
+      "Epoch 16/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s, loss=0.0000, f1=0.9719]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.59it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9719\n",
+      "Val Loss: 0.0001, Val F1: 0.8503\n",
+      "Learning rate: 0.000025\n",
+      "Saved best model with F1: 0.8503\n",
+      "\n",
+      "Epoch 17/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s, loss=0.0000, f1=0.9745]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.61it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9745\n",
+      "Val Loss: 0.0001, Val F1: 0.8525\n",
+      "Learning rate: 0.000025\n",
+      "Saved best model with F1: 0.8525\n",
+      "\n",
+      "Epoch 18/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s, loss=0.0000, f1=0.9757]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.61it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9757\n",
+      "Val Loss: 0.0001, Val F1: 0.8500\n",
+      "Learning rate: 0.000025\n",
+      "\n",
+      "Epoch 19/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s, loss=0.0000, f1=0.9780]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.59it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9780\n",
+      "Val Loss: 0.0001, Val F1: 0.8508\n",
+      "Learning rate: 0.000025\n",
+      "\n",
+      "Epoch 20/20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 493/493 [02:01<00:00,  4.06it/s, loss=0.0000, f1=0.9770]\n",
+      "Evaluating: 100%|██████████| 124/124 [00:10<00:00, 11.58it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Loss: 0.0000, Train F1: 0.9770\n",
+      "Val Loss: 0.0001, Val F1: 0.8538\n",
+      "Learning rate: 0.000013\n",
+      "Saved best model with F1: 0.8538\n",
+      "\n",
+      "============================================================\n",
+      "Training completed!\n",
+      "Best validation F1: 0.8538\n",
+      "\n",
+      "Model saved for deployment in 'saved_transformer_model/' directory\n",
+      "Files saved:\n",
+      "  - saved_transformer_model/pii_transformer_model.pt\n",
+      "  - saved_transformer_model/vocabularies.pkl\n",
+      "  - saved_transformer_model/model_config.pkl\n"
+     ]
+    }
+   ],
+   "source": [
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "print(f\"Using device: {device}\")\n",
+    "\n",
+    "model, text_vocab, label_vocab = train_transformer_pii_model(\n",
+    "    data_path='train_augmented.json',\n",
+    "    num_epochs=20,\n",
+    "    batch_size=32,\n",
+    "    learning_rate=2e-4,\n",
+    "    use_focal_loss=True,\n",
+    "    focal_gamma=2.0,\n",
+    "    device=device\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "nvidiaTeslaT4",
+   "dataSources": [
+    {
+     "isSourceIdPinned": true,
+     "modelId": 419045,
+     "modelInstanceId": 400879,
+     "sourceId": 504813,
+     "sourceType": "modelInstanceVersion"
+    }
+   ],
+   "dockerImageVersionId": 31090,
+   "isGpuEnabled": true,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  },
+  "papermill": {
+   "default_parameters": {},
+   "duration": 2723.9142,
+   "end_time": "2025-08-03T17:39:40.959986",
+   "environment_variables": {},
+   "exception": null,
+   "input_path": "__notebook__.ipynb",
+   "output_path": "__notebook__.ipynb",
+   "parameters": {},
+   "start_time": "2025-08-03T16:54:17.045786",
+   "version": "2.6.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}