Spaces:

AmitHirpara
/

PII-Detection

Sleeping

App Files Files Community

AmitHirpara commited on Aug 8, 2025

Commit

f53fac9

1 Parent(s): 3dea7de

add comments

Browse files

Files changed (6) hide show

app.py +67 -47
data_augmentation.py +64 -95
lstm.py +48 -61
lstm_training.ipynb +65 -26
transformer.py +56 -172
transformer_training.ipynb +76 -29

app.py CHANGED Viewed

@@ -10,20 +10,23 @@ from collections import Counter
 import warnings
 warnings.filterwarnings('ignore')
-# Define the Vocabulary class (needed for unpickling)
 class Vocabulary:
     """Vocabulary class for encoding/decoding text and labels"""
     def __init__(self, max_size=100000):
         self.word2idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}
         self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}
         self.word_count = Counter()
         self.max_size = max_size
     def add_sentence(self, sentence):
         for word in sentence:
             self.word_count[word.lower()] += 1
     def build(self):
         most_common = self.word_count.most_common(self.max_size - len(self.word2idx))
         for word, _ in most_common:
             if word not in self.word2idx:
@@ -35,12 +38,14 @@ class Vocabulary:
         return len(self.word2idx)
     def encode(self, sentence):
         return [self.word2idx.get(word.lower(), self.word2idx['<unk>']) for word in sentence]
     def decode(self, indices):
         return [self.idx2word.get(idx, '<unk>') for idx in indices]
-# Custom Transformer components to match the saved model
 class MultiHeadAttention(nn.Module):
     def __init__(self, d_model, num_heads, dropout=0.1):
         super().__init__()
@@ -49,6 +54,7 @@ class MultiHeadAttention(nn.Module):
         self.num_heads = num_heads
         self.d_k = d_model // num_heads
         self.w_q = nn.Linear(d_model, d_model)
         self.w_k = nn.Linear(d_model, d_model)
         self.w_v = nn.Linear(d_model, d_model)
@@ -59,24 +65,27 @@ class MultiHeadAttention(nn.Module):
     def forward(self, query, key, value, mask=None):
         batch_size = query.size(0)
-        # Linear transformations and split into heads
         Q = self.w_q(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
         K = self.w_k(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
         V = self.w_v(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
-        # Attention
         scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
         if mask is not None:
             mask = mask.unsqueeze(1).unsqueeze(1)
             scores = scores.masked_fill(mask, -1e9)
         attention = F.softmax(scores, dim=-1)
         attention = self.dropout(attention)
         context = torch.matmul(attention, V)
-        # Concatenate heads
         context = context.transpose(1, 2).contiguous().view(
             batch_size, -1, self.d_model
         )
@@ -84,6 +93,7 @@ class MultiHeadAttention(nn.Module):
         output = self.w_o(context)
         return output
 class FeedForward(nn.Module):
     def __init__(self, d_model, d_ff, dropout=0.1):
         super().__init__()
@@ -92,8 +102,10 @@ class FeedForward(nn.Module):
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
         return self.w_2(self.dropout(F.gelu(self.w_1(x))))
 class EncoderLayer(nn.Module):
     def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
         super().__init__()
@@ -104,19 +116,21 @@ class EncoderLayer(nn.Module):
         self.dropout = nn.Dropout(dropout)
     def forward(self, x, mask=None):
-        # Self-attention with residual connection and layer norm
         attn_output = self.self_attention(x, x, x, mask)
         x = self.norm1(x + self.dropout(attn_output))
-        # Feed forward with residual connection and layer norm
         ff_output = self.feed_forward(x)
         x = self.norm2(x + self.dropout(ff_output))
         return x
 class TransformerEncoder(nn.Module):
     def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
         super().__init__()
         self.layers = nn.ModuleList([
             EncoderLayer(d_model, num_heads, d_ff, dropout)
             for _ in range(num_layers)
@@ -124,49 +138,63 @@ class TransformerEncoder(nn.Module):
         self.norm = nn.LayerNorm(d_model)
     def forward(self, x, mask=None):
         for layer in self.layers:
             x = layer(x, mask)
         return self.norm(x)
 class PositionalEncoding(nn.Module):
     def __init__(self, d_model, max_len=5000):
         super().__init__()
         self.d_model = d_model
         pe = torch.zeros(max_len, d_model)
         position = torch.arange(0, max_len).unsqueeze(1).float()
         div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                            -(torch.log(torch.tensor(10000.0)) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
         self.register_buffer('pe', pe.unsqueeze(0))
     def forward(self, x):
         return x * torch.sqrt(torch.tensor(self.d_model, dtype=x.dtype)) + self.pe[:, :x.size(1)]
 class TransformerPIIDetector(nn.Module):
     def __init__(self, vocab_size, num_classes, d_model=256, num_heads=8,
                  d_ff=512, num_layers=4, dropout=0.1, max_len=512):
         super().__init__()
         self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
-        self.positional_encoding = PositionalEncoding(d_model, max_len)  # Changed name to match saved model
         self.dropout = nn.Dropout(dropout)
-        # Custom encoder to match saved model structure
         self.encoder = TransformerEncoder(num_layers, d_model, num_heads, d_ff, dropout)
         self.classifier = nn.Linear(d_model, num_classes)
     def forward(self, x):
         padding_mask = (x == 0)
         x = self.embedding(x)
         x = self.positional_encoding(x)
         x = self.dropout(x)
         x = self.encoder(x, padding_mask)
         return self.classifier(x)
 def create_transformer_pii_model(**kwargs):
     return TransformerPIIDetector(**kwargs)
 class PIIDetector:
     def __init__(self, model_dir='saved_transformer'):
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -176,13 +204,13 @@ class PIIDetector:
         self.label_vocab = None
         self.load_model()
-        # Single color for all PII highlighting
         self.highlight_color = '#FF6B6B'
     def load_model(self):
         """Load the trained model and vocabularies"""
         try:
-            # Load vocabularies
             vocab_path = os.path.join(self.model_dir, 'vocabularies.pkl')
             with open(vocab_path, 'rb') as f:
                 vocabs = pickle.load(f)
@@ -194,7 +222,7 @@ class PIIDetector:
             with open(config_path, 'rb') as f:
                 model_config = pickle.load(f)
-            # Create and load model
             self.model = create_transformer_pii_model(**model_config)
             model_path = os.path.join(self.model_dir, 'pii_transformer_model.pt')
             self.model.load_state_dict(torch.load(model_path, map_location=self.device))
@@ -211,7 +239,7 @@ class PIIDetector:
     def tokenize(self, text: str) -> List[str]:
         """Simple tokenization by splitting on spaces and punctuation"""
         import re
-        # Split on whitespace and keep punctuation as separate tokens
         tokens = re.findall(r'\w+|[^\w\s]', text)
         return tokens
@@ -220,30 +248,30 @@ class PIIDetector:
         if not text.strip():
             return []
-        # Tokenize
         tokens = self.tokenize(text)
-        # Add start and end tokens
         tokens_with_special = ['<start>'] + tokens + ['<end>']
-        # Encode tokens
         token_ids = self.text_vocab.encode(tokens_with_special)
-        # Convert to tensor and add batch dimension
         input_tensor = torch.tensor([token_ids]).to(self.device)
-        # Predict
         with torch.no_grad():
             outputs = self.model(input_tensor)
             predictions = torch.argmax(outputs, dim=-1)
-        # Decode predictions (skip start and end tokens)
         predicted_labels = []
-        for idx in predictions[0][1:-1]:  # Skip <start> and <end>
             label = self.label_vocab.idx2word.get(idx.item(), 'O')
             predicted_labels.append(label.upper())
-        # Pair tokens with their labels
         return list(zip(tokens, predicted_labels))
     def create_highlighted_html(self, token_label_pairs: List[Tuple[str, str]]) -> str:
@@ -254,14 +282,14 @@ class PIIDetector:
         while i < len(token_label_pairs):
             token, label = token_label_pairs[i]
-            # Check if this is the start of a PII entity
             if label != 'O':
                 # Collect all tokens for this entity
                 entity_tokens = [token]
                 entity_label = label
                 j = i + 1
-                # Look for continuation tokens (I- tags)
                 while j < len(token_label_pairs):
                     next_token, next_label = token_label_pairs[j]
                     if next_label.startswith('I-') and next_label.replace('I-', 'B-') == entity_label:
@@ -270,14 +298,14 @@ class PIIDetector:
                     else:
                         break
-                # Join tokens with appropriate spacing
                 entity_text = ''
                 for k, tok in enumerate(entity_tokens):
                     if k > 0 and tok not in '.,!?;:':
                         entity_text += ' '
                     entity_text += tok
-                # Add highlighted entity
                 label_display = entity_label.replace('B-', '').replace('I-', '').replace('_', ' ')
                 html_parts.append(
                     f'<mark style="background-color: {self.highlight_color}; padding: 2px 4px; '
@@ -287,7 +315,7 @@ class PIIDetector:
                 i = j
             else:
-                # Add space before token if needed
                 if i > 0 and token not in '.,!?;:' and len(token_label_pairs) > i-1:
                     prev_token, _ = token_label_pairs[i-1]
                     if prev_token not in '(':
@@ -306,14 +334,14 @@ class PIIDetector:
         total_tokens = len(token_label_pairs)
         pii_tokens = 0
         for _, label in token_label_pairs:
             if label != 'O':
                 pii_tokens += 1
-                # Clean up label for display
                 label_clean = label.replace('B-', '').replace('I-', '').replace('_', ' ')
                 stats[label_clean] = stats.get(label_clean, 0) + 1
-        # Create statistics text
         stats_text = f"### Detection Summary\n\n"
         stats_text += f"**Total tokens:** {total_tokens}\n\n"
         stats_text += f"**PII tokens:** {pii_tokens} ({pii_tokens/total_tokens*100:.1f}%)\n\n"
@@ -323,7 +351,7 @@ class PIIDetector:
         return stats_text
-# Initialize the detector
 print("Initializing PII Detector...")
 detector = PIIDetector()
@@ -333,13 +361,13 @@ def detect_pii(text):
         return "<p style='color: #6c757d; padding: 20px;'>Please enter some text to analyze.</p>", "No text provided."
     try:
-        # Get predictions
         token_label_pairs = detector.predict(text)
-        # Create highlighted HTML
         highlighted_html = detector.create_highlighted_html(token_label_pairs)
-        # Get statistics
         stats = detector.get_statistics(token_label_pairs)
         return highlighted_html, stats
@@ -349,18 +377,7 @@ def detect_pii(text):
         error_stats = f"Error occurred: {str(e)}"
         return error_html, error_stats
-# Example texts
-examples = [
-    "My name is John Smith and my email is john.smith@email.com. You can reach me at 555-123-4567.",
-    "Student ID: 12345678. Please send the documents to 123 Main Street, Anytown, USA 12345.",
-    "Contact Sarah Johnson at sarah_j_2023@gmail.com or visit her profile at linkedin.com/in/sarahjohnson",
-    "The project was completed by student A1B2C3D4 who lives at 456 Oak Avenue.",
-    "For verification, my phone number is (555) 987-6543 and my username is cool_user_99.",
-    "Hi, I'm Emily Chen. My student number is STU-2023-98765 and I live at 789 Pine Street, Apt 4B.",
-    "You can reach me at my personal website: www.johndoe.com or call me at +1-555-0123.",
-]
-# Create Gradio interface
 with gr.Blocks(title="PII Detection System", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
@@ -371,6 +388,7 @@ with gr.Blocks(title="PII Detection System", theme=gr.themes.Soft()) as demo:
     )
     with gr.Column():
         input_text = gr.Textbox(
             label="Input Text",
             placeholder="Enter text to analyze for PII...",
@@ -378,10 +396,12 @@ with gr.Blocks(title="PII Detection System", theme=gr.themes.Soft()) as demo:
             max_lines=20
         )
         with gr.Row():
             analyze_btn = gr.Button("🔍 Detect PII", variant="primary", scale=2)
             clear_btn = gr.Button("🗑️ Clear", scale=1)
         highlighted_output = gr.HTML(
             label="Highlighted Text",
             value="<p style='color: #6c757d; padding: 20px;'>Results will appear here after analysis...</p>"
@@ -392,7 +412,7 @@ with gr.Blocks(title="PII Detection System", theme=gr.themes.Soft()) as demo:
             value="*Statistics will appear here...*"
         )
-    # Set up event handlers
     analyze_btn.click(
         fn=detect_pii,
         inputs=[input_text],
@@ -404,7 +424,7 @@ with gr.Blocks(title="PII Detection System", theme=gr.themes.Soft()) as demo:
         outputs=[input_text, highlighted_output, stats_output]
     )
-# Launch the app
 if __name__ == "__main__":
     print("\nLaunching Gradio interface...")
     demo.launch()

 import warnings
 warnings.filterwarnings('ignore')
+# Vocabulary class for handling text encoding and decoding
 class Vocabulary:
     """Vocabulary class for encoding/decoding text and labels"""
     def __init__(self, max_size=100000):
+        # Initialize special tokens
         self.word2idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}
         self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}
         self.word_count = Counter()
         self.max_size = max_size
     def add_sentence(self, sentence):
+        # Count word frequencies in the sentence
         for word in sentence:
             self.word_count[word.lower()] += 1
     def build(self):
+        # Build vocabulary from most common words
         most_common = self.word_count.most_common(self.max_size - len(self.word2idx))
         for word, _ in most_common:
             if word not in self.word2idx:
         return len(self.word2idx)
     def encode(self, sentence):
+        # Convert words to indices
         return [self.word2idx.get(word.lower(), self.word2idx['<unk>']) for word in sentence]
     def decode(self, indices):
+        # Convert indices back to words
         return [self.idx2word.get(idx, '<unk>') for idx in indices]
+# Multi-head attention mechanism for the transformer
 class MultiHeadAttention(nn.Module):
     def __init__(self, d_model, num_heads, dropout=0.1):
         super().__init__()
         self.num_heads = num_heads
         self.d_k = d_model // num_heads
+        # Linear layers for query, key, value, and output
         self.w_q = nn.Linear(d_model, d_model)
         self.w_k = nn.Linear(d_model, d_model)
         self.w_v = nn.Linear(d_model, d_model)
     def forward(self, query, key, value, mask=None):
         batch_size = query.size(0)
+        # Transform and reshape for multi-head attention
         Q = self.w_q(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
         K = self.w_k(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
         V = self.w_v(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
+        # Calculate attention scores
         scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
+        # Apply mask if provided
         if mask is not None:
             mask = mask.unsqueeze(1).unsqueeze(1)
             scores = scores.masked_fill(mask, -1e9)
+        # Apply softmax and dropout
         attention = F.softmax(scores, dim=-1)
         attention = self.dropout(attention)
+        # Apply attention to values
         context = torch.matmul(attention, V)
+        # Reshape back to original dimensions
         context = context.transpose(1, 2).contiguous().view(
             batch_size, -1, self.d_model
         )
         output = self.w_o(context)
         return output
+# Feed-forward network component
 class FeedForward(nn.Module):
     def __init__(self, d_model, d_ff, dropout=0.1):
         super().__init__()
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
+        # Two linear layers with GELU activation
         return self.w_2(self.dropout(F.gelu(self.w_1(x))))
+# Single encoder layer combining attention and feed-forward
 class EncoderLayer(nn.Module):
     def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
         super().__init__()
         self.dropout = nn.Dropout(dropout)
     def forward(self, x, mask=None):
+        # Apply self-attention with residual connection
         attn_output = self.self_attention(x, x, x, mask)
         x = self.norm1(x + self.dropout(attn_output))
+        # Apply feed-forward with residual connection
         ff_output = self.feed_forward(x)
         x = self.norm2(x + self.dropout(ff_output))
         return x
+# Stack of encoder layers
 class TransformerEncoder(nn.Module):
     def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
         super().__init__()
+        # Create multiple encoder layers
         self.layers = nn.ModuleList([
             EncoderLayer(d_model, num_heads, d_ff, dropout)
             for _ in range(num_layers)
         self.norm = nn.LayerNorm(d_model)
     def forward(self, x, mask=None):
+        # Pass through each encoder layer
         for layer in self.layers:
             x = layer(x, mask)
         return self.norm(x)
+# Positional encoding to add position information to embeddings
 class PositionalEncoding(nn.Module):
     def __init__(self, d_model, max_len=5000):
         super().__init__()
         self.d_model = d_model
+        # Create positional encoding matrix
         pe = torch.zeros(max_len, d_model)
         position = torch.arange(0, max_len).unsqueeze(1).float()
         div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                            -(torch.log(torch.tensor(10000.0)) / d_model))
+        # Apply sine and cosine functions
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
         self.register_buffer('pe', pe.unsqueeze(0))
     def forward(self, x):
+        # Scale embeddings and add positional encoding
         return x * torch.sqrt(torch.tensor(self.d_model, dtype=x.dtype)) + self.pe[:, :x.size(1)]
+# Main transformer model for PII detection
 class TransformerPIIDetector(nn.Module):
     def __init__(self, vocab_size, num_classes, d_model=256, num_heads=8,
                  d_ff=512, num_layers=4, dropout=0.1, max_len=512):
         super().__init__()
+        # Model components
         self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
+        self.positional_encoding = PositionalEncoding(d_model, max_len)
         self.dropout = nn.Dropout(dropout)
         self.encoder = TransformerEncoder(num_layers, d_model, num_heads, d_ff, dropout)
         self.classifier = nn.Linear(d_model, num_classes)
     def forward(self, x):
+        # Create padding mask
         padding_mask = (x == 0)
+        # Pass through embedding and positional encoding
         x = self.embedding(x)
         x = self.positional_encoding(x)
         x = self.dropout(x)
+        # Encode and classify
         x = self.encoder(x, padding_mask)
         return self.classifier(x)
 def create_transformer_pii_model(**kwargs):
+    # Factory function to create the model
     return TransformerPIIDetector(**kwargs)
+# Main PII detection class
 class PIIDetector:
     def __init__(self, model_dir='saved_transformer'):
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.label_vocab = None
         self.load_model()
+        # Color for highlighting PII entities
         self.highlight_color = '#FF6B6B'
     def load_model(self):
         """Load the trained model and vocabularies"""
         try:
+            # Load saved vocabularies
             vocab_path = os.path.join(self.model_dir, 'vocabularies.pkl')
             with open(vocab_path, 'rb') as f:
                 vocabs = pickle.load(f)
             with open(config_path, 'rb') as f:
                 model_config = pickle.load(f)
+            # Initialize and load model weights
             self.model = create_transformer_pii_model(**model_config)
             model_path = os.path.join(self.model_dir, 'pii_transformer_model.pt')
             self.model.load_state_dict(torch.load(model_path, map_location=self.device))
     def tokenize(self, text: str) -> List[str]:
         """Simple tokenization by splitting on spaces and punctuation"""
         import re
+        # Split text into words and punctuation marks
         tokens = re.findall(r'\w+|[^\w\s]', text)
         return tokens
         if not text.strip():
             return []
+        # Tokenize input text
         tokens = self.tokenize(text)
+        # Add special tokens
         tokens_with_special = ['<start>'] + tokens + ['<end>']
+        # Convert tokens to indices
         token_ids = self.text_vocab.encode(tokens_with_special)
+        # Prepare tensor for model
         input_tensor = torch.tensor([token_ids]).to(self.device)
+        # Get predictions
         with torch.no_grad():
             outputs = self.model(input_tensor)
             predictions = torch.argmax(outputs, dim=-1)
+        # Convert predictions to labels
         predicted_labels = []
+        for idx in predictions[0][1:-1]:  # Skip special tokens
             label = self.label_vocab.idx2word.get(idx.item(), 'O')
             predicted_labels.append(label.upper())
+        # Return token-label pairs
         return list(zip(tokens, predicted_labels))
     def create_highlighted_html(self, token_label_pairs: List[Tuple[str, str]]) -> str:
         while i < len(token_label_pairs):
             token, label = token_label_pairs[i]
+            # Check if token is part of PII entity
             if label != 'O':
                 # Collect all tokens for this entity
                 entity_tokens = [token]
                 entity_label = label
                 j = i + 1
+                # Find continuation tokens
                 while j < len(token_label_pairs):
                     next_token, next_label = token_label_pairs[j]
                     if next_label.startswith('I-') and next_label.replace('I-', 'B-') == entity_label:
                     else:
                         break
+                # Join entity tokens with proper spacing
                 entity_text = ''
                 for k, tok in enumerate(entity_tokens):
                     if k > 0 and tok not in '.,!?;:':
                         entity_text += ' '
                     entity_text += tok
+                # Create highlighted HTML for entity
                 label_display = entity_label.replace('B-', '').replace('I-', '').replace('_', ' ')
                 html_parts.append(
                     f'<mark style="background-color: {self.highlight_color}; padding: 2px 4px; '
                 i = j
             else:
+                # Add non-PII token with proper spacing
                 if i > 0 and token not in '.,!?;:' and len(token_label_pairs) > i-1:
                     prev_token, _ = token_label_pairs[i-1]
                     if prev_token not in '(':
         total_tokens = len(token_label_pairs)
         pii_tokens = 0
+        # Count PII tokens by type
         for _, label in token_label_pairs:
             if label != 'O':
                 pii_tokens += 1
                 label_clean = label.replace('B-', '').replace('I-', '').replace('_', ' ')
                 stats[label_clean] = stats.get(label_clean, 0) + 1
+        # Format statistics text
         stats_text = f"### Detection Summary\n\n"
         stats_text += f"**Total tokens:** {total_tokens}\n\n"
         stats_text += f"**PII tokens:** {pii_tokens} ({pii_tokens/total_tokens*100:.1f}%)\n\n"
         return stats_text
+# Initialize the detector when the script runs
 print("Initializing PII Detector...")
 detector = PIIDetector()
         return "<p style='color: #6c757d; padding: 20px;'>Please enter some text to analyze.</p>", "No text provided."
     try:
+        # Run PII detection
         token_label_pairs = detector.predict(text)
+        # Generate highlighted output
         highlighted_html = detector.create_highlighted_html(token_label_pairs)
+        # Generate statistics
         stats = detector.get_statistics(token_label_pairs)
         return highlighted_html, stats
         error_stats = f"Error occurred: {str(e)}"
         return error_html, error_stats
+# Create the Gradio interface
 with gr.Blocks(title="PII Detection System", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
     )
     with gr.Column():
+        # Input text area
         input_text = gr.Textbox(
             label="Input Text",
             placeholder="Enter text to analyze for PII...",
             max_lines=20
         )
+        # Control buttons
         with gr.Row():
             analyze_btn = gr.Button("🔍 Detect PII", variant="primary", scale=2)
             clear_btn = gr.Button("🗑️ Clear", scale=1)
+        # Output areas
         highlighted_output = gr.HTML(
             label="Highlighted Text",
             value="<p style='color: #6c757d; padding: 20px;'>Results will appear here after analysis...</p>"
             value="*Statistics will appear here...*"
         )
+    # Connect buttons to functions
     analyze_btn.click(
         fn=detect_pii,
         inputs=[input_text],
         outputs=[input_text, highlighted_output, stats_output]
     )
+# Launch the application
 if __name__ == "__main__":
     print("\nLaunching Gradio interface...")
     demo.launch()

data_augmentation.py CHANGED Viewed

@@ -15,17 +15,20 @@ class PIIDataAugmenter:
     def __init__(self, seed=42):
         """Initialize the augmenter with random seeds for reproducibility."""
         random.seed(seed)
         np.random.seed(seed)
         self.fake = Faker()
         Faker.seed(seed)
         self._init_templates()
         self._init_context_phrases()
         self._init_generators()
     def _init_templates(self):
         """Initialize templates for different PII types."""
         self.templates = {
             'NAME_STUDENT': [
                 "My name is {name}",
@@ -115,6 +118,7 @@ class PIIDataAugmenter:
     def _init_context_phrases(self):
         """Initialize context phrases for more natural text generation."""
         self.context_prefix = [
             "Hello everyone,",
             "Dear Sir/Madam,",
@@ -128,6 +132,7 @@ class PIIDataAugmenter:
             "I am writing to tell you that"
         ]
         self.context_suffix = [
             "Thank you.",
             "Best regards.",
@@ -141,12 +146,14 @@ class PIIDataAugmenter:
             "Let me know if you have questions."
         ]
         self.connectors = [
             " and ", " or ", ", ", ". Also, ", ". Additionally, "
         ]
     def _init_generators(self):
         """Initialize PII generators mapping."""
         self.generators = {
             'NAME_STUDENT': self.generate_name,
             'EMAIL': self.generate_email,
@@ -157,6 +164,7 @@ class PIIDataAugmenter:
             'USERNAME': self.generate_username
         }
         self.format_keys = {
             'NAME_STUDENT': 'name',
             'EMAIL': 'email',
@@ -167,8 +175,6 @@ class PIIDataAugmenter:
             'USERNAME': 'username'
         }
-    # ========== PII Generators ==========
     def generate_name(self):
         """Generate realistic person names."""
         return self.fake.name()
@@ -179,6 +185,7 @@ class PIIDataAugmenter:
     def generate_phone(self):
         """Generate realistic phone numbers in various formats."""
         formats = [
             "555-{:03d}-{:04d}",
             "(555) {:03d}-{:04d}",
@@ -186,6 +193,7 @@ class PIIDataAugmenter:
             "+1-555-{:03d}-{:04d}",
             "555{:03d}{:04d}"
         ]
         format_choice = random.choice(formats)
         area = random.randint(100, 999)
         number = random.randint(1000, 9999)
@@ -193,10 +201,12 @@ class PIIDataAugmenter:
     def generate_address(self):
         """Generate realistic street addresses."""
         return self.fake.address().replace('\n', ', ')
     def generate_id_num(self):
         """Generate various ID number formats."""
         formats = [
             "{:06d}",           # 6-digit ID
             "{:08d}",           # 8-digit ID
@@ -207,6 +217,7 @@ class PIIDataAugmenter:
         ]
         format_choice = random.choice(formats)
         if '-' in format_choice:
             return format_choice.format(
                 random.randint(1000, 9999),
@@ -217,6 +228,7 @@ class PIIDataAugmenter:
     def generate_url(self):
         """Generate personal website URLs."""
         domains = ['github.com', 'linkedin.com', 'portfolio.com',
                   'personal.com', 'website.com']
         username = self.fake.user_name()
@@ -227,72 +239,53 @@ class PIIDataAugmenter:
         """Generate usernames."""
         return self.fake.user_name()
-    # ========== Synthetic Example Creation ==========
     def create_synthetic_example(self, pii_type, add_context=True):
-        """
-        Create a synthetic example with proper BIO labeling.
-        Args:
-            pii_type: Type of PII to generate
-            add_context: Whether to add context phrases
-        Returns:
-            Tuple of (tokens, labels)
-        """
-        # Generate PII value
         pii_value = self.generators[pii_type]()
-        # Select and fill template
         template = random.choice(self.templates[pii_type])
         format_key = self.format_keys[pii_type]
         sentence = template.format(**{format_key: pii_value})
-        # Add context if requested
         if add_context and random.random() > 0.3:
             sentence = self._add_context(sentence)
-        # Tokenize and label
         tokens, labels = self._tokenize_and_label(sentence, pii_value, pii_type)
         return tokens, labels
     def create_mixed_example(self, pii_types, num_pii=2):
-        """
-        Create examples with multiple PII types.
-        Args:
-            pii_types: List of PII types to include
-            num_pii: Number of PII entities to include
-        Returns:
-            Tuple of (tokens, labels)
-        """
         selected_types = random.sample(pii_types, min(num_pii, len(pii_types)))
         all_tokens = []
         all_labels = []
-        # Add context prefix
         if random.random() > 0.3:
             prefix = random.choice(self.context_prefix)
             all_tokens.extend(prefix.split())
             all_labels.extend(['O'] * len(prefix.split()))
-        # Add each PII with connectors
         for i, pii_type in enumerate(selected_types):
-            # Add connector between PIIs
             if i > 0 and random.random() > 0.5:
                 connector = random.choice(self.connectors)
                 all_tokens.extend(connector.strip().split())
                 all_labels.extend(['O'] * len(connector.strip().split()))
-            # Create PII example without additional context
             tokens, labels = self.create_synthetic_example(pii_type, add_context=False)
             all_tokens.extend(tokens)
             all_labels.extend(labels)
-        # Add context suffix
         if random.random() > 0.3:
             suffix = random.choice(self.context_suffix)
             all_tokens.extend(suffix.split())
@@ -302,79 +295,60 @@ class PIIDataAugmenter:
     def _add_context(self, sentence):
         """Add context phrases to make text more natural."""
         if random.random() > 0.5:
             sentence = random.choice(self.context_prefix) + " " + sentence
         if random.random() > 0.5:
             sentence = sentence + " " + random.choice(self.context_suffix)
         return sentence
     def _tokenize_and_label(self, sentence, pii_value, pii_type):
-        """
-        Tokenize sentence and apply BIO labels for PII.
-        Args:
-            sentence: The sentence containing PII
-            pii_value: The PII value to find and label
-            pii_type: The type of PII for labeling
-        Returns:
-            Tuple of (tokens, labels)
-        """
         tokens = sentence.split()
         labels = ['O'] * len(tokens)
-        # Tokenize PII value
         pii_tokens = pii_value.split()
-        # Find and label PII in the sentence
         for i in range(len(tokens) - len(pii_tokens) + 1):
-            # Check if tokens match PII value
             if (tokens[i:i+len(pii_tokens)] == pii_tokens or
                 ' '.join(tokens[i:i+len(pii_tokens)]).lower() == pii_value.lower()):
-                # Apply BIO labels
-                labels[i] = f'B-{pii_type}'
                 for j in range(1, len(pii_tokens)):
-                    labels[i+j] = f'I-{pii_type}'
                 break
         return tokens, labels
-    # ========== Dataset Augmentation ==========
     def augment_dataset(self, original_data, target_samples_per_class=1000, mix_ratio=0.3):
-        """
-        Augment dataset with synthetic examples to balance PII classes.
-        Args:
-            original_data: Original dataset DataFrame
-            target_samples_per_class: Target number of samples per PII class
-            mix_ratio: Ratio of mixed (multi-PII) examples
-        Returns:
-            Augmented dataset DataFrame
-        """
-        # Analyze original distribution
         label_counts = self._analyze_label_distribution(original_data)
         print("\nOriginal label distribution:")
         self._print_distribution(label_counts)
-        # Generate synthetic examples
         synthetic_tokens, synthetic_labels = self._generate_synthetic_data(
             label_counts, target_samples_per_class, mix_ratio
         )
-        # Add non-PII examples
         synthetic_tokens, synthetic_labels = self._add_non_pii_examples(
             synthetic_tokens, synthetic_labels
         )
-        # Combine and shuffle data
         augmented_df = self._combine_and_shuffle(
             original_data, synthetic_tokens, synthetic_labels
         )
-        # Analyze new distribution
         new_label_counts = self._analyze_label_distribution(augmented_df)
         print("\nAugmented label distribution:")
         self._print_distribution(new_label_counts)
@@ -385,10 +359,11 @@ class PIIDataAugmenter:
         """Analyze the distribution of PII labels in the dataset."""
         label_counts = Counter()
         for labels in data['labels']:
             for label in labels:
                 if label != 'O':
-                    # Extract base label (remove B- or I- prefix)
                     base_label = label.split('-')[1] if '-' in label else label
                     label_counts[base_label] += 1
@@ -397,6 +372,7 @@ class PIIDataAugmenter:
     def _print_distribution(self, label_counts):
         """Print label distribution statistics."""
         total = sum(label_counts.values())
         for label, count in label_counts.most_common():
             percentage = (count / total * 100) if total > 0 else 0
             print(f"  {label:15} : {count:6,} ({percentage:5.2f}%)")
@@ -406,6 +382,7 @@ class PIIDataAugmenter:
         synthetic_tokens = []
         synthetic_labels = []
         for pii_type in self.templates.keys():
             current_count = label_counts.get(pii_type, 0)
             needed = max(0, target_samples - current_count)
@@ -415,17 +392,17 @@ class PIIDataAugmenter:
             print(f"\nGenerating {needed} synthetic examples for {pii_type}")
-            # Single PII examples
             single_count = int(needed * (1 - mix_ratio))
             for _ in range(single_count):
                 tokens, labels = self.create_synthetic_example(pii_type)
                 synthetic_tokens.append(tokens)
                 synthetic_labels.append(labels)
-            # Mixed PII examples
             mixed_count = int(needed * mix_ratio)
             for _ in range(mixed_count):
-                # Ensure current PII type is included
                 other_types = [t for t in self.templates.keys() if t != pii_type]
                 selected_types = [pii_type] + random.sample(
                     other_types, min(1, len(other_types))
@@ -439,6 +416,7 @@ class PIIDataAugmenter:
     def _add_non_pii_examples(self, synthetic_tokens, synthetic_labels):
         """Add examples without PII (all 'O' labels) for balance."""
         num_non_pii = int(len(synthetic_tokens) * 0.1)
         for _ in range(num_non_pii):
@@ -454,17 +432,17 @@ class PIIDataAugmenter:
     def _combine_and_shuffle(self, original_data, synthetic_tokens, synthetic_labels):
         """Combine original and synthetic data, then shuffle."""
-        # Combine data
         all_tokens = original_data['tokens'].tolist() + synthetic_tokens
         all_labels = original_data['labels'].tolist() + synthetic_labels
-        # Create DataFrame
         augmented_data = pd.DataFrame({
             'tokens': all_tokens,
             'labels': all_labels
         })
-        # Shuffle
         augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
         print(f"\nTotal augmented samples: {len(augmented_data):,}")
@@ -472,17 +450,8 @@ class PIIDataAugmenter:
         return augmented_data
 def calculate_class_weights(data, label_vocab):
-    """
-    Calculate class weights for balanced loss function.
-    Args:
-        data: Dataset DataFrame with 'labels' column
-        label_vocab: Vocabulary object with word2idx mapping
-    Returns:
-        Tensor of class weights
-    """
-    # Count label occurrences
     label_counts = Counter()
     for labels in data['labels']:
@@ -490,7 +459,7 @@ def calculate_class_weights(data, label_vocab):
             label_id = label_vocab.word2idx.get(label.lower(), 0)
             label_counts[label_id] += 1
-    # Calculate inverse frequency weights
     total_samples = sum(label_counts.values())
     num_classes = len(label_vocab)
@@ -498,31 +467,31 @@ def calculate_class_weights(data, label_vocab):
     for class_id, count in label_counts.items():
         if count > 0:
-            # Inverse frequency with smoothing
             weights[class_id] = total_samples / (num_classes * count)
-    # Normalize weights
     weights = weights / weights.sum() * num_classes
-    # Cap extreme weights to prevent instability
     weights = torch.clamp(weights, min=0.1, max=10.0)
-    # Set padding weight to 0
     weights[0] = 0.0
     return weights
 if __name__ == '__main__':
     """Example usage of the augmentation module."""
-    # Load original data
     print("Loading original training data...")
     original_data = pd.read_json('train.json')
     print(f"Original dataset size: {len(original_data):,}")
-    # Initialize augmenter
     augmenter = PIIDataAugmenter(seed=42)
-    # Augment dataset
     print("\n" + "="*60)
     print("Starting data augmentation...")
     print("="*60)
@@ -533,7 +502,7 @@ if __name__ == '__main__':
         mix_ratio=0.3
     )
-    # Save augmented data
     output_path = './train_augmented.json'
     augmented_data.to_json(output_path, orient='records', lines=True)
     print(f"\nSaved augmented data to {output_path}")

     def __init__(self, seed=42):
         """Initialize the augmenter with random seeds for reproducibility."""
+        # Set random seeds for consistent results
         random.seed(seed)
         np.random.seed(seed)
         self.fake = Faker()
         Faker.seed(seed)
+        # Initialize data structures
         self._init_templates()
         self._init_context_phrases()
         self._init_generators()
     def _init_templates(self):
         """Initialize templates for different PII types."""
+        # Templates for generating sentences with PII
         self.templates = {
             'NAME_STUDENT': [
                 "My name is {name}",
     def _init_context_phrases(self):
         """Initialize context phrases for more natural text generation."""
+        # Opening phrases for generated text
         self.context_prefix = [
             "Hello everyone,",
             "Dear Sir/Madam,",
             "I am writing to tell you that"
         ]
+        # Closing phrases for generated text
         self.context_suffix = [
             "Thank you.",
             "Best regards.",
             "Let me know if you have questions."
         ]
+        # Words to connect multiple PII elements
         self.connectors = [
             " and ", " or ", ", ", ". Also, ", ". Additionally, "
         ]
     def _init_generators(self):
         """Initialize PII generators mapping."""
+        # Map PII types to their generator functions
         self.generators = {
             'NAME_STUDENT': self.generate_name,
             'EMAIL': self.generate_email,
             'USERNAME': self.generate_username
         }
+        # Map PII types to template placeholder keys
         self.format_keys = {
             'NAME_STUDENT': 'name',
             'EMAIL': 'email',
             'USERNAME': 'username'
         }
     def generate_name(self):
         """Generate realistic person names."""
         return self.fake.name()
     def generate_phone(self):
         """Generate realistic phone numbers in various formats."""
+        # Different phone number formats
         formats = [
             "555-{:03d}-{:04d}",
             "(555) {:03d}-{:04d}",
             "+1-555-{:03d}-{:04d}",
             "555{:03d}{:04d}"
         ]
+        # Pick a random format and fill with random numbers
         format_choice = random.choice(formats)
         area = random.randint(100, 999)
         number = random.randint(1000, 9999)
     def generate_address(self):
         """Generate realistic street addresses."""
+        # Get address and replace newlines with commas
         return self.fake.address().replace('\n', ', ')
     def generate_id_num(self):
         """Generate various ID number formats."""
+        # Different ID number patterns
         formats = [
             "{:06d}",           # 6-digit ID
             "{:08d}",           # 8-digit ID
         ]
         format_choice = random.choice(formats)
+        # Handle hyphenated format differently
         if '-' in format_choice:
             return format_choice.format(
                 random.randint(1000, 9999),
     def generate_url(self):
         """Generate personal website URLs."""
+        # Common personal website domains
         domains = ['github.com', 'linkedin.com', 'portfolio.com',
                   'personal.com', 'website.com']
         username = self.fake.user_name()
         """Generate usernames."""
         return self.fake.user_name()
     def create_synthetic_example(self, pii_type, add_context=True):
+        """Create a synthetic example with proper BIO labeling."""
+        # Generate the PII value
         pii_value = self.generators[pii_type]()
+        # Choose a template and insert the PII
         template = random.choice(self.templates[pii_type])
         format_key = self.format_keys[pii_type]
         sentence = template.format(**{format_key: pii_value})
+        # Optionally add context for more natural text
         if add_context and random.random() > 0.3:
             sentence = self._add_context(sentence)
+        # Create tokens and labels
         tokens, labels = self._tokenize_and_label(sentence, pii_value, pii_type)
         return tokens, labels
     def create_mixed_example(self, pii_types, num_pii=2):
+        """Create examples with multiple PII types."""
+        # Select which PII types to include
         selected_types = random.sample(pii_types, min(num_pii, len(pii_types)))
         all_tokens = []
         all_labels = []
+        # Add opening context
         if random.random() > 0.3:
             prefix = random.choice(self.context_prefix)
             all_tokens.extend(prefix.split())
             all_labels.extend(['O'] * len(prefix.split()))
+        # Add each PII entity
         for i, pii_type in enumerate(selected_types):
+            # Add connector between PII entities
             if i > 0 and random.random() > 0.5:
                 connector = random.choice(self.connectors)
                 all_tokens.extend(connector.strip().split())
                 all_labels.extend(['O'] * len(connector.strip().split()))
+            # Generate PII example
             tokens, labels = self.create_synthetic_example(pii_type, add_context=False)
             all_tokens.extend(tokens)
             all_labels.extend(labels)
+        # Add closing context
         if random.random() > 0.3:
             suffix = random.choice(self.context_suffix)
             all_tokens.extend(suffix.split())
     def _add_context(self, sentence):
         """Add context phrases to make text more natural."""
+        # Randomly add prefix
         if random.random() > 0.5:
             sentence = random.choice(self.context_prefix) + " " + sentence
+        # Randomly add suffix
         if random.random() > 0.5:
             sentence = sentence + " " + random.choice(self.context_suffix)
         return sentence
     def _tokenize_and_label(self, sentence, pii_value, pii_type):
+        """Tokenize sentence and apply BIO labels for PII."""
+        # Split sentence into tokens
         tokens = sentence.split()
         labels = ['O'] * len(tokens)
+        # Split PII value into tokens
         pii_tokens = pii_value.split()
+        # Find where PII appears in the sentence
         for i in range(len(tokens) - len(pii_tokens) + 1):
+            # Check if tokens match the PII value
             if (tokens[i:i+len(pii_tokens)] == pii_tokens or
                 ' '.join(tokens[i:i+len(pii_tokens)]).lower() == pii_value.lower()):
+                # Apply BIO tagging
+                labels[i] = f'B-{pii_type}'  # Beginning
                 for j in range(1, len(pii_tokens)):
+                    labels[i+j] = f'I-{pii_type}'  # Inside
                 break
         return tokens, labels
     def augment_dataset(self, original_data, target_samples_per_class=1000, mix_ratio=0.3):
+        """Augment dataset with synthetic examples to balance PII classes."""
+        # Check current distribution
         label_counts = self._analyze_label_distribution(original_data)
         print("\nOriginal label distribution:")
         self._print_distribution(label_counts)
+        # Generate synthetic data
         synthetic_tokens, synthetic_labels = self._generate_synthetic_data(
             label_counts, target_samples_per_class, mix_ratio
         )
+        # Add some non-PII examples for balance
         synthetic_tokens, synthetic_labels = self._add_non_pii_examples(
             synthetic_tokens, synthetic_labels
         )
+        # Combine original and synthetic data
         augmented_df = self._combine_and_shuffle(
             original_data, synthetic_tokens, synthetic_labels
         )
+        # Check new distribution
         new_label_counts = self._analyze_label_distribution(augmented_df)
         print("\nAugmented label distribution:")
         self._print_distribution(new_label_counts)
         """Analyze the distribution of PII labels in the dataset."""
         label_counts = Counter()
+        # Count each PII type
         for labels in data['labels']:
             for label in labels:
                 if label != 'O':
+                    # Remove B- or I- prefix to get base label
                     base_label = label.split('-')[1] if '-' in label else label
                     label_counts[base_label] += 1
     def _print_distribution(self, label_counts):
         """Print label distribution statistics."""
         total = sum(label_counts.values())
+        # Print each label count and percentage
         for label, count in label_counts.most_common():
             percentage = (count / total * 100) if total > 0 else 0
             print(f"  {label:15} : {count:6,} ({percentage:5.2f}%)")
         synthetic_tokens = []
         synthetic_labels = []
+        # Generate examples for each PII type
         for pii_type in self.templates.keys():
             current_count = label_counts.get(pii_type, 0)
             needed = max(0, target_samples - current_count)
             print(f"\nGenerating {needed} synthetic examples for {pii_type}")
+            # Generate single PII examples
             single_count = int(needed * (1 - mix_ratio))
             for _ in range(single_count):
                 tokens, labels = self.create_synthetic_example(pii_type)
                 synthetic_tokens.append(tokens)
                 synthetic_labels.append(labels)
+            # Generate mixed PII examples
             mixed_count = int(needed * mix_ratio)
             for _ in range(mixed_count):
+                # Make sure current PII type is included
                 other_types = [t for t in self.templates.keys() if t != pii_type]
                 selected_types = [pii_type] + random.sample(
                     other_types, min(1, len(other_types))
     def _add_non_pii_examples(self, synthetic_tokens, synthetic_labels):
         """Add examples without PII (all 'O' labels) for balance."""
+        # Add 10% non-PII examples
         num_non_pii = int(len(synthetic_tokens) * 0.1)
         for _ in range(num_non_pii):
     def _combine_and_shuffle(self, original_data, synthetic_tokens, synthetic_labels):
         """Combine original and synthetic data, then shuffle."""
+        # Merge all data
         all_tokens = original_data['tokens'].tolist() + synthetic_tokens
         all_labels = original_data['labels'].tolist() + synthetic_labels
+        # Create new dataframe
         augmented_data = pd.DataFrame({
             'tokens': all_tokens,
             'labels': all_labels
         })
+        # Shuffle the data
         augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
         print(f"\nTotal augmented samples: {len(augmented_data):,}")
         return augmented_data
 def calculate_class_weights(data, label_vocab):
+    """Calculate class weights for balanced loss function."""
+    # Count occurrences of each label
     label_counts = Counter()
     for labels in data['labels']:
             label_id = label_vocab.word2idx.get(label.lower(), 0)
             label_counts[label_id] += 1
+    # Calculate weights based on inverse frequency
     total_samples = sum(label_counts.values())
     num_classes = len(label_vocab)
     for class_id, count in label_counts.items():
         if count > 0:
+            # Inverse frequency weighting
             weights[class_id] = total_samples / (num_classes * count)
+    # Normalize the weights
     weights = weights / weights.sum() * num_classes
+    # Prevent extreme weights
     weights = torch.clamp(weights, min=0.1, max=10.0)
+    # Don't weight padding tokens
     weights[0] = 0.0
     return weights
 if __name__ == '__main__':
     """Example usage of the augmentation module."""
+    # Load original training data
     print("Loading original training data...")
     original_data = pd.read_json('train.json')
     print(f"Original dataset size: {len(original_data):,}")
+    # Create augmenter instance
     augmenter = PIIDataAugmenter(seed=42)
+    # Run augmentation
     print("\n" + "="*60)
     print("Starting data augmentation...")
     print("="*60)
         mix_ratio=0.3
     )
+    # Save the augmented dataset
     output_path = './train_augmented.json'
     augmented_data.to_json(output_path, orient='records', lines=True)
     print(f"\nSaved augmented data to {output_path}")

lstm.py CHANGED Viewed

@@ -12,28 +12,28 @@ class LSTMCell(nn.Module):
         self.input_size = input_size
         self.hidden_size = hidden_size
-        # Initialize weight matrices and bias vectors for LSTM gates
-        # Input gate
         self.W_ii = nn.Parameter(torch.Tensor(input_size, hidden_size))
         self.W_hi = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
         self.b_i = nn.Parameter(torch.Tensor(hidden_size))
-        # Forget gate
         self.W_if = nn.Parameter(torch.Tensor(input_size, hidden_size))
         self.W_hf = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
         self.b_f = nn.Parameter(torch.Tensor(hidden_size))
-        # Input node (candidate)
         self.W_in = nn.Parameter(torch.Tensor(input_size, hidden_size))
         self.W_hn = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
         self.b_n = nn.Parameter(torch.Tensor(hidden_size))
-        # Output gate
         self.W_io = nn.Parameter(torch.Tensor(input_size, hidden_size))
         self.W_ho = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
         self.b_o = nn.Parameter(torch.Tensor(hidden_size))
-        # Initialize all weights with xavier_uniform and biases with zeros
         for name, param in self.named_parameters():
             if 'W_' in name:
                 nn.init.xavier_uniform_(param)
@@ -41,35 +41,26 @@ class LSTMCell(nn.Module):
                 nn.init.zeros_(param)
     def forward(self, input: torch.Tensor, states: tuple[torch.Tensor, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Forward pass for one time step
-        Args:
-            input: input at current time step [batch_size, input_size]
-            states: tuple of (hidden_state, cell_state) from previous time step
-                    both with shape [batch_size, hidden_size]
-        Returns:
-            new_hidden: updated hidden state [batch_size, hidden_size]
-            new_cell: updated cell state [batch_size, hidden_size]
-        """
         hidden, cell = states
-        # Implement LSTM cell forward pass
-        # Forget gate: f_t = sigmoid(W_if @ x_t + W_hf @ h_{t-1} + b_f)
         forget_gate = torch.sigmoid(torch.mm(input, self.W_if) + torch.mm(hidden, self.W_hf) + self.b_f)
-        # Input gate: i_t = sigmoid(W_ii @ x_t + W_hi @ h_{t-1} + b_i)
         input_gate = torch.sigmoid(torch.mm(input, self.W_ii) + torch.mm(hidden, self.W_hi) + self.b_i)
-        # Input node values: n_t = tanh(W_in @ x_t + W_hn @ h_{t-1} + b_n)
         candidate = torch.tanh(torch.mm(input, self.W_in) + torch.mm(hidden, self.W_hn) + self.b_n)
-        # Output gate: o_t = sigmoid(W_io @ x_t + W_ho @ h_{t-1} + b_o)
         output_gate = torch.sigmoid(torch.mm(input, self.W_io) + torch.mm(hidden, self.W_ho) + self.b_o)
-        # Update cell state: c_t = f_t * c_{t-1} + i_t * n_t
         new_cell = forget_gate * cell + input_gate * candidate
-        # Update hidden state: h_t = o_t * tanh(c_t)
         new_hidden = output_gate * torch.tanh(new_cell)
         return new_hidden, new_cell
@@ -87,27 +78,31 @@ class BidirectionalLSTM(nn.Module):
         self.batch_first = batch_first
         self.dropout = dropout if num_layers > 1 else 0.0
-        # Create forward and backward cells for each layer
         self.forward_cells = nn.ModuleList()
         self.backward_cells = nn.ModuleList()
         self.dropout_layers = nn.ModuleList() if self.dropout > 0 else None
         for layer in range(num_layers):
-            # Input size is input_size for first layer, hidden_size * 2 for others (bidirectional)
             layer_input_size = input_size if layer == 0 else hidden_size * 2
             self.forward_cells.append(LSTMCell(layer_input_size, hidden_size))
             self.backward_cells.append(LSTMCell(layer_input_size, hidden_size))
             if self.dropout > 0 and layer < num_layers - 1:
                 self.dropout_layers.append(nn.Dropout(dropout))
     def forward(self, input, states=None, lengths=None):
-        # Handle PackedSequence input
         is_packed = isinstance(input, PackedSequence)
         if is_packed:
             padded, lengths = pad_packed_sequence(input, batch_first=self.batch_first)
             outputs, (h_n, c_n) = self._forward_unpacked(padded, states, lengths)
             packed_out = pack_padded_sequence(
                 outputs, lengths,
                 batch_first=self.batch_first,
@@ -118,13 +113,15 @@ class BidirectionalLSTM(nn.Module):
             return self._forward_unpacked(input, states, lengths)
     def _forward_unpacked(self, input: torch.Tensor, states, lengths=None):
         if not self.batch_first:
             input = input.transpose(0, 1)
         batch_size, seq_len, _ = input.size()
-        # Initialize states if not provided
         if states is None:
             h_t_forward = [input.new_zeros(batch_size, self.hidden_size)
                           for _ in range(self.num_layers)]
             c_t_forward = [input.new_zeros(batch_size, self.hidden_size)
@@ -134,23 +131,24 @@ class BidirectionalLSTM(nn.Module):
             c_t_backward = [input.new_zeros(batch_size, self.hidden_size)
                            for _ in range(self.num_layers)]
         else:
             h0, c0 = states
-            # h0 and c0 are [num_layers * 2, batch_size, hidden_size]
             h_t_forward = []
             c_t_forward = []
             h_t_backward = []
             c_t_backward = []
             for layer in range(self.num_layers):
                 h_t_forward.append(h0[layer * 2])
                 c_t_forward.append(c0[layer * 2])
                 h_t_backward.append(h0[layer * 2 + 1])
                 c_t_backward.append(c0[layer * 2 + 1])
-        # Process through layers
         layer_input = input
         for layer_idx in range(self.num_layers):
-            # Forward direction
             forward_output = input.new_zeros(batch_size, seq_len, self.hidden_size)
             for t in range(seq_len):
                 x = layer_input[:, t, :]
@@ -159,7 +157,7 @@ class BidirectionalLSTM(nn.Module):
                 c_t_forward[layer_idx] = c
                 forward_output[:, t, :] = h
-            # Backward direction
             backward_output = input.new_zeros(batch_size, seq_len, self.hidden_size)
             for t in reversed(range(seq_len)):
                 x = layer_input[:, t, :]
@@ -168,19 +166,20 @@ class BidirectionalLSTM(nn.Module):
                 c_t_backward[layer_idx] = c
                 backward_output[:, t, :] = h
-            # Concatenate forward and backward
             layer_output = torch.cat([forward_output, backward_output], dim=2)
-            # Apply dropout between layers (except last layer)
             if self.dropout > 0 and layer_idx < self.num_layers - 1:
                 layer_output = self.dropout_layers[layer_idx](layer_output)
             layer_input = layer_output
-        # Final output
         outputs = layer_output
-        # Stack hidden and cell states
         h_n = []
         c_n = []
         for layer in range(self.num_layers):
@@ -189,6 +188,7 @@ class BidirectionalLSTM(nn.Module):
         h_n = torch.stack(h_n, dim=0)
         c_n = torch.stack(c_n, dim=0)
         if not self.batch_first:
             outputs = outputs.transpose(0, 1)
@@ -209,7 +209,7 @@ class LSTM(nn.Module):
         self.hidden_size = hidden_size
         self.num_layers = num_layers
-        # Embedding layer
         self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
         self.embed_dropout = nn.Dropout(dropout)
@@ -222,8 +222,8 @@ class LSTM(nn.Module):
             dropout=dropout if num_layers > 1 else 0.0
         )
-        # Output projection layer
-        lstm_output_size = hidden_size * 2  # bidirectional
         self.fc = nn.Linear(lstm_output_size, num_classes)
         self.output_dropout = nn.Dropout(dropout)
@@ -236,11 +236,11 @@ class LSTM(nn.Module):
         Returns:
             logits: class predictions [batch_size, seq_len, num_classes]
         """
-        # Embedding
-        embedded = self.embedding(input_ids)  # [batch_size, seq_len, embed_size]
         embedded = self.embed_dropout(embedded)
-        # Pack if lengths provided for efficiency
         if lengths is not None:
             packed_embedded = pack_padded_sequence(
                 embedded, lengths.cpu(),
@@ -248,40 +248,27 @@ class LSTM(nn.Module):
                 enforce_sorted=False
             )
             lstm_out, _ = self.lstm(packed_embedded)
             lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
         else:
             lstm_out, _ = self.lstm(embedded)
-        # Apply dropout and project to output
         lstm_out = self.output_dropout(lstm_out)
-        logits = self.fc(lstm_out)  # [batch_size, seq_len, num_classes]
         return logits
 def create_lstm_pii_model(vocab_size: int, num_classes: int, d_model: int = 256,
                          num_heads: int = 8, d_ff: int = 512, num_layers: int = 4,
                          dropout: float = 0.1, max_len: int = 512):
-    """
-    Create Bidirectional LSTM model for PII detection
-    Note: num_heads and d_ff are ignored (kept for compatibility with transformer interface)
-    Args:
-        vocab_size: size of vocabulary
-        num_classes: number of output classes (PII tags)
-        d_model: hidden dimension size
-        num_heads: ignored (for compatibility)
-        d_ff: ignored (for compatibility)
-        num_layers: number of LSTM layers
-        dropout: dropout rate
-        max_len: maximum sequence length
-    Returns:
-        LSTM
-    """
     return LSTM(
         vocab_size=vocab_size,
         num_classes=num_classes,
-        embed_size=d_model // 2,  # Use half of d_model as embedding size
         hidden_size=d_model,
         num_layers=num_layers,
         dropout=dropout,

         self.input_size = input_size
         self.hidden_size = hidden_size
+        # Weight matrices and biases for each gate
+        # Input gate parameters
         self.W_ii = nn.Parameter(torch.Tensor(input_size, hidden_size))
         self.W_hi = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
         self.b_i = nn.Parameter(torch.Tensor(hidden_size))
+        # Forget gate parameters
         self.W_if = nn.Parameter(torch.Tensor(input_size, hidden_size))
         self.W_hf = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
         self.b_f = nn.Parameter(torch.Tensor(hidden_size))
+        # Candidate values parameters
         self.W_in = nn.Parameter(torch.Tensor(input_size, hidden_size))
         self.W_hn = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
         self.b_n = nn.Parameter(torch.Tensor(hidden_size))
+        # Output gate parameters
         self.W_io = nn.Parameter(torch.Tensor(input_size, hidden_size))
         self.W_ho = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
         self.b_o = nn.Parameter(torch.Tensor(hidden_size))
+        # Initialize weights using Xavier initialization
         for name, param in self.named_parameters():
             if 'W_' in name:
                 nn.init.xavier_uniform_(param)
                 nn.init.zeros_(param)
     def forward(self, input: torch.Tensor, states: tuple[torch.Tensor, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass for one time step"""
+        # Unpack previous states
         hidden, cell = states
+        # Calculate forget gate - decides what to forget from previous cell state
         forget_gate = torch.sigmoid(torch.mm(input, self.W_if) + torch.mm(hidden, self.W_hf) + self.b_f)
+        # Calculate input gate - decides what new information to store
         input_gate = torch.sigmoid(torch.mm(input, self.W_ii) + torch.mm(hidden, self.W_hi) + self.b_i)
+        # Calculate candidate values - new information that could be added
         candidate = torch.tanh(torch.mm(input, self.W_in) + torch.mm(hidden, self.W_hn) + self.b_n)
+        # Calculate output gate - decides what parts of cell state to output
         output_gate = torch.sigmoid(torch.mm(input, self.W_io) + torch.mm(hidden, self.W_ho) + self.b_o)
+        # Update cell state by forgetting old info and adding new info
         new_cell = forget_gate * cell + input_gate * candidate
+        # Generate new hidden state based on filtered cell state
         new_hidden = output_gate * torch.tanh(new_cell)
         return new_hidden, new_cell
         self.batch_first = batch_first
         self.dropout = dropout if num_layers > 1 else 0.0
+        # Create forward and backward LSTM cells for each layer
         self.forward_cells = nn.ModuleList()
         self.backward_cells = nn.ModuleList()
         self.dropout_layers = nn.ModuleList() if self.dropout > 0 else None
         for layer in range(num_layers):
+            # First layer takes input_size, others take concatenated bidirectional output
             layer_input_size = input_size if layer == 0 else hidden_size * 2
+            # Add forward and backward cells for this layer
             self.forward_cells.append(LSTMCell(layer_input_size, hidden_size))
             self.backward_cells.append(LSTMCell(layer_input_size, hidden_size))
+            # Add dropout between layers (except after last layer)
             if self.dropout > 0 and layer < num_layers - 1:
                 self.dropout_layers.append(nn.Dropout(dropout))
     def forward(self, input, states=None, lengths=None):
+        # Check if input is packed sequence
         is_packed = isinstance(input, PackedSequence)
         if is_packed:
+            # Unpack for processing
             padded, lengths = pad_packed_sequence(input, batch_first=self.batch_first)
             outputs, (h_n, c_n) = self._forward_unpacked(padded, states, lengths)
+            # Pack output back
             packed_out = pack_padded_sequence(
                 outputs, lengths,
                 batch_first=self.batch_first,
             return self._forward_unpacked(input, states, lengths)
     def _forward_unpacked(self, input: torch.Tensor, states, lengths=None):
+        # Convert to batch-first if needed
         if not self.batch_first:
             input = input.transpose(0, 1)
         batch_size, seq_len, _ = input.size()
+        # Initialize hidden and cell states if not provided
         if states is None:
+            # Create zero states for each layer and direction
             h_t_forward = [input.new_zeros(batch_size, self.hidden_size)
                           for _ in range(self.num_layers)]
             c_t_forward = [input.new_zeros(batch_size, self.hidden_size)
             c_t_backward = [input.new_zeros(batch_size, self.hidden_size)
                            for _ in range(self.num_layers)]
         else:
+            # Unpack provided states
             h0, c0 = states
             h_t_forward = []
             c_t_forward = []
             h_t_backward = []
             c_t_backward = []
+            # Separate forward and backward states for each layer
             for layer in range(self.num_layers):
                 h_t_forward.append(h0[layer * 2])
                 c_t_forward.append(c0[layer * 2])
                 h_t_backward.append(h0[layer * 2 + 1])
                 c_t_backward.append(c0[layer * 2 + 1])
+        # Process through each layer
         layer_input = input
         for layer_idx in range(self.num_layers):
+            # Process forward direction
             forward_output = input.new_zeros(batch_size, seq_len, self.hidden_size)
             for t in range(seq_len):
                 x = layer_input[:, t, :]
                 c_t_forward[layer_idx] = c
                 forward_output[:, t, :] = h
+            # Process backward direction
             backward_output = input.new_zeros(batch_size, seq_len, self.hidden_size)
             for t in reversed(range(seq_len)):
                 x = layer_input[:, t, :]
                 c_t_backward[layer_idx] = c
                 backward_output[:, t, :] = h
+            # Concatenate forward and backward outputs
             layer_output = torch.cat([forward_output, backward_output], dim=2)
+            # Apply dropout between layers
             if self.dropout > 0 and layer_idx < self.num_layers - 1:
                 layer_output = self.dropout_layers[layer_idx](layer_output)
+            # Use this layer's output as next layer's input
             layer_input = layer_output
+        # Final output is the last layer's output
         outputs = layer_output
+        # Stack final hidden and cell states for all layers
         h_n = []
         c_n = []
         for layer in range(self.num_layers):
         h_n = torch.stack(h_n, dim=0)
         c_n = torch.stack(c_n, dim=0)
+        # Convert back if not batch-first
         if not self.batch_first:
             outputs = outputs.transpose(0, 1)
         self.hidden_size = hidden_size
         self.num_layers = num_layers
+        # Embedding layer to convert tokens to vectors
         self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
         self.embed_dropout = nn.Dropout(dropout)
             dropout=dropout if num_layers > 1 else 0.0
         )
+        # Output layer to predict PII labels
+        lstm_output_size = hidden_size * 2  # doubled for bidirectional
         self.fc = nn.Linear(lstm_output_size, num_classes)
         self.output_dropout = nn.Dropout(dropout)
         Returns:
             logits: class predictions [batch_size, seq_len, num_classes]
         """
+        # Convert token ids to embeddings
+        embedded = self.embedding(input_ids)
         embedded = self.embed_dropout(embedded)
+        # Pack sequences for efficient processing if lengths provided
         if lengths is not None:
             packed_embedded = pack_padded_sequence(
                 embedded, lengths.cpu(),
                 enforce_sorted=False
             )
             lstm_out, _ = self.lstm(packed_embedded)
+            # Unpack the output
             lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
         else:
+            # Process without packing
             lstm_out, _ = self.lstm(embedded)
+        # Apply dropout and get final predictions
         lstm_out = self.output_dropout(lstm_out)
+        logits = self.fc(lstm_out)
         return logits
 def create_lstm_pii_model(vocab_size: int, num_classes: int, d_model: int = 256,
                          num_heads: int = 8, d_ff: int = 512, num_layers: int = 4,
                          dropout: float = 0.1, max_len: int = 512):
+    """Create Bidirectional LSTM model for PII detection"""
+    # Create LSTM with appropriate dimensions
     return LSTM(
         vocab_size=vocab_size,
         num_classes=num_classes,
+        embed_size=d_model // 2,
         hidden_size=d_model,
         num_layers=num_layers,
         dropout=dropout,

lstm_training.ipynb CHANGED Viewed

@@ -42,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "1207cd93",
    "metadata": {
     "execution": {
@@ -62,19 +62,23 @@
    },
    "outputs": [],
    "source": [
     "class Vocabulary:\n",
     "    \"\"\"Vocabulary class for encoding/decoding text and labels\"\"\"\n",
     "    def __init__(self, max_size=100000):\n",
     "        self.word2idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}\n",
     "        self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}\n",
     "        self.word_count = Counter()\n",
     "        self.max_size = max_size\n",
     "        \n",
     "    def add_sentence(self, sentence):\n",
     "        for word in sentence:\n",
     "            self.word_count[word.lower()] += 1\n",
     "    \n",
     "    def build(self):\n",
     "        most_common = self.word_count.most_common(self.max_size - len(self.word2idx))\n",
     "        for word, _ in most_common:\n",
     "            if word not in self.word2idx:\n",
@@ -86,15 +90,17 @@
     "        return len(self.word2idx)\n",
     "    \n",
     "    def encode(self, sentence):\n",
     "        return [self.word2idx.get(word.lower(), self.word2idx['<unk>']) for word in sentence]\n",
     "    \n",
     "    def decode(self, indices):\n",
     "        return [self.idx2word.get(idx, '<unk>') for idx in indices]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "f4056292",
    "metadata": {
     "execution": {
@@ -114,6 +120,7 @@
    },
    "outputs": [],
    "source": [
     "class PIIDataset(Dataset):\n",
     "    \"\"\"PyTorch Dataset for PII detection\"\"\"\n",
     "    def __init__(self, tokens, labels, text_vocab, label_vocab, max_len=512):\n",
@@ -127,16 +134,16 @@
     "        return len(self.tokens)\n",
     "    \n",
     "    def __getitem__(self, idx):\n",
-    "        # Add start and end tokens\n",
     "        tokens = ['<start>'] + self.tokens[idx] + ['<end>']\n",
     "        labels = ['<start>'] + self.labels[idx] + ['<end>']\n",
     "        \n",
-    "        # Truncate if too long\n",
     "        if len(tokens) > self.max_len:\n",
     "            tokens = tokens[:self.max_len-1] + ['<end>']\n",
     "            labels = labels[:self.max_len-1] + ['<end>']\n",
     "        \n",
-    "        # Encode\n",
     "        token_ids = self.text_vocab.encode(tokens)\n",
     "        label_ids = self.label_vocab.encode(labels)\n",
     "        \n",
@@ -145,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "499deba2",
    "metadata": {
     "execution": {
@@ -167,7 +174,9 @@
    "source": [
     "def collate_fn(batch):\n",
     "    \"\"\"Custom collate function for padding sequences\"\"\"\n",
     "    tokens, labels = zip(*batch)\n",
     "    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)\n",
     "    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)\n",
     "    return tokens_padded, labels_padded"
@@ -175,7 +184,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "id": "7ade0505",
    "metadata": {
     "execution": {
@@ -195,6 +204,7 @@
    },
    "outputs": [],
    "source": [
     "class F1ScoreMetric:\n",
     "    \"\"\"Custom F1 score metric with beta parameter\"\"\"\n",
     "    def __init__(self, beta=5, num_classes=20, ignore_index=0, label_vocab=None):\n",
@@ -205,26 +215,32 @@
     "        self.reset()\n",
     "        \n",
     "    def reset(self):\n",
     "        self.true_positives = 0\n",
     "        self.false_positives = 0\n",
     "        self.false_negatives = 0\n",
     "        self.class_metrics = {}\n",
     "        \n",
     "    def update(self, predictions, targets):\n",
     "        mask = (targets != self.ignore_index) & (targets != 2) & (targets != 3)\n",
     "        o_idx = self.label_vocab.word2idx.get('o', -1) if self.label_vocab else -1\n",
     "        \n",
     "        for class_id in range(1, self.num_classes):\n",
     "            if class_id == o_idx:\n",
     "                continue\n",
-    "                \n",
     "            pred_mask = (predictions == class_id) & mask\n",
     "            true_mask = (targets == class_id) & mask\n",
     "            \n",
     "            tp = ((pred_mask) & (true_mask)).sum().item()\n",
     "            fp = ((pred_mask) & (~true_mask)).sum().item()\n",
     "            fn = ((~pred_mask) & (true_mask)).sum().item()\n",
     "            \n",
     "            self.true_positives += tp\n",
     "            self.false_positives += fp\n",
     "            self.false_negatives += fn\n",
@@ -236,6 +252,7 @@
     "            self.class_metrics[class_id]['fn'] += fn\n",
     "    \n",
     "    def compute(self):\n",
     "        beta_squared = self.beta ** 2\n",
     "        precision = self.true_positives / (self.true_positives + self.false_positives + 1e-8)\n",
     "        recall = self.true_positives / (self.true_positives + self.false_negatives + 1e-8)\n",
@@ -243,6 +260,7 @@
     "        return f1\n",
     "    \n",
     "    def get_class_metrics(self):\n",
     "        results = {}\n",
     "        for class_id, metrics in self.class_metrics.items():\n",
     "            if self.label_vocab and class_id in self.label_vocab.idx2word:\n",
@@ -261,7 +279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "id": "361b5505",
    "metadata": {
     "execution": {
@@ -281,6 +299,7 @@
    },
    "outputs": [],
    "source": [
     "class FocalLoss(nn.Module):\n",
     "    \"\"\"Focal Loss for addressing class imbalance\"\"\"\n",
     "    def __init__(self, alpha=None, gamma=2.0, reduction='mean', ignore_index=-100):\n",
@@ -291,6 +310,7 @@
     "        self.ignore_index = ignore_index\n",
     "        \n",
     "    def forward(self, inputs, targets):\n",
     "        ce_loss = nn.functional.cross_entropy(\n",
     "            inputs, targets, \n",
     "            weight=self.alpha, \n",
@@ -298,9 +318,11 @@
     "            ignore_index=self.ignore_index\n",
     "        )\n",
     "        \n",
     "        pt = torch.exp(-ce_loss)\n",
     "        focal_loss = (1 - pt) ** self.gamma * ce_loss\n",
     "        \n",
     "        if self.reduction == 'mean':\n",
     "            return focal_loss.mean()\n",
     "        elif self.reduction == 'sum':\n",
@@ -311,7 +333,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "id": "1de646e9",
    "metadata": {
     "execution": {
@@ -337,8 +359,10 @@
     "    total_loss = 0\n",
     "    f1_metric.reset()\n",
     "    \n",
     "    progress_bar = tqdm(dataloader, desc='Training')\n",
     "    for batch_idx, (tokens, labels) in enumerate(progress_bar):\n",
     "        tokens = tokens.to(device)\n",
     "        labels = labels.to(device)\n",
     "        \n",
@@ -353,6 +377,7 @@
     "        # Calculate loss and backward pass\n",
     "        loss = criterion(outputs_flat, labels_flat)\n",
     "        loss.backward()\n",
     "        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)\n",
     "        optimizer.step()\n",
     "        \n",
@@ -372,7 +397,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "id": "d1ce3b0f",
    "metadata": {
     "execution": {
@@ -398,8 +423,10 @@
     "    total_loss = 0\n",
     "    f1_metric.reset()\n",
     "    \n",
     "    with torch.no_grad():\n",
     "        for tokens, labels in tqdm(dataloader, desc='Evaluating'):\n",
     "            tokens = tokens.to(device)\n",
     "            labels = labels.to(device)\n",
     "            \n",
@@ -421,7 +448,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "id": "da3ff80c",
    "metadata": {
     "execution": {
@@ -445,6 +472,7 @@
     "    \"\"\"Create a weighted sampler to balance classes during training\"\"\"\n",
     "    sample_weights = []\n",
     "    \n",
     "    for idx in range(len(dataset)):\n",
     "        _, labels = dataset[idx]\n",
     "        \n",
@@ -453,12 +481,14 @@
     "        for label_id in labels:\n",
     "            if label_id > 3:  # Skip special tokens\n",
     "                label_name = label_vocab.idx2word.get(label_id.item(), 'O')\n",
     "                if label_name != 'o' and 'B-' in label_name:\n",
     "                    min_weight = 10.0\n",
     "                    break\n",
     "        \n",
     "        sample_weights.append(min_weight)\n",
     "    \n",
     "    sampler = WeightedRandomSampler(\n",
     "        weights=sample_weights,\n",
     "        num_samples=len(sample_weights),\n",
@@ -470,7 +500,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "id": "69b37e68",
    "metadata": {
     "execution": {
@@ -493,11 +523,14 @@
     "def print_label_distribution(data, title=\"Label Distribution\"):\n",
     "    \"\"\"Print label distribution statistics\"\"\"\n",
     "    label_counts = Counter()\n",
     "    for label_seq in data.labels:\n",
     "        for label in label_seq:\n",
     "            if label not in ['<pad>', '<start>', '<end>']:\n",
     "                label_counts[label] += 1\n",
     "    \n",
     "    print(f\"\\n{title}:\")\n",
     "    print(\"-\" * 50)\n",
     "    total = sum(label_counts.values())\n",
@@ -510,7 +543,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "id": "4b1b4f86",
    "metadata": {
     "execution": {
@@ -534,7 +567,7 @@
     "    \"\"\"Save model and all necessary components for deployment\"\"\"\n",
     "    os.makedirs(save_dir, exist_ok=True)\n",
     "    \n",
-    "    # Save model state\n",
     "    model_path = os.path.join(save_dir, 'pii_lstm_model.pt')\n",
     "    torch.save(model.state_dict(), model_path)\n",
     "    \n",
@@ -560,7 +593,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "id": "31d2f1b1",
    "metadata": {
     "execution": {
@@ -596,7 +629,7 @@
     "    data = pd.read_json(data_path, lines=True)\n",
     "    print(f\"Total samples: {len(data)}\")\n",
     "    \n",
-    "    # Print initial label distribution\n",
     "    print_label_distribution(data, \"Label Distribution in Augmented Data\")\n",
     "    \n",
     "    # Build vocabularies\n",
@@ -609,6 +642,7 @@
     "    for labels in data.labels:\n",
     "        label_vocab.add_sentence(labels)\n",
     "    \n",
     "    text_vocab.build()\n",
     "    label_vocab.build()\n",
     "    \n",
@@ -616,11 +650,11 @@
     "    print(f\"  - Text vocabulary: {len(text_vocab):,}\")\n",
     "    print(f\"  - Label vocabulary: {len(label_vocab)}\")\n",
     "    \n",
-    "    # Calculate class weights\n",
     "    class_weights = calculate_class_weights(data, label_vocab)\n",
     "    class_weights = class_weights.to(device)\n",
     "    \n",
-    "    # Split data\n",
     "    X_train, X_val, y_train, y_val = train_test_split(\n",
     "        data.tokens.tolist(),\n",
     "        data.labels.tolist(),\n",
@@ -632,14 +666,15 @@
     "    print(f\"  - Train samples: {len(X_train):,}\")\n",
     "    print(f\"  - Validation samples: {len(X_val):,}\")\n",
     "    \n",
-    "    # Create datasets and dataloaders\n",
     "    max_seq_len = 512\n",
     "    train_dataset = PIIDataset(X_train, y_train, text_vocab, label_vocab, max_len=max_seq_len)\n",
     "    val_dataset = PIIDataset(X_val, y_val, text_vocab, label_vocab, max_len=max_seq_len)\n",
     "    \n",
-    "    # Use balanced sampler for training\n",
     "    train_sampler = create_balanced_sampler(train_dataset, label_vocab)\n",
     "    \n",
     "    train_loader = DataLoader(\n",
     "        train_dataset, \n",
     "        batch_size=batch_size,\n",
@@ -668,7 +703,7 @@
     "        'max_len': max_seq_len\n",
     "    }\n",
     "    \n",
-    "    # Create model\n",
     "    print(\"\\nCreating LSTM model...\")\n",
     "    model = create_lstm_pii_model(**model_config).to(device)\n",
     "    print(f\"Model parameters: {sum(p.numel() for p in model.parameters()):,}\")\n",
@@ -701,11 +736,11 @@
     "        min_lr=1e-6\n",
     "    )\n",
     "    \n",
-    "    # Metrics\n",
     "    f1_metric_train = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
     "    f1_metric_val = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
     "    \n",
-    "    # Training loop\n",
     "    train_losses, train_f1s, val_losses, val_f1s = [], [], [], []\n",
     "    best_val_f1 = 0\n",
     "    patience = 7\n",
@@ -714,18 +749,20 @@
     "    print(\"\\nStarting training...\")\n",
     "    print(\"=\" * 60)\n",
     "    \n",
     "    for epoch in range(num_epochs):\n",
     "        print(f\"\\nEpoch {epoch+1}/{num_epochs}\")\n",
     "        \n",
-    "        # Train and validate\n",
     "        train_loss, train_f1 = train_epoch(\n",
     "            model, train_loader, optimizer, criterion, device, f1_metric_train\n",
     "        )\n",
     "        val_loss, val_f1 = evaluate(\n",
     "            model, val_loader, criterion, device, f1_metric_val\n",
     "        )\n",
     "        \n",
-    "        # Step scheduler based on validation loss\n",
     "        scheduler.step(val_loss)\n",
     "        \n",
     "        # Store metrics\n",
@@ -1282,9 +1319,11 @@
     }
    ],
    "source": [
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "print(f\"Using device: {device}\")\n",
     "\n",
     "model, text_vocab, label_vocab = train_lstm_pii_model(\n",
     "    data_path='train_augmented.json',\n",
     "    num_epochs=20,\n",

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "1207cd93",
    "metadata": {
     "execution": {
    },
    "outputs": [],
    "source": [
+    "# Define vocabulary class for text and label encoding\n",
     "class Vocabulary:\n",
     "    \"\"\"Vocabulary class for encoding/decoding text and labels\"\"\"\n",
     "    def __init__(self, max_size=100000):\n",
+    "        # Initialize special tokens\n",
     "        self.word2idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}\n",
     "        self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}\n",
     "        self.word_count = Counter()\n",
     "        self.max_size = max_size\n",
     "        \n",
     "    def add_sentence(self, sentence):\n",
+    "        # Count word frequencies\n",
     "        for word in sentence:\n",
     "            self.word_count[word.lower()] += 1\n",
     "    \n",
     "    def build(self):\n",
+    "        # Build vocabulary from most common words\n",
     "        most_common = self.word_count.most_common(self.max_size - len(self.word2idx))\n",
     "        for word, _ in most_common:\n",
     "            if word not in self.word2idx:\n",
     "        return len(self.word2idx)\n",
     "    \n",
     "    def encode(self, sentence):\n",
+    "        # Convert words to indices\n",
     "        return [self.word2idx.get(word.lower(), self.word2idx['<unk>']) for word in sentence]\n",
     "    \n",
     "    def decode(self, indices):\n",
+    "        # Convert indices back to words\n",
     "        return [self.idx2word.get(idx, '<unk>') for idx in indices]"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "f4056292",
    "metadata": {
     "execution": {
    },
    "outputs": [],
    "source": [
+    "# Dataset class for PII detection\n",
     "class PIIDataset(Dataset):\n",
     "    \"\"\"PyTorch Dataset for PII detection\"\"\"\n",
     "    def __init__(self, tokens, labels, text_vocab, label_vocab, max_len=512):\n",
     "        return len(self.tokens)\n",
     "    \n",
     "    def __getitem__(self, idx):\n",
+    "        # Add special tokens to beginning and end\n",
     "        tokens = ['<start>'] + self.tokens[idx] + ['<end>']\n",
     "        labels = ['<start>'] + self.labels[idx] + ['<end>']\n",
     "        \n",
+    "        # Truncate if sequence is too long\n",
     "        if len(tokens) > self.max_len:\n",
     "            tokens = tokens[:self.max_len-1] + ['<end>']\n",
     "            labels = labels[:self.max_len-1] + ['<end>']\n",
     "        \n",
+    "        # Encode tokens and labels to indices\n",
     "        token_ids = self.text_vocab.encode(tokens)\n",
     "        label_ids = self.label_vocab.encode(labels)\n",
     "        \n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "499deba2",
    "metadata": {
     "execution": {
    "source": [
     "def collate_fn(batch):\n",
     "    \"\"\"Custom collate function for padding sequences\"\"\"\n",
+    "    # Separate tokens and labels\n",
     "    tokens, labels = zip(*batch)\n",
+    "    # Pad sequences to same length\n",
     "    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)\n",
     "    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)\n",
     "    return tokens_padded, labels_padded"
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "7ade0505",
    "metadata": {
     "execution": {
    },
    "outputs": [],
    "source": [
+    "# F1 score metric for evaluation\n",
     "class F1ScoreMetric:\n",
     "    \"\"\"Custom F1 score metric with beta parameter\"\"\"\n",
     "    def __init__(self, beta=5, num_classes=20, ignore_index=0, label_vocab=None):\n",
     "        self.reset()\n",
     "        \n",
     "    def reset(self):\n",
+    "        # Reset counters\n",
     "        self.true_positives = 0\n",
     "        self.false_positives = 0\n",
     "        self.false_negatives = 0\n",
     "        self.class_metrics = {}\n",
     "        \n",
     "    def update(self, predictions, targets):\n",
+    "        # Create mask to ignore padding and special tokens\n",
     "        mask = (targets != self.ignore_index) & (targets != 2) & (targets != 3)\n",
     "        o_idx = self.label_vocab.word2idx.get('o', -1) if self.label_vocab else -1\n",
     "        \n",
+    "        # Calculate metrics for each class\n",
     "        for class_id in range(1, self.num_classes):\n",
     "            if class_id == o_idx:\n",
     "                continue\n",
+    "            \n",
+    "            # Find where predictions and targets match this class\n",
     "            pred_mask = (predictions == class_id) & mask\n",
     "            true_mask = (targets == class_id) & mask\n",
     "            \n",
+    "            # Count true positives, false positives, false negatives\n",
     "            tp = ((pred_mask) & (true_mask)).sum().item()\n",
     "            fp = ((pred_mask) & (~true_mask)).sum().item()\n",
     "            fn = ((~pred_mask) & (true_mask)).sum().item()\n",
     "            \n",
+    "            # Update total counts\n",
     "            self.true_positives += tp\n",
     "            self.false_positives += fp\n",
     "            self.false_negatives += fn\n",
     "            self.class_metrics[class_id]['fn'] += fn\n",
     "    \n",
     "    def compute(self):\n",
+    "        # Calculate F-beta score\n",
     "        beta_squared = self.beta ** 2\n",
     "        precision = self.true_positives / (self.true_positives + self.false_positives + 1e-8)\n",
     "        recall = self.true_positives / (self.true_positives + self.false_negatives + 1e-8)\n",
     "        return f1\n",
     "    \n",
     "    def get_class_metrics(self):\n",
+    "        # Get metrics for each class\n",
     "        results = {}\n",
     "        for class_id, metrics in self.class_metrics.items():\n",
     "            if self.label_vocab and class_id in self.label_vocab.idx2word:\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "361b5505",
    "metadata": {
     "execution": {
    },
    "outputs": [],
    "source": [
+    "# Focal loss for handling class imbalance\n",
     "class FocalLoss(nn.Module):\n",
     "    \"\"\"Focal Loss for addressing class imbalance\"\"\"\n",
     "    def __init__(self, alpha=None, gamma=2.0, reduction='mean', ignore_index=-100):\n",
     "        self.ignore_index = ignore_index\n",
     "        \n",
     "    def forward(self, inputs, targets):\n",
+    "        # Calculate cross entropy loss\n",
     "        ce_loss = nn.functional.cross_entropy(\n",
     "            inputs, targets, \n",
     "            weight=self.alpha, \n",
     "            ignore_index=self.ignore_index\n",
     "        )\n",
     "        \n",
+    "        # Apply focal term to focus on hard examples\n",
     "        pt = torch.exp(-ce_loss)\n",
     "        focal_loss = (1 - pt) ** self.gamma * ce_loss\n",
     "        \n",
+    "        # Reduce loss based on specified method\n",
     "        if self.reduction == 'mean':\n",
     "            return focal_loss.mean()\n",
     "        elif self.reduction == 'sum':\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "1de646e9",
    "metadata": {
     "execution": {
     "    total_loss = 0\n",
     "    f1_metric.reset()\n",
     "    \n",
+    "    # Progress bar for training\n",
     "    progress_bar = tqdm(dataloader, desc='Training')\n",
     "    for batch_idx, (tokens, labels) in enumerate(progress_bar):\n",
+    "        # Move data to device\n",
     "        tokens = tokens.to(device)\n",
     "        labels = labels.to(device)\n",
     "        \n",
     "        # Calculate loss and backward pass\n",
     "        loss = criterion(outputs_flat, labels_flat)\n",
     "        loss.backward()\n",
+    "        # Clip gradients to prevent exploding gradients\n",
     "        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)\n",
     "        optimizer.step()\n",
     "        \n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "d1ce3b0f",
    "metadata": {
     "execution": {
     "    total_loss = 0\n",
     "    f1_metric.reset()\n",
     "    \n",
+    "    # No gradient computation during evaluation\n",
     "    with torch.no_grad():\n",
     "        for tokens, labels in tqdm(dataloader, desc='Evaluating'):\n",
+    "            # Move data to device\n",
     "            tokens = tokens.to(device)\n",
     "            labels = labels.to(device)\n",
     "            \n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "da3ff80c",
    "metadata": {
     "execution": {
     "    \"\"\"Create a weighted sampler to balance classes during training\"\"\"\n",
     "    sample_weights = []\n",
     "    \n",
+    "    # Calculate weight for each sample\n",
     "    for idx in range(len(dataset)):\n",
     "        _, labels = dataset[idx]\n",
     "        \n",
     "        for label_id in labels:\n",
     "            if label_id > 3:  # Skip special tokens\n",
     "                label_name = label_vocab.idx2word.get(label_id.item(), 'O')\n",
+    "                # If sample contains PII, give it higher weight\n",
     "                if label_name != 'o' and 'B-' in label_name:\n",
     "                    min_weight = 10.0\n",
     "                    break\n",
     "        \n",
     "        sample_weights.append(min_weight)\n",
     "    \n",
+    "    # Create weighted sampler\n",
     "    sampler = WeightedRandomSampler(\n",
     "        weights=sample_weights,\n",
     "        num_samples=len(sample_weights),\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "69b37e68",
    "metadata": {
     "execution": {
     "def print_label_distribution(data, title=\"Label Distribution\"):\n",
     "    \"\"\"Print label distribution statistics\"\"\"\n",
     "    label_counts = Counter()\n",
+    "\n",
+    "    # Count each label type\n",
     "    for label_seq in data.labels:\n",
     "        for label in label_seq:\n",
     "            if label not in ['<pad>', '<start>', '<end>']:\n",
     "                label_counts[label] += 1\n",
     "    \n",
+    "    # Print distribution\n",
     "    print(f\"\\n{title}:\")\n",
     "    print(\"-\" * 50)\n",
     "    total = sum(label_counts.values())\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "4b1b4f86",
    "metadata": {
     "execution": {
     "    \"\"\"Save model and all necessary components for deployment\"\"\"\n",
     "    os.makedirs(save_dir, exist_ok=True)\n",
     "    \n",
+    "    # Save model weights\n",
     "    model_path = os.path.join(save_dir, 'pii_lstm_model.pt')\n",
     "    torch.save(model.state_dict(), model_path)\n",
     "    \n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "31d2f1b1",
    "metadata": {
     "execution": {
     "    data = pd.read_json(data_path, lines=True)\n",
     "    print(f\"Total samples: {len(data)}\")\n",
     "    \n",
+    "    # Show label distribution\n",
     "    print_label_distribution(data, \"Label Distribution in Augmented Data\")\n",
     "    \n",
     "    # Build vocabularies\n",
     "    for labels in data.labels:\n",
     "        label_vocab.add_sentence(labels)\n",
     "    \n",
+    "    # Build vocabularies from collected words\n",
     "    text_vocab.build()\n",
     "    label_vocab.build()\n",
     "    \n",
     "    print(f\"  - Text vocabulary: {len(text_vocab):,}\")\n",
     "    print(f\"  - Label vocabulary: {len(label_vocab)}\")\n",
     "    \n",
+    "    # Calculate class weights for balanced loss\n",
     "    class_weights = calculate_class_weights(data, label_vocab)\n",
     "    class_weights = class_weights.to(device)\n",
     "    \n",
+    "    # Split data into train and validation sets\n",
     "    X_train, X_val, y_train, y_val = train_test_split(\n",
     "        data.tokens.tolist(),\n",
     "        data.labels.tolist(),\n",
     "    print(f\"  - Train samples: {len(X_train):,}\")\n",
     "    print(f\"  - Validation samples: {len(X_val):,}\")\n",
     "    \n",
+    "    # Create datasets\n",
     "    max_seq_len = 512\n",
     "    train_dataset = PIIDataset(X_train, y_train, text_vocab, label_vocab, max_len=max_seq_len)\n",
     "    val_dataset = PIIDataset(X_val, y_val, text_vocab, label_vocab, max_len=max_seq_len)\n",
     "    \n",
+    "    # Create balanced sampler for training\n",
     "    train_sampler = create_balanced_sampler(train_dataset, label_vocab)\n",
     "    \n",
+    "    # Create data loaders\n",
     "    train_loader = DataLoader(\n",
     "        train_dataset, \n",
     "        batch_size=batch_size,\n",
     "        'max_len': max_seq_len\n",
     "    }\n",
     "    \n",
+    "    # Create LSTM model\n",
     "    print(\"\\nCreating LSTM model...\")\n",
     "    model = create_lstm_pii_model(**model_config).to(device)\n",
     "    print(f\"Model parameters: {sum(p.numel() for p in model.parameters()):,}\")\n",
     "        min_lr=1e-6\n",
     "    )\n",
     "    \n",
+    "    # Initialize metrics\n",
     "    f1_metric_train = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
     "    f1_metric_val = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
     "    \n",
+    "    # Training history\n",
     "    train_losses, train_f1s, val_losses, val_f1s = [], [], [], []\n",
     "    best_val_f1 = 0\n",
     "    patience = 7\n",
     "    print(\"\\nStarting training...\")\n",
     "    print(\"=\" * 60)\n",
     "    \n",
+    "    # Training loop\n",
     "    for epoch in range(num_epochs):\n",
     "        print(f\"\\nEpoch {epoch+1}/{num_epochs}\")\n",
     "        \n",
+    "        # Train for one epoch\n",
     "        train_loss, train_f1 = train_epoch(\n",
     "            model, train_loader, optimizer, criterion, device, f1_metric_train\n",
     "        )\n",
+    "        # Evaluate on validation set\n",
     "        val_loss, val_f1 = evaluate(\n",
     "            model, val_loader, criterion, device, f1_metric_val\n",
     "        )\n",
     "        \n",
+    "        # Adjust learning rate based on validation loss\n",
     "        scheduler.step(val_loss)\n",
     "        \n",
     "        # Store metrics\n",
     }
    ],
    "source": [
+    "# Set device\n",
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "print(f\"Using device: {device}\")\n",
     "\n",
+    "# Train the LSTM model\n",
     "model, text_vocab, label_vocab = train_lstm_pii_model(\n",
     "    data_path='train_augmented.json',\n",
     "    num_epochs=20,\n",

transformer.py CHANGED Viewed

@@ -4,37 +4,25 @@ import torch.nn.functional as F
 import math
 def scaled_dot_product_attention(q, k, v, mask=None, dropout=None):
-    """
-    Compute scaled dot-product attention.
-    Args:
-        q: queries (batch_size, num_heads, seq_len_q, d_k)
-        k: keys    (batch_size, num_heads, seq_len_k, d_k)
-        v: values  (batch_size, num_heads, seq_len_v, d_v)
-        mask: mask tensor (batch_size, 1, 1, seq_len_k) or (batch_size, 1, seq_len_q, seq_len_k)
-        dropout: dropout layer
-    Returns:
-        output: attended values (batch_size, num_heads, seq_len_q, d_v)
-        attention_weights: attention weights (batch_size, num_heads, seq_len_q, seq_len_k)
-    """
     d_k = q.size(-1)
-    # Calculate attention scores
     scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
-    # Apply mask if provided
     if mask is not None:
         scores = scores.masked_fill(mask == 0, float('-inf'))
-    # Apply softmax
     attention_weights = F.softmax(scores, dim=-1)
-    # Apply dropout if provided
     if dropout is not None:
         attention_weights = dropout(attention_weights)
-    # Apply attention to values
     output = torch.matmul(attention_weights, v)
     return output, attention_weights
@@ -48,47 +36,42 @@ class MultiHeadAttention(nn.Module):
         self.d_model = d_model
         self.num_heads = num_heads
-        self.d_k = d_model // num_heads
-        # Linear projections for Q, K, V
         self.w_q = nn.Linear(d_model, d_model)
         self.w_k = nn.Linear(d_model, d_model)
         self.w_v = nn.Linear(d_model, d_model)
-        # Output projection
         self.w_o = nn.Linear(d_model, d_model)
-        # Dropout
         self.dropout = nn.Dropout(dropout)
     def forward(self, query, key, value, mask=None):
         """
-        Args:
-            query: (batch_size, seq_len_q, d_model)
-            key:   (batch_size, seq_len_k, d_model)
-            value: (batch_size, seq_len_v, d_model)
-            mask:  (batch_size, 1, 1, seq_len_k) or None
-        Returns:
-            output: (batch_size, seq_len_q, d_model)
-            attention_weights: (batch_size, num_heads, seq_len_q, seq_len_k)
         """
         batch_size = query.size(0)
         seq_len_q = query.size(1)
         seq_len_k = key.size(1)
         seq_len_v = value.size(1)
-        # 1. Linear projections in batch from d_model => h x d_k
         Q = self.w_q(query).view(batch_size, seq_len_q, self.num_heads, self.d_k).transpose(1, 2)
         K = self.w_k(key).view(batch_size, seq_len_k, self.num_heads, self.d_k).transpose(1, 2)
         V = self.w_v(value).view(batch_size, seq_len_v, self.num_heads, self.d_k).transpose(1, 2)
-        # 2. Apply attention on all the projected vectors in batch
         attention_output, attention_weights = scaled_dot_product_attention(
             Q, K, V, mask=mask, dropout=self.dropout
         )
-        # 3. Concatenate heads and put through final linear layer
         attention_output = attention_output.transpose(1, 2).contiguous().view(
             batch_size, seq_len_q, self.d_model
         )
@@ -102,19 +85,14 @@ class PositionwiseFeedForward(nn.Module):
     def __init__(self, d_model, d_ff, dropout=0.1):
         super(PositionwiseFeedForward, self).__init__()
         self.w_1 = nn.Linear(d_model, d_ff)
         self.w_2 = nn.Linear(d_ff, d_model)
         self.dropout = nn.Dropout(dropout)
         self.activation = nn.ReLU()
     def forward(self, x):
-        """
-        Args:
-            x: (batch_size, seq_len, d_model)
-        Returns:
-            output: (batch_size, seq_len, d_model)
-        """
         return self.w_2(self.dropout(self.activation(self.w_1(x))))
 class EncoderLayer(nn.Module):
@@ -123,33 +101,25 @@ class EncoderLayer(nn.Module):
     def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
         super(EncoderLayer, self).__init__()
-        # Multi-head attention
         self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
-        # Position-wise feed forward
         self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
-        # Layer normalization
         self.norm1 = nn.LayerNorm(d_model)
         self.norm2 = nn.LayerNorm(d_model)
-        # Dropout
         self.dropout = nn.Dropout(dropout)
     def forward(self, x, mask=None):
-        """
-        Args:
-            x: (batch_size, seq_len, d_model)
-            mask: (batch_size, 1, 1, seq_len) or None
-        Returns:
-            output: (batch_size, seq_len, d_model)
-        """
-        # Self-attention with residual connection and layer norm
         attn_output, _ = self.self_attention(x, x, x, mask)
         x = self.norm1(x + self.dropout(attn_output))
-        # Feed forward with residual connection and layer norm
         ff_output = self.feed_forward(x)
         x = self.norm2(x + self.dropout(ff_output))
@@ -161,25 +131,21 @@ class TransformerEncoder(nn.Module):
     def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
         super(TransformerEncoder, self).__init__()
         self.layers = nn.ModuleList([
             EncoderLayer(d_model, num_heads, d_ff, dropout)
             for _ in range(num_layers)
         ])
         self.norm = nn.LayerNorm(d_model)
     def forward(self, x, mask=None):
-        """
-        Args:
-            x: (batch_size, seq_len, d_model)
-            mask: (batch_size, 1, 1, seq_len) or None
-        Returns:
-            output: (batch_size, seq_len, d_model)
-        """
         for layer in self.layers:
             x = layer(x, mask)
         return self.norm(x)
 class PositionalEncoding(nn.Module):
@@ -189,36 +155,29 @@ class PositionalEncoding(nn.Module):
         super(PositionalEncoding, self).__init__()
         self.dropout = nn.Dropout(dropout)
-        # Create positional encoding matrix
         pe = torch.zeros(max_len, d_model)
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        # Create div_term for sin/cos frequencies
         div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                            (-math.log(10000.0) / d_model))
-        # Apply sin to even indices
         pe[:, 0::2] = torch.sin(position * div_term)
-        # Apply cos to odd indices
         if d_model % 2 == 1:
             pe[:, 1::2] = torch.cos(position * div_term[:-1])
         else:
             pe[:, 1::2] = torch.cos(position * div_term)
-        # Add batch dimension and register as buffer
         pe = pe.unsqueeze(0)
         self.register_buffer('pe', pe)
     def forward(self, x):
-        """
-        Args:
-            x: (batch_size, seq_len, d_model)
-        Returns:
-            output: (batch_size, seq_len, d_model)
-        """
-        # Add positional encoding
         x = x + self.pe[:, :x.size(1), :]
         return self.dropout(x)
@@ -235,62 +194,46 @@ class TransformerPII(nn.Module):
         self.d_model = d_model
         self.pad_idx = pad_idx
-        # Token embedding layer
         self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
-        # Positional encoding
         self.positional_encoding = PositionalEncoding(d_model, max_len, dropout)
-        # Transformer encoder stack
         self.encoder = TransformerEncoder(num_layers, d_model, num_heads, d_ff, dropout)
-        # Classification head
         self.classifier = nn.Linear(d_model, num_classes)
-        # Dropout
         self.dropout = nn.Dropout(dropout)
-        # Initialize weights
         self._init_weights()
     def _init_weights(self):
         """Initialize model weights"""
-        # Initialize embeddings
         nn.init.normal_(self.embedding.weight, mean=0, std=self.d_model**-0.5)
         if self.pad_idx is not None:
             nn.init.constant_(self.embedding.weight[self.pad_idx], 0)
-        # Initialize classifier
         nn.init.xavier_uniform_(self.classifier.weight)
         if self.classifier.bias is not None:
             nn.init.constant_(self.classifier.bias, 0)
     def create_padding_mask(self, x):
-        """
-        Create padding mask for attention
-        Args:
-            x: (batch_size, seq_len) - input token indices
-        Returns:
-            mask: (batch_size, 1, 1, seq_len) - attention mask
-        """
-        # Create mask where padding tokens are marked as 0
         mask = (x != self.pad_idx).unsqueeze(1).unsqueeze(2)
         return mask.float()
     def forward(self, x, mask=None):
-        """
-        Forward pass for token classification
-        Args:
-            x: (batch_size, seq_len) - input token indices
-            mask: Optional custom attention mask
-        Returns:
-            logits: (batch_size, seq_len, num_classes) - classification logits
-        """
-        # Check input dimensions
         if x.dim() != 2:
             raise ValueError(f"Expected input to have 2 dimensions [batch_size, seq_len], got {x.dim()}")
@@ -300,94 +243,35 @@ class TransformerPII(nn.Module):
         if mask is None:
             mask = self.create_padding_mask(x)
-        # Embedding with scaling
         x = self.embedding(x) * math.sqrt(self.d_model)
         # Add positional encoding
         x = self.positional_encoding(x)
-        # Pass through transformer encoder
         encoder_output = self.encoder(x, mask)
         # Apply dropout before classification
         encoder_output = self.dropout(encoder_output)
-        # Classify each token
         logits = self.classifier(encoder_output)
         return logits
     def predict(self, x):
-        """
-        Get predictions for inference
-        Args:
-            x: (batch_size, seq_len) - input token indices
-        Returns:
-            predictions: (batch_size, seq_len) - predicted class indices
-        """
         self.eval()
         with torch.no_grad():
             logits = self.forward(x)
             predictions = torch.argmax(logits, dim=-1)
         return predictions
-class TransformerPIIWithCRF(TransformerPII):
-    """
-    Transformer with CRF layer for improved sequence labeling
-    (Optional enhancement - requires pytorch-crf)
-    """
-    def __init__(self, vocab_size, num_classes, d_model=256, num_heads=8,
-                 d_ff=512, num_layers=4, dropout=0.1, max_len=512, pad_idx=0):
-        super(TransformerPIIWithCRF, self).__init__(
-            vocab_size, num_classes, d_model, num_heads,
-            d_ff, num_layers, dropout, max_len, pad_idx
-        )
-        # CRF layer would be initialized here
-        # from torchcrf import CRF
-        # self.crf = CRF(num_classes, batch_first=True)
-    def forward(self, x, labels=None):
-        """Forward pass with optional CRF"""
-        # Get transformer outputs
-        emissions = super().forward(x)
-        if labels is not None:
-            # Training mode with CRF
-            # mask = (x != self.pad_idx)
-            # loss = -self.crf(emissions, labels, mask=mask)
-            # return loss
-            pass
-        else:
-            # Inference mode with CRF
-            # mask = (x != self.pad_idx)
-            # predictions = self.crf.decode(emissions, mask=mask)
-            # return predictions
-            pass
-        return emissions
 def create_transformer_pii_model(vocab_size, num_classes, d_model=256, num_heads=8,
                                 d_ff=512, num_layers=4, dropout=0.1, max_len=512):
-    """
-    Factory function to create transformer model for PII detection
-    Args:
-        vocab_size: Size of vocabulary
-        num_classes: Number of PII classes (e.g., 20)
-        d_model: Dimension of model (hidden size)
-        num_heads: Number of attention heads
-        d_ff: Dimension of feedforward network
-        num_layers: Number of transformer layers
-        dropout: Dropout rate
-        max_len: Maximum sequence length
-    Returns:
-        TransformerPII model instance
-    """
     model = TransformerPII(
         vocab_size=vocab_size,
         num_classes=num_classes,
@@ -397,7 +281,7 @@ def create_transformer_pii_model(vocab_size, num_classes, d_model=256, num_heads
         num_layers=num_layers,
         dropout=dropout,
         max_len=max_len,
-        pad_idx=0  # Assuming 0 is padding index
     )
     return model

 import math
 def scaled_dot_product_attention(q, k, v, mask=None, dropout=None):
+    """Compute scaled dot-product attention."""
+    # Get dimension of keys for scaling
     d_k = q.size(-1)
+    # Compute attention scores using dot product
     scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
+    # Mask out padding positions if mask provided
     if mask is not None:
         scores = scores.masked_fill(mask == 0, float('-inf'))
+    # Convert scores to probabilities
     attention_weights = F.softmax(scores, dim=-1)
+    # Apply dropout to attention weights if specified
     if dropout is not None:
         attention_weights = dropout(attention_weights)
+    # Apply attention weights to values
     output = torch.matmul(attention_weights, v)
     return output, attention_weights
         self.d_model = d_model
         self.num_heads = num_heads
+        self.d_k = d_model // num_heads  # Dimension per head
+        # Linear layers for projecting Q, K, V
         self.w_q = nn.Linear(d_model, d_model)
         self.w_k = nn.Linear(d_model, d_model)
         self.w_v = nn.Linear(d_model, d_model)
+        # Final output projection
         self.w_o = nn.Linear(d_model, d_model)
+        # Dropout layer
         self.dropout = nn.Dropout(dropout)
     def forward(self, query, key, value, mask=None):
         """
+        query: (batch_size, seq_len_q, d_model)
+        key:   (batch_size, seq_len_k, d_model)
+        value: (batch_size, seq_len_v, d_model)
+        mask:  (batch_size, 1, 1, seq_len_k) or None
         """
         batch_size = query.size(0)
         seq_len_q = query.size(1)
         seq_len_k = key.size(1)
         seq_len_v = value.size(1)
+        # Project and reshape for multiple heads
         Q = self.w_q(query).view(batch_size, seq_len_q, self.num_heads, self.d_k).transpose(1, 2)
         K = self.w_k(key).view(batch_size, seq_len_k, self.num_heads, self.d_k).transpose(1, 2)
         V = self.w_v(value).view(batch_size, seq_len_v, self.num_heads, self.d_k).transpose(1, 2)
+        # Apply scaled dot-product attention
         attention_output, attention_weights = scaled_dot_product_attention(
             Q, K, V, mask=mask, dropout=self.dropout
         )
+        # Concatenate heads and apply output projection
         attention_output = attention_output.transpose(1, 2).contiguous().view(
             batch_size, seq_len_q, self.d_model
         )
     def __init__(self, d_model, d_ff, dropout=0.1):
         super(PositionwiseFeedForward, self).__init__()
+        # Two linear layers with ReLU activation
         self.w_1 = nn.Linear(d_model, d_ff)
         self.w_2 = nn.Linear(d_ff, d_model)
         self.dropout = nn.Dropout(dropout)
         self.activation = nn.ReLU()
     def forward(self, x):
+        # Apply first linear layer, activation, dropout, then second linear layer
         return self.w_2(self.dropout(self.activation(self.w_1(x))))
 class EncoderLayer(nn.Module):
     def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
         super(EncoderLayer, self).__init__()
+        # Multi-head self-attention sublayer
         self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
+        # Position-wise feed forward sublayer
         self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
+        # Layer normalization for each sublayer
         self.norm1 = nn.LayerNorm(d_model)
         self.norm2 = nn.LayerNorm(d_model)
+        # Dropout for residual connections
         self.dropout = nn.Dropout(dropout)
     def forward(self, x, mask=None):
+        # Self-attention sublayer with residual connection and layer norm
         attn_output, _ = self.self_attention(x, x, x, mask)
         x = self.norm1(x + self.dropout(attn_output))
+        # Feed forward sublayer with residual connection and layer norm
         ff_output = self.feed_forward(x)
         x = self.norm2(x + self.dropout(ff_output))
     def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
         super(TransformerEncoder, self).__init__()
+        # Create stack of encoder layers
         self.layers = nn.ModuleList([
             EncoderLayer(d_model, num_heads, d_ff, dropout)
             for _ in range(num_layers)
         ])
+        # Final layer normalization
         self.norm = nn.LayerNorm(d_model)
     def forward(self, x, mask=None):
+        # Pass through each encoder layer sequentially
         for layer in self.layers:
             x = layer(x, mask)
+        # Apply final normalization
         return self.norm(x)
 class PositionalEncoding(nn.Module):
         super(PositionalEncoding, self).__init__()
         self.dropout = nn.Dropout(dropout)
+        # Create matrix to hold positional encodings
         pe = torch.zeros(max_len, d_model)
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        # Create frequency terms for sin/cos functions
         div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                            (-math.log(10000.0) / d_model))
+        # Apply sine to even indices
         pe[:, 0::2] = torch.sin(position * div_term)
+        # Apply cosine to odd indices
         if d_model % 2 == 1:
             pe[:, 1::2] = torch.cos(position * div_term[:-1])
         else:
             pe[:, 1::2] = torch.cos(position * div_term)
+        # Add batch dimension and save as buffer
         pe = pe.unsqueeze(0)
         self.register_buffer('pe', pe)
     def forward(self, x):
+        # Add positional encoding to input embeddings
         x = x + self.pe[:, :x.size(1), :]
         return self.dropout(x)
         self.d_model = d_model
         self.pad_idx = pad_idx
+        # Embedding layer for input tokens
         self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
+        # Add positional information to embeddings
         self.positional_encoding = PositionalEncoding(d_model, max_len, dropout)
+        # Stack of transformer encoder layers
         self.encoder = TransformerEncoder(num_layers, d_model, num_heads, d_ff, dropout)
+        # Classification head for token-level predictions
         self.classifier = nn.Linear(d_model, num_classes)
+        # Dropout layer
         self.dropout = nn.Dropout(dropout)
+        # Initialize model weights
         self._init_weights()
     def _init_weights(self):
         """Initialize model weights"""
+        # Initialize embeddings with normal distribution
         nn.init.normal_(self.embedding.weight, mean=0, std=self.d_model**-0.5)
+        # Set padding token embedding to zero
         if self.pad_idx is not None:
             nn.init.constant_(self.embedding.weight[self.pad_idx], 0)
+        # Initialize classifier with Xavier uniform
         nn.init.xavier_uniform_(self.classifier.weight)
         if self.classifier.bias is not None:
             nn.init.constant_(self.classifier.bias, 0)
     def create_padding_mask(self, x):
+        """Create padding mask for attention"""
+        # Create mask where non-padding tokens are marked as 1
         mask = (x != self.pad_idx).unsqueeze(1).unsqueeze(2)
         return mask.float()
     def forward(self, x, mask=None):
+        """Forward pass for token classification"""
+        # Validate input dimensions
         if x.dim() != 2:
             raise ValueError(f"Expected input to have 2 dimensions [batch_size, seq_len], got {x.dim()}")
         if mask is None:
             mask = self.create_padding_mask(x)
+        # Embed and scale by sqrt(d_model)
         x = self.embedding(x) * math.sqrt(self.d_model)
         # Add positional encoding
         x = self.positional_encoding(x)
+        # Pass through transformer encoder stack
         encoder_output = self.encoder(x, mask)
         # Apply dropout before classification
         encoder_output = self.dropout(encoder_output)
+        # Get class predictions for each token
         logits = self.classifier(encoder_output)
         return logits
     def predict(self, x):
+        """Get predictions for inference"""
+        # Switch to evaluation mode
         self.eval()
         with torch.no_grad():
             logits = self.forward(x)
             predictions = torch.argmax(logits, dim=-1)
         return predictions
 def create_transformer_pii_model(vocab_size, num_classes, d_model=256, num_heads=8,
                                 d_ff=512, num_layers=4, dropout=0.1, max_len=512):
+    """Factory function to create transformer model for PII detection"""
     model = TransformerPII(
         vocab_size=vocab_size,
         num_classes=num_classes,
         num_layers=num_layers,
         dropout=dropout,
         max_len=max_len,
+        pad_idx=0
     )
     return model

transformer_training.ipynb CHANGED Viewed

@@ -42,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "ff1782dd",
    "metadata": {
     "execution": {
@@ -62,19 +62,23 @@
    },
    "outputs": [],
    "source": [
     "class Vocabulary:\n",
     "    \"\"\"Vocabulary class for encoding/decoding text and labels\"\"\"\n",
     "    def __init__(self, max_size=100000):\n",
     "        self.word2idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}\n",
     "        self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}\n",
     "        self.word_count = Counter()\n",
     "        self.max_size = max_size\n",
     "        \n",
     "    def add_sentence(self, sentence):\n",
     "        for word in sentence:\n",
     "            self.word_count[word.lower()] += 1\n",
     "    \n",
     "    def build(self):\n",
     "        most_common = self.word_count.most_common(self.max_size - len(self.word2idx))\n",
     "        for word, _ in most_common:\n",
     "            if word not in self.word2idx:\n",
@@ -86,15 +90,17 @@
     "        return len(self.word2idx)\n",
     "    \n",
     "    def encode(self, sentence):\n",
     "        return [self.word2idx.get(word.lower(), self.word2idx['<unk>']) for word in sentence]\n",
     "    \n",
     "    def decode(self, indices):\n",
     "        return [self.idx2word.get(idx, '<unk>') for idx in indices]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "5b2b46d6",
    "metadata": {
     "execution": {
@@ -114,6 +120,7 @@
    },
    "outputs": [],
    "source": [
     "class PIIDataset(Dataset):\n",
     "    \"\"\"PyTorch Dataset for PII detection\"\"\"\n",
     "    def __init__(self, tokens, labels, text_vocab, label_vocab, max_len=512):\n",
@@ -127,16 +134,16 @@
     "        return len(self.tokens)\n",
     "    \n",
     "    def __getitem__(self, idx):\n",
-    "        # Add start and end tokens\n",
     "        tokens = ['<start>'] + self.tokens[idx] + ['<end>']\n",
     "        labels = ['<start>'] + self.labels[idx] + ['<end>']\n",
     "        \n",
-    "        # Truncate if too long\n",
     "        if len(tokens) > self.max_len:\n",
     "            tokens = tokens[:self.max_len-1] + ['<end>']\n",
     "            labels = labels[:self.max_len-1] + ['<end>']\n",
     "        \n",
-    "        # Encode\n",
     "        token_ids = self.text_vocab.encode(tokens)\n",
     "        label_ids = self.label_vocab.encode(labels)\n",
     "        \n",
@@ -145,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "e7ca8f8f",
    "metadata": {
     "execution": {
@@ -167,7 +174,9 @@
    "source": [
     "def collate_fn(batch):\n",
     "    \"\"\"Custom collate function for padding sequences\"\"\"\n",
     "    tokens, labels = zip(*batch)\n",
     "    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)\n",
     "    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)\n",
     "    return tokens_padded, labels_padded"
@@ -175,7 +184,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "id": "85b32e21",
    "metadata": {
     "execution": {
@@ -195,6 +204,7 @@
    },
    "outputs": [],
    "source": [
     "class F1ScoreMetric:\n",
     "    \"\"\"Custom F1 score metric with beta parameter\"\"\"\n",
     "    def __init__(self, beta=5, num_classes=20, ignore_index=0, label_vocab=None):\n",
@@ -205,30 +215,37 @@
     "        self.reset()\n",
     "        \n",
     "    def reset(self):\n",
     "        self.true_positives = 0\n",
     "        self.false_positives = 0\n",
     "        self.false_negatives = 0\n",
     "        self.class_metrics = {}\n",
     "        \n",
     "    def update(self, predictions, targets):\n",
     "        mask = (targets != self.ignore_index) & (targets != 2) & (targets != 3)\n",
     "        o_idx = self.label_vocab.word2idx.get('o', -1) if self.label_vocab else -1\n",
     "        \n",
     "        for class_id in range(1, self.num_classes):\n",
     "            if class_id == o_idx:\n",
     "                continue\n",
     "                \n",
     "            pred_mask = (predictions == class_id) & mask\n",
     "            true_mask = (targets == class_id) & mask\n",
     "            \n",
     "            tp = ((pred_mask) & (true_mask)).sum().item()\n",
     "            fp = ((pred_mask) & (~true_mask)).sum().item()\n",
     "            fn = ((~pred_mask) & (true_mask)).sum().item()\n",
     "            \n",
     "            self.true_positives += tp\n",
     "            self.false_positives += fp\n",
     "            self.false_negatives += fn\n",
     "            \n",
     "            if class_id not in self.class_metrics:\n",
     "                self.class_metrics[class_id] = {'tp': 0, 'fp': 0, 'fn': 0}\n",
     "            self.class_metrics[class_id]['tp'] += tp\n",
@@ -236,6 +253,7 @@
     "            self.class_metrics[class_id]['fn'] += fn\n",
     "    \n",
     "    def compute(self):\n",
     "        beta_squared = self.beta ** 2\n",
     "        precision = self.true_positives / (self.true_positives + self.false_positives + 1e-8)\n",
     "        recall = self.true_positives / (self.true_positives + self.false_negatives + 1e-8)\n",
@@ -243,6 +261,7 @@
     "        return f1\n",
     "    \n",
     "    def get_class_metrics(self):\n",
     "        results = {}\n",
     "        for class_id, metrics in self.class_metrics.items():\n",
     "            if self.label_vocab and class_id in self.label_vocab.idx2word:\n",
@@ -261,7 +280,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "id": "60cf16eb",
    "metadata": {
     "execution": {
@@ -281,6 +300,7 @@
    },
    "outputs": [],
    "source": [
     "class FocalLoss(nn.Module):\n",
     "    \"\"\"Focal Loss for addressing class imbalance\"\"\"\n",
     "    def __init__(self, alpha=None, gamma=2.0, reduction='mean', ignore_index=-100):\n",
@@ -291,6 +311,7 @@
     "        self.ignore_index = ignore_index\n",
     "        \n",
     "    def forward(self, inputs, targets):\n",
     "        ce_loss = nn.functional.cross_entropy(\n",
     "            inputs, targets, \n",
     "            weight=self.alpha, \n",
@@ -298,9 +319,11 @@
     "            ignore_index=self.ignore_index\n",
     "        )\n",
     "        \n",
     "        pt = torch.exp(-ce_loss)\n",
     "        focal_loss = (1 - pt) ** self.gamma * ce_loss\n",
     "        \n",
     "        if self.reduction == 'mean':\n",
     "            return focal_loss.mean()\n",
     "        elif self.reduction == 'sum':\n",
@@ -311,7 +334,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "id": "4e56747c",
    "metadata": {
     "execution": {
@@ -337,8 +360,10 @@
     "    total_loss = 0\n",
     "    f1_metric.reset()\n",
     "    \n",
     "    progress_bar = tqdm(dataloader, desc='Training')\n",
     "    for batch_idx, (tokens, labels) in enumerate(progress_bar):\n",
     "        tokens = tokens.to(device)\n",
     "        labels = labels.to(device)\n",
     "        \n",
@@ -350,9 +375,10 @@
     "        outputs_flat = outputs.view(-1, outputs.size(-1))\n",
     "        labels_flat = labels.view(-1)\n",
     "        \n",
-    "        # Calculate loss and backward pass\n",
     "        loss = criterion(outputs_flat, labels_flat)\n",
     "        loss.backward()\n",
     "        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)\n",
     "        optimizer.step()\n",
     "        \n",
@@ -372,7 +398,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "id": "8a2e8d19",
    "metadata": {
     "execution": {
@@ -398,8 +424,10 @@
     "    total_loss = 0\n",
     "    f1_metric.reset()\n",
     "    \n",
     "    with torch.no_grad():\n",
     "        for tokens, labels in tqdm(dataloader, desc='Evaluating'):\n",
     "            tokens = tokens.to(device)\n",
     "            labels = labels.to(device)\n",
     "            \n",
@@ -421,7 +449,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "id": "6e292ace",
    "metadata": {
     "execution": {
@@ -445,6 +473,7 @@
     "    \"\"\"Create a weighted sampler to balance classes during training\"\"\"\n",
     "    sample_weights = []\n",
     "    \n",
     "    for idx in range(len(dataset)):\n",
     "        _, labels = dataset[idx]\n",
     "        \n",
@@ -453,24 +482,26 @@
     "        for label_id in labels:\n",
     "            if label_id > 3:  # Skip special tokens\n",
     "                label_name = label_vocab.idx2word.get(label_id.item(), 'O')\n",
     "                if label_name != 'o' and 'B-' in label_name:\n",
     "                    min_weight = 10.0\n",
     "                    break\n",
     "        \n",
     "        sample_weights.append(min_weight)\n",
     "    \n",
     "    sampler = WeightedRandomSampler(\n",
     "        weights=sample_weights,\n",
     "        num_samples=len(sample_weights),\n",
     "        replacement=True\n",
     "    )\n",
     "    \n",
-    "    return sampler\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "id": "857335cb",
    "metadata": {
     "execution": {
@@ -493,11 +524,14 @@
     "def print_label_distribution(data, title=\"Label Distribution\"):\n",
     "    \"\"\"Print label distribution statistics\"\"\"\n",
     "    label_counts = Counter()\n",
     "    for label_seq in data.labels:\n",
     "        for label in label_seq:\n",
     "            if label not in ['<pad>', '<start>', '<end>']:\n",
     "                label_counts[label] += 1\n",
     "    \n",
     "    print(f\"\\n{title}:\")\n",
     "    print(\"-\" * 50)\n",
     "    total = sum(label_counts.values())\n",
@@ -510,7 +544,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "id": "1738f8a9",
    "metadata": {
     "execution": {
@@ -532,9 +566,10 @@
    "source": [
     "def save_model(model, text_vocab, label_vocab, config, save_dir):\n",
     "    \"\"\"Save model and all necessary components for Flask deployment\"\"\"\n",
     "    os.makedirs(save_dir, exist_ok=True)\n",
     "    \n",
-    "    # Save model state\n",
     "    model_path = os.path.join(save_dir, 'pii_transformer_model.pt')\n",
     "    torch.save(model.state_dict(), model_path)\n",
     "    \n",
@@ -560,7 +595,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "id": "d93e7c25",
    "metadata": {
     "execution": {
@@ -591,12 +626,12 @@
     "):\n",
     "    \"\"\"Main training function\"\"\"\n",
     "    \n",
-    "    # Load data\n",
     "    print(\"Loading augmented data...\")\n",
     "    data = pd.read_json(data_path, lines=True)\n",
     "    print(f\"Total samples: {len(data)}\")\n",
     "    \n",
-    "    # Print initial label distribution\n",
     "    print_label_distribution(data, \"Label Distribution in Augmented Data\")\n",
     "    \n",
     "    # Build vocabularies\n",
@@ -604,19 +639,21 @@
     "    text_vocab = Vocabulary(max_size=100000)\n",
     "    label_vocab = Vocabulary(max_size=50)\n",
     "    \n",
     "    for tokens in data.tokens:\n",
     "        text_vocab.add_sentence(tokens)\n",
     "    for labels in data.labels:\n",
     "        label_vocab.add_sentence(labels)\n",
     "    \n",
     "    text_vocab.build()\n",
     "    label_vocab.build()\n",
     "    \n",
-    "    # Calculate class weights\n",
     "    class_weights = calculate_class_weights(data, label_vocab)\n",
     "    class_weights = class_weights.to(device)\n",
     "    \n",
-    "    # Split data\n",
     "    X_train, X_val, y_train, y_val = train_test_split(\n",
     "        data.tokens.tolist(),\n",
     "        data.labels.tolist(),\n",
@@ -628,13 +665,15 @@
     "    print(f\"  - Train samples: {len(X_train):,}\")\n",
     "    print(f\"  - Validation samples: {len(X_val):,}\")\n",
     "    \n",
-    "    # Create datasets and dataloaders\n",
     "    max_seq_len = 512\n",
     "    train_dataset = PIIDataset(X_train, y_train, text_vocab, label_vocab, max_len=max_seq_len)\n",
     "    val_dataset = PIIDataset(X_val, y_val, text_vocab, label_vocab, max_len=max_seq_len)\n",
     "    \n",
     "    train_sampler = create_balanced_sampler(train_dataset, label_vocab)\n",
     "    \n",
     "    train_loader = DataLoader(\n",
     "        train_dataset, \n",
     "        batch_size=batch_size,\n",
@@ -663,7 +702,7 @@
     "        'max_len': max_seq_len\n",
     "    }\n",
     "    \n",
-    "    # Create model\n",
     "    print(\"\\nCreating model...\")\n",
     "    model = create_transformer_pii_model(**model_config).to(device)\n",
     "    print(f\"Model parameters: {sum(p.numel() for p in model.parameters()):,}\")\n",
@@ -678,15 +717,15 @@
     "    else:\n",
     "        criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=0)\n",
     "    \n",
-    "    # Setup optimizer and scheduler\n",
     "    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)\n",
     "    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)\n",
     "    \n",
-    "    # Metrics\n",
     "    f1_metric_train = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
     "    f1_metric_val = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
     "    \n",
-    "    # Training loop\n",
     "    train_losses, train_f1s, val_losses, val_f1s = [], [], [], []\n",
     "    best_val_f1 = 0\n",
     "    patience = 5\n",
@@ -695,18 +734,21 @@
     "    print(\"\\nStarting training...\")\n",
     "    print(\"=\" * 60)\n",
     "    \n",
     "    for epoch in range(num_epochs):\n",
     "        print(f\"\\nEpoch {epoch+1}/{num_epochs}\")\n",
     "        \n",
-    "        # Train and validate\n",
     "        train_loss, train_f1 = train_epoch(\n",
     "            model, train_loader, optimizer, criterion, device, f1_metric_train\n",
     "        )\n",
     "        val_loss, val_f1 = evaluate(\n",
     "            model, val_loader, criterion, device, f1_metric_val\n",
     "        )\n",
     "        \n",
-    "        # Step scheduler\n",
     "        scheduler.step(val_loss)\n",
     "        \n",
     "        # Store metrics\n",
@@ -743,7 +785,7 @@
     "        else:\n",
     "            patience_counter += 1\n",
     "            \n",
-    "        # Early stopping\n",
     "        if patience_counter >= patience and epoch > 10:\n",
     "            print(f\"\\nEarly stopping triggered after {patience} epochs without improvement\")\n",
     "            break\n",
@@ -751,6 +793,7 @@
     "    # Plot training curves\n",
     "    plt.figure(figsize=(12, 5))\n",
     "    \n",
     "    plt.subplot(1, 2, 1)\n",
     "    plt.plot(train_losses, label='Train Loss', linewidth=2)\n",
     "    plt.plot(val_losses, label='Val Loss', linewidth=2)\n",
@@ -760,6 +803,7 @@
     "    plt.legend()\n",
     "    plt.grid(True, alpha=0.3)\n",
     "    \n",
     "    plt.subplot(1, 2, 2)\n",
     "    plt.plot(train_f1s, label='Train F1', linewidth=2)\n",
     "    plt.plot(val_f1s, label='Val F1', linewidth=2)\n",
@@ -777,6 +821,7 @@
     "    print(f\"Training completed!\")\n",
     "    print(f\"Best validation F1: {best_val_f1:.4f}\")\n",
     "    \n",
     "    save_model(model, text_vocab, label_vocab, model_config, 'saved_transformer_model')\n",
     "    \n",
     "    return model, text_vocab, label_vocab"
@@ -1251,9 +1296,11 @@
     }
    ],
    "source": [
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "print(f\"Using device: {device}\")\n",
     "\n",
     "model, text_vocab, label_vocab = train_transformer_pii_model(\n",
     "    data_path='train_augmented.json',\n",
     "    num_epochs=20,\n",

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "ff1782dd",
    "metadata": {
     "execution": {
    },
    "outputs": [],
    "source": [
+    "# Vocabulary class for text and label encoding\n",
     "class Vocabulary:\n",
     "    \"\"\"Vocabulary class for encoding/decoding text and labels\"\"\"\n",
     "    def __init__(self, max_size=100000):\n",
+    "        # Initialize special tokens\n",
     "        self.word2idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}\n",
     "        self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}\n",
     "        self.word_count = Counter()\n",
     "        self.max_size = max_size\n",
     "        \n",
     "    def add_sentence(self, sentence):\n",
+    "        # Count word frequencies\n",
     "        for word in sentence:\n",
     "            self.word_count[word.lower()] += 1\n",
     "    \n",
     "    def build(self):\n",
+    "        # Build vocabulary from most common words\n",
     "        most_common = self.word_count.most_common(self.max_size - len(self.word2idx))\n",
     "        for word, _ in most_common:\n",
     "            if word not in self.word2idx:\n",
     "        return len(self.word2idx)\n",
     "    \n",
     "    def encode(self, sentence):\n",
+    "        # Convert words to indices\n",
     "        return [self.word2idx.get(word.lower(), self.word2idx['<unk>']) for word in sentence]\n",
     "    \n",
     "    def decode(self, indices):\n",
+    "        # Convert indices back to words\n",
     "        return [self.idx2word.get(idx, '<unk>') for idx in indices]"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "5b2b46d6",
    "metadata": {
     "execution": {
    },
    "outputs": [],
    "source": [
+    "# Dataset class for PII detection\n",
     "class PIIDataset(Dataset):\n",
     "    \"\"\"PyTorch Dataset for PII detection\"\"\"\n",
     "    def __init__(self, tokens, labels, text_vocab, label_vocab, max_len=512):\n",
     "        return len(self.tokens)\n",
     "    \n",
     "    def __getitem__(self, idx):\n",
+    "        # Add special tokens at beginning and end\n",
     "        tokens = ['<start>'] + self.tokens[idx] + ['<end>']\n",
     "        labels = ['<start>'] + self.labels[idx] + ['<end>']\n",
     "        \n",
+    "        # Truncate if sequence is too long\n",
     "        if len(tokens) > self.max_len:\n",
     "            tokens = tokens[:self.max_len-1] + ['<end>']\n",
     "            labels = labels[:self.max_len-1] + ['<end>']\n",
     "        \n",
+    "        # Convert to indices\n",
     "        token_ids = self.text_vocab.encode(tokens)\n",
     "        label_ids = self.label_vocab.encode(labels)\n",
     "        \n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "e7ca8f8f",
    "metadata": {
     "execution": {
    "source": [
     "def collate_fn(batch):\n",
     "    \"\"\"Custom collate function for padding sequences\"\"\"\n",
+    "    # Separate tokens and labels\n",
     "    tokens, labels = zip(*batch)\n",
+    "    # Pad sequences to same length in batch\n",
     "    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)\n",
     "    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)\n",
     "    return tokens_padded, labels_padded"
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "85b32e21",
    "metadata": {
     "execution": {
    },
    "outputs": [],
    "source": [
+    "# F1 score metric for evaluation\n",
     "class F1ScoreMetric:\n",
     "    \"\"\"Custom F1 score metric with beta parameter\"\"\"\n",
     "    def __init__(self, beta=5, num_classes=20, ignore_index=0, label_vocab=None):\n",
     "        self.reset()\n",
     "        \n",
     "    def reset(self):\n",
+    "        # Reset all counters\n",
     "        self.true_positives = 0\n",
     "        self.false_positives = 0\n",
     "        self.false_negatives = 0\n",
     "        self.class_metrics = {}\n",
     "        \n",
     "    def update(self, predictions, targets):\n",
+    "        # Create mask to ignore padding and special tokens\n",
     "        mask = (targets != self.ignore_index) & (targets != 2) & (targets != 3)\n",
     "        o_idx = self.label_vocab.word2idx.get('o', -1) if self.label_vocab else -1\n",
     "        \n",
+    "        # Calculate metrics for each PII class\n",
     "        for class_id in range(1, self.num_classes):\n",
     "            if class_id == o_idx:\n",
     "                continue\n",
     "                \n",
+    "            # Find where predictions and targets match this class\n",
     "            pred_mask = (predictions == class_id) & mask\n",
     "            true_mask = (targets == class_id) & mask\n",
     "            \n",
+    "            # Count true positives, false positives, false negatives\n",
     "            tp = ((pred_mask) & (true_mask)).sum().item()\n",
     "            fp = ((pred_mask) & (~true_mask)).sum().item()\n",
     "            fn = ((~pred_mask) & (true_mask)).sum().item()\n",
     "            \n",
+    "            # Update total counts\n",
     "            self.true_positives += tp\n",
     "            self.false_positives += fp\n",
     "            self.false_negatives += fn\n",
     "            \n",
+    "            # Store per-class metrics\n",
     "            if class_id not in self.class_metrics:\n",
     "                self.class_metrics[class_id] = {'tp': 0, 'fp': 0, 'fn': 0}\n",
     "            self.class_metrics[class_id]['tp'] += tp\n",
     "            self.class_metrics[class_id]['fn'] += fn\n",
     "    \n",
     "    def compute(self):\n",
+    "        # Calculate F-beta score\n",
     "        beta_squared = self.beta ** 2\n",
     "        precision = self.true_positives / (self.true_positives + self.false_positives + 1e-8)\n",
     "        recall = self.true_positives / (self.true_positives + self.false_negatives + 1e-8)\n",
     "        return f1\n",
     "    \n",
     "    def get_class_metrics(self):\n",
+    "        # Get detailed metrics for each class\n",
     "        results = {}\n",
     "        for class_id, metrics in self.class_metrics.items():\n",
     "            if self.label_vocab and class_id in self.label_vocab.idx2word:\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "60cf16eb",
    "metadata": {
     "execution": {
    },
    "outputs": [],
    "source": [
+    "# Focal loss for handling class imbalance\n",
     "class FocalLoss(nn.Module):\n",
     "    \"\"\"Focal Loss for addressing class imbalance\"\"\"\n",
     "    def __init__(self, alpha=None, gamma=2.0, reduction='mean', ignore_index=-100):\n",
     "        self.ignore_index = ignore_index\n",
     "        \n",
     "    def forward(self, inputs, targets):\n",
+    "        # Calculate cross entropy loss\n",
     "        ce_loss = nn.functional.cross_entropy(\n",
     "            inputs, targets, \n",
     "            weight=self.alpha, \n",
     "            ignore_index=self.ignore_index\n",
     "        )\n",
     "        \n",
+    "        # Apply focal term to focus on hard examples\n",
     "        pt = torch.exp(-ce_loss)\n",
     "        focal_loss = (1 - pt) ** self.gamma * ce_loss\n",
     "        \n",
+    "        # Reduce loss based on specified method\n",
     "        if self.reduction == 'mean':\n",
     "            return focal_loss.mean()\n",
     "        elif self.reduction == 'sum':\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "4e56747c",
    "metadata": {
     "execution": {
     "    total_loss = 0\n",
     "    f1_metric.reset()\n",
     "    \n",
+    "    # Progress bar for training\n",
     "    progress_bar = tqdm(dataloader, desc='Training')\n",
     "    for batch_idx, (tokens, labels) in enumerate(progress_bar):\n",
+    "        # Move data to device\n",
     "        tokens = tokens.to(device)\n",
     "        labels = labels.to(device)\n",
     "        \n",
     "        outputs_flat = outputs.view(-1, outputs.size(-1))\n",
     "        labels_flat = labels.view(-1)\n",
     "        \n",
+    "        # Calculate loss and backpropagate\n",
     "        loss = criterion(outputs_flat, labels_flat)\n",
     "        loss.backward()\n",
+    "        # Clip gradients to prevent exploding gradients\n",
     "        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)\n",
     "        optimizer.step()\n",
     "        \n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "8a2e8d19",
    "metadata": {
     "execution": {
     "    total_loss = 0\n",
     "    f1_metric.reset()\n",
     "    \n",
+    "    # No gradient computation during evaluation\n",
     "    with torch.no_grad():\n",
     "        for tokens, labels in tqdm(dataloader, desc='Evaluating'):\n",
+    "            # Move data to device\n",
     "            tokens = tokens.to(device)\n",
     "            labels = labels.to(device)\n",
     "            \n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "6e292ace",
    "metadata": {
     "execution": {
     "    \"\"\"Create a weighted sampler to balance classes during training\"\"\"\n",
     "    sample_weights = []\n",
     "    \n",
+    "    # Calculate weight for each sample\n",
     "    for idx in range(len(dataset)):\n",
     "        _, labels = dataset[idx]\n",
     "        \n",
     "        for label_id in labels:\n",
     "            if label_id > 3:  # Skip special tokens\n",
     "                label_name = label_vocab.idx2word.get(label_id.item(), 'O')\n",
+    "                # If sample contains PII, give it higher weight\n",
     "                if label_name != 'o' and 'B-' in label_name:\n",
     "                    min_weight = 10.0\n",
     "                    break\n",
     "        \n",
     "        sample_weights.append(min_weight)\n",
     "    \n",
+    "    # Create weighted sampler\n",
     "    sampler = WeightedRandomSampler(\n",
     "        weights=sample_weights,\n",
     "        num_samples=len(sample_weights),\n",
     "        replacement=True\n",
     "    )\n",
     "    \n",
+    "    return sampler"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "857335cb",
    "metadata": {
     "execution": {
     "def print_label_distribution(data, title=\"Label Distribution\"):\n",
     "    \"\"\"Print label distribution statistics\"\"\"\n",
     "    label_counts = Counter()\n",
+    "    \n",
+    "    # Count each label type\n",
     "    for label_seq in data.labels:\n",
     "        for label in label_seq:\n",
     "            if label not in ['<pad>', '<start>', '<end>']:\n",
     "                label_counts[label] += 1\n",
     "    \n",
+    "    # Print distribution\n",
     "    print(f\"\\n{title}:\")\n",
     "    print(\"-\" * 50)\n",
     "    total = sum(label_counts.values())\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "1738f8a9",
    "metadata": {
     "execution": {
    "source": [
     "def save_model(model, text_vocab, label_vocab, config, save_dir):\n",
     "    \"\"\"Save model and all necessary components for Flask deployment\"\"\"\n",
+    "    # Create directory if it doesn't exist\n",
     "    os.makedirs(save_dir, exist_ok=True)\n",
     "    \n",
+    "    # Save model weights\n",
     "    model_path = os.path.join(save_dir, 'pii_transformer_model.pt')\n",
     "    torch.save(model.state_dict(), model_path)\n",
     "    \n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "d93e7c25",
    "metadata": {
     "execution": {
     "):\n",
     "    \"\"\"Main training function\"\"\"\n",
     "    \n",
+    "    # Load augmented data\n",
     "    print(\"Loading augmented data...\")\n",
     "    data = pd.read_json(data_path, lines=True)\n",
     "    print(f\"Total samples: {len(data)}\")\n",
     "    \n",
+    "    # Show label distribution\n",
     "    print_label_distribution(data, \"Label Distribution in Augmented Data\")\n",
     "    \n",
     "    # Build vocabularies\n",
     "    text_vocab = Vocabulary(max_size=100000)\n",
     "    label_vocab = Vocabulary(max_size=50)\n",
     "    \n",
+    "    # Add all words and labels to vocabularies\n",
     "    for tokens in data.tokens:\n",
     "        text_vocab.add_sentence(tokens)\n",
     "    for labels in data.labels:\n",
     "        label_vocab.add_sentence(labels)\n",
     "    \n",
+    "    # Build vocabularies from collected words\n",
     "    text_vocab.build()\n",
     "    label_vocab.build()\n",
     "    \n",
+    "    # Calculate class weights for balanced loss\n",
     "    class_weights = calculate_class_weights(data, label_vocab)\n",
     "    class_weights = class_weights.to(device)\n",
     "    \n",
+    "    # Split data into train and validation sets\n",
     "    X_train, X_val, y_train, y_val = train_test_split(\n",
     "        data.tokens.tolist(),\n",
     "        data.labels.tolist(),\n",
     "    print(f\"  - Train samples: {len(X_train):,}\")\n",
     "    print(f\"  - Validation samples: {len(X_val):,}\")\n",
     "    \n",
+    "    # Create datasets\n",
     "    max_seq_len = 512\n",
     "    train_dataset = PIIDataset(X_train, y_train, text_vocab, label_vocab, max_len=max_seq_len)\n",
     "    val_dataset = PIIDataset(X_val, y_val, text_vocab, label_vocab, max_len=max_seq_len)\n",
     "    \n",
+    "    # Create balanced sampler for training\n",
     "    train_sampler = create_balanced_sampler(train_dataset, label_vocab)\n",
     "    \n",
+    "    # Create data loaders\n",
     "    train_loader = DataLoader(\n",
     "        train_dataset, \n",
     "        batch_size=batch_size,\n",
     "        'max_len': max_seq_len\n",
     "    }\n",
     "    \n",
+    "    # Create transformer model\n",
     "    print(\"\\nCreating model...\")\n",
     "    model = create_transformer_pii_model(**model_config).to(device)\n",
     "    print(f\"Model parameters: {sum(p.numel() for p in model.parameters()):,}\")\n",
     "    else:\n",
     "        criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=0)\n",
     "    \n",
+    "    # Setup optimizer and learning rate scheduler\n",
     "    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)\n",
     "    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)\n",
     "    \n",
+    "    # Initialize metrics\n",
     "    f1_metric_train = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
     "    f1_metric_val = F1ScoreMetric(beta=5, num_classes=len(label_vocab), label_vocab=label_vocab)\n",
     "    \n",
+    "    # Training history\n",
     "    train_losses, train_f1s, val_losses, val_f1s = [], [], [], []\n",
     "    best_val_f1 = 0\n",
     "    patience = 5\n",
     "    print(\"\\nStarting training...\")\n",
     "    print(\"=\" * 60)\n",
     "    \n",
+    "    # Training loop\n",
     "    for epoch in range(num_epochs):\n",
     "        print(f\"\\nEpoch {epoch+1}/{num_epochs}\")\n",
     "        \n",
+    "        # Train for one epoch\n",
     "        train_loss, train_f1 = train_epoch(\n",
     "            model, train_loader, optimizer, criterion, device, f1_metric_train\n",
     "        )\n",
+    "        \n",
+    "        # Evaluate on validation set\n",
     "        val_loss, val_f1 = evaluate(\n",
     "            model, val_loader, criterion, device, f1_metric_val\n",
     "        )\n",
     "        \n",
+    "        # Adjust learning rate based on validation loss\n",
     "        scheduler.step(val_loss)\n",
     "        \n",
     "        # Store metrics\n",
     "        else:\n",
     "            patience_counter += 1\n",
     "            \n",
+    "        # Early stopping check\n",
     "        if patience_counter >= patience and epoch > 10:\n",
     "            print(f\"\\nEarly stopping triggered after {patience} epochs without improvement\")\n",
     "            break\n",
     "    # Plot training curves\n",
     "    plt.figure(figsize=(12, 5))\n",
     "    \n",
+    "    # Plot loss curves\n",
     "    plt.subplot(1, 2, 1)\n",
     "    plt.plot(train_losses, label='Train Loss', linewidth=2)\n",
     "    plt.plot(val_losses, label='Val Loss', linewidth=2)\n",
     "    plt.legend()\n",
     "    plt.grid(True, alpha=0.3)\n",
     "    \n",
+    "    # Plot F1 score curves\n",
     "    plt.subplot(1, 2, 2)\n",
     "    plt.plot(train_f1s, label='Train F1', linewidth=2)\n",
     "    plt.plot(val_f1s, label='Val F1', linewidth=2)\n",
     "    print(f\"Training completed!\")\n",
     "    print(f\"Best validation F1: {best_val_f1:.4f}\")\n",
     "    \n",
+    "    # Save model for deployment\n",
     "    save_model(model, text_vocab, label_vocab, model_config, 'saved_transformer_model')\n",
     "    \n",
     "    return model, text_vocab, label_vocab"
     }
    ],
    "source": [
+    "# Set device\n",
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "print(f\"Using device: {device}\")\n",
     "\n",
+    "# Train the transformer model\n",
     "model, text_vocab, label_vocab = train_transformer_pii_model(\n",
     "    data_path='train_augmented.json',\n",
     "    num_epochs=20,\n",