Spaces:

AmitHirpara
/

PII-Detection

Sleeping

File size: 18,686 Bytes

de46a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
f53fac9
 
de46a17
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
f53fac9
de46a17
 
 
 
 
f53fac9
 
de46a17
 
 
 
 
f53fac9
de46a17
 
 
 
 
f53fac9
de46a17
f53fac9
de46a17
 
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
f53fac9
de46a17
 
 
 
 
f53fac9
 
de46a17
 
 
f53fac9
de46a17
 
f53fac9
de46a17
f53fac9
de46a17
 
 
f53fac9
 
de46a17
f53fac9
de46a17
 
 
 
 
f53fac9
 
de46a17
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
f53fac9
de46a17
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
f53fac9
de46a17
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
f53fac9
 
de46a17
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
f53fac9
de46a17
 
f53fac9
de46a17
 
f53fac9
de46a17
 
f53fac9
de46a17
 
 
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
f53fac9
de46a17

import random
from faker import Faker
import pandas as pd
import numpy as np
from collections import Counter
import torch

class PIIDataAugmenter:
    """
    Generates synthetic PII examples to augment training data.
    
    This class creates realistic examples of various PII types including names,
    emails, phone numbers, addresses, IDs, URLs, and usernames.
    """
    
    def __init__(self, seed=42):
        """Initialize the augmenter with random seeds for reproducibility."""
        # Set random seeds for consistent results
        random.seed(seed)
        np.random.seed(seed)
        self.fake = Faker()
        Faker.seed(seed)
        
        # Initialize data structures
        self._init_templates()
        self._init_context_phrases()
        self._init_generators()
    
    def _init_templates(self):
        """Initialize templates for different PII types."""
        # Templates for generating sentences with PII
        self.templates = {
            'NAME_STUDENT': [
                "My name is {name}",
                "I am {name}",
                "This is {name} speaking",
                "Student: {name}",
                "{name} here",
                "Submitted by {name}",
                "Author: {name}",
                "Contact {name} for more information",
                "Please call {name}",
                "{name} is my name"
            ],
            'EMAIL': [
                "Email me at {email}",
                "My email is {email}",
                "Contact: {email}",
                "Send to {email}",
                "Reach me at {email}",
                "Email address: {email}",
                "You can email {email}",
                "Write to {email}",
                "My contact email is {email}",
                "Send your response to {email}"
            ],
            'PHONE_NUM': [
                "Call me at {phone}",
                "My phone number is {phone}",
                "Phone: {phone}",
                "Contact number: {phone}",
                "Reach me at {phone}",
                "My number is {phone}",
                "You can call {phone}",
                "Mobile: {phone}",
                "Tel: {phone}",
                "Phone contact: {phone}"
            ],
            'STREET_ADDRESS': [
                "I live at {address}",
                "My address is {address}",
                "Located at {address}",
                "Address: {address}",
                "Find me at {address}",
                "Residence: {address}",
                "Mail to {address}",
                "Home address: {address}",
                "Visit us at {address}",
                "Ship to {address}"
            ],
            'ID_NUM': [
                "ID: {id_num}",
                "Student ID: {id_num}",
                "ID number {id_num}",
                "Reference number: {id_num}",
                "Account: {id_num}",
                "Member ID: {id_num}",
                "Registration: {id_num}",
                "Code: {id_num}",
                "Identification: {id_num}",
                "Number: {id_num}"
            ],
            'URL_PERSONAL': [
                "Visit my website at {url}",
                "Check out {url}",
                "My portfolio: {url}",
                "Website: {url}",
                "Link: {url}",
                "Find me online at {url}",
                "Personal site: {url}",
                "URL: {url}",
                "Web: {url}",
                "Online at {url}"
            ],
            'USERNAME': [
                "Username: {username}",
                "User: {username}",
                "Handle: {username}",
                "My username is {username}",
                "Find me as {username}",
                "Account: {username}",
                "Login: {username}",
                "Profile: {username}",
                "Known as {username}",
                "Tag me @{username}"
            ]
        }
    
    def _init_context_phrases(self):
        """Initialize context phrases for more natural text generation."""
        # Opening phrases for generated text
        self.context_prefix = [
            "Hello everyone,",
            "Dear Sir/Madam,",
            "To whom it may concern,",
            "Please note that",
            "For your reference,",
            "As requested,",
            "I would like to inform you that",
            "This is to confirm that",
            "Please be advised that",
            "I am writing to tell you that"
        ]
        
        # Closing phrases for generated text
        self.context_suffix = [
            "Thank you.",
            "Best regards.",
            "Please let me know if you need anything else.",
            "Looking forward to your response.",
            "Have a great day!",
            "Thanks for your attention.",
            "Feel free to contact me.",
            "I appreciate your help.",
            "Hope this helps.",
            "Let me know if you have questions."
        ]
        
        # Words to connect multiple PII elements
        self.connectors = [
            " and ", " or ", ", ", ". Also, ", ". Additionally, "
        ]
    
    def _init_generators(self):
        """Initialize PII generators mapping."""
        # Map PII types to their generator functions
        self.generators = {
            'NAME_STUDENT': self.generate_name,
            'EMAIL': self.generate_email,
            'PHONE_NUM': self.generate_phone,
            'STREET_ADDRESS': self.generate_address,
            'ID_NUM': self.generate_id_num,
            'URL_PERSONAL': self.generate_url,
            'USERNAME': self.generate_username
        }
        
        # Map PII types to template placeholder keys
        self.format_keys = {
            'NAME_STUDENT': 'name',
            'EMAIL': 'email',
            'PHONE_NUM': 'phone',
            'STREET_ADDRESS': 'address',
            'ID_NUM': 'id_num',
            'URL_PERSONAL': 'url',
            'USERNAME': 'username'
        }
    
    def generate_name(self):
        """Generate realistic person names."""
        return self.fake.name()
    
    def generate_email(self):
        """Generate realistic email addresses."""
        return self.fake.email()
    
    def generate_phone(self):
        """Generate realistic phone numbers in various formats."""
        # Different phone number formats
        formats = [
            "555-{:03d}-{:04d}",
            "(555) {:03d}-{:04d}",
            "555.{:03d}.{:04d}",
            "+1-555-{:03d}-{:04d}",
            "555{:03d}{:04d}"
        ]
        # Pick a random format and fill with random numbers
        format_choice = random.choice(formats)
        area = random.randint(100, 999)
        number = random.randint(1000, 9999)
        return format_choice.format(area, number)
    
    def generate_address(self):
        """Generate realistic street addresses."""
        # Get address and replace newlines with commas
        return self.fake.address().replace('\n', ', ')
    
    def generate_id_num(self):
        """Generate various ID number formats."""
        # Different ID number patterns
        formats = [
            "{:06d}",           # 6-digit ID
            "{:08d}",           # 8-digit ID
            "ID{:05d}",         # ID prefix
            "STU{:06d}",        # Student ID
            "{:04d}-{:04d}",    # Hyphenated
            "A{:07d}",          # Letter prefix
        ]
        format_choice = random.choice(formats)
        
        # Handle hyphenated format differently
        if '-' in format_choice:
            return format_choice.format(
                random.randint(1000, 9999), 
                random.randint(1000, 9999)
            )
        else:
            return format_choice.format(random.randint(10000, 9999999))
    
    def generate_url(self):
        """Generate personal website URLs."""
        # Common personal website domains
        domains = ['github.com', 'linkedin.com', 'portfolio.com', 
                  'personal.com', 'website.com']
        username = self.fake.user_name()
        domain = random.choice(domains)
        return f"https://{domain}/{username}"
    
    def generate_username(self):
        """Generate usernames."""
        return self.fake.user_name()
    
    def create_synthetic_example(self, pii_type, add_context=True):
        """Create a synthetic example with proper BIO labeling."""
        # Generate the PII value
        pii_value = self.generators[pii_type]()
        
        # Choose a template and insert the PII
        template = random.choice(self.templates[pii_type])
        format_key = self.format_keys[pii_type]
        sentence = template.format(**{format_key: pii_value})
        
        # Optionally add context for more natural text
        if add_context and random.random() > 0.3:
            sentence = self._add_context(sentence)
        
        # Create tokens and labels
        tokens, labels = self._tokenize_and_label(sentence, pii_value, pii_type)
        
        return tokens, labels
    
    def create_mixed_example(self, pii_types, num_pii=2):
        """Create examples with multiple PII types."""
        # Select which PII types to include
        selected_types = random.sample(pii_types, min(num_pii, len(pii_types)))
        
        all_tokens = []
        all_labels = []
        
        # Add opening context
        if random.random() > 0.3:
            prefix = random.choice(self.context_prefix)
            all_tokens.extend(prefix.split())
            all_labels.extend(['O'] * len(prefix.split()))
        
        # Add each PII entity
        for i, pii_type in enumerate(selected_types):
            # Add connector between PII entities
            if i > 0 and random.random() > 0.5:
                connector = random.choice(self.connectors)
                all_tokens.extend(connector.strip().split())
                all_labels.extend(['O'] * len(connector.strip().split()))
            
            # Generate PII example
            tokens, labels = self.create_synthetic_example(pii_type, add_context=False)
            all_tokens.extend(tokens)
            all_labels.extend(labels)
        
        # Add closing context
        if random.random() > 0.3:
            suffix = random.choice(self.context_suffix)
            all_tokens.extend(suffix.split())
            all_labels.extend(['O'] * len(suffix.split()))
        
        return all_tokens, all_labels
    
    def _add_context(self, sentence):
        """Add context phrases to make text more natural."""
        # Randomly add prefix
        if random.random() > 0.5:
            sentence = random.choice(self.context_prefix) + " " + sentence
        # Randomly add suffix
        if random.random() > 0.5:
            sentence = sentence + " " + random.choice(self.context_suffix)
        return sentence
    
    def _tokenize_and_label(self, sentence, pii_value, pii_type):
        """Tokenize sentence and apply BIO labels for PII."""
        # Split sentence into tokens
        tokens = sentence.split()
        labels = ['O'] * len(tokens)
        
        # Split PII value into tokens
        pii_tokens = pii_value.split()
        
        # Find where PII appears in the sentence
        for i in range(len(tokens) - len(pii_tokens) + 1):
            # Check if tokens match the PII value
            if (tokens[i:i+len(pii_tokens)] == pii_tokens or 
                ' '.join(tokens[i:i+len(pii_tokens)]).lower() == pii_value.lower()):
                
                # Apply BIO tagging
                labels[i] = f'B-{pii_type}'  # Beginning
                for j in range(1, len(pii_tokens)):
                    labels[i+j] = f'I-{pii_type}'  # Inside
                break
        
        return tokens, labels
    
    def augment_dataset(self, original_data, target_samples_per_class=1000, mix_ratio=0.3):
        """Augment dataset with synthetic examples to balance PII classes."""
        # Check current distribution
        label_counts = self._analyze_label_distribution(original_data)
        print("\nOriginal label distribution:")
        self._print_distribution(label_counts)
        
        # Generate synthetic data
        synthetic_tokens, synthetic_labels = self._generate_synthetic_data(
            label_counts, target_samples_per_class, mix_ratio
        )
        
        # Add some non-PII examples for balance
        synthetic_tokens, synthetic_labels = self._add_non_pii_examples(
            synthetic_tokens, synthetic_labels
        )
        
        # Combine original and synthetic data
        augmented_df = self._combine_and_shuffle(
            original_data, synthetic_tokens, synthetic_labels
        )
        
        # Check new distribution
        new_label_counts = self._analyze_label_distribution(augmented_df)
        print("\nAugmented label distribution:")
        self._print_distribution(new_label_counts)
        
        return augmented_df
    
    def _analyze_label_distribution(self, data):
        """Analyze the distribution of PII labels in the dataset."""
        label_counts = Counter()
        
        # Count each PII type
        for labels in data['labels']:
            for label in labels:
                if label != 'O':
                    # Remove B- or I- prefix to get base label
                    base_label = label.split('-')[1] if '-' in label else label
                    label_counts[base_label] += 1
        
        return label_counts
    
    def _print_distribution(self, label_counts):
        """Print label distribution statistics."""
        total = sum(label_counts.values())
        # Print each label count and percentage
        for label, count in label_counts.most_common():
            percentage = (count / total * 100) if total > 0 else 0
            print(f"  {label:15} : {count:6,} ({percentage:5.2f}%)")
    
    def _generate_synthetic_data(self, label_counts, target_samples, mix_ratio):
        """Generate synthetic PII examples based on current distribution."""
        synthetic_tokens = []
        synthetic_labels = []
        
        # Generate examples for each PII type
        for pii_type in self.templates.keys():
            current_count = label_counts.get(pii_type, 0)
            needed = max(0, target_samples - current_count)
            
            if needed == 0:
                continue
            
            print(f"\nGenerating {needed} synthetic examples for {pii_type}")
            
            # Generate single PII examples
            single_count = int(needed * (1 - mix_ratio))
            for _ in range(single_count):
                tokens, labels = self.create_synthetic_example(pii_type)
                synthetic_tokens.append(tokens)
                synthetic_labels.append(labels)
            
            # Generate mixed PII examples
            mixed_count = int(needed * mix_ratio)
            for _ in range(mixed_count):
                # Make sure current PII type is included
                other_types = [t for t in self.templates.keys() if t != pii_type]
                selected_types = [pii_type] + random.sample(
                    other_types, min(1, len(other_types))
                )
                
                tokens, labels = self.create_mixed_example(selected_types, num_pii=2)
                synthetic_tokens.append(tokens)
                synthetic_labels.append(labels)
        
        return synthetic_tokens, synthetic_labels
    
    def _add_non_pii_examples(self, synthetic_tokens, synthetic_labels):
        """Add examples without PII (all 'O' labels) for balance."""
        # Add 10% non-PII examples
        num_non_pii = int(len(synthetic_tokens) * 0.1)
        
        for _ in range(num_non_pii):
            # Generate random text without PII
            sentence = self.fake.text(max_nb_chars=100)
            tokens = sentence.split()
            labels = ['O'] * len(tokens)
            
            synthetic_tokens.append(tokens)
            synthetic_labels.append(labels)
        
        return synthetic_tokens, synthetic_labels
    
    def _combine_and_shuffle(self, original_data, synthetic_tokens, synthetic_labels):
        """Combine original and synthetic data, then shuffle."""
        # Merge all data
        all_tokens = original_data['tokens'].tolist() + synthetic_tokens
        all_labels = original_data['labels'].tolist() + synthetic_labels
        
        # Create new dataframe
        augmented_data = pd.DataFrame({
            'tokens': all_tokens,
            'labels': all_labels
        })
        
        # Shuffle the data
        augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
        
        print(f"\nTotal augmented samples: {len(augmented_data):,}")
        
        return augmented_data

def calculate_class_weights(data, label_vocab):
    """Calculate class weights for balanced loss function."""
    # Count occurrences of each label
    label_counts = Counter()
    
    for labels in data['labels']:
        for label in labels:
            label_id = label_vocab.word2idx.get(label.lower(), 0)
            label_counts[label_id] += 1
    
    # Calculate weights based on inverse frequency
    total_samples = sum(label_counts.values())
    num_classes = len(label_vocab)
    
    weights = torch.zeros(num_classes)
    
    for class_id, count in label_counts.items():
        if count > 0:
            # Inverse frequency weighting
            weights[class_id] = total_samples / (num_classes * count)
    
    # Normalize the weights
    weights = weights / weights.sum() * num_classes
    
    # Prevent extreme weights
    weights = torch.clamp(weights, min=0.1, max=10.0)
    
    # Don't weight padding tokens
    weights[0] = 0.0
    
    return weights

if __name__ == '__main__':
    """Example usage of the augmentation module."""
    # Load original training data
    print("Loading original training data...")
    original_data = pd.read_json('train.json')
    print(f"Original dataset size: {len(original_data):,}")

    # Create augmenter instance
    augmenter = PIIDataAugmenter(seed=42)

    # Run augmentation
    print("\n" + "="*60)
    print("Starting data augmentation...")
    print("="*60)

    augmented_data = augmenter.augment_dataset(
        original_data,
        target_samples_per_class=2000,
        mix_ratio=0.3
    )

    # Save the augmented dataset
    output_path = './train_augmented.json'
    augmented_data.to_json(output_path, orient='records', lines=True)
    print(f"\nSaved augmented data to {output_path}")