File size: 18,686 Bytes
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
f53fac9
 
de46a17
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
f53fac9
de46a17
 
 
 
 
f53fac9
 
de46a17
 
 
 
 
f53fac9
de46a17
 
 
 
 
f53fac9
de46a17
f53fac9
de46a17
 
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
f53fac9
de46a17
 
 
 
 
f53fac9
 
de46a17
 
 
f53fac9
de46a17
 
f53fac9
de46a17
f53fac9
de46a17
 
 
f53fac9
 
de46a17
f53fac9
de46a17
 
 
 
 
f53fac9
 
de46a17
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
f53fac9
de46a17
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
f53fac9
de46a17
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
f53fac9
 
de46a17
 
 
 
 
 
 
f53fac9
de46a17
 
 
 
 
 
 
f53fac9
de46a17
 
f53fac9
de46a17
 
f53fac9
de46a17
 
f53fac9
de46a17
 
 
 
 
 
f53fac9
de46a17
 
 
 
f53fac9
de46a17
 
f53fac9
de46a17
 
 
 
 
 
 
 
 
 
f53fac9
de46a17
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
import random
from faker import Faker
import pandas as pd
import numpy as np
from collections import Counter
import torch

class PIIDataAugmenter:
    """
    Generates synthetic PII examples to augment training data.
    
    This class creates realistic examples of various PII types including names,
    emails, phone numbers, addresses, IDs, URLs, and usernames.
    """
    
    def __init__(self, seed=42):
        """Initialize the augmenter with random seeds for reproducibility."""
        # Set random seeds for consistent results
        random.seed(seed)
        np.random.seed(seed)
        self.fake = Faker()
        Faker.seed(seed)
        
        # Initialize data structures
        self._init_templates()
        self._init_context_phrases()
        self._init_generators()
    
    def _init_templates(self):
        """Initialize templates for different PII types."""
        # Templates for generating sentences with PII
        self.templates = {
            'NAME_STUDENT': [
                "My name is {name}",
                "I am {name}",
                "This is {name} speaking",
                "Student: {name}",
                "{name} here",
                "Submitted by {name}",
                "Author: {name}",
                "Contact {name} for more information",
                "Please call {name}",
                "{name} is my name"
            ],
            'EMAIL': [
                "Email me at {email}",
                "My email is {email}",
                "Contact: {email}",
                "Send to {email}",
                "Reach me at {email}",
                "Email address: {email}",
                "You can email {email}",
                "Write to {email}",
                "My contact email is {email}",
                "Send your response to {email}"
            ],
            'PHONE_NUM': [
                "Call me at {phone}",
                "My phone number is {phone}",
                "Phone: {phone}",
                "Contact number: {phone}",
                "Reach me at {phone}",
                "My number is {phone}",
                "You can call {phone}",
                "Mobile: {phone}",
                "Tel: {phone}",
                "Phone contact: {phone}"
            ],
            'STREET_ADDRESS': [
                "I live at {address}",
                "My address is {address}",
                "Located at {address}",
                "Address: {address}",
                "Find me at {address}",
                "Residence: {address}",
                "Mail to {address}",
                "Home address: {address}",
                "Visit us at {address}",
                "Ship to {address}"
            ],
            'ID_NUM': [
                "ID: {id_num}",
                "Student ID: {id_num}",
                "ID number {id_num}",
                "Reference number: {id_num}",
                "Account: {id_num}",
                "Member ID: {id_num}",
                "Registration: {id_num}",
                "Code: {id_num}",
                "Identification: {id_num}",
                "Number: {id_num}"
            ],
            'URL_PERSONAL': [
                "Visit my website at {url}",
                "Check out {url}",
                "My portfolio: {url}",
                "Website: {url}",
                "Link: {url}",
                "Find me online at {url}",
                "Personal site: {url}",
                "URL: {url}",
                "Web: {url}",
                "Online at {url}"
            ],
            'USERNAME': [
                "Username: {username}",
                "User: {username}",
                "Handle: {username}",
                "My username is {username}",
                "Find me as {username}",
                "Account: {username}",
                "Login: {username}",
                "Profile: {username}",
                "Known as {username}",
                "Tag me @{username}"
            ]
        }
    
    def _init_context_phrases(self):
        """Initialize context phrases for more natural text generation."""
        # Opening phrases for generated text
        self.context_prefix = [
            "Hello everyone,",
            "Dear Sir/Madam,",
            "To whom it may concern,",
            "Please note that",
            "For your reference,",
            "As requested,",
            "I would like to inform you that",
            "This is to confirm that",
            "Please be advised that",
            "I am writing to tell you that"
        ]
        
        # Closing phrases for generated text
        self.context_suffix = [
            "Thank you.",
            "Best regards.",
            "Please let me know if you need anything else.",
            "Looking forward to your response.",
            "Have a great day!",
            "Thanks for your attention.",
            "Feel free to contact me.",
            "I appreciate your help.",
            "Hope this helps.",
            "Let me know if you have questions."
        ]
        
        # Words to connect multiple PII elements
        self.connectors = [
            " and ", " or ", ", ", ". Also, ", ". Additionally, "
        ]
    
    def _init_generators(self):
        """Initialize PII generators mapping."""
        # Map PII types to their generator functions
        self.generators = {
            'NAME_STUDENT': self.generate_name,
            'EMAIL': self.generate_email,
            'PHONE_NUM': self.generate_phone,
            'STREET_ADDRESS': self.generate_address,
            'ID_NUM': self.generate_id_num,
            'URL_PERSONAL': self.generate_url,
            'USERNAME': self.generate_username
        }
        
        # Map PII types to template placeholder keys
        self.format_keys = {
            'NAME_STUDENT': 'name',
            'EMAIL': 'email',
            'PHONE_NUM': 'phone',
            'STREET_ADDRESS': 'address',
            'ID_NUM': 'id_num',
            'URL_PERSONAL': 'url',
            'USERNAME': 'username'
        }
    
    def generate_name(self):
        """Generate realistic person names."""
        return self.fake.name()
    
    def generate_email(self):
        """Generate realistic email addresses."""
        return self.fake.email()
    
    def generate_phone(self):
        """Generate realistic phone numbers in various formats."""
        # Different phone number formats
        formats = [
            "555-{:03d}-{:04d}",
            "(555) {:03d}-{:04d}",
            "555.{:03d}.{:04d}",
            "+1-555-{:03d}-{:04d}",
            "555{:03d}{:04d}"
        ]
        # Pick a random format and fill with random numbers
        format_choice = random.choice(formats)
        area = random.randint(100, 999)
        number = random.randint(1000, 9999)
        return format_choice.format(area, number)
    
    def generate_address(self):
        """Generate realistic street addresses."""
        # Get address and replace newlines with commas
        return self.fake.address().replace('\n', ', ')
    
    def generate_id_num(self):
        """Generate various ID number formats."""
        # Different ID number patterns
        formats = [
            "{:06d}",           # 6-digit ID
            "{:08d}",           # 8-digit ID
            "ID{:05d}",         # ID prefix
            "STU{:06d}",        # Student ID
            "{:04d}-{:04d}",    # Hyphenated
            "A{:07d}",          # Letter prefix
        ]
        format_choice = random.choice(formats)
        
        # Handle hyphenated format differently
        if '-' in format_choice:
            return format_choice.format(
                random.randint(1000, 9999), 
                random.randint(1000, 9999)
            )
        else:
            return format_choice.format(random.randint(10000, 9999999))
    
    def generate_url(self):
        """Generate personal website URLs."""
        # Common personal website domains
        domains = ['github.com', 'linkedin.com', 'portfolio.com', 
                  'personal.com', 'website.com']
        username = self.fake.user_name()
        domain = random.choice(domains)
        return f"https://{domain}/{username}"
    
    def generate_username(self):
        """Generate usernames."""
        return self.fake.user_name()
    
    def create_synthetic_example(self, pii_type, add_context=True):
        """Create a synthetic example with proper BIO labeling."""
        # Generate the PII value
        pii_value = self.generators[pii_type]()
        
        # Choose a template and insert the PII
        template = random.choice(self.templates[pii_type])
        format_key = self.format_keys[pii_type]
        sentence = template.format(**{format_key: pii_value})
        
        # Optionally add context for more natural text
        if add_context and random.random() > 0.3:
            sentence = self._add_context(sentence)
        
        # Create tokens and labels
        tokens, labels = self._tokenize_and_label(sentence, pii_value, pii_type)
        
        return tokens, labels
    
    def create_mixed_example(self, pii_types, num_pii=2):
        """Create examples with multiple PII types."""
        # Select which PII types to include
        selected_types = random.sample(pii_types, min(num_pii, len(pii_types)))
        
        all_tokens = []
        all_labels = []
        
        # Add opening context
        if random.random() > 0.3:
            prefix = random.choice(self.context_prefix)
            all_tokens.extend(prefix.split())
            all_labels.extend(['O'] * len(prefix.split()))
        
        # Add each PII entity
        for i, pii_type in enumerate(selected_types):
            # Add connector between PII entities
            if i > 0 and random.random() > 0.5:
                connector = random.choice(self.connectors)
                all_tokens.extend(connector.strip().split())
                all_labels.extend(['O'] * len(connector.strip().split()))
            
            # Generate PII example
            tokens, labels = self.create_synthetic_example(pii_type, add_context=False)
            all_tokens.extend(tokens)
            all_labels.extend(labels)
        
        # Add closing context
        if random.random() > 0.3:
            suffix = random.choice(self.context_suffix)
            all_tokens.extend(suffix.split())
            all_labels.extend(['O'] * len(suffix.split()))
        
        return all_tokens, all_labels
    
    def _add_context(self, sentence):
        """Add context phrases to make text more natural."""
        # Randomly add prefix
        if random.random() > 0.5:
            sentence = random.choice(self.context_prefix) + " " + sentence
        # Randomly add suffix
        if random.random() > 0.5:
            sentence = sentence + " " + random.choice(self.context_suffix)
        return sentence
    
    def _tokenize_and_label(self, sentence, pii_value, pii_type):
        """Tokenize sentence and apply BIO labels for PII."""
        # Split sentence into tokens
        tokens = sentence.split()
        labels = ['O'] * len(tokens)
        
        # Split PII value into tokens
        pii_tokens = pii_value.split()
        
        # Find where PII appears in the sentence
        for i in range(len(tokens) - len(pii_tokens) + 1):
            # Check if tokens match the PII value
            if (tokens[i:i+len(pii_tokens)] == pii_tokens or 
                ' '.join(tokens[i:i+len(pii_tokens)]).lower() == pii_value.lower()):
                
                # Apply BIO tagging
                labels[i] = f'B-{pii_type}'  # Beginning
                for j in range(1, len(pii_tokens)):
                    labels[i+j] = f'I-{pii_type}'  # Inside
                break
        
        return tokens, labels
    
    def augment_dataset(self, original_data, target_samples_per_class=1000, mix_ratio=0.3):
        """Augment dataset with synthetic examples to balance PII classes."""
        # Check current distribution
        label_counts = self._analyze_label_distribution(original_data)
        print("\nOriginal label distribution:")
        self._print_distribution(label_counts)
        
        # Generate synthetic data
        synthetic_tokens, synthetic_labels = self._generate_synthetic_data(
            label_counts, target_samples_per_class, mix_ratio
        )
        
        # Add some non-PII examples for balance
        synthetic_tokens, synthetic_labels = self._add_non_pii_examples(
            synthetic_tokens, synthetic_labels
        )
        
        # Combine original and synthetic data
        augmented_df = self._combine_and_shuffle(
            original_data, synthetic_tokens, synthetic_labels
        )
        
        # Check new distribution
        new_label_counts = self._analyze_label_distribution(augmented_df)
        print("\nAugmented label distribution:")
        self._print_distribution(new_label_counts)
        
        return augmented_df
    
    def _analyze_label_distribution(self, data):
        """Analyze the distribution of PII labels in the dataset."""
        label_counts = Counter()
        
        # Count each PII type
        for labels in data['labels']:
            for label in labels:
                if label != 'O':
                    # Remove B- or I- prefix to get base label
                    base_label = label.split('-')[1] if '-' in label else label
                    label_counts[base_label] += 1
        
        return label_counts
    
    def _print_distribution(self, label_counts):
        """Print label distribution statistics."""
        total = sum(label_counts.values())
        # Print each label count and percentage
        for label, count in label_counts.most_common():
            percentage = (count / total * 100) if total > 0 else 0
            print(f"  {label:15} : {count:6,} ({percentage:5.2f}%)")
    
    def _generate_synthetic_data(self, label_counts, target_samples, mix_ratio):
        """Generate synthetic PII examples based on current distribution."""
        synthetic_tokens = []
        synthetic_labels = []
        
        # Generate examples for each PII type
        for pii_type in self.templates.keys():
            current_count = label_counts.get(pii_type, 0)
            needed = max(0, target_samples - current_count)
            
            if needed == 0:
                continue
            
            print(f"\nGenerating {needed} synthetic examples for {pii_type}")
            
            # Generate single PII examples
            single_count = int(needed * (1 - mix_ratio))
            for _ in range(single_count):
                tokens, labels = self.create_synthetic_example(pii_type)
                synthetic_tokens.append(tokens)
                synthetic_labels.append(labels)
            
            # Generate mixed PII examples
            mixed_count = int(needed * mix_ratio)
            for _ in range(mixed_count):
                # Make sure current PII type is included
                other_types = [t for t in self.templates.keys() if t != pii_type]
                selected_types = [pii_type] + random.sample(
                    other_types, min(1, len(other_types))
                )
                
                tokens, labels = self.create_mixed_example(selected_types, num_pii=2)
                synthetic_tokens.append(tokens)
                synthetic_labels.append(labels)
        
        return synthetic_tokens, synthetic_labels
    
    def _add_non_pii_examples(self, synthetic_tokens, synthetic_labels):
        """Add examples without PII (all 'O' labels) for balance."""
        # Add 10% non-PII examples
        num_non_pii = int(len(synthetic_tokens) * 0.1)
        
        for _ in range(num_non_pii):
            # Generate random text without PII
            sentence = self.fake.text(max_nb_chars=100)
            tokens = sentence.split()
            labels = ['O'] * len(tokens)
            
            synthetic_tokens.append(tokens)
            synthetic_labels.append(labels)
        
        return synthetic_tokens, synthetic_labels
    
    def _combine_and_shuffle(self, original_data, synthetic_tokens, synthetic_labels):
        """Combine original and synthetic data, then shuffle."""
        # Merge all data
        all_tokens = original_data['tokens'].tolist() + synthetic_tokens
        all_labels = original_data['labels'].tolist() + synthetic_labels
        
        # Create new dataframe
        augmented_data = pd.DataFrame({
            'tokens': all_tokens,
            'labels': all_labels
        })
        
        # Shuffle the data
        augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
        
        print(f"\nTotal augmented samples: {len(augmented_data):,}")
        
        return augmented_data

def calculate_class_weights(data, label_vocab):
    """Calculate class weights for balanced loss function."""
    # Count occurrences of each label
    label_counts = Counter()
    
    for labels in data['labels']:
        for label in labels:
            label_id = label_vocab.word2idx.get(label.lower(), 0)
            label_counts[label_id] += 1
    
    # Calculate weights based on inverse frequency
    total_samples = sum(label_counts.values())
    num_classes = len(label_vocab)
    
    weights = torch.zeros(num_classes)
    
    for class_id, count in label_counts.items():
        if count > 0:
            # Inverse frequency weighting
            weights[class_id] = total_samples / (num_classes * count)
    
    # Normalize the weights
    weights = weights / weights.sum() * num_classes
    
    # Prevent extreme weights
    weights = torch.clamp(weights, min=0.1, max=10.0)
    
    # Don't weight padding tokens
    weights[0] = 0.0
    
    return weights

if __name__ == '__main__':
    """Example usage of the augmentation module."""
    # Load original training data
    print("Loading original training data...")
    original_data = pd.read_json('train.json')
    print(f"Original dataset size: {len(original_data):,}")

    # Create augmenter instance
    augmenter = PIIDataAugmenter(seed=42)

    # Run augmentation
    print("\n" + "="*60)
    print("Starting data augmentation...")
    print("="*60)

    augmented_data = augmenter.augment_dataset(
        original_data,
        target_samples_per_class=2000,
        mix_ratio=0.3
    )

    # Save the augmented dataset
    output_path = './train_augmented.json'
    augmented_data.to_json(output_path, orient='records', lines=True)
    print(f"\nSaved augmented data to {output_path}")