Twin commited on
Commit
0c2f645
·
1 Parent(s): 9ef957a

Fix PII entity numbering order in reconstruction

Browse files

🔧 Problem: Entity flags were numbered in reverse order (X_4, X_3, X_2, X_1)
✅ Solution: Now correctly numbered in appearance order (X_1, X_2, X_3, X_4)

Changes:
- Modified reconstruct_masked_text() in text_processing.py
- First pass: assign numbers in order of appearance
- Second pass: replace in reverse order to maintain text positions
- Added test script to verify correct numbering

This ensures PII entities are numbered intuitively based on their order in the text.

src/pii_masking/text_processing.py CHANGED
@@ -178,17 +178,28 @@ def reconstruct_masked_text(text: str, pii_dict: Dict[str, List[str]]) -> str:
178
  Text with PII replaced by [ENTITY_TYPE_X] placeholders
179
  """
180
  masked_text = text
181
- entity_counters = defaultdict(int)
182
 
183
- # Get all spans and sort by position (reverse order to maintain positions)
184
  all_spans = json_to_spans(text, pii_dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  all_spans.sort(key=lambda x: x.start, reverse=True)
186
 
187
- # Replace each span with placeholder
188
  for span in all_spans:
189
- entity_counters[span.entity_type] += 1
190
- placeholder = f"[{span.entity_type}_{entity_counters[span.entity_type]}]"
191
-
192
  # Replace the span in the text
193
  masked_text = masked_text[:span.start] + placeholder + masked_text[span.end:]
194
 
 
178
  Text with PII replaced by [ENTITY_TYPE_X] placeholders
179
  """
180
  masked_text = text
 
181
 
182
+ # Get all spans and sort by position (normal order for numbering)
183
  all_spans = json_to_spans(text, pii_dict)
184
+ all_spans.sort(key=lambda x: x.start)
185
+
186
+ # First pass: assign numbers to spans in order of appearance
187
+ entity_counters = defaultdict(int)
188
+ span_numbers = {}
189
+
190
+ for i, span in enumerate(all_spans):
191
+ entity_counters[span.entity_type] += 1
192
+ # Create a unique key for this span
193
+ span_key = (span.start, span.end, span.entity_type)
194
+ span_numbers[span_key] = entity_counters[span.entity_type]
195
+
196
+ # Second pass: replace spans in reverse order to maintain text positions
197
  all_spans.sort(key=lambda x: x.start, reverse=True)
198
 
 
199
  for span in all_spans:
200
+ span_key = (span.start, span.end, span.entity_type)
201
+ number = span_numbers[span_key]
202
+ placeholder = f"[{span.entity_type}_{number}]"
203
  # Replace the span in the text
204
  masked_text = masked_text[:span.start] + placeholder + masked_text[span.end:]
205
 
static/index.html CHANGED
@@ -451,7 +451,7 @@
451
  <div class="container">
452
  <div class="header">
453
  <h1>🔒 PII Masking Demo</h1>
454
- <p>Detect and mask Personal Identifiable Information using AI</p>
455
  </div>
456
 
457
  <div class="content">
@@ -460,10 +460,10 @@
460
  <label>Choose input method:</label>
461
  <div class="input-tabs">
462
  <button type="button" class="input-tab active" onclick="switchInputMethod('text')">
463
- 📝 Text Input
464
  </button>
465
  <button type="button" class="input-tab" onclick="switchInputMethod('pdf')">
466
- 📄 PDF Upload
467
  </button>
468
  </div>
469
 
 
451
  <div class="container">
452
  <div class="header">
453
  <h1>🔒 PII Masking Demo</h1>
454
+ <p>Detect and mask Personal Identifiable Information in your documents.</p>
455
  </div>
456
 
457
  <div class="content">
 
460
  <label>Choose input method:</label>
461
  <div class="input-tabs">
462
  <button type="button" class="input-tab active" onclick="switchInputMethod('text')">
463
+ Text Input
464
  </button>
465
  <button type="button" class="input-tab" onclick="switchInputMethod('pdf')">
466
+ PDF Upload
467
  </button>
468
  </div>
469
 
test_reconstruction.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to verify that PII reconstruction works with correct numbering.
4
+ """
5
+
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add the src directory to Python path
10
+ sys.path.insert(0, str(Path(__file__).parent / "src"))
11
+
12
+ from pii_masking.text_processing import reconstruct_masked_text
13
+
14
+ def test_reconstruction():
15
+ """Test the reconstruction function with multiple entities of the same type."""
16
+
17
+ # Test case with multiple entities of the same type
18
+ text = "John T. Smith lives at 4872 Willow Creek Drive Apartment 14B Springfield, IL 62701. Phone: 555-1234. Email: john@email.com. On January 15, 2024 Ms. Alice A. Johnson from Records Office at 9135 Westfield Parkway, Suite 320 Hartford, CT 06101. Dear Ms. Johnson, I am writing regarding my claim from December 2024. As of February 1, 2024, my address is 4872 Willow Creek Drive, Apartment 14B, Springfield, IL 62701."
19
+
20
+ # Simulate PII entities found in the text
21
+ pii_dict = {
22
+ "FIRSTNAME": ["John", "Alice"],
23
+ "LASTNAME": ["Smith", "Johnson"],
24
+ "CITY": ["Springfield", "Hartford"],
25
+ "STATE": ["IL", "CT"],
26
+ "ZIPCODE": ["62701", "06101"],
27
+ "PHONENUMBER": ["555-1234"],
28
+ "EMAIL": ["john@email.com"],
29
+ "DATE": ["January 15, 2024", "December 2024", "February 1, 2024"]
30
+ }
31
+
32
+ print("🧪 Testing PII reconstruction with correct numbering")
33
+ print("=" * 60)
34
+ print(f"Original text: {text[:100]}...")
35
+ print()
36
+
37
+ masked_text = reconstruct_masked_text(text, pii_dict)
38
+
39
+ print("🎭 Masked text:")
40
+ print(masked_text)
41
+ print()
42
+
43
+ # Check if numbering is correct (first occurrence should be _1, second _2, etc.)
44
+ print("🔍 Checking numbering order:")
45
+
46
+ # Check FIRSTNAMEs
47
+ john_pos = masked_text.find("[FIRSTNAME_1]")
48
+ alice_pos = masked_text.find("[FIRSTNAME_2]")
49
+ print(f" FIRSTNAME_1 position: {john_pos}")
50
+ print(f" FIRSTNAME_2 position: {alice_pos}")
51
+ print(f" ✅ Correct order: {john_pos < alice_pos}")
52
+
53
+ # Check LASTNAMEs
54
+ smith_pos = masked_text.find("[LASTNAME_1]")
55
+ johnson_pos = masked_text.find("[LASTNAME_2]")
56
+ print(f" LASTNAME_1 position: {smith_pos}")
57
+ print(f" LASTNAME_2 position: {johnson_pos}")
58
+ print(f" ✅ Correct order: {smith_pos < johnson_pos}")
59
+
60
+ # Check CITYs
61
+ springfield_pos = masked_text.find("[CITY_1]")
62
+ hartford_pos = masked_text.find("[CITY_2]")
63
+ print(f" CITY_1 position: {springfield_pos}")
64
+ print(f" CITY_2 position: {hartford_pos}")
65
+ print(f" ✅ Correct order: {springfield_pos < hartford_pos}")
66
+
67
+ # Check DATEs
68
+ date1_pos = masked_text.find("[DATE_1]")
69
+ date2_pos = masked_text.find("[DATE_2]")
70
+ date3_pos = masked_text.find("[DATE_3]")
71
+ print(f" DATE_1 position: {date1_pos}")
72
+ print(f" DATE_2 position: {date2_pos}")
73
+ print(f" DATE_3 position: {date3_pos}")
74
+ print(f" ✅ Correct order: {date1_pos < date2_pos < date3_pos}")
75
+
76
+ if __name__ == "__main__":
77
+ test_reconstruction()