Spaces:
Sleeping
Sleeping
Twin commited on
Commit ·
0c2f645
1
Parent(s): 9ef957a
Fix PII entity numbering order in reconstruction
Browse files🔧 Problem: Entity flags were numbered in reverse order (X_4, X_3, X_2, X_1)
✅ Solution: Now correctly numbered in appearance order (X_1, X_2, X_3, X_4)
Changes:
- Modified reconstruct_masked_text() in text_processing.py
- First pass: assign numbers in order of appearance
- Second pass: replace in reverse order to maintain text positions
- Added test script to verify correct numbering
This ensures PII entities are numbered intuitively based on their order in the text.
- src/pii_masking/text_processing.py +17 -6
- static/index.html +3 -3
- test_reconstruction.py +77 -0
src/pii_masking/text_processing.py
CHANGED
|
@@ -178,17 +178,28 @@ def reconstruct_masked_text(text: str, pii_dict: Dict[str, List[str]]) -> str:
|
|
| 178 |
Text with PII replaced by [ENTITY_TYPE_X] placeholders
|
| 179 |
"""
|
| 180 |
masked_text = text
|
| 181 |
-
entity_counters = defaultdict(int)
|
| 182 |
|
| 183 |
-
# Get all spans and sort by position (
|
| 184 |
all_spans = json_to_spans(text, pii_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
all_spans.sort(key=lambda x: x.start, reverse=True)
|
| 186 |
|
| 187 |
-
# Replace each span with placeholder
|
| 188 |
for span in all_spans:
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
# Replace the span in the text
|
| 193 |
masked_text = masked_text[:span.start] + placeholder + masked_text[span.end:]
|
| 194 |
|
|
|
|
| 178 |
Text with PII replaced by [ENTITY_TYPE_X] placeholders
|
| 179 |
"""
|
| 180 |
masked_text = text
|
|
|
|
| 181 |
|
| 182 |
+
# Get all spans and sort by position (normal order for numbering)
|
| 183 |
all_spans = json_to_spans(text, pii_dict)
|
| 184 |
+
all_spans.sort(key=lambda x: x.start)
|
| 185 |
+
|
| 186 |
+
# First pass: assign numbers to spans in order of appearance
|
| 187 |
+
entity_counters = defaultdict(int)
|
| 188 |
+
span_numbers = {}
|
| 189 |
+
|
| 190 |
+
for i, span in enumerate(all_spans):
|
| 191 |
+
entity_counters[span.entity_type] += 1
|
| 192 |
+
# Create a unique key for this span
|
| 193 |
+
span_key = (span.start, span.end, span.entity_type)
|
| 194 |
+
span_numbers[span_key] = entity_counters[span.entity_type]
|
| 195 |
+
|
| 196 |
+
# Second pass: replace spans in reverse order to maintain text positions
|
| 197 |
all_spans.sort(key=lambda x: x.start, reverse=True)
|
| 198 |
|
|
|
|
| 199 |
for span in all_spans:
|
| 200 |
+
span_key = (span.start, span.end, span.entity_type)
|
| 201 |
+
number = span_numbers[span_key]
|
| 202 |
+
placeholder = f"[{span.entity_type}_{number}]"
|
| 203 |
# Replace the span in the text
|
| 204 |
masked_text = masked_text[:span.start] + placeholder + masked_text[span.end:]
|
| 205 |
|
static/index.html
CHANGED
|
@@ -451,7 +451,7 @@
|
|
| 451 |
<div class="container">
|
| 452 |
<div class="header">
|
| 453 |
<h1>🔒 PII Masking Demo</h1>
|
| 454 |
-
<p>Detect and mask Personal Identifiable Information
|
| 455 |
</div>
|
| 456 |
|
| 457 |
<div class="content">
|
|
@@ -460,10 +460,10 @@
|
|
| 460 |
<label>Choose input method:</label>
|
| 461 |
<div class="input-tabs">
|
| 462 |
<button type="button" class="input-tab active" onclick="switchInputMethod('text')">
|
| 463 |
-
|
| 464 |
</button>
|
| 465 |
<button type="button" class="input-tab" onclick="switchInputMethod('pdf')">
|
| 466 |
-
|
| 467 |
</button>
|
| 468 |
</div>
|
| 469 |
|
|
|
|
| 451 |
<div class="container">
|
| 452 |
<div class="header">
|
| 453 |
<h1>🔒 PII Masking Demo</h1>
|
| 454 |
+
<p>Detect and mask Personal Identifiable Information in your documents.</p>
|
| 455 |
</div>
|
| 456 |
|
| 457 |
<div class="content">
|
|
|
|
| 460 |
<label>Choose input method:</label>
|
| 461 |
<div class="input-tabs">
|
| 462 |
<button type="button" class="input-tab active" onclick="switchInputMethod('text')">
|
| 463 |
+
Text Input
|
| 464 |
</button>
|
| 465 |
<button type="button" class="input-tab" onclick="switchInputMethod('pdf')">
|
| 466 |
+
PDF Upload
|
| 467 |
</button>
|
| 468 |
</div>
|
| 469 |
|
test_reconstruction.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify that PII reconstruction works with correct numbering.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Add the src directory to Python path
|
| 10 |
+
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
| 11 |
+
|
| 12 |
+
from pii_masking.text_processing import reconstruct_masked_text
|
| 13 |
+
|
| 14 |
+
def test_reconstruction():
|
| 15 |
+
"""Test the reconstruction function with multiple entities of the same type."""
|
| 16 |
+
|
| 17 |
+
# Test case with multiple entities of the same type
|
| 18 |
+
text = "John T. Smith lives at 4872 Willow Creek Drive Apartment 14B Springfield, IL 62701. Phone: 555-1234. Email: john@email.com. On January 15, 2024 Ms. Alice A. Johnson from Records Office at 9135 Westfield Parkway, Suite 320 Hartford, CT 06101. Dear Ms. Johnson, I am writing regarding my claim from December 2024. As of February 1, 2024, my address is 4872 Willow Creek Drive, Apartment 14B, Springfield, IL 62701."
|
| 19 |
+
|
| 20 |
+
# Simulate PII entities found in the text
|
| 21 |
+
pii_dict = {
|
| 22 |
+
"FIRSTNAME": ["John", "Alice"],
|
| 23 |
+
"LASTNAME": ["Smith", "Johnson"],
|
| 24 |
+
"CITY": ["Springfield", "Hartford"],
|
| 25 |
+
"STATE": ["IL", "CT"],
|
| 26 |
+
"ZIPCODE": ["62701", "06101"],
|
| 27 |
+
"PHONENUMBER": ["555-1234"],
|
| 28 |
+
"EMAIL": ["john@email.com"],
|
| 29 |
+
"DATE": ["January 15, 2024", "December 2024", "February 1, 2024"]
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
print("🧪 Testing PII reconstruction with correct numbering")
|
| 33 |
+
print("=" * 60)
|
| 34 |
+
print(f"Original text: {text[:100]}...")
|
| 35 |
+
print()
|
| 36 |
+
|
| 37 |
+
masked_text = reconstruct_masked_text(text, pii_dict)
|
| 38 |
+
|
| 39 |
+
print("🎭 Masked text:")
|
| 40 |
+
print(masked_text)
|
| 41 |
+
print()
|
| 42 |
+
|
| 43 |
+
# Check if numbering is correct (first occurrence should be _1, second _2, etc.)
|
| 44 |
+
print("🔍 Checking numbering order:")
|
| 45 |
+
|
| 46 |
+
# Check FIRSTNAMEs
|
| 47 |
+
john_pos = masked_text.find("[FIRSTNAME_1]")
|
| 48 |
+
alice_pos = masked_text.find("[FIRSTNAME_2]")
|
| 49 |
+
print(f" FIRSTNAME_1 position: {john_pos}")
|
| 50 |
+
print(f" FIRSTNAME_2 position: {alice_pos}")
|
| 51 |
+
print(f" ✅ Correct order: {john_pos < alice_pos}")
|
| 52 |
+
|
| 53 |
+
# Check LASTNAMEs
|
| 54 |
+
smith_pos = masked_text.find("[LASTNAME_1]")
|
| 55 |
+
johnson_pos = masked_text.find("[LASTNAME_2]")
|
| 56 |
+
print(f" LASTNAME_1 position: {smith_pos}")
|
| 57 |
+
print(f" LASTNAME_2 position: {johnson_pos}")
|
| 58 |
+
print(f" ✅ Correct order: {smith_pos < johnson_pos}")
|
| 59 |
+
|
| 60 |
+
# Check CITYs
|
| 61 |
+
springfield_pos = masked_text.find("[CITY_1]")
|
| 62 |
+
hartford_pos = masked_text.find("[CITY_2]")
|
| 63 |
+
print(f" CITY_1 position: {springfield_pos}")
|
| 64 |
+
print(f" CITY_2 position: {hartford_pos}")
|
| 65 |
+
print(f" ✅ Correct order: {springfield_pos < hartford_pos}")
|
| 66 |
+
|
| 67 |
+
# Check DATEs
|
| 68 |
+
date1_pos = masked_text.find("[DATE_1]")
|
| 69 |
+
date2_pos = masked_text.find("[DATE_2]")
|
| 70 |
+
date3_pos = masked_text.find("[DATE_3]")
|
| 71 |
+
print(f" DATE_1 position: {date1_pos}")
|
| 72 |
+
print(f" DATE_2 position: {date2_pos}")
|
| 73 |
+
print(f" DATE_3 position: {date3_pos}")
|
| 74 |
+
print(f" ✅ Correct order: {date1_pos < date2_pos < date3_pos}")
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
test_reconstruction()
|