Spaces:

Distopia22
/

icd-cpt-coding-api-backend

Sleeping

App Files Files Community

Distopia22 commited on Nov 23, 2025

Commit

d8473b6

1 Parent(s): d37f590

Add lightweight regex-based PII removal for file uploads

Browse files

Files changed (6) hide show

Dockerfile +1 -1
requirements.txt +1 -5
src/api/routes.py +5 -8
src/services/file_service.py +13 -15
src/services/pii_detector.py +0 -197
src/services/regex_pii_remover.py +229 -0

Dockerfile CHANGED Viewed

@@ -5,7 +5,7 @@ WORKDIR /app
 # Copy requirements
 COPY requirements.txt .
-# Install dependencies (now includes spaCy model)
 RUN pip install --no-cache-dir --upgrade pip && \
   pip install --no-cache-dir -r requirements.txt

 # Copy requirements
 COPY requirements.txt .
+# Install dependencies
 RUN pip install --no-cache-dir --upgrade pip && \
   pip install --no-cache-dir -r requirements.txt

requirements.txt CHANGED Viewed

@@ -3,8 +3,4 @@ uvicorn==0.24.0
 python-dotenv==1.0.0
 groq==0.11.0
 pydantic==2.5.0
-python-multipart==0.0.6
-presidio-analyzer==2.2.354
-presidio-anonymizer==2.2.354
-spacy==3.7.2
-en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl

 python-dotenv==1.0.0
 groq==0.11.0
 pydantic==2.5.0
+python-multipart==0.0.6

src/api/routes.py CHANGED Viewed

@@ -20,7 +20,6 @@ async def analyze_provider_notes(request: ProviderNotesRequest):
     try:
         logger.info("Received coding request")
-        # Get provider notes from request
         provider_notes = request.provider_notes
         if not provider_notes or len(provider_notes.strip()) < 10:
@@ -29,12 +28,10 @@ async def analyze_provider_notes(request: ProviderNotesRequest):
                 detail="Provider notes must be at least 10 characters long"
             )
-        # Process through Groq service
         result = await groq_service.analyze_provider_notes(provider_notes)
         logger.info("Successfully processed coding request")
-        # Return response matching CodingResponse model
         return CodingResponse(
             cpt_codes=result.get("CPT", []),
             cpt_explanation=result.get("CPT_explanation", ""),
@@ -52,7 +49,7 @@ async def analyze_provider_notes(request: ProviderNotesRequest):
         )
-# UPDATED ENDPOINT - File Upload with PII Removal
 @router.post("/upload-file", response_model=FileUploadResponse)
 async def upload_provider_notes_file(file: UploadFile = File(...)):
     """
@@ -60,7 +57,7 @@ async def upload_provider_notes_file(file: UploadFile = File(...)):
     This endpoint:
     1. Extracts text from uploaded TXT file
-    2. Automatically detects and removes patient personal information (PII)
     3. Processes sanitized text through LLM
     4. Returns ICD-10 and CPT codes
@@ -73,10 +70,10 @@ async def upload_provider_notes_file(file: UploadFile = File(...)):
     try:
         logger.info(f"📁 Received file upload request: {file.filename}")
-        # Step 1: Extract text from file with automatic PII removal
         extraction_result = await file_service.extract_text_from_file(
             file=file,
-            remove_pii=True  # Always remove PII for safety
         )
         extracted_text = extraction_result["text"]
@@ -87,7 +84,7 @@ async def upload_provider_notes_file(file: UploadFile = File(...)):
         logger.info(f"✅ Extracted {text_length} characters from {filename}")
         if pii_info["pii_removed"]:
-            logger.info(f"🔒 Removed {pii_info['pii_count']} PII entities before processing")
         # Step 2: Process sanitized text through Groq LLM
         coding_result = await groq_service.analyze_provider_notes(extracted_text)

     try:
         logger.info("Received coding request")
         provider_notes = request.provider_notes
         if not provider_notes or len(provider_notes.strip()) < 10:
                 detail="Provider notes must be at least 10 characters long"
             )
         result = await groq_service.analyze_provider_notes(provider_notes)
         logger.info("Successfully processed coding request")
         return CodingResponse(
             cpt_codes=result.get("CPT", []),
             cpt_explanation=result.get("CPT_explanation", ""),
         )
+# FILE UPLOAD ENDPOINT WITH REGEX-BASED PII REMOVAL
 @router.post("/upload-file", response_model=FileUploadResponse)
 async def upload_provider_notes_file(file: UploadFile = File(...)):
     """
     This endpoint:
     1. Extracts text from uploaded TXT file
+    2. Automatically detects and removes patient personal information using regex patterns
     3. Processes sanitized text through LLM
     4. Returns ICD-10 and CPT codes
     try:
         logger.info(f"📁 Received file upload request: {file.filename}")
+        # Step 1: Extract text from file with automatic regex-based PII removal
         extraction_result = await file_service.extract_text_from_file(
             file=file,
+            remove_pii=True  # Always remove PII using regex patterns
         )
         extracted_text = extraction_result["text"]
         logger.info(f"✅ Extracted {text_length} characters from {filename}")
         if pii_info["pii_removed"]:
+            logger.info(f"🔒 Removed {pii_info['pii_count']} PII entities using regex before processing")
         # Step 2: Process sanitized text through Groq LLM
         coding_result = await groq_service.analyze_provider_notes(extracted_text)

src/services/file_service.py CHANGED Viewed

@@ -2,13 +2,13 @@ from fastapi import UploadFile, HTTPException
 import os
 from typing import Dict
 import logging
-from services.pii_detector import pii_detector
 logger = logging.getLogger(__name__)
 class FileService:
-    """Service to handle file uploads and text extraction"""
     ALLOWED_EXTENSIONS = {'.txt'}
     MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB
@@ -24,11 +24,9 @@ class FileService:
         Raises:
             HTTPException: If file is invalid
         """
-        # Check if file exists
         if not file:
             raise HTTPException(status_code=400, detail="No file provided")
-        # Check file extension
         file_ext = os.path.splitext(file.filename)[1].lower()
         if file_ext not in FileService.ALLOWED_EXTENSIONS:
             raise HTTPException(
@@ -39,7 +37,7 @@ class FileService:
     @staticmethod
     async def extract_text_from_file(file: UploadFile, remove_pii: bool = True) -> Dict[str, any]:
         """
-        Extract text content from uploaded file and optionally remove PII
         Args:
             file: Uploaded file object
@@ -88,9 +86,9 @@ class FileService:
                     detail="Extracted text is too short. Please provide more detailed provider notes"
                 )
-            logger.info(f"✅ Successfully extracted {len(text)} characters from {file.filename}")
-            # Remove PII if requested
             pii_info = {
                 "pii_removed": False,
                 "pii_count": 0,
@@ -98,18 +96,18 @@ class FileService:
             }
             if remove_pii:
-                logger.info("🔒 Removing PII from extracted text...")
-                pii_result = pii_detector.remove_pii(text)
-                text = pii_result["sanitized_text"]
                 pii_info = {
-                    "pii_removed": pii_result["was_pii_removed"],
-                    "pii_count": pii_result["pii_count"],
-                    "pii_details": pii_result["pii_detected"]
                 }
-                if pii_result["was_pii_removed"]:
-                    logger.info(f"✅ Removed {pii_result['pii_count']} PII entities")
                 else:
                     logger.info("✅ No PII detected in text")

 import os
 from typing import Dict
 import logging
+from services.regex_pii_remover import regex_pii_remover
 logger = logging.getLogger(__name__)
 class FileService:
+    """Service to handle file uploads and text extraction with PII removal"""
     ALLOWED_EXTENSIONS = {'.txt'}
     MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB
         Raises:
             HTTPException: If file is invalid
         """
         if not file:
             raise HTTPException(status_code=400, detail="No file provided")
         file_ext = os.path.splitext(file.filename)[1].lower()
         if file_ext not in FileService.ALLOWED_EXTENSIONS:
             raise HTTPException(
     @staticmethod
     async def extract_text_from_file(file: UploadFile, remove_pii: bool = True) -> Dict[str, any]:
         """
+        Extract text content from uploaded file and optionally remove PII using regex
         Args:
             file: Uploaded file object
                     detail="Extracted text is too short. Please provide more detailed provider notes"
                 )
+            logger.info(f"📄 Successfully extracted {len(text)} characters from {file.filename}")
+            # Remove PII using regex if requested
             pii_info = {
                 "pii_removed": False,
                 "pii_count": 0,
             }
             if remove_pii:
+                logger.info("🔒 Removing PII from extracted text using regex patterns...")
+                pii_result = regex_pii_remover.sanitize_provider_notes(text)
+                text = pii_result["sanitized_notes"]
                 pii_info = {
+                    "pii_removed": pii_result["was_pii_found"],
+                    "pii_count": pii_result["pii_removed_count"],
+                    "pii_details": pii_result["pii_details"]
                 }
+                if pii_result["was_pii_found"]:
+                    logger.info(f"✅ Removed {pii_result['pii_removed_count']} PII entities using regex")
                 else:
                     logger.info("✅ No PII detected in text")

src/services/pii_detector.py DELETED Viewed

@@ -1,197 +0,0 @@
-from presidio_analyzer import AnalyzerEngine
-from presidio_anonymizer import AnonymizerEngine
-from typing import Dict, List
-import re
-import logging
-logger = logging.getLogger(__name__)
-class PIIDetector:
-    """Service to detect and remove Personal Identifiable Information from medical notes"""
-    def __init__(self):
-        """Initialize PII detection engines"""
-        try:
-            self.analyzer = AnalyzerEngine()
-            self.anonymizer = AnonymizerEngine()
-            # Entities to detect (common in medical notes)
-            self.entities_to_detect = [
-                "PERSON",              # Names
-                "EMAIL_ADDRESS",       # Email
-                "PHONE_NUMBER",        # Phone numbers
-                "US_SSN",             # Social Security Number
-                "CREDIT_CARD",        # Credit card numbers
-                "US_DRIVER_LICENSE",  # Driver's license
-                "LOCATION",           # Addresses, cities
-                "DATE_TIME",          # Birth dates, appointment dates
-                "US_PASSPORT",        # Passport numbers
-                "MEDICAL_LICENSE",    # Medical license numbers
-                "IP_ADDRESS",         # IP addresses
-                "URL"                 # URLs
-            ]
-            logger.info("✅ PII Detector initialized successfully")
-        except Exception as e:
-            logger.error(f"❌ Failed to initialize PII Detector: {str(e)}")
-            raise
-    def detect_pii(self, text: str) -> List[Dict]:
-        """
-        Detect PII entities in text
-        Args:
-            text: Input text to analyze
-        Returns:
-            List of detected PII entities with details
-        """
-        try:
-            results = self.analyzer.analyze(
-                text=text,
-                entities=self.entities_to_detect,
-                language='en'
-            )
-            pii_findings = []
-            for result in results:
-                pii_findings.append({
-                    "entity_type": result.entity_type,
-                    "start": result.start,
-                    "end": result.end,
-                    "score": result.score,
-                    "text": text[result.start:result.end]
-                })
-            logger.info(f"🔍 Detected {len(pii_findings)} PII entities")
-            return pii_findings
-        except Exception as e:
-            logger.error(f"❌ Error detecting PII: {str(e)}")
-            return []
-    def remove_pii(self, text: str) -> Dict[str, any]:
-        """
-        Remove PII from text while preserving medical information
-        Args:
-            text: Input text containing potential PII
-        Returns:
-            Dictionary with sanitized text and PII removal report
-        """
-        try:
-            # Step 1: Detect PII
-            analyzer_results = self.analyzer.analyze(
-                text=text,
-                entities=self.entities_to_detect,
-                language='en'
-            )
-            if not analyzer_results:
-                logger.info("✅ No PII detected in text")
-                return {
-                    "sanitized_text": text,
-                    "pii_detected": [],
-                    "pii_count": 0,
-                    "was_pii_removed": False
-                }
-            # Step 2: Anonymize detected PII
-            anonymized_result = self.anonymizer.anonymize(
-                text=text,
-                analyzer_results=analyzer_results
-            )
-            sanitized_text = anonymized_result.text
-            # Step 3: Additional pattern-based cleaning for medical notes
-            # Replace common medical note PII patterns
-            sanitized_text = self._clean_medical_patterns(sanitized_text)
-            # Step 4: Collect PII detection details
-            pii_detected = []
-            for result in analyzer_results:
-                pii_detected.append({
-                    "entity_type": result.entity_type,
-                    "start": result.start,
-                    "end": result.end,
-                    "score": result.score
-                })
-            logger.info(f"✅ Removed {len(pii_detected)} PII entities from text")
-            return {
-                "sanitized_text": sanitized_text,
-                "pii_detected": pii_detected,
-                "pii_count": len(pii_detected),
-                "was_pii_removed": True
-            }
-        except Exception as e:
-            logger.error(f"❌ Error removing PII: {str(e)}")
-            # Return original text if PII removal fails
-            return {
-                "sanitized_text": text,
-                "pii_detected": [],
-                "pii_count": 0,
-                "was_pii_removed": False,
-                "error": str(e)
-            }
-    def _clean_medical_patterns(self, text: str) -> str:
-        """
-        Clean common medical note PII patterns that might be missed
-        Args:
-            text: Text to clean
-        Returns:
-            Cleaned text
-        """
-        # Pattern 1: "Patient: <NAME>" or "Pt: <NAME>"
-        text = re.sub(
-            r'(Patient|Pt|Patient Name):\s*<[A-Z_]+>',
-            r'\1: [REDACTED]',
-            text,
-            flags=re.IGNORECASE
-        )
-        # Pattern 2: "DOB: <DATE>"
-        text = re.sub(
-            r'(DOB|Date of Birth|Birth Date):\s*<[A-Z_]+>',
-            r'\1: [REDACTED]',
-            text,
-            flags=re.IGNORECASE
-        )
-        # Pattern 3: "Address: <LOCATION>"
-        text = re.sub(
-            r'(Address|Addr|Home Address):\s*<[A-Z_]+>',
-            r'\1: [REDACTED]',
-            text,
-            flags=re.IGNORECASE
-        )
-        # Pattern 4: "Phone: <PHONE_NUMBER>"
-        text = re.sub(
-            r'(Phone|Tel|Telephone|Cell|Mobile):\s*<[A-Z_]+>',
-            r'\1: [REDACTED]',
-            text,
-            flags=re.IGNORECASE
-        )
-        # Pattern 5: "MRN: <NUMBER>" (Medical Record Number)
-        text = re.sub(
-            r'(MRN|Medical Record Number|Record #):\s*<[A-Z_]+>',
-            r'\1: [REDACTED]',
-            text,
-            flags=re.IGNORECASE
-        )
-        return text
-# Singleton instance
-pii_detector = PIIDetector()

src/services/regex_pii_remover.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import re
+import logging
+from typing import Dict, List, Tuple
+logger = logging.getLogger(__name__)
+class RegexPIIRemover:
+    """
+    Lightweight regex-based PII detection and removal service
+    Detects and removes common personal information from medical notes
+    """
+    def __init__(self):
+        """Initialize regex patterns for PII detection"""
+        # Pattern definitions with descriptions
+        self.patterns = {
+            'PHONE': {
+                'pattern': r'\b(?:\+?1[-.]?)?\(?([0-9]{3})\)?[-.]?([0-9]{3})[-.]?([0-9]{4})\b',
+                'replacement': '[PHONE_REDACTED]',
+                'description': 'Phone numbers'
+            },
+            'EMAIL': {
+                'pattern': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
+                'replacement': '[EMAIL_REDACTED]',
+                'description': 'Email addresses'
+            },
+            'SSN': {
+                'pattern': r'\b\d{3}-\d{2}-\d{4}\b',
+                'replacement': '[SSN_REDACTED]',
+                'description': 'Social Security Numbers'
+            },
+            'DATE_OF_BIRTH': {
+                'pattern': r'\b(0?[1-9]|1[0-2])[/-](0?[1-9]|[12][0-9]|3[01])[/-](19|20)\d{2}\b',
+                'replacement': '[DOB_REDACTED]',
+                'description': 'Dates of birth'
+            },
+            'ZIP_CODE': {
+                'pattern': r'\b\d{5}(?:-\d{4})?\b',
+                'replacement': '[ZIP_REDACTED]',
+                'description': 'ZIP codes'
+            },
+            'CREDIT_CARD': {
+                'pattern': r'\b(?:\d{4}[-\s]?){3}\d{4}\b',
+                'replacement': '[CARD_REDACTED]',
+                'description': 'Credit card numbers'
+            },
+            'IP_ADDRESS': {
+                'pattern': r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
+                'replacement': '[IP_REDACTED]',
+                'description': 'IP addresses'
+            },
+            'STREET_ADDRESS': {
+                'pattern': r'\b\d{1,5}\s+([A-Z][a-z]+\s*){1,3}(Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Way)\b',
+                'replacement': '[ADDRESS_REDACTED]',
+                'description': 'Street addresses'
+            }
+        }
+        # Medical note specific patterns
+        self.medical_patterns = {
+            'PATIENT_NAME_LABEL': {
+                'pattern': r'(Patient|Pt|Patient Name|Name):\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)',
+                'replacement': r'\1: [NAME_REDACTED]',
+                'description': 'Patient names after labels'
+            },
+            'DOB_LABEL': {
+                'pattern': r'(DOB|Date of Birth|Birth Date|Birthdate):\s*[\d/\-]+',
+                'replacement': r'\1: [DOB_REDACTED]',
+                'description': 'DOB after labels'
+            },
+            'PHONE_LABEL': {
+                'pattern': r'(Phone|Tel|Telephone|Cell|Mobile|Contact):\s*[\d\s\-\(\)\.]+',
+                'replacement': r'\1: [PHONE_REDACTED]',
+                'description': 'Phone numbers after labels'
+            },
+            'ADDRESS_LABEL': {
+                'pattern': r'(Address|Addr|Home Address|Mailing Address):\s*[^\n]+',
+                'replacement': r'\1: [ADDRESS_REDACTED]',
+                'description': 'Addresses after labels'
+            },
+            'MRN_LABEL': {
+                'pattern': r'(MRN|Medical Record Number|Record #|Patient ID|ID):\s*[\w\d\-]+',
+                'replacement': r'\1: [MRN_REDACTED]',
+                'description': 'Medical record numbers'
+            },
+            'GUARDIAN_INFO': {
+                'pattern': r'(Guardian|Emergency Contact|Next of Kin):\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*',
+                'replacement': r'\1: [CONTACT_REDACTED]',
+                'description': 'Guardian/emergency contact names'
+            }
+        }
+        logger.info("✅ Regex PII Remover initialized with pattern-based detection")
+    def detect_pii(self, text: str) -> List[Dict]:
+        """
+        Detect PII entities in text using regex patterns
+        Args:
+            text: Input text to analyze
+        Returns:
+            List of detected PII entities with details
+        """
+        findings = []
+        # Check general patterns
+        for entity_type, config in self.patterns.items():
+            matches = re.finditer(config['pattern'], text)
+            for match in matches:
+                findings.append({
+                    'entity_type': entity_type,
+                    'text': match.group(),
+                    'start': match.start(),
+                    'end': match.end(),
+                    'description': config['description']
+                })
+        # Check medical-specific patterns
+        for entity_type, config in self.medical_patterns.items():
+            matches = re.finditer(config['pattern'], text, re.IGNORECASE)
+            for match in matches:
+                findings.append({
+                    'entity_type': entity_type,
+                    'text': match.group(),
+                    'start': match.start(),
+                    'end': match.end(),
+                    'description': config['description']
+                })
+        logger.info(f"🔍 Detected {len(findings)} PII entities using regex patterns")
+        return findings
+    def remove_pii(self, text: str) -> Dict[str, any]:
+        """
+        Remove PII from text using regex patterns
+        Args:
+            text: Input text containing potential PII
+        Returns:
+            Dictionary with sanitized text and PII removal report
+        """
+        try:
+            original_text = text
+            sanitized_text = text
+            total_replacements = 0
+            replacement_details = []
+            # Apply general PII patterns
+            for entity_type, config in self.patterns.items():
+                matches = list(re.finditer(config['pattern'], sanitized_text))
+                if matches:
+                    count = len(matches)
+                    total_replacements += count
+                    replacement_details.append({
+                        'type': entity_type,
+                        'count': count,
+                        'description': config['description']
+                    })
+                    sanitized_text = re.sub(config['pattern'], config['replacement'], sanitized_text)
+                    logger.info(f"  🔒 Removed {count} {config['description']}")
+            # Apply medical-specific patterns
+            for entity_type, config in self.medical_patterns.items():
+                matches = list(re.finditer(config['pattern'], sanitized_text, re.IGNORECASE))
+                if matches:
+                    count = len(matches)
+                    total_replacements += count
+                    replacement_details.append({
+                        'type': entity_type,
+                        'count': count,
+                        'description': config['description']
+                    })
+                    sanitized_text = re.sub(config['pattern'], config['replacement'], sanitized_text, flags=re.IGNORECASE)
+                    logger.info(f"  🔒 Removed {count} {config['description']}")
+            was_pii_removed = sanitized_text != original_text
+            if was_pii_removed:
+                logger.info(f"✅ Total PII removals: {total_replacements} entities")
+            else:
+                logger.info("✅ No PII detected in text")
+            return {
+                'sanitized_text': sanitized_text,
+                'original_text': original_text,
+                'was_pii_removed': was_pii_removed,
+                'pii_count': total_replacements,
+                'pii_detected': replacement_details
+            }
+        except Exception as e:
+            logger.error(f"❌ Error removing PII: {str(e)}")
+            return {
+                'sanitized_text': text,
+                'original_text': text,
+                'was_pii_removed': False,
+                'pii_count': 0,
+                'pii_detected': [],
+                'error': str(e)
+            }
+    def sanitize_provider_notes(self, notes: str) -> Dict[str, any]:
+        """
+        Sanitize provider notes by removing all PII
+        Main entry point for file processing
+        Args:
+            notes: Provider notes text
+        Returns:
+            Dictionary with sanitized notes and PII removal report
+        """
+        logger.info("🔒 Starting PII sanitization of provider notes...")
+        result = self.remove_pii(notes)
+        return {
+            'sanitized_notes': result['sanitized_text'],
+            'pii_removed_count': result['pii_count'],
+            'pii_details': result['pii_detected'],
+            'was_pii_found': result['was_pii_removed']
+        }
+# Singleton instance
+regex_pii_remover = RegexPIIRemover()