Spaces:

Distopia22
/

icd-cpt-coding-api-backend

Sleeping

App Files Files Community

Distopia22 commited on Nov 24, 2025

Commit

764e30e

1 Parent(s): 5574ac6

Fix: Add remove_pii method to FileService

Browse files

Files changed (3) hide show

src/api/routes.py +13 -5
src/services/file_service.py +17 -108
src/services/regex_pii_remover.py +46 -204

src/api/routes.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from fastapi import APIRouter, HTTPException, UploadFile, File
-from models.request_models import ProviderNotesRequest, FileUploadResponse
-from models.response_models import CodingResponse
 from services.groq_service import groq_service
 from services.file_service import file_service
@@ -74,9 +74,14 @@ async def upload_provider_notes_file(file: UploadFile = File(...)):
         logger.info(f"📄 File read successfully (length: {len(text)})")
         # Remove PII
-        cleaned_text, pii_count = file_service.remove_pii(text)
-        logger.info(f"🔒 PII removal complete: {pii_count} entities removed")
         # Analyze with Groq
         result = groq_service.analyze_provider_notes(cleaned_text)
@@ -99,6 +104,9 @@ async def upload_provider_notes_file(file: UploadFile = File(...)):
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"❌ Error processing uploaded file: {str(e)}", exc_info=True)
         raise HTTPException(status_code=500, detail=f"Error processing uploaded file: {str(e)}")

 import logging
 from fastapi import APIRouter, HTTPException, UploadFile, File
+from models.request_models import ProviderNotesRequest
+from models.response_models import CodingResponse, FileUploadResponse
 from services.groq_service import groq_service
 from services.file_service import file_service
         logger.info(f"📄 File read successfully (length: {len(text)})")
         # Remove PII
+        try:
+            cleaned_text, pii_count = file_service.remove_pii(text)
+            logger.info(f"🔒 PII removal complete: {pii_count} entities removed")
+        except Exception as pii_error:
+            logger.error(f"⚠️ PII removal failed: {str(pii_error)}")
+            # Continue without PII removal if it fails
+            cleaned_text = text
+            pii_count = 0
         # Analyze with Groq
         result = groq_service.analyze_provider_notes(cleaned_text)
     except HTTPException:
         raise
+    except UnicodeDecodeError:
+        logger.error("❌ File encoding error")
+        raise HTTPException(status_code=400, detail="File must be UTF-8 encoded text")
     except Exception as e:
         logger.error(f"❌ Error processing uploaded file: {str(e)}", exc_info=True)
         raise HTTPException(status_code=500, detail=f"Error processing uploaded file: {str(e)}")

src/services/file_service.py CHANGED Viewed

@@ -2,132 +2,41 @@ from fastapi import UploadFile, HTTPException
 import os
 from typing import Dict
 import logging
-from services.regex_pii_remover import regex_pii_remover
 logger = logging.getLogger(__name__)
 class FileService:
-    """Service to handle file uploads and text extraction with PII removal"""
-    ALLOWED_EXTENSIONS = {'.txt'}
-    MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB
-    @staticmethod
-    def validate_file(file: UploadFile) -> None:
-        """
-        Validate uploaded file
-        Args:
-            file: Uploaded file object
-        Raises:
-            HTTPException: If file is invalid
-        """
-        if not file:
-            raise HTTPException(status_code=400, detail="No file provided")
-        file_ext = os.path.splitext(file.filename)[1].lower()
-        if file_ext not in FileService.ALLOWED_EXTENSIONS:
-            raise HTTPException(
-                status_code=400,
-                detail=f"Invalid file type. Only {', '.join(FileService.ALLOWED_EXTENSIONS)} files are allowed"
-            )
-    @staticmethod
-    async def extract_text_from_file(file: UploadFile, remove_pii: bool = True) -> Dict[str, any]:
         """
-        Extract text content from uploaded file and optionally remove PII using regex
         Args:
-            file: Uploaded file object
-            remove_pii: Whether to remove PII from extracted text (default: True)
         Returns:
-            Dictionary containing extracted text, PII removal info, and metadata
         """
         try:
-            # Validate file
-            FileService.validate_file(file)
-            # Read file content
-            content = await file.read()
-            # Check file size
-            file_size = len(content)
-            if file_size > FileService.MAX_FILE_SIZE:
-                raise HTTPException(
-                    status_code=400,
-                    detail=f"File too large. Maximum size is {FileService.MAX_FILE_SIZE / (1024*1024)} MB"
-                )
-            # Decode text
-            try:
-                text = content.decode('utf-8')
-            except UnicodeDecodeError:
-                try:
-                    text = content.decode('latin-1')
-                except Exception as e:
-                    raise HTTPException(
-                        status_code=400,
-                        detail="Unable to decode file. Please ensure it's a valid text file"
-                    )
-            # Validate extracted text
-            if not text.strip():
-                raise HTTPException(
-                    status_code=400,
-                    detail="File is empty or contains no readable text"
-                )
-            if len(text.strip()) < 10:
-                raise HTTPException(
-                    status_code=400,
-                    detail="Extracted text is too short. Please provide more detailed provider notes"
-                )
-            logger.info(f"Successfully extracted {len(text)} characters from {file.filename}")
-            # Remove PII using regex if requested
-            pii_info = {
-                "pii_removed": False,
-                "pii_count": 0,
-                "pii_details": []
-            }
-            if remove_pii:
-                logger.info("Removing PII from extracted text using regex patterns...")
-                pii_result = regex_pii_remover.sanitize_provider_notes(text)
-                text = pii_result["sanitized_notes"]
-                pii_info = {
-                    "pii_removed": pii_result["was_pii_found"],
-                    "pii_count": pii_result["pii_removed_count"],
-                    "pii_details": pii_result["pii_details"]
-                }
-                if pii_result["was_pii_found"]:
-                    logger.info(f"Removed {pii_result['pii_removed_count']} PII entities using regex")
-                else:
-                    logger.info("No PII detected in text")
-            return {
-                "text": text,
-                "filename": file.filename,
-                "file_size": file_size,
-                "text_length": len(text),
-                "pii_info": pii_info
-            }
-        except HTTPException:
-            raise
         except Exception as e:
-            logger.error(f"Error extracting text from file: {str(e)}")
-            raise HTTPException(
-                status_code=500,
-                detail=f"Error processing file: {str(e)}"
-            )
-# Singleton instance
 file_service = FileService()

 import os
 from typing import Dict
 import logging
+from services.regex_pii_remover import RegexPIIRemover
 logger = logging.getLogger(__name__)
 class FileService:
+    def __init__(self):
+        """Initialize file service with PII remover"""
+        self.pii_remover = RegexPIIRemover()
+        logger.info("✅ FileService initialized")
+    def remove_pii(self, text: str) -> tuple[str, int]:
         """
+        Remove PII from text using regex patterns
         Args:
+            text: Input text containing potential PII
         Returns:
+            tuple: (cleaned_text, pii_count)
         """
         try:
+            logger.info(f"🔒 Starting PII removal (text length: {len(text)})")
+            cleaned_text, pii_count = self.pii_remover.remove_pii(text)
+            logger.info(f"✅ PII removal complete: {pii_count} entities removed")
+            return cleaned_text, pii_count
         except Exception as e:
+            logger.error(f"❌ Error during PII removal: {str(e)}")
+            # Return original text if PII removal fails
+            return text, 0
+# Global instance
 file_service = FileService()

src/services/regex_pii_remover.py CHANGED Viewed

@@ -1,229 +1,71 @@
 import re
 import logging
-from typing import Dict, List, Tuple
 logger = logging.getLogger(__name__)
 class RegexPIIRemover:
-    """
-    Lightweight regex-based PII detection and removal service
-    Detects and removes common personal information from medical notes
-    """
     def __init__(self):
-        """Initialize regex patterns for PII detection"""
-        # Pattern definitions with descriptions
         self.patterns = {
-            'PHONE': {
-                'pattern': r'\b(?:\+?1[-.]?)?\(?([0-9]{3})\)?[-.]?([0-9]{3})[-.]?([0-9]{4})\b',
-                'replacement': '[PHONE_REDACTED]',
-                'description': 'Phone numbers'
-            },
-            'EMAIL': {
-                'pattern': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
-                'replacement': '[EMAIL_REDACTED]',
-                'description': 'Email addresses'
-            },
-            'SSN': {
-                'pattern': r'\b\d{3}-\d{2}-\d{4}\b',
-                'replacement': '[SSN_REDACTED]',
-                'description': 'Social Security Numbers'
-            },
-            'DATE_OF_BIRTH': {
-                'pattern': r'\b(0?[1-9]|1[0-2])[/-](0?[1-9]|[12][0-9]|3[01])[/-](19|20)\d{2}\b',
-                'replacement': '[DOB_REDACTED]',
-                'description': 'Dates of birth'
-            },
-            'ZIP_CODE': {
-                'pattern': r'\b\d{5}(?:-\d{4})?\b',
-                'replacement': '[ZIP_REDACTED]',
-                'description': 'ZIP codes'
-            },
-            'CREDIT_CARD': {
-                'pattern': r'\b(?:\d{4}[-\s]?){3}\d{4}\b',
-                'replacement': '[CARD_REDACTED]',
-                'description': 'Credit card numbers'
-            },
-            'IP_ADDRESS': {
-                'pattern': r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
-                'replacement': '[IP_REDACTED]',
-                'description': 'IP addresses'
-            },
-            'STREET_ADDRESS': {
-                'pattern': r'\b\d{1,5}\s+([A-Z][a-z]+\s*){1,3}(Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Way)\b',
-                'replacement': '[ADDRESS_REDACTED]',
-                'description': 'Street addresses'
-            }
-        }
-        # Medical note specific patterns
-        self.medical_patterns = {
-            'PATIENT_NAME_LABEL': {
-                'pattern': r'(Patient|Pt|Patient Name|Name):\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)',
-                'replacement': r'\1: [NAME_REDACTED]',
-                'description': 'Patient names after labels'
-            },
-            'DOB_LABEL': {
-                'pattern': r'(DOB|Date of Birth|Birth Date|Birthdate):\s*[\d/\-]+',
-                'replacement': r'\1: [DOB_REDACTED]',
-                'description': 'DOB after labels'
-            },
-            'PHONE_LABEL': {
-                'pattern': r'(Phone|Tel|Telephone|Cell|Mobile|Contact):\s*[\d\s\-\(\)\.]+',
-                'replacement': r'\1: [PHONE_REDACTED]',
-                'description': 'Phone numbers after labels'
-            },
-            'ADDRESS_LABEL': {
-                'pattern': r'(Address|Addr|Home Address|Mailing Address):\s*[^\n]+',
-                'replacement': r'\1: [ADDRESS_REDACTED]',
-                'description': 'Addresses after labels'
-            },
-            'MRN_LABEL': {
-                'pattern': r'(MRN|Medical Record Number|Record #|Patient ID|ID):\s*[\w\d\-]+',
-                'replacement': r'\1: [MRN_REDACTED]',
-                'description': 'Medical record numbers'
-            },
-            'GUARDIAN_INFO': {
-                'pattern': r'(Guardian|Emergency Contact|Next of Kin):\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*',
-                'replacement': r'\1: [CONTACT_REDACTED]',
-                'description': 'Guardian/emergency contact names'
-            }
-        }
-        logger.info("Regex PII Remover initialized with pattern-based detection")
-    def detect_pii(self, text: str) -> List[Dict]:
-        """
-        Detect PII entities in text using regex patterns
-        Args:
-            text: Input text to analyze
-        Returns:
-            List of detected PII entities with details
-        """
-        findings = []
-        # Check general patterns
-        for entity_type, config in self.patterns.items():
-            matches = re.finditer(config['pattern'], text)
-            for match in matches:
-                findings.append({
-                    'entity_type': entity_type,
-                    'text': match.group(),
-                    'start': match.start(),
-                    'end': match.end(),
-                    'description': config['description']
-                })
-        # Check medical-specific patterns
-        for entity_type, config in self.medical_patterns.items():
-            matches = re.finditer(config['pattern'], text, re.IGNORECASE)
-            for match in matches:
-                findings.append({
-                    'entity_type': entity_type,
-                    'text': match.group(),
-                    'start': match.start(),
-                    'end': match.end(),
-                    'description': config['description']
-                })
-        logger.info(f"Detected {len(findings)} PII entities using regex patterns")
-        return findings
-    def remove_pii(self, text: str) -> Dict[str, any]:
-        """
-        Remove PII from text using regex patterns
-        Args:
-            text: Input text containing potential PII
-        Returns:
-            Dictionary with sanitized text and PII removal report
-        """
-        try:
-            original_text = text
-            sanitized_text = text
-            total_replacements = 0
-            replacement_details = []
-            # Apply general PII patterns
-            for entity_type, config in self.patterns.items():
-                matches = list(re.finditer(config['pattern'], sanitized_text))
-                if matches:
-                    count = len(matches)
-                    total_replacements += count
-                    replacement_details.append({
-                        'type': entity_type,
-                        'count': count,
-                        'description': config['description']
-                    })
-                    sanitized_text = re.sub(config['pattern'], config['replacement'], sanitized_text)
-                    logger.info(f"Removed {count} {config['description']}")
-            # Apply medical-specific patterns
-            for entity_type, config in self.medical_patterns.items():
-                matches = list(re.finditer(config['pattern'], sanitized_text, re.IGNORECASE))
-                if matches:
-                    count = len(matches)
-                    total_replacements += count
-                    replacement_details.append({
-                        'type': entity_type,
-                        'count': count,
-                        'description': config['description']
-                    })
-                    sanitized_text = re.sub(config['pattern'], config['replacement'], sanitized_text, flags=re.IGNORECASE)
-                    logger.info(f"Removed {count} {config['description']}")
-            was_pii_removed = sanitized_text != original_text
-            if was_pii_removed:
-                logger.info(f"Total PII removals: {total_replacements} entities")
-            else:
-                logger.info("No PII detected in text")
-            return {
-                'sanitized_text': sanitized_text,
-                'original_text': original_text,
-                'was_pii_removed': was_pii_removed,
-                'pii_count': total_replacements,
-                'pii_detected': replacement_details
-            }
-        except Exception as e:
-            logger.error(f"Error removing PII: {str(e)}")
-            return {
-                'sanitized_text': text,
-                'original_text': text,
-                'was_pii_removed': False,
-                'pii_count': 0,
-                'pii_detected': [],
-                'error': str(e)
-            }
-    def sanitize_provider_notes(self, notes: str) -> Dict[str, any]:
         """
-        Sanitize provider notes by removing all PII
-        Main entry point for file processing
         Args:
-            notes: Provider notes text
         Returns:
-            Dictionary with sanitized notes and PII removal report
         """
-        logger.info("Starting PII sanitization of provider notes...")
-        result = self.remove_pii(notes)
-        return {
-            'sanitized_notes': result['sanitized_text'],
-            'pii_removed_count': result['pii_count'],
-            'pii_details': result['pii_detected'],
-            'was_pii_found': result['was_pii_removed']
-        }
-# Singleton instance
-regex_pii_remover = RegexPIIRemover()

 import re
 import logging
 logger = logging.getLogger(__name__)
 class RegexPIIRemover:
+    """Remove PII using regex patterns"""
     def __init__(self):
+        """Initialize PII removal patterns"""
         self.patterns = {
+            # Social Security Numbers
+            'ssn': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'),
+            # Phone numbers
+            'phone': re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'),
+            # Email addresses
+            'email': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
+            # Dates (MM/DD/YYYY, MM-DD-YYYY, etc.)
+            'date': re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'),
+            # Medical Record Numbers (MRN)
+            'mrn': re.compile(r'\b(MRN|Medical Record Number)[:\s]+\w+\b', re.IGNORECASE),
+            # ZIP codes
+            'zip': re.compile(r'\b\d{5}(-\d{4})?\b'),
+            # Names (simple pattern - captures "Patient: John Doe" or "Name: Jane Smith")
+            'patient_name': re.compile(r'(Patient|Name)[:\s]+([A-Z][a-z]+\s[A-Z][a-z]+)', re.IGNORECASE),
+            # Date of Birth
+            'dob': re.compile(r'(DOB|Date of Birth)[:\s]+\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', re.IGNORECASE),
+        }
+        logger.info(f"✅ RegexPIIRemover initialized with {len(self.patterns)} patterns")
+    def remove_pii(self, text: str) -> tuple[str, int]:
         """
+        Remove PII from text
         Args:
+            text: Input text
         Returns:
+            tuple: (cleaned_text, count_of_pii_removed)
         """
+        cleaned_text = text
+        total_removed = 0
+        for pii_type, pattern in self.patterns.items():
+            matches = pattern.findall(cleaned_text)
+            count = len(matches)
+            if count > 0:
+                logger.debug(f"Found {count} instances of {pii_type}")
+                total_removed += count
+                # Replace with redacted placeholder
+                if pii_type == 'patient_name':
+                    cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text)
+                elif pii_type in ['dob', 'mrn']:
+                    cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text)
+                else:
+                    cleaned_text = pattern.sub('[REDACTED]', cleaned_text)
+        logger.info(f"🔒 Removed {total_removed} PII entities")
+        return cleaned_text, total_removed