Distopia22 commited on
Commit
fd20bd2
·
1 Parent(s): 3cbb4d9

Add automatic PII removal during file extraction

Browse files
Dockerfile CHANGED
@@ -8,6 +8,9 @@ COPY requirements.txt .
8
  # Install dependencies
9
  RUN pip install --no-cache-dir -r requirements.txt
10
 
 
 
 
11
  # Copy the entire backend
12
  COPY . .
13
 
 
8
  # Install dependencies
9
  RUN pip install --no-cache-dir -r requirements.txt
10
 
11
+ # Download spaCy model for Presidio (required for PII detection)
12
+ RUN python -m spacy download en_core_web_lg
13
+
14
  # Copy the entire backend
15
  COPY . .
16
 
requirements.txt CHANGED
@@ -1,7 +1,9 @@
1
  fastapi==0.104.1
2
  uvicorn==0.24.0
3
  python-dotenv==1.0.0
4
- groq==0.9.0
5
- httpx==0.27.0
6
  pydantic==2.5.0
7
- python-multipart==0.0.6
 
 
 
 
1
  fastapi==0.104.1
2
  uvicorn==0.24.0
3
  python-dotenv==1.0.0
4
+ groq==0.4.1
 
5
  pydantic==2.5.0
6
+ python-multipart==0.0.6
7
+ presidio-analyzer==2.2.354
8
+ presidio-anonymizer==2.2.354
9
+ spacy==3.7.2
src/api/routes.py CHANGED
@@ -29,7 +29,7 @@ async def analyze_provider_notes(request: ProviderNotesRequest):
29
  detail="Provider notes must be at least 10 characters long"
30
  )
31
 
32
- # Process through Groq service - CORRECT METHOD NAME
33
  result = await groq_service.analyze_provider_notes(provider_notes)
34
 
35
  logger.info("Successfully processed coding request")
@@ -52,43 +52,55 @@ async def analyze_provider_notes(request: ProviderNotesRequest):
52
  )
53
 
54
 
55
- # NEW ENDPOINT - File Upload
56
  @router.post("/upload-file", response_model=FileUploadResponse)
57
  async def upload_provider_notes_file(file: UploadFile = File(...)):
58
  """
59
  Upload a TXT file containing provider notes and extract ICD-10 and CPT codes
60
 
61
- This endpoint accepts a TXT file, extracts the text, and processes it through the LLM.
 
 
 
 
62
 
63
  Args:
64
  file: TXT file containing provider notes
65
 
66
  Returns:
67
- FileUploadResponse with extracted codes and explanations
68
  """
69
  try:
70
- logger.info(f"Received file upload request: {file.filename}")
71
 
72
- # Step 1: Extract text from uploaded file
73
- extraction_result = await file_service.extract_text_from_file(file)
 
 
 
74
 
75
  extracted_text = extraction_result["text"]
76
  filename = extraction_result["filename"]
77
  text_length = extraction_result["text_length"]
 
 
 
78
 
79
- logger.info(f"Extracted {text_length} characters from {filename}")
 
80
 
81
- # Step 2: Process extracted text through Groq LLM
82
- # FIXED: Use the correct method name 'analyze_provider_notes'
83
  coding_result = await groq_service.analyze_provider_notes(extracted_text)
84
 
85
- logger.info(f"Successfully processed file: {filename}")
86
 
87
- # Step 3: Return combined response
88
  return FileUploadResponse(
89
  success=True,
90
  filename=filename,
91
  extracted_text_length=text_length,
 
 
92
  cpt_codes=coding_result.get("CPT", []),
93
  cpt_explanation=coding_result.get("CPT_explanation", ""),
94
  icd_codes=coding_result.get("ICD", []),
@@ -98,7 +110,7 @@ async def upload_provider_notes_file(file: UploadFile = File(...)):
98
  except HTTPException:
99
  raise
100
  except Exception as e:
101
- logger.error(f"Error in upload_provider_notes_file: {str(e)}")
102
  raise HTTPException(
103
  status_code=500,
104
  detail=f"Error processing uploaded file: {str(e)}"
 
29
  detail="Provider notes must be at least 10 characters long"
30
  )
31
 
32
+ # Process through Groq service
33
  result = await groq_service.analyze_provider_notes(provider_notes)
34
 
35
  logger.info("Successfully processed coding request")
 
52
  )
53
 
54
 
55
+ # UPDATED ENDPOINT - File Upload with PII Removal
56
  @router.post("/upload-file", response_model=FileUploadResponse)
57
  async def upload_provider_notes_file(file: UploadFile = File(...)):
58
  """
59
  Upload a TXT file containing provider notes and extract ICD-10 and CPT codes
60
 
61
+ This endpoint:
62
+ 1. Extracts text from uploaded TXT file
63
+ 2. Automatically detects and removes patient personal information (PII)
64
+ 3. Processes sanitized text through LLM
65
+ 4. Returns ICD-10 and CPT codes
66
 
67
  Args:
68
  file: TXT file containing provider notes
69
 
70
  Returns:
71
+ FileUploadResponse with codes, explanations, and PII removal info
72
  """
73
  try:
74
+ logger.info(f"📁 Received file upload request: {file.filename}")
75
 
76
+ # Step 1: Extract text from file with automatic PII removal
77
+ extraction_result = await file_service.extract_text_from_file(
78
+ file=file,
79
+ remove_pii=True # Always remove PII for safety
80
+ )
81
 
82
  extracted_text = extraction_result["text"]
83
  filename = extraction_result["filename"]
84
  text_length = extraction_result["text_length"]
85
+ pii_info = extraction_result["pii_info"]
86
+
87
+ logger.info(f"✅ Extracted {text_length} characters from {filename}")
88
 
89
+ if pii_info["pii_removed"]:
90
+ logger.info(f"🔒 Removed {pii_info['pii_count']} PII entities before processing")
91
 
92
+ # Step 2: Process sanitized text through Groq LLM
 
93
  coding_result = await groq_service.analyze_provider_notes(extracted_text)
94
 
95
+ logger.info(f"Successfully processed file: {filename}")
96
 
97
+ # Step 3: Return combined response with PII info
98
  return FileUploadResponse(
99
  success=True,
100
  filename=filename,
101
  extracted_text_length=text_length,
102
+ pii_removed=pii_info["pii_removed"],
103
+ pii_count=pii_info["pii_count"],
104
  cpt_codes=coding_result.get("CPT", []),
105
  cpt_explanation=coding_result.get("CPT_explanation", ""),
106
  icd_codes=coding_result.get("ICD", []),
 
110
  except HTTPException:
111
  raise
112
  except Exception as e:
113
+ logger.error(f"Error in upload_provider_notes_file: {str(e)}")
114
  raise HTTPException(
115
  status_code=500,
116
  detail=f"Error processing uploaded file: {str(e)}"
src/models/response_models.py CHANGED
@@ -1,38 +1,64 @@
1
  from pydantic import BaseModel, Field
2
- from typing import List
3
 
4
- class ICDCode(BaseModel):
5
- code: str = Field(..., description="ICD-10 diagnosis code")
6
- description: str = Field(..., description="Description of the diagnosis")
7
- explanation: str = Field(..., description="Explanation for why this code was selected")
 
 
 
 
 
 
 
 
 
 
8
 
9
- class CPTCode(BaseModel):
10
- code: str = Field(..., description="CPT procedure code")
11
- description: str = Field(..., description="Description of the procedure/service")
12
- explanation: str = Field(..., description="Explanation for why this code was selected")
13
 
14
  class CodingResponse(BaseModel):
15
- icd_codes: List[ICDCode] = Field(default_factory=list, description="List of ICD-10 codes")
16
- cpt_codes: List[CPTCode] = Field(default_factory=list, description="List of CPT codes")
17
- overall_summary: str = Field(..., description="Overall summary of coding decisions")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  class Config:
20
  json_schema_extra = {
21
  "example": {
22
- "icd_codes": [
23
- {
24
- "code": "J20.9",
25
- "description": "Acute bronchitis, unspecified",
26
- "explanation": "Patient presents with acute bronchitis as documented in provider notes"
27
- }
28
- ],
29
- "cpt_codes": [
30
- {
31
- "code": "99213",
32
- "description": "Office visit, established patient",
33
- "explanation": "Comprehensive examination performed as documented"
34
- }
35
- ],
36
- "overall_summary": "Patient encounter for acute bronchitis with examination and treatment"
37
  }
38
  }
 
1
  from pydantic import BaseModel, Field
2
+ from typing import List, Optional
3
 
4
+ class ProviderNotesRequest(BaseModel):
5
+ provider_notes: str = Field(
6
+ ...,
7
+ description="The medical provider notes to analyze",
8
+ min_length=10,
9
+ example="Patient presents with acute bronchitis. Performed comprehensive examination and prescribed antibiotics."
10
+ )
11
+
12
+ class Config:
13
+ json_schema_extra = {
14
+ "example": {
15
+ "provider_notes": "Patient presents with acute bronchitis. Cough for 5 days, productive with yellow sputum. Lung exam reveals diffuse wheezing. Prescribed azithromycin 500mg."
16
+ }
17
+ }
18
 
19
+ class ProviderNote(BaseModel):
20
+ note: str
 
 
21
 
22
  class CodingResponse(BaseModel):
23
+ cpt_codes: list
24
+ cpt_explanation: str
25
+ icd_codes: list
26
+ icd_explanation: str
27
+
28
+
29
+ # PII Detection Detail Model
30
+ class PIIDetail(BaseModel):
31
+ """Details of detected PII entity"""
32
+ entity_type: str = Field(description="Type of PII detected (e.g., PERSON, PHONE_NUMBER)")
33
+ start: int = Field(description="Start position in original text")
34
+ end: int = Field(description="End position in original text")
35
+ score: float = Field(description="Confidence score")
36
+
37
+
38
+ # Updated File Upload Response with PII info
39
+ class FileUploadResponse(BaseModel):
40
+ """Response model for file upload endpoint with PII removal info"""
41
+ success: bool = Field(description="Whether file processing was successful")
42
+ filename: str = Field(description="Name of uploaded file")
43
+ extracted_text_length: int = Field(description="Length of extracted text (after PII removal)")
44
+ pii_removed: bool = Field(description="Whether PII was detected and removed")
45
+ pii_count: int = Field(description="Number of PII entities removed")
46
+ cpt_codes: list = Field(description="List of CPT codes")
47
+ cpt_explanation: str = Field(description="Explanation of CPT codes")
48
+ icd_codes: list = Field(description="List of ICD codes")
49
+ icd_explanation: str = Field(description="Explanation of ICD codes")
50
 
51
  class Config:
52
  json_schema_extra = {
53
  "example": {
54
+ "success": True,
55
+ "filename": "provider_notes.txt",
56
+ "extracted_text_length": 450,
57
+ "pii_removed": True,
58
+ "pii_count": 3,
59
+ "cpt_codes": ["99213", "93000"],
60
+ "cpt_explanation": "Office visit and EKG",
61
+ "icd_codes": ["I20.0", "R07.9"],
62
+ "icd_explanation": "Unstable angina and chest pain"
 
 
 
 
 
 
63
  }
64
  }
src/services/file_service.py CHANGED
@@ -2,6 +2,7 @@ from fastapi import UploadFile, HTTPException
2
  import os
3
  from typing import Dict
4
  import logging
 
5
 
6
  logger = logging.getLogger(__name__)
7
 
@@ -36,15 +37,16 @@ class FileService:
36
  )
37
 
38
  @staticmethod
39
- async def extract_text_from_file(file: UploadFile) -> Dict[str, any]:
40
  """
41
- Extract text content from uploaded file
42
 
43
  Args:
44
  file: Uploaded file object
 
45
 
46
  Returns:
47
- Dictionary containing extracted text and metadata
48
  """
49
  try:
50
  # Validate file
@@ -86,19 +88,43 @@ class FileService:
86
  detail="Extracted text is too short. Please provide more detailed provider notes"
87
  )
88
 
89
- logger.info(f"Successfully extracted {len(text)} characters from {file.filename}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  return {
92
  "text": text,
93
  "filename": file.filename,
94
  "file_size": file_size,
95
- "text_length": len(text)
 
96
  }
97
 
98
  except HTTPException:
99
  raise
100
  except Exception as e:
101
- logger.error(f"Error extracting text from file: {str(e)}")
102
  raise HTTPException(
103
  status_code=500,
104
  detail=f"Error processing file: {str(e)}"
 
2
  import os
3
  from typing import Dict
4
  import logging
5
+ from services.pii_detector import pii_detector
6
 
7
  logger = logging.getLogger(__name__)
8
 
 
37
  )
38
 
39
  @staticmethod
40
+ async def extract_text_from_file(file: UploadFile, remove_pii: bool = True) -> Dict[str, any]:
41
  """
42
+ Extract text content from uploaded file and optionally remove PII
43
 
44
  Args:
45
  file: Uploaded file object
46
+ remove_pii: Whether to remove PII from extracted text (default: True)
47
 
48
  Returns:
49
+ Dictionary containing extracted text, PII removal info, and metadata
50
  """
51
  try:
52
  # Validate file
 
88
  detail="Extracted text is too short. Please provide more detailed provider notes"
89
  )
90
 
91
+ logger.info(f"Successfully extracted {len(text)} characters from {file.filename}")
92
+
93
+ # Remove PII if requested
94
+ pii_info = {
95
+ "pii_removed": False,
96
+ "pii_count": 0,
97
+ "pii_details": []
98
+ }
99
+
100
+ if remove_pii:
101
+ logger.info("🔒 Removing PII from extracted text...")
102
+ pii_result = pii_detector.remove_pii(text)
103
+
104
+ text = pii_result["sanitized_text"]
105
+ pii_info = {
106
+ "pii_removed": pii_result["was_pii_removed"],
107
+ "pii_count": pii_result["pii_count"],
108
+ "pii_details": pii_result["pii_detected"]
109
+ }
110
+
111
+ if pii_result["was_pii_removed"]:
112
+ logger.info(f"✅ Removed {pii_result['pii_count']} PII entities")
113
+ else:
114
+ logger.info("✅ No PII detected in text")
115
 
116
  return {
117
  "text": text,
118
  "filename": file.filename,
119
  "file_size": file_size,
120
+ "text_length": len(text),
121
+ "pii_info": pii_info
122
  }
123
 
124
  except HTTPException:
125
  raise
126
  except Exception as e:
127
+ logger.error(f"Error extracting text from file: {str(e)}")
128
  raise HTTPException(
129
  status_code=500,
130
  detail=f"Error processing file: {str(e)}"
src/services/pii_detector.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from presidio_analyzer import AnalyzerEngine
2
+ from presidio_anonymizer import AnonymizerEngine
3
+ from typing import Dict, List
4
+ import re
5
+ import logging
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class PIIDetector:
11
+ """Service to detect and remove Personal Identifiable Information from medical notes"""
12
+
13
+ def __init__(self):
14
+ """Initialize PII detection engines"""
15
+ try:
16
+ self.analyzer = AnalyzerEngine()
17
+ self.anonymizer = AnonymizerEngine()
18
+
19
+ # Entities to detect (common in medical notes)
20
+ self.entities_to_detect = [
21
+ "PERSON", # Names
22
+ "EMAIL_ADDRESS", # Email
23
+ "PHONE_NUMBER", # Phone numbers
24
+ "US_SSN", # Social Security Number
25
+ "CREDIT_CARD", # Credit card numbers
26
+ "US_DRIVER_LICENSE", # Driver's license
27
+ "LOCATION", # Addresses, cities
28
+ "DATE_TIME", # Birth dates, appointment dates
29
+ "US_PASSPORT", # Passport numbers
30
+ "MEDICAL_LICENSE", # Medical license numbers
31
+ "IP_ADDRESS", # IP addresses
32
+ "URL" # URLs
33
+ ]
34
+
35
+ logger.info("✅ PII Detector initialized successfully")
36
+ except Exception as e:
37
+ logger.error(f"❌ Failed to initialize PII Detector: {str(e)}")
38
+ raise
39
+
40
+ def detect_pii(self, text: str) -> List[Dict]:
41
+ """
42
+ Detect PII entities in text
43
+
44
+ Args:
45
+ text: Input text to analyze
46
+
47
+ Returns:
48
+ List of detected PII entities with details
49
+ """
50
+ try:
51
+ results = self.analyzer.analyze(
52
+ text=text,
53
+ entities=self.entities_to_detect,
54
+ language='en'
55
+ )
56
+
57
+ pii_findings = []
58
+ for result in results:
59
+ pii_findings.append({
60
+ "entity_type": result.entity_type,
61
+ "start": result.start,
62
+ "end": result.end,
63
+ "score": result.score,
64
+ "text": text[result.start:result.end]
65
+ })
66
+
67
+ logger.info(f"🔍 Detected {len(pii_findings)} PII entities")
68
+ return pii_findings
69
+
70
+ except Exception as e:
71
+ logger.error(f"❌ Error detecting PII: {str(e)}")
72
+ return []
73
+
74
+ def remove_pii(self, text: str) -> Dict[str, any]:
75
+ """
76
+ Remove PII from text while preserving medical information
77
+
78
+ Args:
79
+ text: Input text containing potential PII
80
+
81
+ Returns:
82
+ Dictionary with sanitized text and PII removal report
83
+ """
84
+ try:
85
+ # Step 1: Detect PII
86
+ analyzer_results = self.analyzer.analyze(
87
+ text=text,
88
+ entities=self.entities_to_detect,
89
+ language='en'
90
+ )
91
+
92
+ if not analyzer_results:
93
+ logger.info("✅ No PII detected in text")
94
+ return {
95
+ "sanitized_text": text,
96
+ "pii_detected": [],
97
+ "pii_count": 0,
98
+ "was_pii_removed": False
99
+ }
100
+
101
+ # Step 2: Anonymize detected PII
102
+ anonymized_result = self.anonymizer.anonymize(
103
+ text=text,
104
+ analyzer_results=analyzer_results
105
+ )
106
+
107
+ sanitized_text = anonymized_result.text
108
+
109
+ # Step 3: Additional pattern-based cleaning for medical notes
110
+ # Replace common medical note PII patterns
111
+ sanitized_text = self._clean_medical_patterns(sanitized_text)
112
+
113
+ # Step 4: Collect PII detection details
114
+ pii_detected = []
115
+ for result in analyzer_results:
116
+ pii_detected.append({
117
+ "entity_type": result.entity_type,
118
+ "start": result.start,
119
+ "end": result.end,
120
+ "score": result.score
121
+ })
122
+
123
+ logger.info(f"✅ Removed {len(pii_detected)} PII entities from text")
124
+
125
+ return {
126
+ "sanitized_text": sanitized_text,
127
+ "pii_detected": pii_detected,
128
+ "pii_count": len(pii_detected),
129
+ "was_pii_removed": True
130
+ }
131
+
132
+ except Exception as e:
133
+ logger.error(f"❌ Error removing PII: {str(e)}")
134
+ # Return original text if PII removal fails
135
+ return {
136
+ "sanitized_text": text,
137
+ "pii_detected": [],
138
+ "pii_count": 0,
139
+ "was_pii_removed": False,
140
+ "error": str(e)
141
+ }
142
+
143
+ def _clean_medical_patterns(self, text: str) -> str:
144
+ """
145
+ Clean common medical note PII patterns that might be missed
146
+
147
+ Args:
148
+ text: Text to clean
149
+
150
+ Returns:
151
+ Cleaned text
152
+ """
153
+ # Pattern 1: "Patient: <NAME>" or "Pt: <NAME>"
154
+ text = re.sub(
155
+ r'(Patient|Pt|Patient Name):\s*<[A-Z_]+>',
156
+ r'\1: [REDACTED]',
157
+ text,
158
+ flags=re.IGNORECASE
159
+ )
160
+
161
+ # Pattern 2: "DOB: <DATE>"
162
+ text = re.sub(
163
+ r'(DOB|Date of Birth|Birth Date):\s*<[A-Z_]+>',
164
+ r'\1: [REDACTED]',
165
+ text,
166
+ flags=re.IGNORECASE
167
+ )
168
+
169
+ # Pattern 3: "Address: <LOCATION>"
170
+ text = re.sub(
171
+ r'(Address|Addr|Home Address):\s*<[A-Z_]+>',
172
+ r'\1: [REDACTED]',
173
+ text,
174
+ flags=re.IGNORECASE
175
+ )
176
+
177
+ # Pattern 4: "Phone: <PHONE_NUMBER>"
178
+ text = re.sub(
179
+ r'(Phone|Tel|Telephone|Cell|Mobile):\s*<[A-Z_]+>',
180
+ r'\1: [REDACTED]',
181
+ text,
182
+ flags=re.IGNORECASE
183
+ )
184
+
185
+ # Pattern 5: "MRN: <NUMBER>" (Medical Record Number)
186
+ text = re.sub(
187
+ r'(MRN|Medical Record Number|Record #):\s*<[A-Z_]+>',
188
+ r'\1: [REDACTED]',
189
+ text,
190
+ flags=re.IGNORECASE
191
+ )
192
+
193
+ return text
194
+
195
+
196
+ # Singleton instance
197
+ pii_detector = PIIDetector()
tests/test_api.py CHANGED
@@ -23,7 +23,6 @@ def test_coding_endpoint():
23
 
24
  def test_file_upload_endpoint():
25
  """Test new file upload endpoint"""
26
- # Create a sample TXT file
27
  file_content = b"Patient John Doe presents with acute bronchitis. Cough for 5 days, productive with yellow sputum. Lung exam reveals diffuse wheezing."
28
 
29
  files = {
@@ -38,10 +37,36 @@ def test_file_upload_endpoint():
38
  assert data["success"] is True
39
  assert data["filename"] == "provider_notes.txt"
40
  assert data["extracted_text_length"] > 0
 
 
41
  assert "cpt_codes" in data
42
  assert "icd_codes" in data
43
- assert isinstance(data["cpt_codes"], list)
44
- assert isinstance(data["icd_codes"], list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
47
  def test_file_upload_invalid_extension():
 
23
 
24
  def test_file_upload_endpoint():
25
  """Test new file upload endpoint"""
 
26
  file_content = b"Patient John Doe presents with acute bronchitis. Cough for 5 days, productive with yellow sputum. Lung exam reveals diffuse wheezing."
27
 
28
  files = {
 
37
  assert data["success"] is True
38
  assert data["filename"] == "provider_notes.txt"
39
  assert data["extracted_text_length"] > 0
40
+ assert "pii_removed" in data
41
+ assert "pii_count" in data
42
  assert "cpt_codes" in data
43
  assert "icd_codes" in data
44
+
45
+
46
+ def test_file_upload_with_pii():
47
+ """Test file upload with PII - should be removed"""
48
+ file_content = b"""
49
+ Patient: John Doe
50
+ DOB: 01/15/1980
51
+ Phone: 555-123-4567
52
+ Address: 123 Main St, New York, NY
53
+
54
+ Chief Complaint: Chest pain
55
+ History: Patient presents with acute chest pain...
56
+ """
57
+
58
+ files = {
59
+ "file": ("notes_with_pii.txt", BytesIO(file_content), "text/plain")
60
+ }
61
+
62
+ response = client.post("/api/upload-file", files=files)
63
+
64
+ assert response.status_code == 200
65
+ data = response.json()
66
+
67
+ # PII should be detected and removed
68
+ assert data["pii_removed"] is True
69
+ assert data["pii_count"] > 0
70
 
71
 
72
  def test_file_upload_invalid_extension():