Add confidence levels and warnings for subject detection reliability
Browse files
app.py
CHANGED
|
@@ -420,6 +420,11 @@ def extract_subject_id(file_path):
|
|
| 420 |
- Filename prefix: subject_001_slice_01.png -> subject_001
|
| 421 |
- Patient ID in filename: patient_123_slice_5.dcm -> patient_123
|
| 422 |
- Study UID in DICOM: extract from DICOM metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
"""
|
| 424 |
import re
|
| 425 |
|
|
@@ -427,52 +432,60 @@ def extract_subject_id(file_path):
|
|
| 427 |
filename = os.path.basename(file_path)
|
| 428 |
dir_path = os.path.dirname(file_path)
|
| 429 |
|
| 430 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
folder_name = os.path.basename(dir_path.rstrip('/'))
|
| 432 |
if folder_name and folder_name not in ['', '.', '..']:
|
| 433 |
# Check if folder name looks like a subject ID
|
| 434 |
if re.match(r'(subject|patient|sub|pat|case|id)[_-]?\d+', folder_name, re.I):
|
| 435 |
-
return folder_name
|
| 436 |
|
| 437 |
-
#
|
| 438 |
-
# Pattern: subject_001, patient_123, sub_001, case_001, etc.
|
| 439 |
patterns = [
|
| 440 |
-
r'(subject|patient|sub|pat|case|id)[_-]?(\d+)', # subject_001, patient_123
|
| 441 |
-
r'(
|
| 442 |
-
r'([A-Z]{2,}\d+)', # BR001, MR123, etc.
|
| 443 |
]
|
| 444 |
|
| 445 |
-
for pattern in patterns:
|
| 446 |
match = re.search(pattern, filename, re.I)
|
| 447 |
if match:
|
| 448 |
if len(match.groups()) > 1:
|
| 449 |
-
return f"{match.group(1)}_{match.group(2)}"
|
| 450 |
else:
|
| 451 |
-
return match.group(1)
|
| 452 |
|
| 453 |
-
#
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
patient_id = getattr(ds, 'PatientID', None)
|
| 458 |
-
if patient_id:
|
| 459 |
-
return f"patient_{patient_id}"
|
| 460 |
-
study_uid = getattr(ds, 'StudyInstanceUID', None)
|
| 461 |
-
if study_uid:
|
| 462 |
-
# Use last part of UID as identifier
|
| 463 |
-
return f"study_{study_uid.split('.')[-1][:8]}"
|
| 464 |
-
except:
|
| 465 |
-
pass
|
| 466 |
|
| 467 |
-
#
|
| 468 |
base_name = os.path.splitext(filename)[0]
|
| 469 |
if len(base_name) > 0:
|
| 470 |
-
return base_name
|
| 471 |
|
| 472 |
-
return "unknown"
|
| 473 |
|
| 474 |
def group_images_by_subject(image_files):
|
| 475 |
-
"""Group image files by subject/patient ID.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
if not image_files:
|
| 477 |
return {}
|
| 478 |
|
|
@@ -482,17 +495,29 @@ def group_images_by_subject(image_files):
|
|
| 482 |
# Filter out None files
|
| 483 |
image_files = [f for f in image_files if f is not None]
|
| 484 |
|
| 485 |
-
# Group by subject ID
|
| 486 |
subject_groups = {}
|
| 487 |
for file_path in image_files:
|
| 488 |
-
subject_id = extract_subject_id(file_path)
|
|
|
|
| 489 |
if subject_id not in subject_groups:
|
| 490 |
-
subject_groups[subject_id] =
|
| 491 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
|
| 493 |
# Sort files within each group (by filename)
|
| 494 |
for subject_id in subject_groups:
|
| 495 |
-
subject_groups[subject_id].sort()
|
|
|
|
| 496 |
|
| 497 |
return subject_groups
|
| 498 |
|
|
@@ -521,7 +546,16 @@ def process_slices_for_viewer(image_files, selected_subject, prompt_text, modali
|
|
| 521 |
return None, 0, f"⚠️ Subject '{subject_id}' not found.", "No slices loaded", gr.Dropdown(choices=[], value=None), ""
|
| 522 |
|
| 523 |
# Get files for selected subject
|
| 524 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
|
| 526 |
results = []
|
| 527 |
status_messages = []
|
|
@@ -544,7 +578,7 @@ def process_slices_for_viewer(image_files, selected_subject, prompt_text, modali
|
|
| 544 |
processed_results_cache[cache_key] = results
|
| 545 |
|
| 546 |
max_slices = len(results) - 1
|
| 547 |
-
status = f"✅ Processed {len(results)}/{len(subject_files)} slices for {subject_id}!\nUse slider or buttons to navigate."
|
| 548 |
slice_info = f"Slice 1/{len(results)} ({subject_id})"
|
| 549 |
|
| 550 |
# Update subject dropdown choices
|
|
|
|
| 420 |
- Filename prefix: subject_001_slice_01.png -> subject_001
|
| 421 |
- Patient ID in filename: patient_123_slice_5.dcm -> patient_123
|
| 422 |
- Study UID in DICOM: extract from DICOM metadata
|
| 423 |
+
|
| 424 |
+
Returns:
|
| 425 |
+
tuple: (subject_id, confidence_level, source)
|
| 426 |
+
confidence_level: 'high' (DICOM metadata), 'medium' (folder/filename pattern), 'low' (fallback)
|
| 427 |
+
source: 'dicom_patientid', 'dicom_study', 'folder', 'filename', 'fallback'
|
| 428 |
"""
|
| 429 |
import re
|
| 430 |
|
|
|
|
| 432 |
filename = os.path.basename(file_path)
|
| 433 |
dir_path = os.path.dirname(file_path)
|
| 434 |
|
| 435 |
+
# HIGHEST CONFIDENCE: DICOM metadata (most reliable)
|
| 436 |
+
if file_path.lower().endswith('.dcm'):
|
| 437 |
+
try:
|
| 438 |
+
ds = pydicom.dcmread(file_path, stop_before_pixels=True)
|
| 439 |
+
patient_id = getattr(ds, 'PatientID', None)
|
| 440 |
+
if patient_id and patient_id.strip():
|
| 441 |
+
return f"patient_{patient_id}", 'high', 'dicom_patientid'
|
| 442 |
+
|
| 443 |
+
study_uid = getattr(ds, 'StudyInstanceUID', None)
|
| 444 |
+
if study_uid:
|
| 445 |
+
# Use full study UID as identifier (unique per study)
|
| 446 |
+
return f"study_{study_uid}", 'high', 'dicom_study'
|
| 447 |
+
except:
|
| 448 |
+
pass
|
| 449 |
+
|
| 450 |
+
# MEDIUM CONFIDENCE: Folder name (common in medical datasets)
|
| 451 |
folder_name = os.path.basename(dir_path.rstrip('/'))
|
| 452 |
if folder_name and folder_name not in ['', '.', '..']:
|
| 453 |
# Check if folder name looks like a subject ID
|
| 454 |
if re.match(r'(subject|patient|sub|pat|case|id)[_-]?\d+', folder_name, re.I):
|
| 455 |
+
return folder_name, 'medium', 'folder'
|
| 456 |
|
| 457 |
+
# MEDIUM CONFIDENCE: Filename pattern
|
|
|
|
| 458 |
patterns = [
|
| 459 |
+
(r'(subject|patient|sub|pat|case|id)[_-]?(\d+)', 'medium'), # subject_001, patient_123
|
| 460 |
+
(r'([A-Z]{2,}\d+)', 'medium'), # BR001, MR123, etc.
|
|
|
|
| 461 |
]
|
| 462 |
|
| 463 |
+
for pattern, confidence in patterns:
|
| 464 |
match = re.search(pattern, filename, re.I)
|
| 465 |
if match:
|
| 466 |
if len(match.groups()) > 1:
|
| 467 |
+
return f"{match.group(1)}_{match.group(2)}", confidence, 'filename'
|
| 468 |
else:
|
| 469 |
+
return match.group(1), confidence, 'filename'
|
| 470 |
|
| 471 |
+
# LOW CONFIDENCE: Numeric pattern (could be slice number, not patient ID)
|
| 472 |
+
numeric_match = re.search(r'(\d{3,})', filename)
|
| 473 |
+
if numeric_match:
|
| 474 |
+
return numeric_match.group(1), 'low', 'filename_numeric'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
|
| 476 |
+
# LOWEST CONFIDENCE: Fallback to filename
|
| 477 |
base_name = os.path.splitext(filename)[0]
|
| 478 |
if len(base_name) > 0:
|
| 479 |
+
return base_name, 'low', 'fallback'
|
| 480 |
|
| 481 |
+
return "unknown", 'low', 'unknown'
|
| 482 |
|
| 483 |
def group_images_by_subject(image_files):
|
| 484 |
+
"""Group image files by subject/patient ID.
|
| 485 |
+
|
| 486 |
+
Returns:
|
| 487 |
+
dict: {subject_id: {'files': [...], 'confidence': 'high/medium/low', 'sources': set(...)}}
|
| 488 |
+
"""
|
| 489 |
if not image_files:
|
| 490 |
return {}
|
| 491 |
|
|
|
|
| 495 |
# Filter out None files
|
| 496 |
image_files = [f for f in image_files if f is not None]
|
| 497 |
|
| 498 |
+
# Group by subject ID and track confidence
|
| 499 |
subject_groups = {}
|
| 500 |
for file_path in image_files:
|
| 501 |
+
subject_id, confidence, source = extract_subject_id(file_path)
|
| 502 |
+
|
| 503 |
if subject_id not in subject_groups:
|
| 504 |
+
subject_groups[subject_id] = {
|
| 505 |
+
'files': [],
|
| 506 |
+
'confidence': confidence,
|
| 507 |
+
'sources': set([source])
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
subject_groups[subject_id]['files'].append(file_path)
|
| 511 |
+
subject_groups[subject_id]['sources'].add(source)
|
| 512 |
+
|
| 513 |
+
# Upgrade confidence if we find high-confidence source
|
| 514 |
+
if confidence == 'high' or (confidence == 'medium' and subject_groups[subject_id]['confidence'] == 'low'):
|
| 515 |
+
subject_groups[subject_id]['confidence'] = confidence
|
| 516 |
|
| 517 |
# Sort files within each group (by filename)
|
| 518 |
for subject_id in subject_groups:
|
| 519 |
+
subject_groups[subject_id]['files'].sort()
|
| 520 |
+
subject_groups[subject_id]['sources'] = list(subject_groups[subject_id]['sources'])
|
| 521 |
|
| 522 |
return subject_groups
|
| 523 |
|
|
|
|
| 546 |
return None, 0, f"⚠️ Subject '{subject_id}' not found.", "No slices loaded", gr.Dropdown(choices=[], value=None), ""
|
| 547 |
|
| 548 |
# Get files for selected subject
|
| 549 |
+
subject_info = subject_groups[subject_id]
|
| 550 |
+
subject_files = subject_info['files']
|
| 551 |
+
confidence = subject_info['confidence']
|
| 552 |
+
|
| 553 |
+
# Add confidence warning
|
| 554 |
+
confidence_warning = ""
|
| 555 |
+
if confidence == 'low':
|
| 556 |
+
confidence_warning = "\n⚠️ LOW CONFIDENCE: These files may not be from the same patient. Please verify!"
|
| 557 |
+
elif confidence == 'medium':
|
| 558 |
+
confidence_warning = "\n⚠️ MEDIUM CONFIDENCE: Likely same patient, but verify if critical."
|
| 559 |
|
| 560 |
results = []
|
| 561 |
status_messages = []
|
|
|
|
| 578 |
processed_results_cache[cache_key] = results
|
| 579 |
|
| 580 |
max_slices = len(results) - 1
|
| 581 |
+
status = f"✅ Processed {len(results)}/{len(subject_files)} slices for {subject_id}!\nUse slider or buttons to navigate.{confidence_warning}"
|
| 582 |
slice_info = f"Slice 1/{len(results)} ({subject_id})"
|
| 583 |
|
| 584 |
# Update subject dropdown choices
|