mmrech commited on
Commit
d5e45f7
·
1 Parent(s): df56589

Add confidence levels and warnings for subject detection reliability

Browse files
Files changed (1) hide show
  1. app.py +68 -34
app.py CHANGED
@@ -420,6 +420,11 @@ def extract_subject_id(file_path):
420
  - Filename prefix: subject_001_slice_01.png -> subject_001
421
  - Patient ID in filename: patient_123_slice_5.dcm -> patient_123
422
  - Study UID in DICOM: extract from DICOM metadata
 
 
 
 
 
423
  """
424
  import re
425
 
@@ -427,52 +432,60 @@ def extract_subject_id(file_path):
427
  filename = os.path.basename(file_path)
428
  dir_path = os.path.dirname(file_path)
429
 
430
- # Try to extract from folder name (common in medical datasets)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  folder_name = os.path.basename(dir_path.rstrip('/'))
432
  if folder_name and folder_name not in ['', '.', '..']:
433
  # Check if folder name looks like a subject ID
434
  if re.match(r'(subject|patient|sub|pat|case|id)[_-]?\d+', folder_name, re.I):
435
- return folder_name
436
 
437
- # Try to extract from filename
438
- # Pattern: subject_001, patient_123, sub_001, case_001, etc.
439
  patterns = [
440
- r'(subject|patient|sub|pat|case|id)[_-]?(\d+)', # subject_001, patient_123
441
- r'(\d{3,})', # Just numbers (3+ digits) - might be subject ID
442
- r'([A-Z]{2,}\d+)', # BR001, MR123, etc.
443
  ]
444
 
445
- for pattern in patterns:
446
  match = re.search(pattern, filename, re.I)
447
  if match:
448
  if len(match.groups()) > 1:
449
- return f"{match.group(1)}_{match.group(2)}"
450
  else:
451
- return match.group(1)
452
 
453
- # For DICOM files, try to read PatientID or StudyInstanceUID
454
- if file_path.lower().endswith('.dcm'):
455
- try:
456
- ds = pydicom.dcmread(file_path, stop_before_pixels=True)
457
- patient_id = getattr(ds, 'PatientID', None)
458
- if patient_id:
459
- return f"patient_{patient_id}"
460
- study_uid = getattr(ds, 'StudyInstanceUID', None)
461
- if study_uid:
462
- # Use last part of UID as identifier
463
- return f"study_{study_uid.split('.')[-1][:8]}"
464
- except:
465
- pass
466
 
467
- # Fallback: use filename without extension
468
  base_name = os.path.splitext(filename)[0]
469
  if len(base_name) > 0:
470
- return base_name
471
 
472
- return "unknown"
473
 
474
  def group_images_by_subject(image_files):
475
- """Group image files by subject/patient ID."""
 
 
 
 
476
  if not image_files:
477
  return {}
478
 
@@ -482,17 +495,29 @@ def group_images_by_subject(image_files):
482
  # Filter out None files
483
  image_files = [f for f in image_files if f is not None]
484
 
485
- # Group by subject ID
486
  subject_groups = {}
487
  for file_path in image_files:
488
- subject_id = extract_subject_id(file_path)
 
489
  if subject_id not in subject_groups:
490
- subject_groups[subject_id] = []
491
- subject_groups[subject_id].append(file_path)
 
 
 
 
 
 
 
 
 
 
492
 
493
  # Sort files within each group (by filename)
494
  for subject_id in subject_groups:
495
- subject_groups[subject_id].sort()
 
496
 
497
  return subject_groups
498
 
@@ -521,7 +546,16 @@ def process_slices_for_viewer(image_files, selected_subject, prompt_text, modali
521
  return None, 0, f"⚠️ Subject '{subject_id}' not found.", "No slices loaded", gr.Dropdown(choices=[], value=None), ""
522
 
523
  # Get files for selected subject
524
- subject_files = subject_groups[subject_id]
 
 
 
 
 
 
 
 
 
525
 
526
  results = []
527
  status_messages = []
@@ -544,7 +578,7 @@ def process_slices_for_viewer(image_files, selected_subject, prompt_text, modali
544
  processed_results_cache[cache_key] = results
545
 
546
  max_slices = len(results) - 1
547
- status = f"✅ Processed {len(results)}/{len(subject_files)} slices for {subject_id}!\nUse slider or buttons to navigate."
548
  slice_info = f"Slice 1/{len(results)} ({subject_id})"
549
 
550
  # Update subject dropdown choices
 
420
  - Filename prefix: subject_001_slice_01.png -> subject_001
421
  - Patient ID in filename: patient_123_slice_5.dcm -> patient_123
422
  - Study UID in DICOM: extract from DICOM metadata
423
+
424
+ Returns:
425
+ tuple: (subject_id, confidence_level, source)
426
+ confidence_level: 'high' (DICOM metadata), 'medium' (folder/filename pattern), 'low' (fallback)
427
+ source: 'dicom_patientid', 'dicom_study', 'folder', 'filename', 'fallback'
428
  """
429
  import re
430
 
 
432
  filename = os.path.basename(file_path)
433
  dir_path = os.path.dirname(file_path)
434
 
435
+ # HIGHEST CONFIDENCE: DICOM metadata (most reliable)
436
+ if file_path.lower().endswith('.dcm'):
437
+ try:
438
+ ds = pydicom.dcmread(file_path, stop_before_pixels=True)
439
+ patient_id = getattr(ds, 'PatientID', None)
440
+ if patient_id and patient_id.strip():
441
+ return f"patient_{patient_id}", 'high', 'dicom_patientid'
442
+
443
+ study_uid = getattr(ds, 'StudyInstanceUID', None)
444
+ if study_uid:
445
+ # Use full study UID as identifier (unique per study)
446
+ return f"study_{study_uid}", 'high', 'dicom_study'
447
+ except:
448
+ pass
449
+
450
+ # MEDIUM CONFIDENCE: Folder name (common in medical datasets)
451
  folder_name = os.path.basename(dir_path.rstrip('/'))
452
  if folder_name and folder_name not in ['', '.', '..']:
453
  # Check if folder name looks like a subject ID
454
  if re.match(r'(subject|patient|sub|pat|case|id)[_-]?\d+', folder_name, re.I):
455
+ return folder_name, 'medium', 'folder'
456
 
457
+ # MEDIUM CONFIDENCE: Filename pattern
 
458
  patterns = [
459
+ (r'(subject|patient|sub|pat|case|id)[_-]?(\d+)', 'medium'), # subject_001, patient_123
460
+ (r'([A-Z]{2,}\d+)', 'medium'), # BR001, MR123, etc.
 
461
  ]
462
 
463
+ for pattern, confidence in patterns:
464
  match = re.search(pattern, filename, re.I)
465
  if match:
466
  if len(match.groups()) > 1:
467
+ return f"{match.group(1)}_{match.group(2)}", confidence, 'filename'
468
  else:
469
+ return match.group(1), confidence, 'filename'
470
 
471
+ # LOW CONFIDENCE: Numeric pattern (could be slice number, not patient ID)
472
+ numeric_match = re.search(r'(\d{3,})', filename)
473
+ if numeric_match:
474
+ return numeric_match.group(1), 'low', 'filename_numeric'
 
 
 
 
 
 
 
 
 
475
 
476
+ # LOWEST CONFIDENCE: Fallback to filename
477
  base_name = os.path.splitext(filename)[0]
478
  if len(base_name) > 0:
479
+ return base_name, 'low', 'fallback'
480
 
481
+ return "unknown", 'low', 'unknown'
482
 
483
  def group_images_by_subject(image_files):
484
+ """Group image files by subject/patient ID.
485
+
486
+ Returns:
487
+ dict: {subject_id: {'files': [...], 'confidence': 'high/medium/low', 'sources': set(...)}}
488
+ """
489
  if not image_files:
490
  return {}
491
 
 
495
  # Filter out None files
496
  image_files = [f for f in image_files if f is not None]
497
 
498
+ # Group by subject ID and track confidence
499
  subject_groups = {}
500
  for file_path in image_files:
501
+ subject_id, confidence, source = extract_subject_id(file_path)
502
+
503
  if subject_id not in subject_groups:
504
+ subject_groups[subject_id] = {
505
+ 'files': [],
506
+ 'confidence': confidence,
507
+ 'sources': set([source])
508
+ }
509
+
510
+ subject_groups[subject_id]['files'].append(file_path)
511
+ subject_groups[subject_id]['sources'].add(source)
512
+
513
+ # Upgrade confidence if we find high-confidence source
514
+ if confidence == 'high' or (confidence == 'medium' and subject_groups[subject_id]['confidence'] == 'low'):
515
+ subject_groups[subject_id]['confidence'] = confidence
516
 
517
  # Sort files within each group (by filename)
518
  for subject_id in subject_groups:
519
+ subject_groups[subject_id]['files'].sort()
520
+ subject_groups[subject_id]['sources'] = list(subject_groups[subject_id]['sources'])
521
 
522
  return subject_groups
523
 
 
546
  return None, 0, f"⚠️ Subject '{subject_id}' not found.", "No slices loaded", gr.Dropdown(choices=[], value=None), ""
547
 
548
  # Get files for selected subject
549
+ subject_info = subject_groups[subject_id]
550
+ subject_files = subject_info['files']
551
+ confidence = subject_info['confidence']
552
+
553
+ # Add confidence warning
554
+ confidence_warning = ""
555
+ if confidence == 'low':
556
+ confidence_warning = "\n⚠️ LOW CONFIDENCE: These files may not be from the same patient. Please verify!"
557
+ elif confidence == 'medium':
558
+ confidence_warning = "\n⚠️ MEDIUM CONFIDENCE: Likely same patient, but verify if critical."
559
 
560
  results = []
561
  status_messages = []
 
578
  processed_results_cache[cache_key] = results
579
 
580
  max_slices = len(results) - 1
581
+ status = f"✅ Processed {len(results)}/{len(subject_files)} slices for {subject_id}!\nUse slider or buttons to navigate.{confidence_warning}"
582
  slice_info = f"Slice 1/{len(results)} ({subject_id})"
583
 
584
  # Update subject dropdown choices