Spaces:

mmrech
/

NeuroSAM3

Running on Zero

App Files Files Community

mmrech commited on 17 days ago

Commit

d5e45f7

1 Parent(s): df56589

Add confidence levels and warnings for subject detection reliability

Browse files

Files changed (1) hide show

app.py +68 -34

app.py CHANGED Viewed

@@ -420,6 +420,11 @@ def extract_subject_id(file_path):
     - Filename prefix: subject_001_slice_01.png -> subject_001
     - Patient ID in filename: patient_123_slice_5.dcm -> patient_123
     - Study UID in DICOM: extract from DICOM metadata
     """
     import re
@@ -427,52 +432,60 @@ def extract_subject_id(file_path):
     filename = os.path.basename(file_path)
     dir_path = os.path.dirname(file_path)
-    # Try to extract from folder name (common in medical datasets)
     folder_name = os.path.basename(dir_path.rstrip('/'))
     if folder_name and folder_name not in ['', '.', '..']:
         # Check if folder name looks like a subject ID
         if re.match(r'(subject|patient|sub|pat|case|id)[_-]?\d+', folder_name, re.I):
-            return folder_name
-    # Try to extract from filename
-    # Pattern: subject_001, patient_123, sub_001, case_001, etc.
     patterns = [
-        r'(subject|patient|sub|pat|case|id)[_-]?(\d+)',  # subject_001, patient_123
-        r'(\d{3,})',  # Just numbers (3+ digits) - might be subject ID
-        r'([A-Z]{2,}\d+)',  # BR001, MR123, etc.
     ]
-    for pattern in patterns:
         match = re.search(pattern, filename, re.I)
         if match:
             if len(match.groups()) > 1:
-                return f"{match.group(1)}_{match.group(2)}"
             else:
-                return match.group(1)
-    # For DICOM files, try to read PatientID or StudyInstanceUID
-    if file_path.lower().endswith('.dcm'):
-        try:
-            ds = pydicom.dcmread(file_path, stop_before_pixels=True)
-            patient_id = getattr(ds, 'PatientID', None)
-            if patient_id:
-                return f"patient_{patient_id}"
-            study_uid = getattr(ds, 'StudyInstanceUID', None)
-            if study_uid:
-                # Use last part of UID as identifier
-                return f"study_{study_uid.split('.')[-1][:8]}"
-        except:
-            pass
-    # Fallback: use filename without extension
     base_name = os.path.splitext(filename)[0]
     if len(base_name) > 0:
-        return base_name
-    return "unknown"
 def group_images_by_subject(image_files):
-    """Group image files by subject/patient ID."""
     if not image_files:
         return {}
@@ -482,17 +495,29 @@ def group_images_by_subject(image_files):
     # Filter out None files
     image_files = [f for f in image_files if f is not None]
-    # Group by subject ID
     subject_groups = {}
     for file_path in image_files:
-        subject_id = extract_subject_id(file_path)
         if subject_id not in subject_groups:
-            subject_groups[subject_id] = []
-        subject_groups[subject_id].append(file_path)
     # Sort files within each group (by filename)
     for subject_id in subject_groups:
-        subject_groups[subject_id].sort()
     return subject_groups
@@ -521,7 +546,16 @@ def process_slices_for_viewer(image_files, selected_subject, prompt_text, modali
         return None, 0, f"⚠️ Subject '{subject_id}' not found.", "No slices loaded", gr.Dropdown(choices=[], value=None), ""
     # Get files for selected subject
-    subject_files = subject_groups[subject_id]
     results = []
     status_messages = []
@@ -544,7 +578,7 @@ def process_slices_for_viewer(image_files, selected_subject, prompt_text, modali
         processed_results_cache[cache_key] = results
         max_slices = len(results) - 1
-        status = f"✅ Processed {len(results)}/{len(subject_files)} slices for {subject_id}!\nUse slider or buttons to navigate."
         slice_info = f"Slice 1/{len(results)} ({subject_id})"
         # Update subject dropdown choices

     - Filename prefix: subject_001_slice_01.png -> subject_001
     - Patient ID in filename: patient_123_slice_5.dcm -> patient_123
     - Study UID in DICOM: extract from DICOM metadata
+    Returns:
+        tuple: (subject_id, confidence_level, source)
+        confidence_level: 'high' (DICOM metadata), 'medium' (folder/filename pattern), 'low' (fallback)
+        source: 'dicom_patientid', 'dicom_study', 'folder', 'filename', 'fallback'
     """
     import re
     filename = os.path.basename(file_path)
     dir_path = os.path.dirname(file_path)
+    # HIGHEST CONFIDENCE: DICOM metadata (most reliable)
+    if file_path.lower().endswith('.dcm'):
+        try:
+            ds = pydicom.dcmread(file_path, stop_before_pixels=True)
+            patient_id = getattr(ds, 'PatientID', None)
+            if patient_id and patient_id.strip():
+                return f"patient_{patient_id}", 'high', 'dicom_patientid'
+            study_uid = getattr(ds, 'StudyInstanceUID', None)
+            if study_uid:
+                # Use full study UID as identifier (unique per study)
+                return f"study_{study_uid}", 'high', 'dicom_study'
+        except:
+            pass
+    # MEDIUM CONFIDENCE: Folder name (common in medical datasets)
     folder_name = os.path.basename(dir_path.rstrip('/'))
     if folder_name and folder_name not in ['', '.', '..']:
         # Check if folder name looks like a subject ID
         if re.match(r'(subject|patient|sub|pat|case|id)[_-]?\d+', folder_name, re.I):
+            return folder_name, 'medium', 'folder'
+    # MEDIUM CONFIDENCE: Filename pattern
     patterns = [
+        (r'(subject|patient|sub|pat|case|id)[_-]?(\d+)', 'medium'),  # subject_001, patient_123
+        (r'([A-Z]{2,}\d+)', 'medium'),  # BR001, MR123, etc.
     ]
+    for pattern, confidence in patterns:
         match = re.search(pattern, filename, re.I)
         if match:
             if len(match.groups()) > 1:
+                return f"{match.group(1)}_{match.group(2)}", confidence, 'filename'
             else:
+                return match.group(1), confidence, 'filename'
+    # LOW CONFIDENCE: Numeric pattern (could be slice number, not patient ID)
+    numeric_match = re.search(r'(\d{3,})', filename)
+    if numeric_match:
+        return numeric_match.group(1), 'low', 'filename_numeric'
+    # LOWEST CONFIDENCE: Fallback to filename
     base_name = os.path.splitext(filename)[0]
     if len(base_name) > 0:
+        return base_name, 'low', 'fallback'
+    return "unknown", 'low', 'unknown'
 def group_images_by_subject(image_files):
+    """Group image files by subject/patient ID.
+    Returns:
+        dict: {subject_id: {'files': [...], 'confidence': 'high/medium/low', 'sources': set(...)}}
+    """
     if not image_files:
         return {}
     # Filter out None files
     image_files = [f for f in image_files if f is not None]
+    # Group by subject ID and track confidence
     subject_groups = {}
     for file_path in image_files:
+        subject_id, confidence, source = extract_subject_id(file_path)
         if subject_id not in subject_groups:
+            subject_groups[subject_id] = {
+                'files': [],
+                'confidence': confidence,
+                'sources': set([source])
+            }
+        subject_groups[subject_id]['files'].append(file_path)
+        subject_groups[subject_id]['sources'].add(source)
+        # Upgrade confidence if we find high-confidence source
+        if confidence == 'high' or (confidence == 'medium' and subject_groups[subject_id]['confidence'] == 'low'):
+            subject_groups[subject_id]['confidence'] = confidence
     # Sort files within each group (by filename)
     for subject_id in subject_groups:
+        subject_groups[subject_id]['files'].sort()
+        subject_groups[subject_id]['sources'] = list(subject_groups[subject_id]['sources'])
     return subject_groups
         return None, 0, f"⚠️ Subject '{subject_id}' not found.", "No slices loaded", gr.Dropdown(choices=[], value=None), ""
     # Get files for selected subject
+    subject_info = subject_groups[subject_id]
+    subject_files = subject_info['files']
+    confidence = subject_info['confidence']
+    # Add confidence warning
+    confidence_warning = ""
+    if confidence == 'low':
+        confidence_warning = "\n⚠️ LOW CONFIDENCE: These files may not be from the same patient. Please verify!"
+    elif confidence == 'medium':
+        confidence_warning = "\n⚠️ MEDIUM CONFIDENCE: Likely same patient, but verify if critical."
     results = []
     status_messages = []
         processed_results_cache[cache_key] = results
         max_slices = len(results) - 1
+        status = f"✅ Processed {len(results)}/{len(subject_files)} slices for {subject_id}!\nUse slider or buttons to navigate.{confidence_warning}"
         slice_info = f"Slice 1/{len(results)} ({subject_id})"
         # Update subject dropdown choices