elephmind-api / dicom_processor.py
zousko-stark's picture
Upload folder using huggingface_hub
a29fdb5 verified
raw
history blame
4.93 kB
import pydicom
import logging
import hashlib
from typing import Tuple, Dict, Any, Optional
from pathlib import Path
import os
import io
logger = logging.getLogger(__name__)
# Mandatory DICOM Tags for Medical Validity
REQUIRED_TAGS = [
'PatientID',
'StudyInstanceUID',
'SeriesInstanceUID',
'Modality',
'PixelSpacing', # Crucial for measurements
# 'ImageOrientationPatient' # Often missing in simple CR/DX, but critical for CT/MRI
]
# Tags to Anonymize (PHI)
PHI_TAGS = [
'PatientName',
'PatientBirthDate',
'PatientAddress',
'InstitutionName',
'ReferringPhysicianName'
]
def validate_dicom(file_bytes: bytes) -> pydicom.dataset.FileDataset:
"""
Strict validation of DICOM file.
Raises ValueError if invalid.
"""
try:
# 1. Parse without loading pixel data first (speed)
ds = pydicom.dcmread(io.BytesIO(file_bytes), stop_before_pixels=False)
except Exception as e:
raise ValueError(f"Invalid DICOM format: {str(e)}")
# 2. Check Mandatory Tags
missing_tags = [tag for tag in REQUIRED_TAGS if tag not in ds]
if missing_tags:
# Modality specific relaxation could go here, but strict for now
raise ValueError(f"Missing critical DICOM tags: {missing_tags}")
# 3. Check Pixel Data presence
if 'PixelData' not in ds:
raise ValueError("DICOM file has no image data (PixelData missing).")
return ds
def anonymize_dicom(ds: pydicom.dataset.FileDataset) -> pydicom.dataset.FileDataset:
"""
Remove PHI from dataset.
Returns modified dataset.
"""
# Hash PatientID to keep linkable anonymous ID
original_id = str(ds.get('PatientID', 'Unknown'))
hashed_id = hashlib.sha256(original_id.encode()).hexdigest()[:16].upper()
ds.PatientID = f"ANON-{hashed_id}"
# Wipe other fields
for tag in PHI_TAGS:
if tag in ds:
ds.data_element(tag).value = "ANONYMIZED"
return ds
def process_dicom_upload(file_bytes: bytes, username: str) -> Tuple[bytes, Dict[str, Any]]:
"""
Main Gateway Function: Validate -> Anonymize -> Return Bytes & Metadata
"""
# 1. Validate
try:
ds = validate_dicom(file_bytes)
except Exception as e:
logger.error(f"DICOM Validation Failed: {e}")
raise ValueError(f"DICOM Rejected: {e}")
# 2. Anonymize
ds = anonymize_dicom(ds)
# 3. Extract safe metadata for Indexing
metadata = {
"modality": ds.get("Modality", "Unknown"),
"body_part": ds.get("BodyPartExamined", "Unknown"),
"study_uid": str(ds.get("StudyInstanceUID", "")),
"series_uid": str(ds.get("SeriesInstanceUID", "")),
"pixel_spacing": ds.get("PixelSpacing", [1.0, 1.0]),
"original_filename_hint": "dicom_file.dcm" # We generally lose original filename in API
}
# 4. Convert back to bytes for storage
# We save the ANONYMIZED version
with io.BytesIO() as buffer:
ds.save_as(buffer)
safe_bytes = buffer.getvalue()
return safe_bytes, metadata
def convert_dicom_to_image(ds: pydicom.dataset.FileDataset) -> Any:
"""
Convert DICOM to PIL Image / Numpy array for inference.
Handles Hounsfield Units (HU) and Windowing if CT.
"""
import numpy as np
from PIL import Image
try:
# Start with raw pixel array
pixel_array = ds.pixel_array.astype(float)
# Rescale Slope/Intercept (Hounsfield Units)
slope = getattr(ds, 'RescaleSlope', 1)
intercept = getattr(ds, 'RescaleIntercept', 0)
pixel_array = (pixel_array * slope) + intercept
# Windowing (Basic Auto-Windowing if not specified)
# Improvement: Use window center/width from tags if available
# window_center = ds.get("WindowCenter", ... )
# Normalize to 0-255 for standard Vision Models (unless model expects HU)
# For CLIP/Vision models trained on PNGs, 0-255 is safe
pixel_min = np.min(pixel_array)
pixel_max = np.max(pixel_array)
if pixel_max - pixel_min != 0:
pixel_array = ((pixel_array - pixel_min) / (pixel_max - pixel_min)) * 255.0
else:
pixel_array = np.zeros_like(pixel_array)
pixel_array = pixel_array.astype(np.uint8)
# Handle Color Space (Monochrome usually)
if len(pixel_array.shape) == 2:
image = Image.fromarray(pixel_array).convert("RGB")
else:
image = Image.fromarray(pixel_array) # RGB already?
return image
except Exception as e:
logger.error(f"DICOM Conversion Error: {e}")
raise ValueError(f"Could not convert DICOM to image: {e}")