raylim's picture
Handle legacy 'Unknown' sex values gracefully
6bb43ff unverified
"""UI utility functions for the Mosaic Gradio interface.
This module provides helper functions for:
- OncoTree code lookup and caching
- User session directory management
- Settings CSV loading and validation
- Data export functionality
"""
import tempfile
from pathlib import Path
import pandas as pd
import gradio as gr
import requests
from mosaic.data_directory import get_data_directory
# This path should be outside your project directory if running locally
TEMP_USER_DATA_DIR = Path(tempfile.gettempdir()) / "mosaic_user_data"
IHC_SUBTYPES = ["", "HR+/HER2+", "HR+/HER2-", "HR-/HER2+", "HR-/HER2-"]
SEX_OPTIONS = ["Male", "Female"]
SETTINGS_COLUMNS = [
"Slide",
"Site Type",
"Sex",
"Tissue Site",
"Cancer Subtype",
"IHC Subtype",
"Segmentation Config",
]
oncotree_code_map = {}
tissue_site_list = None
def get_tissue_sites():
"""Get the list of tissue sites from the tissue site map file.
Returns:
List of tissue site names. Returns ["Unknown"] if the CSV file is not found.
"""
global tissue_site_list
if tissue_site_list is None:
try:
data_dir = get_data_directory()
tissue_site_map_path = data_dir / "tissue_site_original_to_idx.csv"
df = pd.read_csv(tissue_site_map_path)
# Get unique tissue sites and sort them
tissue_site_list = ["Unknown"] + sorted(df["TISSUE_SITE"].unique().tolist())
except FileNotFoundError:
gr.Warning(
f"Tissue site mapping file not found at {tissue_site_map_path}. "
"Only 'Unknown' option will be available for tissue site selection. "
"Please ensure the data files are downloaded from the model repository."
)
tissue_site_list = ["Unknown"]
return tissue_site_list
def get_oncotree_code_name(code):
"""Retrieve the human-readable name for an OncoTree code.
Queries the OncoTree API to get the cancer subtype name corresponding
to the given code. Results are cached to avoid repeated API calls.
Args:
code: OncoTree code (e.g., "LUAD", "BRCA")
Returns:
Human-readable cancer subtype name, or "Unknown" if not found
"""
global oncotree_code_map
if code in oncotree_code_map.keys():
return oncotree_code_map[code]
url = f"https://oncotree.mskcc.org/api/tumorTypes/search/code/{code}?exactMatch=true&version=oncotree_2025_04_08"
response = requests.get(url)
code_name = "Unknown"
if response.status_code == 200:
data = response.json()
if data:
code_name = data[0]["name"]
oncotree_code_map[code] = code_name
return code_name
def create_user_directory(state, request: gr.Request):
"""Create a unique directory for each user session.
Args:
state: Gradio state object (unused)
request: Gradio request object containing session hash
Returns:
Path to user's session directory, or None if no session hash available
"""
session_hash = request.session_hash
if session_hash is None:
return None
user_dir = TEMP_USER_DATA_DIR / session_hash
user_dir.mkdir(parents=True, exist_ok=True)
return user_dir
def load_settings(slide_csv_path):
"""Load slide analysis settings from CSV file.
Loads the CSV and ensures all required columns are present, adding defaults
for optional columns if they are missing.
Args:
slide_csv_path: Path to the CSV file containing slide settings
Returns:
DataFrame with columns: Slide, Site Type, Cancer Subtype, IHC Subtype, Segmentation Config
Raises:
ValueError: If required columns are missing from the CSV
"""
settings_df = pd.read_csv(slide_csv_path, na_filter=False)
if "Segmentation Config" not in settings_df.columns:
settings_df["Segmentation Config"] = "Biopsy"
if "Cancer Subtype" not in settings_df.columns:
settings_df["Cancer Subtype"] = "Unknown"
if "IHC Subtype" not in settings_df.columns:
settings_df["IHC Subtype"] = ""
if "Tissue Site" not in settings_df.columns:
settings_df["Tissue Site"] = "Unknown"
if not set(SETTINGS_COLUMNS).issubset(settings_df.columns):
raise ValueError("Missing required column in CSV file")
settings_df = settings_df[SETTINGS_COLUMNS]
return settings_df
def validate_settings(
settings_df,
cancer_subtype_name_map,
cancer_subtypes,
reversed_cancer_subtype_name_map,
):
"""Validate and normalize slide analysis settings.
Checks each row for valid values and normalizes cancer subtype names.
Generates warnings for invalid entries and replaces them with defaults.
Args:
settings_df: DataFrame with slide settings to validate
cancer_subtype_name_map: Dict mapping subtype display names to codes
cancer_subtypes: List of valid cancer subtype codes
reversed_cancer_subtype_name_map: Dict mapping codes to display names
Returns:
Validated DataFrame with normalized values
Note:
Invalid entries are replaced with defaults and warnings are displayed
to the user via Gradio warnings.
"""
settings_df.columns = SETTINGS_COLUMNS
warnings = []
tissue_sites = get_tissue_sites()
for idx, row in settings_df.iterrows():
slide_name = row["Slide"]
subtype = row["Cancer Subtype"]
if subtype in cancer_subtypes:
settings_df.at[idx, "Cancer Subtype"] = reversed_cancer_subtype_name_map[
subtype
]
if settings_df.at[idx, "Cancer Subtype"] not in cancer_subtype_name_map.keys():
warnings.append(
f"Slide {slide_name}: Unknown cancer subtype. Valid subtypes are: {', '.join(cancer_subtype_name_map.keys())}. "
)
settings_df.at[idx, "Cancer Subtype"] = "Unknown"
if row["Site Type"] not in ["Metastatic", "Primary"]:
warnings.append(
f"Slide {slide_name}: Unknown site type. Valid types are: Metastatic, Primary. "
)
settings_df.at[idx, "Site Type"] = "Primary"
# Only warn about invalid sex values that are not empty/None
# Empty/None will be validated at analysis time
# Convert old "Unknown" values to empty string silently
sex_value = row["Sex"]
if sex_value == "Unknown":
settings_df.at[idx, "Sex"] = ""
elif sex_value and sex_value not in SEX_OPTIONS:
warnings.append(
f"Slide {slide_name}: Invalid sex value '{sex_value}'. Valid options are: {', '.join(SEX_OPTIONS)}. "
)
settings_df.at[idx, "Sex"] = ""
if row["Tissue Site"] not in tissue_sites:
warnings.append(
f"Slide {slide_name}: Unknown tissue site. Valid tissue sites are: {', '.join(tissue_sites)}. "
)
settings_df.at[idx, "Tissue Site"] = "Unknown"
if (
"Breast" not in settings_df.at[idx, "Cancer Subtype"]
and row["IHC Subtype"] != ""
):
warnings.append(
f"Slide {slide_name}: IHC subtype should be empty for non-breast cancer subtypes. "
)
settings_df.at[idx, "IHC Subtype"] = ""
if row["IHC Subtype"] not in IHC_SUBTYPES:
warnings.append(
f"Slide {slide_name}: Unknown IHC subtype. Valid subtypes are: {', '.join(IHC_SUBTYPES)}. "
)
settings_df.at[idx, "IHC Subtype"] = ""
if row["Segmentation Config"] not in ["Biopsy", "Resection", "TCGA"]:
warnings.append(
f"Slide {slide_name}: Unknown segmentation config. Valid configs are: Biopsy, Resection, TCGA. "
)
settings_df.at[idx, "Segmentation Config"] = "Biopsy"
if warnings:
gr.Warning("\n".join(warnings))
return settings_df
def export_to_csv(df):
"""Export a DataFrame to CSV file for download.
Args:
df: DataFrame to export
Returns:
Path to the exported CSV file
Raises:
gr.Error: If the DataFrame is None or empty
"""
if df is None or df.empty:
raise gr.Error("No data to export.")
csv_path = "paladin_results.csv"
df.to_csv(csv_path, index=False)
return csv_path