"""UI utility functions for the Mosaic Gradio interface. This module provides helper functions for: - OncoTree code lookup and caching - User session directory management - Settings CSV loading and validation - Data export functionality """ import tempfile from pathlib import Path import pandas as pd import gradio as gr import requests from mosaic.data_directory import get_data_directory # This path should be outside your project directory if running locally TEMP_USER_DATA_DIR = Path(tempfile.gettempdir()) / "mosaic_user_data" IHC_SUBTYPES = ["", "HR+/HER2+", "HR+/HER2-", "HR-/HER2+", "HR-/HER2-"] SEX_OPTIONS = ["Male", "Female"] SETTINGS_COLUMNS = [ "Slide", "Site Type", "Sex", "Tissue Site", "Cancer Subtype", "IHC Subtype", "Segmentation Config", ] oncotree_code_map = {} tissue_site_list = None def get_tissue_sites(): """Get the list of tissue sites from the tissue site map file. Returns: List of tissue site names. Returns ["Unknown"] if the CSV file is not found. """ global tissue_site_list if tissue_site_list is None: try: data_dir = get_data_directory() tissue_site_map_path = data_dir / "tissue_site_original_to_idx.csv" df = pd.read_csv(tissue_site_map_path) # Get unique tissue sites and sort them tissue_site_list = ["Unknown"] + sorted(df["TISSUE_SITE"].unique().tolist()) except FileNotFoundError: gr.Warning( f"Tissue site mapping file not found at {tissue_site_map_path}. " "Only 'Unknown' option will be available for tissue site selection. " "Please ensure the data files are downloaded from the model repository." ) tissue_site_list = ["Unknown"] return tissue_site_list def get_oncotree_code_name(code): """Retrieve the human-readable name for an OncoTree code. Queries the OncoTree API to get the cancer subtype name corresponding to the given code. Results are cached to avoid repeated API calls. Args: code: OncoTree code (e.g., "LUAD", "BRCA") Returns: Human-readable cancer subtype name, or "Unknown" if not found """ global oncotree_code_map if code in oncotree_code_map.keys(): return oncotree_code_map[code] url = f"https://oncotree.mskcc.org/api/tumorTypes/search/code/{code}?exactMatch=true&version=oncotree_2025_04_08" response = requests.get(url) code_name = "Unknown" if response.status_code == 200: data = response.json() if data: code_name = data[0]["name"] oncotree_code_map[code] = code_name return code_name def create_user_directory(state, request: gr.Request): """Create a unique directory for each user session. Args: state: Gradio state object (unused) request: Gradio request object containing session hash Returns: Path to user's session directory, or None if no session hash available """ session_hash = request.session_hash if session_hash is None: return None user_dir = TEMP_USER_DATA_DIR / session_hash user_dir.mkdir(parents=True, exist_ok=True) return user_dir def load_settings(slide_csv_path): """Load slide analysis settings from CSV file. Loads the CSV and ensures all required columns are present, adding defaults for optional columns if they are missing. Args: slide_csv_path: Path to the CSV file containing slide settings Returns: DataFrame with columns: Slide, Site Type, Cancer Subtype, IHC Subtype, Segmentation Config Raises: ValueError: If required columns are missing from the CSV """ settings_df = pd.read_csv(slide_csv_path, na_filter=False) if "Segmentation Config" not in settings_df.columns: settings_df["Segmentation Config"] = "Biopsy" if "Cancer Subtype" not in settings_df.columns: settings_df["Cancer Subtype"] = "Unknown" if "IHC Subtype" not in settings_df.columns: settings_df["IHC Subtype"] = "" if "Tissue Site" not in settings_df.columns: settings_df["Tissue Site"] = "Unknown" if not set(SETTINGS_COLUMNS).issubset(settings_df.columns): raise ValueError("Missing required column in CSV file") settings_df = settings_df[SETTINGS_COLUMNS] return settings_df def validate_settings( settings_df, cancer_subtype_name_map, cancer_subtypes, reversed_cancer_subtype_name_map, ): """Validate and normalize slide analysis settings. Checks each row for valid values and normalizes cancer subtype names. Generates warnings for invalid entries and replaces them with defaults. Args: settings_df: DataFrame with slide settings to validate cancer_subtype_name_map: Dict mapping subtype display names to codes cancer_subtypes: List of valid cancer subtype codes reversed_cancer_subtype_name_map: Dict mapping codes to display names Returns: Validated DataFrame with normalized values Note: Invalid entries are replaced with defaults and warnings are displayed to the user via Gradio warnings. """ settings_df.columns = SETTINGS_COLUMNS warnings = [] tissue_sites = get_tissue_sites() for idx, row in settings_df.iterrows(): slide_name = row["Slide"] subtype = row["Cancer Subtype"] if subtype in cancer_subtypes: settings_df.at[idx, "Cancer Subtype"] = reversed_cancer_subtype_name_map[ subtype ] if settings_df.at[idx, "Cancer Subtype"] not in cancer_subtype_name_map.keys(): warnings.append( f"Slide {slide_name}: Unknown cancer subtype. Valid subtypes are: {', '.join(cancer_subtype_name_map.keys())}. " ) settings_df.at[idx, "Cancer Subtype"] = "Unknown" if row["Site Type"] not in ["Metastatic", "Primary"]: warnings.append( f"Slide {slide_name}: Unknown site type. Valid types are: Metastatic, Primary. " ) settings_df.at[idx, "Site Type"] = "Primary" # Only warn about invalid sex values that are not empty/None # Empty/None will be validated at analysis time # Convert old "Unknown" values to empty string silently sex_value = row["Sex"] if sex_value == "Unknown": settings_df.at[idx, "Sex"] = "" elif sex_value and sex_value not in SEX_OPTIONS: warnings.append( f"Slide {slide_name}: Invalid sex value '{sex_value}'. Valid options are: {', '.join(SEX_OPTIONS)}. " ) settings_df.at[idx, "Sex"] = "" if row["Tissue Site"] not in tissue_sites: warnings.append( f"Slide {slide_name}: Unknown tissue site. Valid tissue sites are: {', '.join(tissue_sites)}. " ) settings_df.at[idx, "Tissue Site"] = "Unknown" if ( "Breast" not in settings_df.at[idx, "Cancer Subtype"] and row["IHC Subtype"] != "" ): warnings.append( f"Slide {slide_name}: IHC subtype should be empty for non-breast cancer subtypes. " ) settings_df.at[idx, "IHC Subtype"] = "" if row["IHC Subtype"] not in IHC_SUBTYPES: warnings.append( f"Slide {slide_name}: Unknown IHC subtype. Valid subtypes are: {', '.join(IHC_SUBTYPES)}. " ) settings_df.at[idx, "IHC Subtype"] = "" if row["Segmentation Config"] not in ["Biopsy", "Resection", "TCGA"]: warnings.append( f"Slide {slide_name}: Unknown segmentation config. Valid configs are: Biopsy, Resection, TCGA. " ) settings_df.at[idx, "Segmentation Config"] = "Biopsy" if warnings: gr.Warning("\n".join(warnings)) return settings_df def export_to_csv(df): """Export a DataFrame to CSV file for download. Args: df: DataFrame to export Returns: Path to the exported CSV file Raises: gr.Error: If the DataFrame is None or empty """ if df is None or df.empty: raise gr.Error("No data to export.") csv_path = "paladin_results.csv" df.to_csv(csv_path, index=False) return csv_path