Spaces:

ImagingDataCommons
/

CloudSegmentatorResults

Sleeping

App Files Files Community

vkt1414 commited on May 28, 2024

Commit

6330aeb

1 Parent(s): c6d0240

add violin plots, several other enhancements

Browse files

Files changed (1) hide show

filter_data_app.py +128 -57

filter_data_app.py CHANGED Viewed

@@ -5,12 +5,12 @@ import pandas as pd
 from upsetplot import UpSet
 import matplotlib.pyplot as plt
 import polars as pl
 # Set page configuration
 st.set_page_config(layout="wide")
-# URL and local path to the Parquet file
-PARQUET_URL = 'https://github.com/vkt1414/idc-index-data/releases/download/0.1/qualitative_checks.parquet'
 LOCAL_PARQUET_FILE = 'qual-checks-and-quant-values.parquet'
 @st.cache_data
@@ -27,13 +27,23 @@ def load_data():
         'connected_volumes',
         'Volume from Voxel Summation'
     ]
-    return pl.read_parquet(LOCAL_PARQUET_FILE, columns=cols)
 # Function to filter data based on user input
 def filter_data(df, filters):
     for col, value in filters.items():
-        if value:
-            df = df.filter(pl.col(col) == value)
     return df
 # Function to create an UpSet plot for failed checks
@@ -43,7 +53,7 @@ def create_upset_plot_failures(df):
     # Treat 'pass' and null values as passing
     df = df.set_index(~((df['segmentation_completeness'] == 'pass') | df['segmentation_completeness'].isnull())).set_index(~((df['laterality_check'] == 'pass') | df['laterality_check'].isnull()), append=True)
     df = df.set_index(~((df['series_with_vertabra_on_every_slice'] == 'pass') | df['series_with_vertabra_on_every_slice'].isnull()), append=True)
-    df = df.set_index(~((df['connected_volumes'] == 'pass') | df['connected_volumes'].isnull()), append=True)
     df = df[df.index.to_frame().any(axis=1)]  # Ignore the case when all conditions are false
     fig = plt.figure()
@@ -64,6 +74,13 @@ def create_upset_plot_passes(df):
     upset.plot(fig=fig)
     st.pyplot(fig)
 # Main function to run the Streamlit app
 def main():
     st.title("Qualitative Checks of TotalSegmentator Segmentations on NLST")
@@ -115,13 +132,22 @@ def main():
             # Apply the current filters to update options for other filters
             filtered_df = filter_data(df, filters)
             # Update options for other filters based on the current selection
             segmentation_completeness_options = [""] + filtered_df['segmentation_completeness'].unique().to_list()
             laterality_check_options = [""] + filtered_df['laterality_check'].unique().to_list()
             series_with_vertabra_on_every_slice_options = [""] + filtered_df['series_with_vertabra_on_every_slice'].unique().to_list()
-            connected_volumes_options = [""] + filtered_df['connected_volumes'].unique().to_list()
             laterality_options = [""] + filtered_df['laterality'].unique().to_list()
             # Add remaining filters with default values from session state
             segmentation_completeness = st.selectbox(
                 "Segmentation Completeness",
@@ -147,62 +173,29 @@ def main():
                 on_change=lambda: apply_filter('series_with_vertabra_on_every_slice', st.session_state.series_with_vertabra_on_every_slice)
             )
             connected_volumes = st.selectbox(
-                "Connected Volumes",
-                options=connected_volumes_options,
-                index=connected_volumes_options.index(filters['connected_volumes']) if filters['connected_volumes'] else 0,
                 key='connected_volumes',
                 on_change=lambda: apply_filter('connected_volumes', st.session_state.connected_volumes)
             )
-            laterality = st.selectbox(
-                "Laterality",
-                options=laterality_options,
-                index=laterality_options.index(filters['laterality']) if filters['laterality'] else 0,
-                key='laterality',
-                on_change=lambda: apply_filter('laterality', st.session_state.laterality)
-            )
             st.session_state.filters = filters
-    # Define the pages
-    if page == "Summary":
-        st.header("Summary of Qualitative Checks")
-        # Execute the SQL to get summary statistics
-        summary_df = duckdb.query("""
-            WITH Checks AS (
-                SELECT
-                    bodyPart,
-                    laterality,
-                    COUNT(*) AS total_count,
-                    SUM(CASE WHEN segmentation_completeness = 'pass' THEN 1 ELSE 0 END) AS pass_count,
-                    SUM(CASE WHEN laterality_check = 'pass' THEN 1 ELSE 0 END) AS laterality_pass_count,
-                    SUM(CASE WHEN series_with_vertabra_on_every_slice = 'pass' THEN 1 ELSE 0 END) AS vertabra_pass_count,
-                    SUM(CASE WHEN connected_volumes = 'pass' THEN 1 ELSE 0 END) AS volumes_pass_count
-                FROM
-                    'qual-checks-and-quant-values.parquet'
-                GROUP BY
-                    bodyPart, laterality
-            )
-            SELECT
-                bodyPart,
-                laterality,
-                ROUND((pass_count * 100.0) / total_count, 2) || '% (' || pass_count || '/' || total_count || ')' AS segmentation_completeness,
-                CASE WHEN laterality IS NOT NULL
-                    THEN ROUND((laterality_pass_count * 100.0) / NULLIF(total_count, 0), 2) || '% (' || laterality_pass_count || '/' || total_count || ')'
-                    ELSE 'N/A' END AS laterality_check,
-                ROUND((vertabra_pass_count * 100.0) / total_count, 2) || '% (' || vertabra_pass_count || '/' || total_count || ')' AS vertabra_check,
-                ROUND((volumes_pass_count * 100.0) / total_count, 2) || '% (' || volumes_pass_count || '/' || total_count || ')' AS volumes_check
-            FROM
-                Checks
-            ORDER BY
-                bodyPart, laterality;
-        """).pl()
-        summary_df = summary_df.to_pandas()
-        st.data_editor(summary_df,  hide_index=True,use_container_width=True,height=1500)
-    elif page == "UpSet Plots":
         st.header("UpSet Plots of Qualitative Checks")
         # Pagination for the filtered dataframe
@@ -223,6 +216,7 @@ def main():
         start_idx = (page_number - 1) * page_size
         end_idx = min(start_idx + page_size, len(filtered_df))  # Ensure end_idx does not go beyond the dataframe length
         paginated_df = filtered_df[start_idx:end_idx].to_pandas()  # Convert to Pandas DataFrame
         # Display the paginated dataframe
         st.header("Filtered Data")
@@ -230,7 +224,16 @@ def main():
         st.data_editor(
             paginated_df,
             hide_index=True,
         )
         # Explanation about the UpSet plot
@@ -251,5 +254,73 @@ def main():
         if not filtered_df.is_empty():
             create_upset_plot_passes(filtered_df)
 if __name__ == "__main__":
     main()

 from upsetplot import UpSet
 import matplotlib.pyplot as plt
 import polars as pl
+from polars import col, lit
 # Set page configuration
 st.set_page_config(layout="wide")
+# Local path to the Parquet file
 LOCAL_PARQUET_FILE = 'qual-checks-and-quant-values.parquet'
 @st.cache_data
         'connected_volumes',
         'Volume from Voxel Summation'
     ]
+    df = pl.read_parquet(LOCAL_PARQUET_FILE, columns=cols)
+    df = df.with_columns([
+        pl.when(pl.col('connected_volumes') == 'pass').then(pl.lit(1)).otherwise(
+            pl.col('connected_volumes').cast(pl.Int32, strict=False)
+        ).alias('connected_volumes')
+    ])
+    return df
 # Function to filter data based on user input
 def filter_data(df, filters):
     for col, value in filters.items():
+        if value is not None:
+            if col == 'connected_volumes' and value:
+                df = df.filter((pl.col(col) <= value) & (pl.col(col).is_not_null()))
+            else:
+                df = df.filter(pl.col(col) == value)
     return df
 # Function to create an UpSet plot for failed checks
     # Treat 'pass' and null values as passing
     df = df.set_index(~((df['segmentation_completeness'] == 'pass') | df['segmentation_completeness'].isnull())).set_index(~((df['laterality_check'] == 'pass') | df['laterality_check'].isnull()), append=True)
     df = df.set_index(~((df['series_with_vertabra_on_every_slice'] == 'pass') | df['series_with_vertabra_on_every_slice'].isnull()), append=True)
+    df = df.set_index(~((df['connected_volumes'] == '1') | df['connected_volumes'].isnull()), append=True)
     df = df[df.index.to_frame().any(axis=1)]  # Ignore the case when all conditions are false
     fig = plt.figure()
     upset.plot(fig=fig)
     st.pyplot(fig)
+# Function to calculate standard deviation of volumes within a patient
+def calculate_std_dev(df):
+    df=df.to_pandas()
+    # Group by 'PatientID' and calculate the standard deviation of 'Volume from Voxel Summation'
+    std_dev_df = df.groupby(['PatientID','bodyPart'])['Volume from Voxel Summation'].std()
+    return std_dev_df
 # Main function to run the Streamlit app
 def main():
     st.title("Qualitative Checks of TotalSegmentator Segmentations on NLST")
             # Apply the current filters to update options for other filters
             filtered_df = filter_data(df, filters)
             # Update options for other filters based on the current selection
             segmentation_completeness_options = [""] + filtered_df['segmentation_completeness'].unique().to_list()
             laterality_check_options = [""] + filtered_df['laterality_check'].unique().to_list()
             series_with_vertabra_on_every_slice_options = [""] + filtered_df['series_with_vertabra_on_every_slice'].unique().to_list()
+            connected_volumes_options = filtered_df['connected_volumes'].unique().to_list()
             laterality_options = [""] + filtered_df['laterality'].unique().to_list()
+            laterality = st.selectbox(
+                "Laterality",
+                options=laterality_options,
+                index=laterality_options.index(filters['laterality']) if filters['laterality'] else 0,
+                key='laterality',
+                on_change=lambda: apply_filter('laterality', st.session_state.laterality)
+            )
             # Add remaining filters with default values from session state
             segmentation_completeness = st.selectbox(
                 "Segmentation Completeness",
                 on_change=lambda: apply_filter('series_with_vertabra_on_every_slice', st.session_state.series_with_vertabra_on_every_slice)
             )
+            # connected_volumes = st.selectbox(
+            #     "Connected Volumes (<= value)",
+            #     options=connected_volumes_options,
+            #     index=connected_volumes_options.index(filters['connected_volumes']) if filters['connected_volumes'] else 0,
+            #     key='connected_volumes',
+            #     on_change=lambda: apply_filter('connected_volumes', st.session_state.connected_volumes)
+            # )
             connected_volumes = st.selectbox(
+                "Connected Volumes (<= value)",
+                options=[None] + connected_volumes_options,
+                index=connected_volumes_options.index(filters['connected_volumes'])+1 if filters['connected_volumes'] else 0,
                 key='connected_volumes',
                 on_change=lambda: apply_filter('connected_volumes', st.session_state.connected_volumes)
             )
             st.session_state.filters = filters
+            if laterality:
+                body_part_df = df.filter((col('bodyPart') == lit(body_part)) & (col('laterality') == lit(laterality)))
+            else:
+                body_part_df = df.filter(col('bodyPart') == lit(body_part))
         st.header("UpSet Plots of Qualitative Checks")
         # Pagination for the filtered dataframe
         start_idx = (page_number - 1) * page_size
         end_idx = min(start_idx + page_size, len(filtered_df))  # Ensure end_idx does not go beyond the dataframe length
         paginated_df = filtered_df[start_idx:end_idx].to_pandas()  # Convert to Pandas DataFrame
+        paginated_df['Viewer Url'] = 'https://viewer.imaging.datacommons.cancer.gov/viewer/'+paginated_df['StudyInstanceUID']
         # Display the paginated dataframe
         st.header("Filtered Data")
         st.data_editor(
             paginated_df,
+            column_config={
+                "Viewer Url":st.column_config.LinkColumn("StudyInstanceUID",
+                              display_text="https:\/\/viewer\.imaging\.datacommons\.cancer\.gov\/viewer\/(.*)"
+                              ),
+            },
+            column_order=("PatientID", "Viewer Url", "seriesNumber","bodyPart","laterality", "segmentation_completeness","laterality_check", "series_with_vertabra_on_every_slice","connected_volumes"),
             hide_index=True,
+            use_container_width=True
         )
         # Explanation about the UpSet plot
         if not filtered_df.is_empty():
             create_upset_plot_passes(filtered_df)
+        import seaborn as sns
+        import pandas as pd
+        # Assuming calculate_std_dev returns a Series
+        std_dev_before_filtering = calculate_std_dev(body_part_df)
+        std_dev_after_filtering = calculate_std_dev(filtered_df)
+        # Convert Series to DataFrame and add 'Filtering' column
+        std_dev_before_filtering = std_dev_before_filtering.reset_index().rename(columns={0: 'Volume from Voxel Summation'})
+        std_dev_before_filtering['Filtering'] = 'Before Filtering'
+        std_dev_after_filtering = std_dev_after_filtering.reset_index().rename(columns={0: 'Volume from Voxel Summation'})
+        std_dev_after_filtering['Filtering'] = 'After Filtering'
+        # Combine the dataframes for easier plotting
+        combined_df = pd.concat([std_dev_before_filtering, std_dev_after_filtering])
+        # Reset the index of the DataFrame
+        combined_df = combined_df.reset_index(drop=True)
+        # Display violin plots for the distribution of standard deviation of volumes
+        st.header("Violin Plots for Standard Deviation of Volumes")
+        st.write("This plot shows the distribution of standard deviation of volumes within a patient.")
+        fig2, ax = plt.subplots()
+        sns.violinplot(x='Filtering', y='Volume from Voxel Summation', data=combined_df, ax=ax)
+        ax.set_ylabel("Standard Deviation of Volumes")
+        st.pyplot(fig2)
+    # Define the pages
+    if page == "Summary":
+        st.header("Summary of Qualitative Checks")
+        # Execute the SQL to get summary statistics
+        summary_df = duckdb.query("""
+            WITH Checks AS (
+                SELECT
+                    bodyPart,
+                    laterality,
+                    COUNT(*) AS total_count,
+                    SUM(CASE WHEN segmentation_completeness = 'pass' THEN 1 ELSE 0 END) AS pass_count,
+                    SUM(CASE WHEN laterality_check = 'pass' THEN 1 ELSE 0 END) AS laterality_pass_count,
+                    SUM(CASE WHEN series_with_vertabra_on_every_slice = 'pass' THEN 1 ELSE 0 END) AS vertabra_pass_count,
+                    SUM(CASE WHEN connected_volumes = 'pass' THEN 1 ELSE 0 END) AS volumes_pass_count
+                FROM
+                    'qual-checks-and-quant-values.parquet'
+                GROUP BY
+                    bodyPart, laterality
+            )
+            SELECT
+                bodyPart,
+                laterality,
+                ROUND((pass_count * 100.0) / total_count, 2) || '% (' || pass_count || '/' || total_count || ')' AS segmentation_completeness,
+                CASE WHEN laterality IS NOT NULL
+                    THEN ROUND((laterality_pass_count * 100.0) / NULLIF(total_count, 0), 2) || '% (' || laterality_pass_count || '/' || total_count || ')'
+                    ELSE 'N/A' END AS laterality_check,
+                ROUND((vertabra_pass_count * 100.0) / total_count, 2) || '% (' || vertabra_pass_count || '/' || total_count || ')' AS vertabra_check,
+                ROUND((volumes_pass_count * 100.0) / total_count, 2) || '% (' || volumes_pass_count || '/' || total_count || ')' AS volumes_check
+            FROM
+                Checks
+            ORDER BY
+                bodyPart, laterality;
+        """).pl()
+        summary_df = summary_df.to_pandas()
+        st.data_editor(summary_df,  hide_index=True,use_container_width=True,height=1500)
+#    elif page == "UpSet Plots":
 if __name__ == "__main__":
     main()