import marimo __generated_with = "0.14.16" app = marimo.App( width="medium", app_title="Open Syndrome Definition - Data Browser", ) @app.cell def _(): import marimo as mo import json from pathlib import Path import polars as pl import plotly.graph_objects as go import yaml from opensyndrome.filter import OSDEngine, load_profile from opensyndrome.artifacts import get_definition_dir return ( OSDEngine, Path, get_definition_dir, go, json, load_profile, mo, pl, yaml, ) @app.cell def _(go, pl): def plot_cases( _df_filtered, definitions, date_column="date", date_format="%Y-%m-%d %H:%M:%S" ): _definitions_columns_sum = [ pl.col(definition).sum().alias(definition) for definition in definitions ] _agg_df = ( _df_filtered.with_columns( pl.col(date_column) .str.to_datetime(format=date_format, strict=False) .cast(pl.Date) .dt.truncate("1mo") .alias("_month") ) .group_by("_month") .agg(_definitions_columns_sum) .sort("_month") ) _fig = go.Figure() for definition in definitions: _fig.add_trace( go.Scatter( x=_agg_df["_month"], y=_agg_df[definition], mode="lines+markers", name=definition, ) ) _fig.update_layout(yaxis=dict(tickformat="d", rangemode="tozero")) return _fig return (plot_cases,) @app.cell def _(go, pl): def groupped_bar(_df_filtered, definitions, group_by_column="code", top_n=10): _agg_df = ( _df_filtered.group_by(group_by_column) .agg([pl.col(definition).sum() for definition in definitions]) .sort(group_by_column) ) _agg_df = _agg_df.with_columns( (sum([pl.col(definition) for definition in definitions])).alias("total") ) _agg_df = _agg_df.sort("total", descending=True).head( top_n ) # .sort(group_by_column) _fig = go.Figure( data=[ go.Bar( name=definition, x=_agg_df[group_by_column], y=_agg_df[definition] ) for definition in definitions ] ) _fig.update_layout(barmode="group") return _fig return (groupped_bar,) @app.cell def _(mo): mo.md(r"""# Open Syndrome Definition 👩🏽‍🔬""") return @app.cell def _(mo): mo.callout( mo.md( "This is a prototype of how to filter your data using definitions from the Open Syndrome Initiative.\n\n" "We do not store any data." ), kind="neutral", ) return @app.cell def _(Path): EXAMPLE_DATASETS = { "Toy dataset": { "csv": Path("toy_dataset.csv"), "mapping": Path("mapping.yaml"), "date_column": "recording_ts", }, } return (EXAMPLE_DATASETS,) @app.cell def _(EXAMPLE_DATASETS, mo): data_source = mo.ui.radio( options=["Load example", "Upload your own"], value="Load example", ) example_picker = mo.ui.dropdown( options=list(EXAMPLE_DATASETS.keys()), label="Example dataset", ) sample_file = mo.ui.file(kind="area", filetypes=[".csv"]) return data_source, example_picker, sample_file @app.cell def _(data_source, example_picker, mo, sample_file): mo.vstack( [ mo.md("## Load your data"), data_source, example_picker if data_source.value == "Load example" else sample_file, ] ) return @app.cell def _(EXAMPLE_DATASETS, data_source, example_picker, pl, sample_file): if data_source.value == "Load example": df_selected = ( pl.read_csv(EXAMPLE_DATASETS[example_picker.value]["csv"]) if example_picker.value else None ) else: df_selected = pl.read_csv(sample_file.contents()) if sample_file.value else None return (df_selected,) @app.cell def _(EXAMPLE_DATASETS, data_source, example_picker): _default_yaml = """\ profiles: - name: my_dataset # value_encodings: # optional — map OSD canonical values to dataset-specific ones # sex: # male: "M" # female: "F" columns: # Rename the keys below to match your dataset column names. # Available concepts: diagnosis, demographic_criteria, symptom, # diagnostic_test, epidemiological_history my_diagnosis_column: concept: diagnosis dtype: string # my_age_column: # concept: demographic_criteria # attribute: age # dtype: integer """ if data_source.value == "Load example" and example_picker.value: _example = EXAMPLE_DATASETS[example_picker.value] initial_yaml = _example["mapping"].read_text() initial_date_column = _example["date_column"] else: initial_yaml = _default_yaml initial_date_column = None return initial_date_column, initial_yaml @app.cell def _(df_selected, initial_date_column, initial_yaml, mo): mo.stop(df_selected is None) yaml_editor = mo.ui.code_editor( value=initial_yaml, language="yaml", ).form(label="Data to OSD mapping", show_clear_button=True, bordered=True) date_column_picker = mo.ui.dropdown( options=df_selected.columns, label="Date column", value=initial_date_column, ) date_format_input = mo.ui.text( value="%Y-%m-%d %H:%M:%S", label="Date format", ) _cols_hint = "`, `".join(df_selected.columns) mo.vstack( [ mo.md("### Mapping your data to the format"), mo.md( "Edit the YAML below to map your dataset columns to OSD concepts, " "then click **Submit**. " "Select the date column separately for the time-series view.\n\n" f"Your dataset columns: `{_cols_hint}`" ), mo.hstack( [yaml_editor, mo.vstack([date_column_picker, date_format_input])], widths=[3, 1], align="start", ), ] ) return date_column_picker, date_format_input, yaml_editor @app.cell def _(df_selected, load_profile, mo, yaml, yaml_editor): mo.stop(yaml_editor.value is None) try: _parsed = yaml.safe_load(yaml_editor.value) except yaml.YAMLError as _e: mo.stop(True, mo.callout(mo.md(f"**Invalid YAML:** {_e}"), kind="danger")) if not _parsed["profiles"][0]["columns"]: mo.stop( True, mo.callout(mo.md("You need to map **at least one column**"), kind="danger"), ) not_found = [] for declared_column in _parsed["profiles"][0]["columns"]: if declared_column not in df_selected.columns: not_found.append(declared_column) if not_found: mo.stop( True, mo.callout( mo.md(f"**Columns not found:** {', '.join(not_found)}"), kind="danger" ), ) try: _profile_name = _parsed["profiles"][0]["name"] profile = load_profile(_parsed, _profile_name) except (KeyError, IndexError, ValueError) as _e: mo.stop(True, mo.callout(mo.md(f"**Profile error:** {_e}"), kind="danger")) return (profile,) @app.cell def _(date_column_picker, mo, profile): mo.stop(date_column_picker.value is None) date_column = date_column_picker.value _diagnosis_cols = [c.col_name for c in profile.columns if c.concept == "diagnosis"] code_column = _diagnosis_cols[0] if _diagnosis_cols else None return code_column, date_column @app.cell def _(get_definition_dir): definition_options = { filepath.name.replace(".json", ""): filepath for filepath in get_definition_dir().glob("**/*.json") } return (definition_options,) @app.cell def _(definition_options, mo): definitions_dropdown = mo.ui.multiselect( label="Select Syndromic Indicators", options=sorted(definition_options.keys()) ) return (definitions_dropdown,) @app.cell def _(mo): mo.md(r"""### Data sample""") return @app.cell def _(df_selected): df_selected.sample(10) return @app.cell def _(mo): mo.md(r"""---""") return @app.cell def _(mo): mo.md(r"""## Data & Definitions""") return @app.cell def _(definitions_dropdown, mo): mo.hstack([mo.md("**::lucide:filter:: Filters:**"), definitions_dropdown]) return @app.cell def _( OSDEngine, definition_options, definitions_dropdown, df_selected, json, mo, profile, ): mo.stop( df_selected is None or df_selected.is_empty() or not definitions_dropdown.value ) definitions = definitions_dropdown.value # skip criteria that can't be evaluated (e.g. professional_judgment) engine = OSDEngine(profile, skip_unresolvable=True) defs_dict = { name: json.loads(definition_options[name].read_text()) for name in definitions } df_filtered = engine.label(df_selected, defs_dict) return definitions, df_filtered @app.cell def _(definitions, df_filtered, df_selected, mo): mo.stop(definitions is None or df_filtered is None) _cards = [ mo.stat( label="Syndromic Indicators", value=len(definitions), caption=", ".join([definition for definition in definitions]), bordered=True, ), mo.stat( label="Rows", value=df_selected.shape[0], ), mo.stat( label="Columns", value=df_selected.shape[1], ), ] mo.hstack(_cards, widths="equal", align="center") return @app.cell def _(definition_options, json): def load_definition(name: str) -> dict: return json.loads(definition_options[name].read_text()) return (load_definition,) @app.cell def _(mo): top_n = mo.ui.number(start=1, stop=10, label="Number of top codes", value=3, step=1) return (top_n,) @app.cell def _( code_column, date_column, date_format_input, definitions, df_filtered, df_selected, groupped_bar, mo, plot_cases, top_n, ): mo.stop(definitions is None or df_selected is None) if code_column: diagnosis_chart = [ mo.md("### Codes comparison per syndromic indicator"), top_n.left(), groupped_bar( df_filtered, definitions, top_n=top_n.value or 3, group_by_column=code_column, ), ] else: diagnosis_chart = [] timeseries = [ mo.md("### Time series"), plot_cases( df_filtered, definitions, date_column=date_column, date_format=date_format_input.value, ), ] mo.vstack( [ mo.md("## Findings from the data 📊"), *timeseries, *diagnosis_chart, ] ) return @app.cell def _(definitions, load_definition, mo): mo.stop(definitions is None) mo.vstack( [ mo.md("### Definitions details"), mo.md( "Here the definitions used to filter the data. See here what criteria were applied. 🔎" ), mo.accordion( { "JSONs": mo.accordion( { definition: mo.json(load_definition(definition)) for definition in definitions } ), }, ), ] ) return if __name__ == "__main__": app.run()