from __future__ import annotations import json from pathlib import Path import pandas as pd from sepsis_mcp import gossis_experiment, gossis_model_swap_analysis, mimic4_experiment, score_design_ablation from sepsis_mcp.experiment_log import ( DEFAULT_EXPERIMENT_LOG_PATH, append_gossis_output_entry, append_mimic_output_entry, append_non_clinical_output_entry, append_run_entry, append_summary_block, should_auto_append_experiment_log, ) def test_append_run_entry_and_summary_are_idempotent(tmp_path: Path) -> None: log_path = tmp_path / "EXPERIMENT_LOG.md" result_file = tmp_path / "result.csv" result_file.write_text("ok\n", encoding="utf-8") append_run_entry( log_path, title="Smoke run", config_command="cmd", result_file=result_file, setup_note="note", core_result="core", table_headers=["Method", "Coverage"], table_rows=[["standard", "0.900"]], interpretation="interp", method_takeaway="takeaway", ) append_run_entry( log_path, title="Smoke run duplicate", config_command="cmd", result_file=result_file, setup_note="note", core_result="core", table_headers=["Method", "Coverage"], table_rows=[["standard", "0.900"]], interpretation="interp", method_takeaway="takeaway", ) append_summary_block( log_path, title="Smoke Summary", claim_status="validated", bullets=["a"], ) append_summary_block( log_path, title="Smoke Summary", claim_status="validated", bullets=["a"], ) text = log_path.read_text(encoding="utf-8") assert text.count("## Run #") == 1 assert text.count("## Smoke Summary") == 1 assert "src/sepsis_mcp" in text def test_default_root_log_is_skipped_under_pytest(monkeypatch) -> None: monkeypatch.setenv("PYTEST_CURRENT_TEST", "tests/test_experiment_log.py::fake") assert should_auto_append_experiment_log(DEFAULT_EXPERIMENT_LOG_PATH) is False assert should_auto_append_experiment_log(Path("/tmp/custom-log.md")) is True def test_append_gossis_output_entry_records_classification_run(tmp_path: Path) -> None: output_dir = tmp_path / "gossis" output_dir.mkdir() pd.DataFrame( [ { "method": "standard", "empirical_coverage": 0.89, "max_group_coverage_gap": 0.05, "average_set_size": 0.95, }, { "method": "missingness_aware", "empirical_coverage": 0.90, "max_group_coverage_gap": 0.03, "average_set_size": 0.96, }, ] ).to_csv(output_dir / "overall_summary.csv", index=False) (output_dir / "metrics.json").write_text( json.dumps( { "config": { "random_state": 3, "model_type": "xgboost", "target_column": None, "enable_external_baselines": False, "enable_learned_partition": False, "enable_within_group_weighted_missingness": True, } } ), encoding="utf-8", ) log_path = tmp_path / "EXPERIMENT_LOG.md" append_gossis_output_entry(log_path, output_dir) text = log_path.read_text(encoding="utf-8") assert "GOSSIS classification run" in text assert "within-group weighted CP" in text assert "missingness_aware" in text def test_append_mimic_output_entry_records_leave_one_out_run(tmp_path: Path) -> None: output_dir = tmp_path / "mimic" output_dir.mkdir() pd.DataFrame( [ { "method": "standard", "empirical_coverage_mean": 0.89, "max_group_coverage_gap_mean": 0.06, "average_set_size_mean": 0.94, }, { "method": "missingness_aware", "empirical_coverage_mean": 0.91, "max_group_coverage_gap_mean": 0.03, "average_set_size_mean": 0.95, }, ] ).to_csv(output_dir / "repeated_summary.csv", index=False) (output_dir / "config.json").write_text( json.dumps({"model_type": "xgboost", "rotations": 3}), encoding="utf-8", ) log_path = tmp_path / "EXPERIMENT_LOG.md" append_mimic_output_entry(log_path, output_dir) text = log_path.read_text(encoding="utf-8") assert "MIMIC-IV leave-one-unit-out run" in text assert "missingness_aware" in text def test_append_non_clinical_output_entry_records_dataset_run(tmp_path: Path) -> None: output_dir = tmp_path / "airquality" output_dir.mkdir() pd.DataFrame( [ { "method": "standard", "coverage_mean": 0.89, "max_gap_mean": 0.06, "set_size_mean": 0.94, }, { "method": "missingness_aware", "coverage_mean": 0.91, "max_gap_mean": 0.03, "set_size_mean": 0.95, }, ] ).to_csv(output_dir / "summary.csv", index=False) (output_dir / "config.json").write_text( json.dumps({"dataset": "airquality", "seeds": 2}), encoding="utf-8", ) log_path = tmp_path / "EXPERIMENT_LOG.md" append_non_clinical_output_entry(log_path, output_dir) text = log_path.read_text(encoding="utf-8") assert "Non-clinical experiment" in text assert "airquality" in text assert "missingness_aware" in text def test_gossis_main_appends_experiment_log(tmp_path: Path, monkeypatch) -> None: output_dir = tmp_path / "gossis-auto" log_path = tmp_path / "EXPERIMENT_LOG.md" def fake_run(config): output_dir.mkdir(parents=True, exist_ok=True) pd.DataFrame( [ { "method": "standard", "empirical_coverage": 0.88, "max_group_coverage_gap": 0.06, "average_set_size": 0.94, } ] ).to_csv(output_dir / "overall_summary.csv", index=False) (output_dir / "metrics.json").write_text( json.dumps({"config": {"random_state": config.random_state, "model_type": config.model_type}}), encoding="utf-8", ) return {"overall_summary_path": output_dir / "overall_summary.csv"} monkeypatch.setattr(gossis_experiment, "run_gossis_experiment", fake_run) gossis_experiment.main( [ "--data-root", str(tmp_path), "--output-dir", str(output_dir), "--experiment-log-path", str(log_path), ] ) assert "GOSSIS classification run" in log_path.read_text(encoding="utf-8") def test_mimic_main_appends_experiment_log(tmp_path: Path, monkeypatch) -> None: output_dir = tmp_path / "mimic-auto" log_path = tmp_path / "EXPERIMENT_LOG.md" def fake_run(**kwargs): output_dir.mkdir(parents=True, exist_ok=True) pd.DataFrame( [ { "method": "standard", "empirical_coverage_mean": 0.89, "max_group_coverage_gap_mean": 0.06, "average_set_size_mean": 0.94, } ] ).to_csv(output_dir / "repeated_summary.csv", index=False) return {"repeated_summary": output_dir / "repeated_summary.csv"} monkeypatch.setattr(mimic4_experiment, "run_mimic4_leave_one_out", fake_run) mimic4_experiment.main( [ "--csv-path", str(tmp_path / "mimic.csv"), "--output-dir", str(output_dir), "--experiment-log-path", str(log_path), ] ) assert "MIMIC-IV leave-one-unit-out run" in log_path.read_text(encoding="utf-8") def test_model_swap_and_score_design_mains_append_experiment_log(tmp_path: Path, monkeypatch) -> None: swap_output = tmp_path / "swap" score_output = tmp_path / "score" log_path = tmp_path / "EXPERIMENT_LOG.md" def fake_swap_run(config): swap_output.mkdir(parents=True, exist_ok=True) pd.DataFrame( [ { "calibrate_model": "xgboost", "deploy_model": "logistic_regression", "grouping": "coverage_gap_variable", "mean_gap": 0.05, }, { "calibrate_model": "xgboost", "deploy_model": "logistic_regression", "grouping": "coverage_gap_variable_within_group_weighted", "mean_gap": 0.04, }, ] ).to_csv(swap_output / "swap_aggregate.csv", index=False) return {"swap_aggregate_path": swap_output / "swap_aggregate.csv"} def fake_score_run(**kwargs): score_output.mkdir(parents=True, exist_ok=True) pd.DataFrame( [ { "method": "original_standard", "empirical_coverage_mean": 0.89, "max_group_coverage_gap_mean": 0.05, "average_set_size_mean": 0.94, }, { "method": "original_mondrian", "empirical_coverage_mean": 0.90, "max_group_coverage_gap_mean": 0.03, "average_set_size_mean": 0.95, }, { "method": "scaled_global_missing_rate_mondrian", "empirical_coverage_mean": 0.91, "max_group_coverage_gap_mean": 0.02, "average_set_size_mean": 0.96, }, { "method": "scaled_selected_variable_standard", "empirical_coverage_mean": 0.90, "max_group_coverage_gap_mean": 0.03, "average_set_size_mean": 0.95, }, { "method": "znorm_quantile_mondrian", "empirical_coverage_mean": 0.90, "max_group_coverage_gap_mean": 0.03, "average_set_size_mean": 0.95, }, { "method": "recalibrated_top5_ks_mondrian", "empirical_coverage_mean": 0.89, "max_group_coverage_gap_mean": 0.04, "average_set_size_mean": 0.95, }, ] ).to_csv(score_output / "repeated_summary.csv", index=False) (score_output / "config.json").write_text( json.dumps({"model_type": "xgboost", "random_state_grid": [0, 1]}), encoding="utf-8", ) return {"repeated_summary_path": score_output / "repeated_summary.csv"} monkeypatch.setattr(gossis_model_swap_analysis, "run_model_swap_experiment", fake_swap_run) monkeypatch.setattr(score_design_ablation, "run_gossis_score_design_ablation", fake_score_run) gossis_model_swap_analysis.main( [ "--data-root", str(tmp_path), "--output-dir", str(swap_output), "--experiment-log-path", str(log_path), ] ) score_design_ablation.main( [ "gossis", "--data-root", str(tmp_path), "--output-dir", str(score_output), "--experiment-log-path", str(log_path), ] ) text = log_path.read_text(encoding="utf-8") assert "GOSSIS model-swap portability run" in text assert "GOSSIS score-design ablation" in text