| |
| """ |
| Verify Aeon test results against expected ground truth. |
| |
| This script reads the test results and compares them against the ground truth |
| values in test_samples.json to validate the Aeon model predictions. |
| |
| Usage: |
| python verify_aeon_results.py \ |
| --test-samples test_slides/test_samples.json \ |
| --results-dir test_slides/results |
| """ |
|
|
| import argparse |
| import json |
| from pathlib import Path |
| import pandas as pd |
| from typing import Dict, List, Tuple |
|
|
|
|
| def load_test_samples(test_samples_file: Path) -> List[Dict]: |
| """Load test samples from JSON file. |
| |
| Args: |
| test_samples_file: Path to test_samples.json |
| |
| Returns: |
| List of test sample dictionaries |
| """ |
| with open(test_samples_file) as f: |
| return json.load(f) |
|
|
|
|
| def load_aeon_results(slide_id: str, results_dir: Path) -> Tuple[str, float]: |
| """Load Aeon prediction results for a slide. |
| |
| Args: |
| slide_id: Slide identifier |
| results_dir: Directory containing results |
| |
| Returns: |
| Tuple of (predicted_subtype, confidence) |
| """ |
| results_file = results_dir / slide_id / f"{slide_id}_aeon_results.csv" |
|
|
| if not results_file.exists(): |
| raise FileNotFoundError(f"Results file not found: {results_file}") |
|
|
| df = pd.read_csv(results_file) |
|
|
| if df.empty: |
| raise ValueError(f"Empty results file: {results_file}") |
|
|
| |
| top_prediction = df.iloc[0] |
| return top_prediction["Cancer Subtype"], top_prediction["Confidence"] |
|
|
|
|
| def verify_results(test_samples: List[Dict], results_dir: Path) -> Dict: |
| """Verify all test results against ground truth. |
| |
| Args: |
| test_samples: List of test sample dictionaries |
| results_dir: Directory containing results |
| |
| Returns: |
| Dictionary with verification statistics |
| """ |
| total = len(test_samples) |
| passed = 0 |
| failed = 0 |
| results = [] |
|
|
| print("=" * 80) |
| print("Aeon Model Verification Report") |
| print("=" * 80) |
| print() |
|
|
| for sample in test_samples: |
| slide_id = sample.get("slide_id") or sample.get("image_id") |
| ground_truth = sample.get("cancer_subtype") or sample.get("cancer_type") |
| site_type = sample["site_type"] |
| sex = sample["sex"] |
| tissue_site = sample["tissue_site"] |
|
|
| print(f"Slide: {slide_id}") |
| print(f" Ground Truth: {ground_truth}") |
| print(f" Site Type: {site_type}") |
| print(f" Sex: {sex}") |
| print(f" Tissue Site: {tissue_site}") |
|
|
| try: |
| predicted, confidence = load_aeon_results(slide_id, results_dir) |
|
|
| print(f" Predicted: {predicted}") |
| print(f" Confidence: {confidence:.4f} ({confidence * 100:.2f}%)") |
|
|
| |
| if predicted == ground_truth: |
| print(" Status: ✓ PASS") |
| passed += 1 |
| status = "PASS" |
| else: |
| print(f" Status: ✗ FAIL (expected {ground_truth}, got {predicted})") |
| failed += 1 |
| status = "FAIL" |
|
|
| results.append( |
| { |
| "slide_id": slide_id, |
| "ground_truth": ground_truth, |
| "predicted": predicted, |
| "confidence": confidence, |
| "site_type": site_type, |
| "sex": sex, |
| "tissue_site": tissue_site, |
| "status": status, |
| } |
| ) |
|
|
| except Exception as e: |
| print(f" Status: ✗ ERROR - {e}") |
| failed += 1 |
| results.append( |
| { |
| "slide_id": slide_id, |
| "ground_truth": ground_truth, |
| "predicted": None, |
| "confidence": None, |
| "site_type": site_type, |
| "sex": sex, |
| "tissue_site": tissue_site, |
| "status": "ERROR", |
| "error": str(e), |
| } |
| ) |
|
|
| print() |
|
|
| |
| print("=" * 80) |
| print("Summary") |
| print("=" * 80) |
| print(f"Total slides: {total}") |
| print(f"Passed: {passed} ({passed / total * 100:.1f}%)") |
| print(f"Failed: {failed} ({failed / total * 100:.1f}%)") |
| print() |
|
|
| if passed == total: |
| print("✓ All tests passed!") |
| else: |
| print(f"✗ {failed} test(s) failed") |
|
|
| |
| if passed > 0: |
| confidences = [r["confidence"] for r in results if r["status"] == "PASS"] |
| avg_confidence = sum(confidences) / len(confidences) |
| min_confidence = min(confidences) |
| max_confidence = max(confidences) |
|
|
| print() |
| print("Confidence Statistics (for passed tests):") |
| print(f" Average: {avg_confidence:.4f} ({avg_confidence * 100:.2f}%)") |
| print(f" Minimum: {min_confidence:.4f} ({min_confidence * 100:.2f}%)") |
| print(f" Maximum: {max_confidence:.4f} ({max_confidence * 100:.2f}%)") |
|
|
| return { |
| "total": total, |
| "passed": passed, |
| "failed": failed, |
| "accuracy": passed / total if total > 0 else 0, |
| "results": results, |
| } |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description="Verify Aeon test results against ground truth" |
| ) |
| parser.add_argument( |
| "--test-samples", |
| type=Path, |
| default=Path("test_slides/test_samples.json"), |
| help="Path to test_samples.json (default: test_slides/test_samples.json)", |
| ) |
| parser.add_argument( |
| "--results-dir", |
| type=Path, |
| default=Path("test_slides/results"), |
| help="Directory containing results (default: test_slides/results)", |
| ) |
| parser.add_argument( |
| "--output", type=Path, help="Optional path to save verification report as JSON" |
| ) |
|
|
| args = parser.parse_args() |
|
|
| |
| if not args.test_samples.exists(): |
| raise FileNotFoundError(f"Test samples file not found: {args.test_samples}") |
|
|
| if not args.results_dir.exists(): |
| raise FileNotFoundError(f"Results directory not found: {args.results_dir}") |
|
|
| |
| test_samples = load_test_samples(args.test_samples) |
|
|
| |
| verification_report = verify_results(test_samples, args.results_dir) |
|
|
| |
| if args.output: |
| with open(args.output, "w") as f: |
| json.dump(verification_report, f, indent=2) |
| print() |
| print(f"Verification report saved to: {args.output}") |
|
|
| |
| if verification_report["failed"] > 0: |
| exit(1) |
| else: |
| exit(0) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|