fix: ablation.py — wire all disable flags, add missing experiments (dinov2, mse/cosine loss, no_sigreg, vicreg_only), add run()/load_results() methods, generate CLI commands

Browse files

Files changed (1) hide show

mr_jepa/utils/ablation.py +317 -83

mr_jepa/utils/ablation.py CHANGED Viewed

@@ -3,22 +3,32 @@ Ablation Study Runner for MR-JEPA.
 Supports systematic ablation experiments to validate the paper's contributions:
-1. Full MR-JEPA vs. No JEPA (remove JEPA loss, train with task loss only)
-2. Full MR-JEPA vs. No Rollout (use z₀ directly, K=0)
-3. Full MR-JEPA vs. No Evidence Gate (remove gating, always use full evidence)
-4. K=1 vs. K=3 vs. K=5 (rollout depth ablation)
-5. With vs. Without enriched evidence (Phase 3 ablation)
-6. Hybrid vs. Purist branch comparison
 """
 import copy
 import json
 import logging
 from typing import Dict, List, Any, Optional
 from dataclasses import dataclass, field
 from pathlib import Path
-from ..configs.model_config import MRJEPAConfig, get_hybrid_config, get_purist_config
 logger = logging.getLogger(__name__)
@@ -28,155 +38,379 @@ class AblationConfig:
     """Configuration for a single ablation experiment."""
     name: str
     description: str
-    modifications: Dict[str, Any] = field(default_factory=dict)
-    # What to change from the base config
     disable_jepa: bool = False
     disable_rollout: bool = False
     disable_evidence_gate: bool = False
     override_K: Optional[int] = None
-# Predefined ablation experiments
 ABLATION_EXPERIMENTS = {
-    "full_model": AblationConfig(
-        name="full_model",
-        description="Complete MR-JEPA (baseline)",
     ),
     "no_jepa": AblationConfig(
         name="no_jepa",
-        description="Without JEPA objective (task loss only)",
         disable_jepa=True,
     ),
     "no_rollout": AblationConfig(
         name="no_rollout",
-        description="Without latent rollout (z₀ only, K=0)",
         disable_rollout=True,
     ),
-    "no_evidence_gate": AblationConfig(
-        name="no_evidence_gate",
-        description="Without evidence gating",
         disable_evidence_gate=True,
     ),
     "K1": AblationConfig(
         name="K1",
-        description="Rollout depth K=1",
         override_K=1,
     ),
     "K3": AblationConfig(
         name="K3",
-        description="Rollout depth K=3 (default)",
         override_K=3,
     ),
     "K5": AblationConfig(
         name="K5",
-        description="Rollout depth K=5",
         override_K=5,
     ),
     "K7": AblationConfig(
         name="K7",
-        description="Rollout depth K=7 (deep rollout)",
         override_K=7,
     ),
 }
 class AblationRunner:
     """
     Systematically run ablation experiments.
     Usage:
-        runner = AblationRunner(base_config, experiments=['full_model', 'no_jepa', 'no_rollout'])
-        results = runner.run(train_data, eval_data)
-        runner.report()
     """
     def __init__(
         self,
-        base_config: Optional[MRJEPAConfig] = None,
         experiments: Optional[List[str]] = None,
-        output_dir: str = "./ablations",
     ):
-        self.base_config = base_config or get_hybrid_config()
-        self.experiments = experiments or list(ABLATION_EXPERIMENTS.keys())
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
-        self.results = {}
     def _apply_ablation(self, config: MRJEPAConfig, ablation: AblationConfig) -> MRJEPAConfig:
         """Apply ablation modifications to a config."""
         modified = copy.deepcopy(config)
         if ablation.override_K is not None:
             modified.rollout.K = ablation.override_K
         return modified
     def generate_configs(self) -> Dict[str, MRJEPAConfig]:
-        """Generate configs for all ablation experiments."""
         configs = {}
         for exp_name in self.experiments:
             if exp_name not in ABLATION_EXPERIMENTS:
-                logger.warning(f"Unknown ablation: {exp_name}")
                 continue
             ablation = ABLATION_EXPERIMENTS[exp_name]
-            config = self._apply_ablation(self.base_config, ablation)
-            configs[exp_name] = config
         return configs
     def report(self) -> str:
         """Generate a formatted ablation report."""
         if not self.results:
-            return "No results yet."
         lines = [
-            "=" * 80,
             "MR-JEPA Ablation Study Results",
-            "=" * 80,
             "",
         ]
-        # Header
-        benchmarks = set()
         for exp_results in self.results.values():
-            benchmarks.update(exp_results.keys())
-        benchmarks = sorted(benchmarks)
-        header = f"{'Experiment':<25}"
-        for b in benchmarks:
-            header += f" | {b:<12}"
         lines.append(header)
         lines.append("-" * len(header))
-        # Results rows
-        for exp_name, exp_results in self.results.items():
             ablation = ABLATION_EXPERIMENTS.get(exp_name)
-            row = f"{exp_name:<25}"
-            for b in benchmarks:
-                if b in exp_results:
-                    val = exp_results[b].get('accuracy',
-                          exp_results[b].get('anls',
-                          exp_results[b].get('vqa_accuracy',
-                          exp_results[b].get('relaxed_accuracy', 0))))
-                    row += f" | {val:>10.1f}%"
                 else:
                     row += f" | {'N/A':>10}"
             lines.append(row)
         lines.append("")
-        lines.append("Key findings:")
-        # Auto-detect key findings
-        if 'full_model' in self.results and 'no_jepa' in self.results:
-            lines.append("- JEPA vs No-JEPA: Compare 'full_model' and 'no_jepa' rows")
-        if 'full_model' in self.results and 'no_rollout' in self.results:
-            lines.append("- Rollout vs No-Rollout: Compare 'full_model' and 'no_rollout' rows")
-        report = "\n".join(lines)
-        # Save to file
-        with open(self.output_dir / "ablation_report.txt", "w") as f:
-            f.write(report)
-        return report

 Supports systematic ablation experiments to validate the paper's contributions:
+1.  hybrid_main     — Full MR-JEPA baseline (DINOv3-L, K=3, SmoothL1+VICReg)
+2.  no_jepa         — Remove JEPA loss, train with task loss only
+3.  no_rollout      — Use z₀ directly (K=0), keep task loss only
+4.  no_gate         — Remove evidence gating, always use full evidence
+5.  K1 / K5 / K7    — Rollout depth ablation
+6.  dinov2_ablation  — DINOv2-L/14 backbone instead of DINOv3-L/16
+7.  purist           — DINOv3-B, K=5, Cosine+SIGReg, no enriched evidence
+8.  mse_loss         — MSE (L2) JEPA loss instead of SmoothL1
+9.  cosine_loss      — Cosine similarity JEPA loss
+10. no_sigreg        — Disable SIGReg anti-collapse regularization
+11. vicreg_only      — VICReg regularization without SIGReg
+Each AblationConfig maps 1:1 to CLI flags in train_mrjepa.py.
 """
 import copy
 import json
+import subprocess
 import logging
 from typing import Dict, List, Any, Optional
 from dataclasses import dataclass, field
 from pathlib import Path
+from ..configs.model_config import (
+    MRJEPAConfig, get_hybrid_config, get_purist_config, get_dinov2_ablation_config,
+)
 logger = logging.getLogger(__name__)
     """Configuration for a single ablation experiment."""
     name: str
     description: str
+    # CLI flags that map to train_mrjepa.py arguments
+    cli_flags: Dict[str, Any] = field(default_factory=dict)
+    # Config modifications for the library-based runner
     disable_jepa: bool = False
     disable_rollout: bool = False
     disable_evidence_gate: bool = False
+    disable_sigreg: bool = False
+    enable_vicreg: bool = False
     override_K: Optional[int] = None
+    override_loss_fn: Optional[str] = None
+    override_backbone: Optional[str] = None
+    use_purist: bool = False
+# ──────────────────────────────────────────────────────────
+# Complete ablation experiment registry
+# ──────────────────────────────────────────────────────────
 ABLATION_EXPERIMENTS = {
+    # ── Baseline ──
+    "hybrid_main": AblationConfig(
+        name="hybrid_main",
+        description="Complete MR-JEPA (DINOv3-L, K=3, SmoothL1+VICReg)",
+        cli_flags={"--run_name": "hybrid_main"},
     ),
+    # ── Core contribution ablations ──
     "no_jepa": AblationConfig(
         name="no_jepa",
+        description="Without JEPA objective — task loss only. Tests whether JEPA trajectory supervision adds value.",
+        cli_flags={"--run_name": "no_jepa", "--no_jepa": True},
         disable_jepa=True,
     ),
     "no_rollout": AblationConfig(
         name="no_rollout",
+        description="Without latent rollout — z₀ directly to answer head (K=0). Tests whether iterative refinement adds value.",
+        cli_flags={"--run_name": "no_rollout", "--no_rollout": True},
         disable_rollout=True,
+        # NOTE: no_rollout also disables JEPA (can't supervise a trajectory that doesn't exist)
+        disable_jepa=True,
     ),
+    "no_gate": AblationConfig(
+        name="no_gate",
+        description="Without evidence gating — full evidence at every step. Tests whether adaptive evidence flow matters.",
+        cli_flags={"--run_name": "no_gate", "--no_evidence_gate": True},
         disable_evidence_gate=True,
     ),
+    # ── Rollout depth ablations ──
     "K1": AblationConfig(
         name="K1",
+        description="Rollout depth K=1 (shallow reasoning)",
+        cli_flags={"--run_name": "K1", "--K": 1},
         override_K=1,
     ),
     "K3": AblationConfig(
         name="K3",
+        description="Rollout depth K=3 (default, same as hybrid_main)",
+        cli_flags={"--run_name": "K3", "--K": 3},
         override_K=3,
     ),
     "K5": AblationConfig(
         name="K5",
+        description="Rollout depth K=5 (deeper reasoning)",
+        cli_flags={"--run_name": "K5", "--K": 5},
         override_K=5,
     ),
     "K7": AblationConfig(
         name="K7",
+        description="Rollout depth K=7 (deep rollout — diminishing returns expected)",
+        cli_flags={"--run_name": "K7", "--K": 7},
         override_K=7,
     ),
+    # ── Backbone ablation ──
+    "dinov2_ablation": AblationConfig(
+        name="dinov2_ablation",
+        description="DINOv2-L/14 backbone instead of DINOv3-L/16. Isolates DINOv3 contribution.",
+        cli_flags={"--run_name": "dinov2_ablation", "--backbone": "dinov2"},
+        override_backbone="dinov2",
+    ),
+    # ── Loss function ablations ──
+    "mse_loss": AblationConfig(
+        name="mse_loss",
+        description="MSE (L2) JEPA loss instead of SmoothL1. Original I-JEPA loss.",
+        cli_flags={"--run_name": "mse_loss", "--loss_fn": "mse"},
+        override_loss_fn="mse",
+    ),
+    "cosine_loss": AblationConfig(
+        name="cosine_loss",
+        description="Cosine similarity JEPA loss. Used in purist branch.",
+        cli_flags={"--run_name": "cosine_loss", "--loss_fn": "cosine"},
+        override_loss_fn="cosine",
+    ),
+    # ── Regularization ablations ──
+    "no_sigreg": AblationConfig(
+        name="no_sigreg",
+        description="Disable SIGReg anti-collapse. Expect training instability / collapse.",
+        cli_flags={"--run_name": "no_sigreg", "--no_sigreg": True},
+        disable_sigreg=True,
+    ),
+    "vicreg_only": AblationConfig(
+        name="vicreg_only",
+        description="VICReg regularization only (no SIGReg). Alternative anti-collapse.",
+        cli_flags={"--run_name": "vicreg_only",  "--no_sigreg": True, "--use_vicreg": True},
+        disable_sigreg=True,
+        enable_vicreg=True,
+    ),
+    # ── Branch comparison ──
+    "purist": AblationConfig(
+        name="purist",
+        description="Purist branch: DINOv3-B, K=5, Cosine+SIGReg, no enriched evidence. Isolates JEPA reasoning from perception quality.",
+        cli_flags={"--run_name": "purist", "--purist": True},
+        use_purist=True,
+    ),
 }
 class AblationRunner:
     """
     Systematically run ablation experiments.
+    Two modes:
+    1. CLI mode: generates shell commands for train_mrjepa.py (for HF Jobs)
+    2. Config mode: generates MRJEPAConfig objects (for library-based runner)
     Usage:
+        runner = AblationRunner(experiments=['hybrid_main', 'no_jepa', 'no_rollout'])
+        # Mode 1: Generate CLI commands
+        commands = runner.generate_commands()
+        for name, cmd in commands.items():
+            print(f"{name}: {cmd}")
+        # Mode 2: Generate configs for programmatic use
+        configs = runner.generate_configs()
+        # After running, load results and report
+        runner.load_results("./outputs/mrjepa")
+        print(runner.report())
     """
     def __init__(
         self,
         experiments: Optional[List[str]] = None,
+        output_dir: str = "./outputs/mrjepa",
+        script_path: str = "train_mrjepa.py",
+        common_flags: Optional[Dict[str, Any]] = None,
     ):
+        self.experiments = experiments or [
+            "hybrid_main", "no_jepa", "no_rollout", "no_gate",
+            "K1", "K5", "K7",
+            "dinov2_ablation", "mse_loss", "cosine_loss",
+            "no_sigreg", "purist",
+        ]
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.script_path = script_path
+        self.common_flags = common_flags or {}
+        self.results: Dict[str, Dict[str, Any]] = {}
     def _apply_ablation(self, config: MRJEPAConfig, ablation: AblationConfig) -> MRJEPAConfig:
         """Apply ablation modifications to a config."""
         modified = copy.deepcopy(config)
+        if ablation.use_purist:
+            return get_purist_config()
+        if ablation.override_backbone == "dinov2":
+            return get_dinov2_ablation_config()
         if ablation.override_K is not None:
             modified.rollout.K = ablation.override_K
+        if ablation.disable_jepa:
+            modified.jepa.use_jepa = False
+        if ablation.disable_rollout:
+            modified.rollout.K = 0
+            modified.jepa.use_jepa = False  # No trajectory to supervise
+        if ablation.disable_evidence_gate:
+            modified.rollout.use_evidence_gate = False
+            modified.rollout.gate_type = "none"
+        if ablation.disable_sigreg:
+            modified.jepa.use_sigreg = False
+            modified.jepa.sigreg_weight = 0.0
+        if ablation.enable_vicreg:
+            modified.jepa.use_vicreg = True
+        if ablation.override_loss_fn is not None:
+            modified.jepa.jepa_loss_fn = ablation.override_loss_fn
         return modified
     def generate_configs(self) -> Dict[str, MRJEPAConfig]:
+        """Generate MRJEPAConfig objects for all ablation experiments."""
+        base_config = get_hybrid_config()
         configs = {}
         for exp_name in self.experiments:
             if exp_name not in ABLATION_EXPERIMENTS:
+                logger.warning(f"Unknown ablation: {exp_name}, skipping")
                 continue
             ablation = ABLATION_EXPERIMENTS[exp_name]
+            configs[exp_name] = self._apply_ablation(base_config, ablation)
         return configs
+    def generate_commands(self) -> Dict[str, str]:
+        """Generate CLI commands for train_mrjepa.py for each ablation."""
+        commands = {}
+        for exp_name in self.experiments:
+            if exp_name not in ABLATION_EXPERIMENTS:
+                logger.warning(f"Unknown ablation: {exp_name}, skipping")
+                continue
+            ablation = ABLATION_EXPERIMENTS[exp_name]
+            parts = ["python", self.script_path]
+            # Merge common flags + experiment-specific flags
+            all_flags = {**self.common_flags, **ablation.cli_flags}
+            for flag, value in all_flags.items():
+                if isinstance(value, bool):
+                    if value:
+                        parts.append(flag)
+                else:
+                    parts.append(flag)
+                    parts.append(str(value))
+            commands[exp_name] = " ".join(parts)
+        return commands
+    def run(
+        self,
+        mode: str = "cli",
+        dry_run: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Run all ablation experiments.
+        Args:
+            mode: "cli" to run via subprocess, "config" for programmatic (not yet implemented)
+            dry_run: If True, print commands but don't execute
+        Returns:
+            Dict mapping experiment name to run status/result
+        """
+        if mode == "cli":
+            commands = self.generate_commands()
+            results = {}
+            for exp_name, cmd in commands.items():
+                logger.info(f"{'[DRY RUN] ' if dry_run else ''}Running ablation: {exp_name}")
+                logger.info(f"  Command: {cmd}")
+                if dry_run:
+                    results[exp_name] = {"status": "dry_run", "command": cmd}
+                    continue
+                try:
+                    proc = subprocess.run(
+                        cmd, shell=True, capture_output=True, text=True, timeout=7200,
+                    )
+                    results[exp_name] = {
+                        "status": "success" if proc.returncode == 0 else "failed",
+                        "returncode": proc.returncode,
+                        "stdout_tail": proc.stdout[-2000:] if proc.stdout else "",
+                        "stderr_tail": proc.stderr[-2000:] if proc.stderr else "",
+                    }
+                    if proc.returncode != 0:
+                        logger.error(f"  FAILED (rc={proc.returncode}): {proc.stderr[-500:]}")
+                    else:
+                        logger.info(f"  SUCCESS")
+                except subprocess.TimeoutExpired:
+                    results[exp_name] = {"status": "timeout"}
+                    logger.error(f"  TIMEOUT")
+                except Exception as e:
+                    results[exp_name] = {"status": "error", "error": str(e)}
+                    logger.error(f"  ERROR: {e}")
+            return results
+        else:
+            raise NotImplementedError(f"Mode '{mode}' not implemented. Use 'cli'.")
+    def load_results(self, results_dir: Optional[str] = None):
+        """Load results JSON files from a directory."""
+        rdir = Path(results_dir) if results_dir else self.output_dir
+        for exp_name in self.experiments:
+            result_file = rdir / f"results_{exp_name}.json"
+            if result_file.exists():
+                with open(result_file) as f:
+                    self.results[exp_name] = json.load(f)
+                logger.info(f"Loaded results for {exp_name}")
+            else:
+                logger.warning(f"No results file for {exp_name} at {result_file}")
     def report(self) -> str:
         """Generate a formatted ablation report."""
         if not self.results:
+            return "No results loaded. Call load_results() first."
         lines = [
+            "=" * 90,
             "MR-JEPA Ablation Study Results",
+            "=" * 90,
             "",
         ]
+        # Collect all metric keys across experiments
+        metric_keys = set()
         for exp_results in self.results.values():
+            metric_keys.update(k for k in exp_results.keys() if k.startswith("best_") or k.endswith("_accuracy"))
+        metric_keys = sorted(metric_keys)
+        if not metric_keys:
+            metric_keys = ["best_eval_accuracy"]
+        # Header
+        header = f"{'Experiment':<22} | {'K':>2} | {'JEPA':>4} | {'Gate':>4} | {'Loss':>9}"
+        for mk in metric_keys:
+            short = mk.replace("best_eval_", "").replace("best_", "").replace("_accuracy", "_acc")[:12]
+            header += f" | {short:>10}"
         lines.append(header)
         lines.append("-" * len(header))
+        # Rows
+        for exp_name in self.experiments:
+            if exp_name not in self.results:
+                continue
+            r = self.results[exp_name]
             ablation = ABLATION_EXPERIMENTS.get(exp_name)
+            row = f"{exp_name:<22}"
+            row += f" | {r.get('K', '?'):>2}"
+            row += f" | {'Y' if r.get('use_jepa', True) else 'N':>4}"
+            row += f" | {'Y' if r.get('use_evidence_gate', True) else 'N':>4}"
+            row += f" | {r.get('loss_fn', 'smooth_l1'):>9}"
+            for mk in metric_keys:
+                val = r.get(mk)
+                if val is not None:
+                    row += f" | {val:>9.1f}%"
                 else:
                     row += f" | {'N/A':>10}"
             lines.append(row)
+        lines.append("")
+        lines.append("=" * 90)
         lines.append("")
+        # Auto-generate key findings
+        lines.append("Key comparisons:")
+        if "hybrid_main" in self.results:
+            base_acc = self.results["hybrid_main"].get("best_eval_accuracy", 0)
+            for exp_name in ["no_jepa", "no_rollout", "no_gate"]:
+                if exp_name in self.results:
+                    exp_acc = self.results[exp_name].get("best_eval_accuracy", 0)
+                    delta = exp_acc - base_acc
+                    lines.append(
+                        f"  {exp_name:>15} vs hybrid_main: {delta:+.1f}% "
+                        f"({'JEPA helps' if delta < 0 else 'no benefit'})"
+                    )
+        report_text = "\n".join(lines)
+        # Save
+        report_path = self.output_dir / "ablation_report.txt"
+        with open(report_path, "w") as f:
+            f.write(report_text)
+        logger.info(f"Ablation report saved to {report_path}")
+        return report_text