Wave 20: close F4 s3:// gap — live-S3 DiLoCo allreduce smoke (AWS_SMOKE-gated)

ObjectStoreAllReduce's S3 branches (_init_fsspec/_put/_exists/_get over
s3fs) previously had only mock coverage; F4 flagged "s3:// never
exercised against real S3" as the highest-value unproven gap.

Adds test_s3_rendezvous_allreduce_across_replicas to test_serverless_local.py
(torchft-free, so it runs on Apple Silicon CI too): 2 OS processes call
ObjectStoreAllReduce.allreduce() over an s3:// rendezvous, every rank must
end with the cross-rank mean. Reuses the existing spawn-importable
_replica_compute_and_sync. Gated on AWS_SMOKE=1; skips cleanly otherwise.

Verified live 2026-06-09 against
s3://amazon-sagemaker-386931836011-us-west-2-7597bf4d9a3d/diloco-rdv/:
both ranks converged to identical weights (max|diff|=0.00e+00), both
round_000000/rank_000{0,1}.pt objects present in S3 — proving the
PUT->poll->GET->mean path and S3 strong read-after-write consistency
hold cross-process. Full file: 10 passed, 1 skipped.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (1) hide show

composer_replication/diloco/serverless/tests/test_serverless_local.py +99 -0

composer_replication/diloco/serverless/tests/test_serverless_local.py CHANGED Viewed

@@ -182,6 +182,105 @@ def test_local_executor_handles_multiple_rounds():
             assert all(abs(v - 100.0) < 1e-4 for v in r["result"]["avg2"])
 def _replica_that_raises(rendezvous_uri: str, world_size: int) -> dict:
     """Simulates a replica that crashes mid-run."""
     rank = int(os.environ["REPLICA_RANK"])

             assert all(abs(v - 100.0) < 1e-4 for v in r["result"]["avg2"])
+# ---------------------------------------------------------------------
+# Live-S3 smoke (F4 step 1): the file:// → s3:// transport gap.
+#
+# ObjectStoreAllReduce's S3 branches (_init_fsspec/_put/_exists/_get over
+# s3fs) only have mock coverage; this exercises them against REAL S3 with
+# concurrent OS processes, relying on S3's strong read-after-write
+# consistency (the poll loop's _exists()→_get() assumption). Gated on
+# AWS_SMOKE=1 so it never runs in ordinary CI / on machines without creds.
+#
+# Run it with:
+#   AWS_SMOKE=1 AWS_REGION=us-west-2 \
+#   DILOCO_S3_RENDEZVOUS=s3://<sagemaker-bucket>/diloco-rdv \
+#   pytest composer_replication/diloco/serverless/tests/test_serverless_local.py \
+#          -k s3_rendezvous -s
+#
+# Use a sagemaker-named bucket: stock AmazonSageMakerFullAccess only grants
+# S3 on buckets whose name contains "sagemaker"/"aws-glue" — a custom-named
+# bucket would 403 the first PUT and hang every peer until timeout_s (F4 §3).
+# Verified PASS 2026-06-09 against
+# s3://amazon-sagemaker-386931836011-us-west-2-7597bf4d9a3d/diloco-rdv/.
+# ---------------------------------------------------------------------
+def _s3_smoke_enabled() -> bool:
+    return os.environ.get("AWS_SMOKE") == "1"
+@pytest.mark.skipif(
+    not _s3_smoke_enabled(),
+    reason="live-S3 smoke; set AWS_SMOKE=1 (+ AWS creds, DILOCO_S3_RENDEZVOUS) to run",
+)
+@pytest.mark.parametrize("n_replicas", [2])
+def test_s3_rendezvous_allreduce_across_replicas(n_replicas):
+    """Real-S3 analogue of test_local_executor_runs_allreduce_across_replicas.
+    Same property (N processes call allreduce, every rank ends with the
+    cross-rank mean) but over an ``s3://`` rendezvous instead of a tmp dir,
+    so it actually drives s3fs PUT/poll/GET and depends on S3 strong
+    read-after-write consistency. This is the cheapest (≈$0, no GPU) closure
+    of F4's documented "ObjectStoreAllReduce over s3:// never exercised
+    against real S3" gap.
+    """
+    import uuid
+    pytest.importorskip("s3fs", reason="s3fs required for the live-S3 smoke")
+    import s3fs
+    base = os.environ.get(
+        "DILOCO_S3_RENDEZVOUS",
+        "s3://amazon-sagemaker-386931836011-us-west-2-7597bf4d9a3d/diloco-rdv",
+    ).rstrip("/")
+    rendezvous = f"{base}/smoke-{uuid.uuid4().hex[:8]}/"
+    executor = LocalProcessExecutor()
+    handles = executor.launch_replicas(
+        n_replicas=n_replicas,
+        entrypoint=f"{__name__}._replica_compute_and_sync",
+        entrypoint_args={
+            "rendezvous_uri": rendezvous,
+            "world_size": n_replicas,
+            "rank_value": 10.0,
+            "rank_env": "REPLICA_RANK",
+        },
+        timeout=300,
+    )
+    try:
+        results = executor.collect(handles, timeout=300)
+        for r in results:
+            assert r["status"] == "succeeded", (
+                f"rank {r['rank']} failed (S3 rendezvous {rendezvous}): "
+                f"{r.get('error')}"
+            )
+        # Every rank must agree on the mean — only possible if each read the
+        # SAME peer objects through S3 (proves the cross-process exchange).
+        N = n_replicas
+        expected_mean = 10.0 * (N * (N + 1) / 2) / N
+        for r in results:
+            for v in r["result"]["post"]:
+                assert abs(v - expected_mean) < 1e-4, (
+                    f"rank {r['rank']}: expected S3-averaged mean {expected_mean}, "
+                    f"got {v}"
+                )
+        # Both ranks' pseudo-gradient objects must be present in S3.
+        fs = s3fs.S3FileSystem()
+        listing = fs.ls(rendezvous.replace("s3://", "") + "round_000000/")
+        got = {os.path.basename(p) for p in listing}
+        expected = {f"rank_{r:04d}.pt" for r in range(n_replicas)}
+        assert expected <= got, f"missing rank objects in S3: {expected - got}"
+    finally:
+        # Best-effort cleanup so repeated smokes don't accrete prefixes.
+        try:
+            s3fs.S3FileSystem().rm(rendezvous.replace("s3://", ""), recursive=True)
+        except Exception:
+            pass
 def _replica_that_raises(rendezvous_uri: str, world_size: int) -> dict:
     """Simulates a replica that crashes mid-run."""
     rank = int(os.environ["REPLICA_RANK"])