Add files using upload-large-folder tool

a402b9b verified 29 days ago

161 kB

	import subprocess
	import sys
	import textwrap
	from argparse import Namespace
	from pathlib import Path

	import pytest
	import torch

	import sglang.srt.debug_utils.comparator.entrypoint as _entrypoint_module
	import sglang.srt.debug_utils.dumper as _dumper_module
	from sglang.srt.debug_utils.comparator.entrypoint import (
	parse_args,
	run,
	)
	from sglang.srt.debug_utils.comparator.output_types import (
	AnyRecord,
	ComparisonErrorRecord,
	ComparisonNonTensorRecord,
	ComparisonSkipRecord,
	ComparisonTensorRecord,
	ConfigRecord,
	InfoLog,
	LogRecord,
	ReplicatedCheckResult,
	SummaryRecord,
	_OutputRecord,
	parse_record_json,
	)
	from sglang.srt.debug_utils.dumper import DumperConfig, _Dumper, _RecomputeStatus
	from sglang.test.ci.ci_register import register_cpu_ci

	register_cpu_ci(est_time=30, suite="default", nightly=True)

	_FIXED_EXP_NAME = "my_exp_name"

	# Each test has a one-line docstring describing the scenario it covers.


	class TestEntrypointGroupingRaw:
	"""Test `--grouping-skip-keys` empty (raw) scenarios"""

	def test_run_basic(self, tmp_path, capsys):
	"""Two matching tensors produce ConfigRecord, 2 ComparisonTensorRecords, and SummaryRecord."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a", "tensor_b"])
	argv = _make_argv(baseline_path, target_path, preset="raw")

	records, _ = _run_and_parse(argv, capsys)
	assert isinstance(records[0], ConfigRecord)

	assert len(_get_comparisons(records)) == 2

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 2
	assert summary.skipped == 0

	def test_filter(self, tmp_path, capsys):
	"""--filter selects only the matching tensor, producing 1 ComparisonTensorRecord."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a", "tensor_b"])
	argv = _make_argv(baseline_path, target_path, filter="tensor_a", preset="raw")

	records, _ = _run_and_parse(argv, capsys)
	assert len(_get_comparisons(records)) == 1

	def test_no_baseline_skip(self, tmp_path, capsys):
	"""Target tensor missing from baseline emits a ComparisonSkipRecord with reason baseline_load_failed."""
	baseline_path, target_path = _create_dumps(
	tmp_path,
	tensor_names=["tensor_a", "tensor_extra"],
	baseline_names=["tensor_a"],
	)
	argv = _make_argv(baseline_path, target_path, preset="raw")

	records, _ = _run_and_parse(argv, capsys)
	skips = [r for r in records if isinstance(r, ComparisonSkipRecord)]
	assert len(skips) == 1
	assert skips[0].reason == "baseline_load_failed"

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.skipped == 1

	def test_step_range(self, tmp_path, capsys):
	"""--start_step/--end_step restricts comparison to a single step out of three."""
	baseline_path, target_path = _create_dumps(tmp_path, ["t"], num_steps=3)
	argv = _make_argv(
	baseline_path, target_path, start_step=1, end_step=1, preset="raw"
	)

	records, _ = _run_and_parse(argv, capsys)
	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 1

	def test_all_valid_records(self, tmp_path, capsys):
	"""Every emitted JSON record is a valid _OutputRecord subclass."""
	baseline_path, target_path = _create_dumps(tmp_path, ["t"], num_steps=2)
	argv = _make_argv(baseline_path, target_path, preset="raw")

	records, _ = _run_and_parse(argv, capsys)
	assert all(isinstance(r, _OutputRecord) for r in records)

	def test_comparison_failed(self, tmp_path, capsys):
	"""Completely different tensors produce a failed ComparisonTensorRecord."""
	torch.manual_seed(42)
	baseline_path = _create_rank_dump(
	tmp_path / "baseline", rank=0, name="tensor_a", tensor=torch.randn(10, 10)
	)
	target_path = _create_rank_dump(
	tmp_path / "target",
	rank=0,
	name="tensor_a",
	tensor=torch.randn(10, 10) * 100,
	)
	argv = _make_argv(baseline_path, target_path, preset="raw", diff_threshold=1e-3)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].diff is not None
	assert not comparisons[0].diff.passed
	assert comparisons[0].category == "failed"

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.failed == 1

	def test_shape_mismatch(self, tmp_path, capsys):
	"""Different shapes produce shape_mismatch=True and category='failed'."""
	torch.manual_seed(42)
	baseline_path = _create_rank_dump(
	tmp_path / "baseline", rank=0, name="tensor_a", tensor=torch.randn(4, 8)
	)
	target_path = _create_rank_dump(
	tmp_path / "target", rank=0, name="tensor_a", tensor=torch.randn(4, 10)
	)
	argv = _make_argv(baseline_path, target_path, preset="raw")

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].shape_mismatch is True
	assert comparisons[0].diff is None
	assert comparisons[0].category == "failed"

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.failed == 1

	def test_unify_shape_leading_dims(self, tmp_path, capsys):
	"""Leading singleton dims on baseline are squeezed to match target shape."""
	torch.manual_seed(42)
	base_tensor = torch.randn(4, 8)
	baseline_tensor = base_tensor.unsqueeze(0) # (1, 4, 8)
	target_tensor = base_tensor + torch.randn(4, 8) * 0.0001 # (4, 8)

	baseline_path = _create_rank_dump(
	tmp_path / "baseline", rank=0, name="tensor_a", tensor=baseline_tensor
	)
	target_path = _create_rank_dump(
	tmp_path / "target", rank=0, name="tensor_a", tensor=target_tensor
	)
	argv = _make_argv(baseline_path, target_path, preset="raw")

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1

	comp = comparisons[0]
	assert comp.shape_mismatch is False
	assert comp.baseline.shape == [1, 4, 8]
	assert comp.target.shape == [4, 8]
	assert comp.unified_shape == [4, 8]
	assert comp.diff is not None
	assert comp.diff.passed

	def test_dtype_mismatch_downcast(self, tmp_path, capsys):
	"""Baseline float32 vs target bfloat16 produces diff_downcast."""
	torch.manual_seed(42)
	baseline_tensor = torch.randn(4, 8, dtype=torch.float32)
	target_tensor = (baseline_tensor + torch.randn(4, 8) * 0.0001).to(
	torch.bfloat16
	)

	baseline_path = _create_rank_dump(
	tmp_path / "baseline", rank=0, name="tensor_a", tensor=baseline_tensor
	)
	target_path = _create_rank_dump(
	tmp_path / "target", rank=0, name="tensor_a", tensor=target_tensor
	)
	argv = _make_argv(baseline_path, target_path, preset="raw", diff_threshold=0.01)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].diff_downcast is not None
	assert comparisons[0].downcast_dtype is not None

	def test_mixed_summary(self, tmp_path, capsys):
	"""One passed, one failed, one skipped tensor in a single run."""
	torch.manual_seed(42)
	similar_tensor = torch.randn(4, 4)
	different_baseline = torch.randn(4, 4)
	different_target = torch.randn(4, 4) * 100
	extra_tensor = torch.randn(4, 4)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	_create_rank_dump(baseline_dir, rank=0, name="similar", tensor=similar_tensor)
	_create_rank_dump(
	baseline_dir, rank=0, name="different", tensor=different_baseline
	)

	_create_rank_dump(
	target_dir,
	rank=0,
	name="similar",
	tensor=similar_tensor + torch.randn(4, 4) * 0.0001,
	)
	_create_rank_dump(target_dir, rank=0, name="different", tensor=different_target)
	_create_rank_dump(target_dir, rank=0, name="extra", tensor=extra_tensor)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	preset="raw",
	diff_threshold=1e-3,
	)

	records, _ = _run_and_parse(argv, capsys)
	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.passed == 1
	assert summary.failed == 1
	assert summary.skipped == 1
	assert summary.total == 3

	def test_filter_empty_result(self, tmp_path, capsys):
	"""--filter matching nothing produces summary with total=0."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	argv = _make_argv(
	baseline_path,
	target_path,
	filter="nonexistent_pattern",
	preset="raw",
	)

	records, _ = _run_and_parse(argv, capsys)
	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 0

	def test_raw_multi_rank(self, tmp_path, capsys):
	"""Two ranks in raw grouping produce two ComparisonTensorRecords (one per rank)."""
	torch.manual_seed(42)
	tensor = torch.randn(4, 4)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for rank in range(2):
	_create_rank_dump(baseline_dir, rank=rank, name="hidden", tensor=tensor)
	_create_rank_dump(
	target_dir,
	rank=rank,
	name="hidden",
	tensor=tensor + torch.randn(4, 4) * 0.0001,
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	preset="raw",
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 2

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 2
	assert summary.passed == 2

	def test_text_output_smoke(self, tmp_path, capsys):
	"""Text output format renders without errors and contains Config/Summary sections."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	argv = _make_argv(
	baseline_path, target_path, output_format="text", preset="raw"
	)
	capsys.readouterr()

	run(parse_args(argv))

	output = capsys.readouterr().out
	assert "Comparator Config" in output
	assert "SUMMARY" in output

	def test_text_output_with_failure(self, tmp_path, capsys):
	"""Text output with a failed comparison renders failure info."""
	torch.manual_seed(42)
	baseline_path = _create_rank_dump(
	tmp_path / "baseline", rank=0, name="tensor_a", tensor=torch.randn(10, 10)
	)
	target_path = _create_rank_dump(
	tmp_path / "target",
	rank=0,
	name="tensor_a",
	tensor=torch.randn(10, 10) * 100,
	)
	argv = _make_argv(
	baseline_path, target_path, output_format="text", preset="raw"
	)
	capsys.readouterr()

	run(parse_args(argv))

	output = capsys.readouterr().out
	assert "SUMMARY" in output
	assert "failed" in output.lower()

	def test_duplicate_dump_pairing(self, tmp_path, capsys):
	"""Same name dumped twice (different values) pairs by duplicate_index: 0th↔0th, 1st↔1st."""
	torch.manual_seed(42)
	tensor_v0 = torch.randn(4, 4)
	tensor_v1 = torch.randn(4, 4)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir in [baseline_dir, target_dir]:
	with pytest.MonkeyPatch.context() as mp:
	mp.setattr(_dumper_module, "_get_rank", lambda: 0)
	dumper = _Dumper(
	config=DumperConfig(
	enable=True,
	dir=str(side_dir),
	exp_name=_FIXED_EXP_NAME,
	)
	)
	dumper.__dict__["_static_meta"] = {"world_rank": 0, "world_size": 1}

	dumper.dump("tensor_a", tensor_v0)
	dumper.dump("tensor_a", tensor_v1)
	dumper.step()

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	preset="raw",
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 2
	assert all(c.diff is not None and c.diff.passed for c in comparisons)

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 2
	assert summary.passed == 2


	class TestEntrypointGroupingLogical:
	"""Test `--grouping-skip-keys rank` (logical) scenarios"""

	def test_no_dims_single_rank(self, tmp_path, capsys):
	"""Single-rank dumps without dims fall back to raw loading."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a", "tensor_b"])
	argv = _make_argv(baseline_path, target_path)

	records, _ = _run_and_parse(argv, capsys)
	assert len(_get_comparisons(records)) == 2
	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 2
	assert summary.skipped == 0

	def test_tp_unshard_same_size(self, tmp_path, capsys):
	"""Both sides TP=2: shards are concatenated before comparison."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8)
	full_target = full_baseline + torch.randn(4, 8) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	baseline_path = _create_tp_sharded_dumps(
	baseline_dir,
	full_tensor=full_baseline,
	name="hidden",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)
	target_path = _create_tp_sharded_dumps(
	target_dir,
	full_tensor=full_target,
	name="hidden",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)

	argv = _make_argv(baseline_path, target_path, diff_threshold=0.01)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "hidden"

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 1
	assert summary.passed == 1

	def test_tp_unshard_different_sizes(self, tmp_path, capsys):
	"""Baseline TP=4 vs target TP=2: different shard counts are handled correctly."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8)
	full_target = full_baseline + torch.randn(4, 8) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	baseline_path = _create_tp_sharded_dumps(
	baseline_dir,
	full_tensor=full_baseline,
	name="hidden",
	tp_size=4,
	shard_dim=1,
	dims_str="b h[tp]",
	)
	target_path = _create_tp_sharded_dumps(
	target_dir,
	full_tensor=full_target,
	name="hidden",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)

	argv = _make_argv(baseline_path, target_path, diff_threshold=0.01)

	records, _ = _run_and_parse(argv, capsys)
	_assert_single_comparison_passed(records)

	def test_one_side_dims_single_baseline(self, tmp_path, capsys):
	"""Baseline has no dims (single rank), target has TP shards: unshard target only."""
	torch.manual_seed(42)
	full_tensor = torch.randn(4, 8)
	target_full = full_tensor + torch.randn(4, 8) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	baseline_path = _create_rank_dump(
	baseline_dir, rank=0, name="hidden", tensor=full_tensor
	)

	target_path = _create_tp_sharded_dumps(
	target_dir,
	full_tensor=target_full,
	name="hidden",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)

	argv = _make_argv(baseline_path, target_path, diff_threshold=0.01)

	records, _ = _run_and_parse(argv, capsys)
	_assert_single_comparison_passed(records)

	@pytest.mark.parametrize(
	"bad_side, expected_reason",
	[
	("baseline", "baseline_load_failed"),
	("target", "target_load_failed"),
	],
	)
	def test_ambiguous_no_dims_skip(self, tmp_path, capsys, bad_side, expected_reason):
	"""Multi-rank without dims on one side produces a ComparisonSkipRecord with the appropriate reason."""
	torch.manual_seed(42)
	tensor = torch.randn(4, 8)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	good_dir = target_dir if bad_side == "baseline" else baseline_dir
	bad_dir = baseline_dir if bad_side == "baseline" else target_dir

	_create_rank_dump(good_dir, rank=0, name="hidden", tensor=tensor)
	for rank, shard in [(0, tensor[:, :4]), (1, tensor[:, 4:])]:
	_create_rank_dump(bad_dir, rank=rank, name="hidden", tensor=shard)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	)

	records, _ = _run_and_parse(argv, capsys)
	skips = [r for r in records if isinstance(r, ComparisonSkipRecord)]
	assert len(skips) == 1
	assert skips[0].reason == expected_reason

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.skipped == 1

	def test_summary_counts_unshard(self, tmp_path, capsys):
	"""Two TP-sharded tensors: summary counts total=2, passed=2, skipped=0."""
	torch.manual_seed(42)
	full_a = torch.randn(4, 8)
	full_b = torch.randn(4, 8)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for tensor_name, tensor in [("t_a", full_a), ("t_b", full_b)]:
	baseline_path = _create_tp_sharded_dumps(
	baseline_dir,
	full_tensor=tensor,
	name=tensor_name,
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)
	target_tensor = tensor + torch.randn_like(tensor) * 0.0001
	target_path = _create_tp_sharded_dumps(
	target_dir,
	full_tensor=target_tensor,
	name=tensor_name,
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)

	argv = _make_argv(baseline_path, target_path, diff_threshold=0.01)

	records, _ = _run_and_parse(argv, capsys)
	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 2
	assert summary.passed == 2
	assert summary.failed == 0
	assert summary.skipped == 0

	def test_multi_step_tp(self, tmp_path, capsys):
	"""Two steps with TP=2 shards: concat mode merges into one comparison."""
	torch.manual_seed(42)
	full_tensor = torch.randn(4, 8)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	baseline_path = _create_tp_sharded_dumps(
	baseline_dir,
	full_tensor=full_tensor,
	name="hidden",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	num_steps=2,
	)
	target_path = _create_tp_sharded_dumps(
	target_dir,
	full_tensor=full_tensor + torch.randn(4, 8) * 0.0001,
	name="hidden",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	num_steps=2,
	)

	argv = _make_argv(
	baseline_path,
	target_path,
	diff_threshold=0.01,
	preset="sglang_megatron",
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	# concat along dim 0 (fallback, no token dim) → 2 steps × [4, 8] = [8, 8]
	assert comparisons[0].baseline.shape == [8, 8]

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 1
	assert summary.passed == 1

	def test_cp_axis_unshard(self, tmp_path, capsys):
	"""CP-sharded tensors are correctly concatenated along the sequence dim."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8, 6)
	full_target = full_baseline + torch.randn(4, 8, 6) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir, full_tensor in [
	(baseline_dir, full_baseline),
	(target_dir, full_target),
	]:
	shards = list(full_tensor.chunk(2, dim=1))
	for cp_rank in range(2):
	_create_rank_dump(
	side_dir,
	rank=cp_rank,
	name="attn_out",
	tensor=shards[cp_rank],
	dims="b s[cp] h",
	parallel_info={"cp_rank": cp_rank, "cp_size": 2},
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "attn_out"

	def test_filter_logical(self, tmp_path, capsys):
	"""--filter in logical grouping selects only matching tensor bundles."""
	torch.manual_seed(42)
	full_a = torch.randn(4, 8)
	full_b = torch.randn(4, 8)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for tensor_name, tensor in [("t_a", full_a), ("t_b", full_b)]:
	_create_tp_sharded_dumps(
	baseline_dir,
	full_tensor=tensor,
	name=tensor_name,
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)
	_create_tp_sharded_dumps(
	target_dir,
	full_tensor=tensor + torch.randn_like(tensor) * 0.0001,
	name=tensor_name,
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	filter="t_a",
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].name == "t_a"

	def test_mixed_dims_logical(self, tmp_path, capsys):
	"""TP-sharded and single-rank tensors in the same logical run both compare successfully."""
	torch.manual_seed(42)
	full_tp_tensor = torch.randn(4, 8)
	single_tensor = torch.randn(4, 4)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	_create_tp_sharded_dumps(
	baseline_dir,
	full_tensor=full_tp_tensor,
	name="tensor_a",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)
	_create_tp_sharded_dumps(
	target_dir,
	full_tensor=full_tp_tensor + torch.randn(4, 8) * 0.0001,
	name="tensor_a",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)

	_create_rank_dump(baseline_dir, rank=0, name="tensor_b", tensor=single_tensor)
	_create_rank_dump(
	target_dir,
	rank=0,
	name="tensor_b",
	tensor=single_tensor + torch.randn(4, 4) * 0.0001,
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 2
	assert all(c.diff is not None and c.diff.passed for c in comparisons)
	assert {c.name for c in comparisons} == {"tensor_a", "tensor_b"}

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 2
	assert summary.passed == 2

	def test_cp_tp_unshard(self, tmp_path, capsys):
	"""CP=2 + TP=2: multi-axis shards are unsharded before comparison."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8, 16)
	full_target = full_baseline + torch.randn(4, 8, 16) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir, full_tensor in [
	(baseline_dir, full_baseline),
	(target_dir, full_target),
	]:
	_create_cp_tp_sharded_dumps(
	side_dir,
	full_tensor=full_tensor,
	name="hidden",
	cp_size=2,
	tp_size=2,
	seq_dim=1,
	head_dim=2,
	dims_str="b s[cp] h[tp]",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "hidden"

	def test_cp_tp_different_sizes(self, tmp_path, capsys):
	"""Baseline CP=2+TP=2 vs target CP=1+TP=4: both sides independently unsharder."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8, 16)
	full_target = full_baseline + torch.randn(4, 8, 16) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	_create_cp_tp_sharded_dumps(
	baseline_dir,
	full_tensor=full_baseline,
	name="hidden",
	cp_size=2,
	tp_size=2,
	seq_dim=1,
	head_dim=2,
	dims_str="b s[cp] h[tp]",
	)

	_create_tp_sharded_dumps(
	target_dir,
	full_tensor=full_target,
	name="hidden",
	tp_size=4,
	shard_dim=2,
	dims_str="b s h[tp]",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	_assert_single_comparison_passed(records)

	def test_ep_cp_tp_three_axis_unshard(self, tmp_path, capsys):
	"""EP=2 + CP=2 + TP=2: three-axis shards are unsharded before comparison."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8, 16, 32)
	full_target = full_baseline + torch.randn(4, 8, 16, 32) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir, full_tensor in [
	(baseline_dir, full_baseline),
	(target_dir, full_target),
	]:
	_create_ep_cp_tp_sharded_dumps(
	side_dir,
	full_tensor=full_tensor,
	name="hidden",
	ep_size=2,
	cp_size=2,
	tp_size=2,
	expert_dim=1,
	seq_dim=2,
	head_dim=3,
	dims_str="b e[ep] s[cp] h[tp]",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "hidden"

	def test_cp_zigzag_unshard(self, tmp_path, capsys):
	"""CP=2 zigzag reorder is correctly undone through the full pipeline."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8, 6)
	full_target = full_baseline + torch.randn(4, 8, 6) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir, full_tensor in [
	(baseline_dir, full_baseline),
	(target_dir, full_target),
	]:
	_create_cp_zigzag_tp_sharded_dumps(
	side_dir,
	full_tensor=full_tensor,
	name="attn_out",
	cp_size=2,
	tp_size=1,
	seq_dim=1,
	head_dim=2,
	dims_str="b s[cp:zigzag] h",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "attn_out"

	def test_cp_zigzag_tp_unshard(self, tmp_path, capsys):
	"""CP=2 zigzag + TP=2: multi-axis unshard with reorder through full pipeline."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8, 16)
	full_target = full_baseline + torch.randn(4, 8, 16) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir, full_tensor in [
	(baseline_dir, full_baseline),
	(target_dir, full_target),
	]:
	_create_cp_zigzag_tp_sharded_dumps(
	side_dir,
	full_tensor=full_tensor,
	name="hidden",
	cp_size=2,
	tp_size=2,
	seq_dim=1,
	head_dim=2,
	dims_str="b s[cp:zigzag] h[tp]",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "hidden"

	def test_recompute_pseudo_replicated_verification(self, tmp_path, capsys):
	"""Recompute pseudo-axis with identical original/recompute tensors → passed."""
	torch.manual_seed(42)
	tensor = torch.randn(4, 8)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir in [baseline_dir, target_dir]:
	_create_recompute_rank_dump(
	side_dir,
	rank=0,
	name="hidden",
	original_tensor=tensor,
	recompute_tensor=tensor.clone(),
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	grouping_skip_keys=["rank", "recompute_status"],
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "hidden"

	def test_recompute_pseudo_mismatch_warning(self, tmp_path, capsys):
	"""Recompute pseudo-axis with differing original/recompute → failed replicated_checks."""
	torch.manual_seed(42)
	tensor = torch.randn(4, 8)
	mismatched_tensor = tensor + torch.randn(4, 8) * 10.0

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir in [baseline_dir, target_dir]:
	_create_recompute_rank_dump(
	side_dir,
	rank=0,
	name="hidden",
	original_tensor=tensor,
	recompute_tensor=mismatched_tensor,
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	grouping_skip_keys=["rank", "recompute_status"],
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1

	recompute_checks: list[ReplicatedCheckResult] = [
	c for c in comparisons[0].replicated_checks if c.axis == "recompute_pseudo"
	]
	assert len(recompute_checks) > 0
	assert any(not c.passed for c in recompute_checks)

	def test_tp_partial_reduction_unshard(self, tmp_path, capsys):
	"""TP=2 with partial reduction: element-wise sum reconstructs full tensor."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8)
	full_target = full_baseline + torch.randn(4, 8) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	baseline_path = _create_tp_partial_dumps(
	baseline_dir,
	full_tensor=full_baseline,
	name="attn_out",
	tp_size=2,
	dims_str="b h[tp:partial]",
	)
	target_path = _create_tp_partial_dumps(
	target_dir,
	full_tensor=full_target,
	name="attn_out",
	tp_size=2,
	dims_str="b h[tp:partial]",
	)

	argv = _make_argv(baseline_path, target_path, diff_threshold=0.01)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "attn_out"

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 1
	assert summary.passed == 1

	def test_tp_partial_vs_single_rank(self, tmp_path, capsys):
	"""Baseline single rank vs target TP=2 partial: unshard target then compare."""
	torch.manual_seed(42)
	full_tensor = torch.randn(4, 8)
	target_full = full_tensor + torch.randn(4, 8) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	baseline_path = _create_rank_dump(
	baseline_dir, rank=0, name="attn_out", tensor=full_tensor
	)
	target_path = _create_tp_partial_dumps(
	target_dir,
	full_tensor=target_full,
	name="attn_out",
	tp_size=2,
	dims_str="b h[tp:partial]",
	)

	argv = _make_argv(baseline_path, target_path, diff_threshold=0.01)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "attn_out"

	def test_cp_concat_tp_partial_reduction(self, tmp_path, capsys):
	"""CP=2 concat + TP=2 partial reduction: multi-axis unshard."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8, 16)
	full_target = full_baseline + torch.randn(4, 8, 16) * 0.001

	for side_dir, full_tensor in [
	(tmp_path / "baseline", full_baseline),
	(tmp_path / "target", full_target),
	]:
	side_dir.mkdir()
	cp_chunks = list(full_tensor.chunk(2, dim=1))
	rank = 0
	for cp_rank in range(2):
	for tp_rank in range(2):
	_create_rank_dump(
	side_dir,
	rank=rank,
	name="hidden",
	tensor=cp_chunks[cp_rank] / 2,
	dims="b s[cp] h[tp:partial]",
	parallel_info={
	"cp_rank": cp_rank,
	"cp_size": 2,
	"tp_rank": tp_rank,
	"tp_size": 2,
	},
	)
	rank += 1

	argv = _make_argv(
	tmp_path / "baseline" / _FIXED_EXP_NAME,
	tmp_path / "target" / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "hidden"

	def test_cp_zigzag_sp_same_dim_unshard(self, tmp_path, capsys):
	"""CP=2 zigzag + SP=2 on same seq dim: multi-axis unshard + reorder."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8, 6)
	full_target = full_baseline + torch.randn(4, 8, 6) * 0.001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir, full_tensor in [
	(baseline_dir, full_baseline),
	(target_dir, full_target),
	]:
	_create_cp_zigzag_sp_sharded_dumps(
	side_dir,
	full_tensor=full_tensor,
	name="hidden",
	cp_size=2,
	sp_size=2,
	dims_str="b s[cp:zigzag,sp] h",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "hidden"


	class TestEntrypointPerStepMode:
	"""Test per-step comparison mode (sglang_dev preset behavior)."""

	def test_multi_step_per_step_comparison(self, tmp_path, capsys):
	"""Multiple steps produce one ComparisonTensorRecord per step with step field set."""
	torch.manual_seed(42)
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"], num_steps=3)
	argv = _make_argv(baseline_path, target_path, diff_threshold=0.1)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 3

	steps: list[int] = sorted(c.location.step for c in comparisons)
	assert steps == [0, 1, 2]
	assert all(c.diff is not None and c.diff.passed for c in comparisons)

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 3
	assert summary.passed == 3

	def test_per_step_with_tp_unshard(self, tmp_path, capsys):
	"""Per-step mode with TP=2: each step independently unsharded and compared."""
	torch.manual_seed(42)
	full_tensor = torch.randn(4, 8)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	baseline_path = _create_tp_sharded_dumps(
	baseline_dir,
	full_tensor=full_tensor,
	name="hidden",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	num_steps=2,
	)
	target_path = _create_tp_sharded_dumps(
	target_dir,
	full_tensor=full_tensor + torch.randn(4, 8) * 0.0001,
	name="hidden",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	num_steps=2,
	)

	argv = _make_argv(baseline_path, target_path, diff_threshold=0.01)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 2

	steps: list[int] = sorted(c.location.step for c in comparisons)
	assert steps == [0, 1]
	assert all(c.diff is not None and c.diff.passed for c in comparisons)
	assert all(c.baseline.shape == [4, 8] for c in comparisons)

	def test_single_step_has_step_field(self, tmp_path, capsys):
	"""Single step produces ComparisonTensorRecord with location.step=0."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"], num_steps=1)
	argv = _make_argv(baseline_path, target_path)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].location.step == 0


	class TestEntrypointConcatMode:
	"""Test concat token-aligner mode through the full entrypoint pipeline."""

	@staticmethod
	def _make_dirs(tmp_path: Path) -> tuple[Path, Path]:
	baseline_dir: Path = tmp_path / "baseline"
	target_dir: Path = tmp_path / "target"
	baseline_dir.mkdir()
	target_dir.mkdir()
	return baseline_dir, target_dir

	@staticmethod
	def _create_both_sides(
	tmp_path: Path,
	*,
	baseline_steps: list[torch.Tensor],
	target_steps: list[torch.Tensor],
	name: str = "hidden",
	dims: str \| None = None,
	) -> tuple[Path, Path]:
	"""Create multi-step rank-0 dumps for both sides and return exp paths."""
	baseline_dir, target_dir = TestEntrypointConcatMode._make_dirs(tmp_path)

	for side_dir, steps in [
	(baseline_dir, baseline_steps),
	(target_dir, target_steps),
	]:
	_create_multi_step_rank_dump(
	side_dir,
	rank=0,
	name=name,
	tensors_per_step=steps,
	dims=dims,
	)

	return baseline_dir / _FIXED_EXP_NAME, target_dir / _FIXED_EXP_NAME

	@staticmethod
	def _run_concat(
	tmp_path: Path,
	capsys: pytest.CaptureFixture,
	*,
	baseline_steps: list[torch.Tensor],
	target_steps: list[torch.Tensor],
	name: str = "hidden",
	dims: str \| None = None,
	diff_threshold: float = 0.01,
	) -> list[AnyRecord]:
	"""Create both-side dumps, run comparator, return parsed records."""
	baseline_path, target_path = TestEntrypointConcatMode._create_both_sides(
	tmp_path,
	baseline_steps=baseline_steps,
	target_steps=target_steps,
	name=name,
	dims=dims,
	)
	argv: list[str] = _make_argv(
	baseline_path,
	target_path,
	diff_threshold=diff_threshold,
	preset="sglang_megatron",
	)
	records, _ = _run_and_parse(argv, capsys)
	return records

	def test_concat_multi_step_different_data(self, tmp_path, capsys):
	"""Multi-step concat with different data per step + truncation."""
	torch.manual_seed(42)

	# baseline: 2 steps [5,4] + [3,4] → concat → [8,4]
	baseline_step0 = torch.randn(5, 4)
	baseline_step1 = torch.randn(3, 4)
	baseline_concat = torch.cat([baseline_step0, baseline_step1], dim=0)

	# target: 1 step [6,4] — will be truncated to min(8,6)=6
	target_step0 = baseline_concat[:6] + torch.randn(6, 4) * 0.0001

	records = self._run_concat(
	tmp_path,
	capsys,
	baseline_steps=[baseline_step0, baseline_step1],
	target_steps=[target_step0],
	)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	# truncated to min(8,6) = 6 along concat dim
	assert comparisons[0].baseline.shape == [6, 4]
	assert comparisons[0].target.shape == [6, 4]

	def test_concat_multi_step_tp_unshard(self, tmp_path, capsys):
	"""Multi-step different data + TP=2 unshard + concat."""
	torch.manual_seed(42)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	# 2 steps: [4,8] each → concat → [8,8]
	full_step0 = torch.randn(4, 8)
	full_step1 = torch.randn(4, 8)

	_create_multi_step_tp_sharded_dumps(
	baseline_dir,
	full_tensors_per_step=[full_step0, full_step1],
	name="hidden",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)
	_create_multi_step_tp_sharded_dumps(
	target_dir,
	full_tensors_per_step=[
	full_step0 + torch.randn(4, 8) * 0.0001,
	full_step1 + torch.randn(4, 8) * 0.0001,
	],
	name="hidden",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp]",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	preset="sglang_megatron",
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	# 2 steps × [4, 8] concat along dim 0 (fallback) → [8, 8]
	assert comparisons[0].baseline.shape == [8, 8]
	assert comparisons[0].diff is not None
	assert comparisons[0].diff.passed

	def test_concat_unequal_step_counts(self, tmp_path, capsys):
	"""Baseline 3 steps vs target 2 steps with truncation."""
	torch.manual_seed(42)

	# baseline: 3 steps [3]+[4]+[2] = 9 tokens along dim 0
	b_step0 = torch.randn(3, 4)
	b_step1 = torch.randn(4, 4)
	b_step2 = torch.randn(2, 4)
	b_concat = torch.cat([b_step0, b_step1, b_step2], dim=0)

	# target: 2 steps [5]+[3] = 8 tokens along dim 0
	t_step0 = b_concat[:5] + torch.randn(5, 4) * 0.0001
	t_step1 = b_concat[5:8] + torch.randn(3, 4) * 0.0001

	records = self._run_concat(
	tmp_path,
	capsys,
	baseline_steps=[b_step0, b_step1, b_step2],
	target_steps=[t_step0, t_step1],
	)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	# truncated to min(9,8) = 8
	assert comparisons[0].baseline.shape == [8, 4]
	assert comparisons[0].target.shape == [8, 4]
	assert comparisons[0].diff is not None
	assert comparisons[0].diff.passed

	def test_concat_token_dim_nonzero(self, tmp_path, capsys):
	"""Token dim at dim=1 (dims='b t h') — concat along dim 1."""
	torch.manual_seed(42)

	# 2 steps: [2,5,4] + [2,3,4] → concat along dim 1 → [2,8,4]
	b_step0 = torch.randn(2, 5, 4)
	b_step1 = torch.randn(2, 3, 4)
	b_concat = torch.cat([b_step0, b_step1], dim=1)

	t_step0 = b_concat[:, :5, :] + torch.randn(2, 5, 4) * 0.0001
	t_step1 = b_concat[:, 5:, :] + torch.randn(2, 3, 4) * 0.0001

	records = self._run_concat(
	tmp_path,
	capsys,
	baseline_steps=[b_step0, b_step1],
	target_steps=[t_step0, t_step1],
	dims="b t h",
	)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].baseline.shape == [2, 8, 4]
	assert comparisons[0].diff is not None
	assert comparisons[0].diff.passed

	def test_concat_seq_dim_fallback(self, tmp_path, capsys):
	"""No 't' dim but 's' dim present (dims='b s h') → concat along s."""
	torch.manual_seed(42)

	# 2 steps: [2,5,4] + [2,3,4] → concat along dim 1 (s) → [2,8,4]
	b_step0 = torch.randn(2, 5, 4)
	b_step1 = torch.randn(2, 3, 4)
	b_concat = torch.cat([b_step0, b_step1], dim=1)

	t_step0 = b_concat[:, :5, :] + torch.randn(2, 5, 4) * 0.0001
	t_step1 = b_concat[:, 5:, :] + torch.randn(2, 3, 4) * 0.0001

	records = self._run_concat(
	tmp_path,
	capsys,
	baseline_steps=[b_step0, b_step1],
	target_steps=[t_step0, t_step1],
	dims="b s h",
	)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].baseline.shape == [2, 8, 4]
	assert comparisons[0].diff is not None
	assert comparisons[0].diff.passed

	def test_concat_no_dims_fallback(self, tmp_path, capsys):
	"""No dims annotation → fallback to concat along dim 0."""
	torch.manual_seed(42)

	# 2 steps: [5,4] + [3,4] → concat along dim 0 → [8,4]
	b_step0 = torch.randn(5, 4)
	b_step1 = torch.randn(3, 4)
	b_concat = torch.cat([b_step0, b_step1], dim=0)

	t_step0 = b_concat[:5] + torch.randn(5, 4) * 0.0001
	t_step1 = b_concat[5:] + torch.randn(3, 4) * 0.0001

	records = self._run_concat(
	tmp_path,
	capsys,
	baseline_steps=[b_step0, b_step1],
	target_steps=[t_step0, t_step1],
	)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].baseline.shape == [8, 4]
	assert comparisons[0].diff is not None
	assert comparisons[0].diff.passed

	def test_concat_preserves_step_order(self, tmp_path, capsys):
	"""Verify step0 data precedes step1 data in the concatenated result."""
	# deterministic integer data: step0=[1,2,3], step1=[4,5]
	b_step0 = torch.tensor([[1.0], [2.0], [3.0]])
	b_step1 = torch.tensor([[4.0], [5.0]])

	# target: same data, single step [1,2,3,4,5]
	t_full = torch.tensor([[1.0], [2.0], [3.0], [4.0], [5.0]])

	records = self._run_concat(
	tmp_path,
	capsys,
	baseline_steps=[b_step0, b_step1],
	target_steps=[t_full],
	)
	comp = _assert_single_comparison_passed(records)
	# if order were wrong, diff would not pass with exact integer data
	assert comp.baseline.shape == [5, 1]
	assert comp.diff is not None
	assert comp.diff.max_abs_diff == 0.0

	def test_concat_aux_tensors_not_filtered(self, tmp_path, capsys):
	"""Concat mode does not filter aux tensors — all participate in comparison."""
	torch.manual_seed(42)

	baseline_dir, target_dir = self._make_dirs(tmp_path)

	hidden = torch.randn(4, 8)
	input_ids = torch.randint(0, 100, (4,))
	positions = torch.arange(4)

	_create_rank_dump(
	baseline_dir,
	rank=0,
	name="hidden_states",
	tensor=hidden,
	extra_dumps=[("input_ids", input_ids), ("positions", positions)],
	)
	_create_rank_dump(
	target_dir,
	rank=0,
	name="hidden_states",
	tensor=hidden + torch.randn(4, 8) * 0.0001,
	extra_dumps=[("input_ids", input_ids), ("positions", positions)],
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	# all 3 tensors should be compared (not filtered out)
	names = {c.name for c in comparisons}
	assert "hidden_states" in names
	assert "input_ids" in names
	assert "positions" in names
	assert len(comparisons) == 3

	def test_concat_aligner_plan_fields(self, tmp_path, capsys):
	"""ComparisonTensorRecord.traced_plan reports mode='concat' with plan=None."""
	torch.manual_seed(42)

	records = self._run_concat(
	tmp_path,
	capsys,
	baseline_steps=[torch.randn(3, 4), torch.randn(2, 4)],
	target_steps=[torch.randn(3, 4), torch.randn(2, 4)],
	diff_threshold=100.0,
	)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	traced_plan = comparisons[0].traced_plan
	assert traced_plan is not None
	plan = traced_plan.plan
	assert plan.token_aligner_mode == "concat_steps"
	assert plan.token_aligner_plan is None

	def test_concat_comparison_fails(self, tmp_path, capsys):
	"""Completely different data → comparison fails."""
	torch.manual_seed(42)
	b_step0 = torch.randn(4, 4)
	b_step1 = torch.randn(3, 4)

	# target: completely different random data
	torch.manual_seed(99)
	t_step0 = torch.randn(4, 4) * 100
	t_step1 = torch.randn(3, 4) * 100

	records = self._run_concat(
	tmp_path,
	capsys,
	baseline_steps=[b_step0, b_step1],
	target_steps=[t_step0, t_step1],
	diff_threshold=1e-6,
	)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].diff is not None
	assert not comparisons[0].diff.passed

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.failed == 1
	assert summary.passed == 0

	def test_concat_multi_step_cp_unshard(self, tmp_path, capsys):
	"""Multi-step different data + CP=2 unshard along seq dim + concat."""
	torch.manual_seed(42)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	# 2 steps: [4,8,6] each → concat along seq dim (dim 1) → [4,16,6]
	full_step0 = torch.randn(4, 8, 6)
	full_step1 = torch.randn(4, 8, 6)

	for side_dir, steps in [
	(baseline_dir, [full_step0, full_step1]),
	(
	target_dir,
	[
	full_step0 + torch.randn(4, 8, 6) * 0.0001,
	full_step1 + torch.randn(4, 8, 6) * 0.0001,
	],
	),
	]:
	for cp_rank in range(2):
	per_step_shards: list[torch.Tensor] = [
	t.chunk(2, dim=1)[cp_rank] for t in steps
	]
	_create_multi_step_rank_dump(
	side_dir,
	rank=cp_rank,
	name="attn_out",
	tensors_per_step=per_step_shards,
	dims="b s[cp] h",
	parallel_info={"cp_rank": cp_rank, "cp_size": 2},
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	preset="sglang_megatron",
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	# CP unshard: [4,4,6] × 2 ranks → [4,8,6] per step
	# concat along seq dim (dim 1): 2 steps × [4,8,6] → [4,16,6]
	assert comparisons[0].baseline.shape == [4, 16, 6]
	assert comparisons[0].diff is not None
	assert comparisons[0].diff.passed

	def test_concat_thd_cp_zigzag(self, tmp_path: Path, capsys) -> None:
	"""Concat mode with THD CP=2 zigzag (Megatron format) — unshard + reorder works."""
	torch.manual_seed(42)
	cp_size: int = 2
	seq_lens: list[int] = [100, 64]
	total_tokens: int = sum(seq_lens)
	total_per_rank: int = 128
	num_steps: int = 2

	full_tensor: torch.Tensor = torch.randn(total_tokens + 92)

	baseline_dir: Path = tmp_path / "baseline"
	target_dir: Path = tmp_path / "target"
	baseline_dir.mkdir()
	target_dir.mkdir()

	baseline_path: Path = _create_thd_cp_zigzag_dumps(
	baseline_dir,
	full_tensor=full_tensor,
	name="hidden_states",
	seq_lens=seq_lens,
	cp_size=cp_size,
	total_per_rank=total_per_rank,
	num_steps=num_steps,
	)

	target_tensor: torch.Tensor = full_tensor + torch.randn_like(full_tensor) * 1e-5
	target_path: Path = _create_thd_cp_zigzag_dumps(
	target_dir,
	full_tensor=target_tensor,
	name="hidden_states",
	seq_lens=seq_lens,
	cp_size=cp_size,
	total_per_rank=total_per_rank,
	num_steps=num_steps,
	)

	argv: list[str] = _make_argv(
	baseline_path,
	target_path,
	preset="sglang_megatron",
	diff_threshold=1e-3,
	)
	records, _ = _run_and_parse(argv, capsys)

	comparisons: list[ComparisonTensorRecord] = _get_comparisons(records)
	hidden_comparisons: list[ComparisonTensorRecord] = [
	c for c in comparisons if c.name == "hidden_states"
	]
	assert len(hidden_comparisons) >= 1
	assert all(c.diff is not None and c.diff.passed for c in hidden_comparisons)


	class TestEntrypointAxisAligner:
	"""Test cross-framework dim reordering through the full entrypoint pipeline."""

	def test_axis_swap_different_dim_order(self, tmp_path, capsys):
	"""Baseline dims 'b h d' vs target dims 'b d h': axis swapper rearranges baseline to match."""
	torch.manual_seed(42)
	full_tensor = torch.randn(4, 8, 16)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	_create_rank_dump(
	baseline_dir,
	rank=0,
	name="hidden",
	tensor=full_tensor,
	dims="b h d",
	)
	_create_rank_dump(
	target_dir,
	rank=0,
	name="hidden",
	tensor=full_tensor.permute(0, 2, 1).contiguous(),
	dims="b d h",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=1e-3,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "hidden"
	assert comp.baseline.shape == [4, 16, 8]
	assert comp.target.shape == [4, 16, 8]

	def test_axis_swap_with_tp_unshard(self, tmp_path, capsys):
	"""Baseline TP=2 with dims 'b h[tp] d' vs target TP=2 with dims 'b d h[tp]': unshard + axis swap."""
	torch.manual_seed(42)
	full_tensor = torch.randn(4, 8, 16)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	_create_tp_sharded_dumps(
	baseline_dir,
	full_tensor=full_tensor,
	name="hidden",
	tp_size=2,
	shard_dim=1,
	dims_str="b h[tp] d",
	)
	_create_tp_sharded_dumps(
	target_dir,
	full_tensor=full_tensor.permute(0, 2, 1).contiguous(),
	name="hidden",
	tp_size=2,
	shard_dim=2,
	dims_str="b d h[tp]",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=1e-3,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "hidden"

	def test_squeeze_dim_one_side(self, tmp_path, capsys):
	"""SGLang dims 't h' vs Megatron dims 't 1 h': axis aligner squeezes the singleton dim."""
	torch.manual_seed(42)
	full_tensor = torch.randn(4, 8)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	_create_rank_dump(
	baseline_dir,
	rank=0,
	name="hidden",
	tensor=full_tensor,
	dims="t h",
	)
	_create_rank_dump(
	target_dir,
	rank=0,
	name="hidden",
	tensor=full_tensor.unsqueeze(1),
	dims="t 1 h",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=1e-3,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.name == "hidden"
	assert comp.baseline.shape == [4, 8]
	assert comp.target.shape == [4, 8]


	class TestEntrypointReplicatedAxis:
	"""Test replicated-axis scenarios through the full entrypoint pipeline."""

	def test_replicated_axis_identical_replicas_passed(self, tmp_path, capsys):
	"""CP2 TP2, TP replicated and identical → passed, replicated_checks all passed."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8, 6)
	full_target = full_baseline + torch.randn(4, 8, 6) * 0.0001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir, full_tensor in [
	(baseline_dir, full_baseline),
	(target_dir, full_target),
	]:
	_create_replicated_tp_sharded_cp_dumps(
	side_dir,
	full_tensor=full_tensor,
	name="attn_out",
	cp_size=2,
	tp_size=2,
	seq_dim=1,
	dims_str="b s[cp] d # tp:replicated",
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comp = _assert_single_comparison_passed(records)
	assert comp.errors == []
	assert comp.infos == []
	assert all(c.passed for c in comp.replicated_checks)

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.passed == 1

	def test_replicated_mismatch_fails(self, tmp_path, capsys):
	"""CP2 TP2, TP replicas differ (> atol) → failed with replicated_checks."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8, 6)
	full_target = full_baseline + torch.randn(4, 8, 6) * 0.0001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir, full_tensor in [
	(baseline_dir, full_baseline),
	(target_dir, full_target),
	]:
	_create_replicated_tp_sharded_cp_dumps(
	side_dir,
	full_tensor=full_tensor,
	name="attn_out",
	cp_size=2,
	tp_size=2,
	seq_dim=1,
	dims_str="b s[cp] d # tp:replicated",
	tp_noise=0.5,
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].category == "failed"
	assert any(not c.passed for c in comparisons[0].replicated_checks)

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.failed == 1

	def test_summary_counts_failed_from_replicated_checks_only(self, tmp_path, capsys):
	"""Diff itself passes but TP replicas differ → summary.failed=1 from replicated_checks."""
	torch.manual_seed(42)
	full_baseline = torch.randn(4, 8, 6)
	full_target = full_baseline + torch.randn(4, 8, 6) * 0.0001

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	_create_replicated_tp_sharded_cp_dumps(
	baseline_dir,
	full_tensor=full_baseline,
	name="attn_out",
	cp_size=2,
	tp_size=2,
	seq_dim=1,
	dims_str="b s[cp] d # tp:replicated",
	tp_noise=0.5,
	)
	_create_replicated_tp_sharded_cp_dumps(
	target_dir,
	full_tensor=full_target,
	name="attn_out",
	cp_size=2,
	tp_size=2,
	seq_dim=1,
	dims_str="b s[cp] d # tp:replicated",
	tp_noise=0.5,
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.5,
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1

	comp = comparisons[0]
	assert comp.diff is not None
	assert comp.diff.passed
	assert any(not c.passed for c in comp.replicated_checks)
	assert comp.category == "failed"

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.failed == 1
	assert summary.passed == 0

	def test_replicated_shape_mismatch(self, tmp_path, capsys):
	"""TP replicated tensors with different shapes → failed, replicated diff=None."""
	torch.manual_seed(42)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir in [baseline_dir, target_dir]:
	# rank 0 (cp=0, tp=0): shape (4, 4, 6)
	_create_rank_dump(
	side_dir,
	rank=0,
	name="attn_out",
	tensor=torch.randn(4, 4, 6),
	dims="b s[cp] d # tp:replicated",
	parallel_info={
	"cp_rank": 0,
	"cp_size": 2,
	"tp_rank": 0,
	"tp_size": 2,
	},
	)
	# rank 1 (cp=0, tp=1): shape (4, 4, 3) — different last dim
	_create_rank_dump(
	side_dir,
	rank=1,
	name="attn_out",
	tensor=torch.randn(4, 4, 3),
	dims="b s[cp] d # tp:replicated",
	parallel_info={
	"cp_rank": 0,
	"cp_size": 2,
	"tp_rank": 1,
	"tp_size": 2,
	},
	)
	# rank 2 (cp=1, tp=0): shape (4, 4, 6)
	_create_rank_dump(
	side_dir,
	rank=2,
	name="attn_out",
	tensor=torch.randn(4, 4, 6),
	dims="b s[cp] d # tp:replicated",
	parallel_info={
	"cp_rank": 1,
	"cp_size": 2,
	"tp_rank": 0,
	"tp_size": 2,
	},
	)
	# rank 3 (cp=1, tp=1): shape (4, 4, 3) — different last dim
	_create_rank_dump(
	side_dir,
	rank=3,
	name="attn_out",
	tensor=torch.randn(4, 4, 3),
	dims="b s[cp] d # tp:replicated",
	parallel_info={
	"cp_rank": 1,
	"cp_size": 2,
	"tp_rank": 1,
	"tp_size": 2,
	},
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	diff_threshold=0.01,
	)

	records, _ = _run_and_parse(argv, capsys)
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].category == "failed"

	failed_checks = [c for c in comparisons[0].replicated_checks if not c.passed]
	assert len(failed_checks) >= 1
	assert all(c.diff is None for c in failed_checks)

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.failed == 1


	class TestEntrypointAlignment:
	"""Test smart token alignment with aux tensors."""

	def test_sglang_multi_step_alignment(self, tmp_path, capsys):
	"""SGLang multi-step dumps with aux tensors auto-trigger alignment."""
	torch.manual_seed(42)
	hidden_dim = 8

	hidden_step0 = torch.randn(5, hidden_dim)
	hidden_step1 = torch.randn(2, hidden_dim)

	exp_paths: list[Path] = []
	for side_dir in ["baseline", "target"]:
	d = tmp_path / side_dir
	d.mkdir()

	dumper = _Dumper(
	config=DumperConfig(
	enable=True,
	dir=str(d),
	exp_name=_FIXED_EXP_NAME,
	)
	)

	# Step 0: prefill with 2 sequences (3+2 tokens)
	dumper.dump("input_ids", torch.tensor([10, 20, 30, 40, 50]))
	dumper.dump("positions", torch.tensor([0, 1, 2, 0, 1]))
	dumper.dump("seq_lens", torch.tensor([3, 2]))
	dumper.dump("req_pool_indices", torch.tensor([7, 3]))
	dumper.dump("rids", ["A", "B"])
	dumper.dump("hidden_states", hidden_step0)
	dumper.step()

	# Step 1: decode (1 token per sequence)
	dumper.dump("input_ids", torch.tensor([31, 51]))
	dumper.dump("positions", torch.tensor([3, 2]))
	dumper.dump("seq_lens", torch.tensor([1, 1]))
	dumper.dump("req_pool_indices", torch.tensor([7, 3]))
	dumper.dump("rids", ["A", "B"])
	dumper.dump("hidden_states", hidden_step1)
	dumper.step()

	exp_paths.append(d / _FIXED_EXP_NAME)

	argv = _make_argv(
	exp_paths[0],
	exp_paths[1],
	grouping_skip_keys=["rank", "step"],
	token_aligner="smart",
	)
	records, _ = _run_and_parse(argv, capsys)

	comparisons = _get_comparisons(records)
	# AUX_NAMES are filtered out after plan computation → only hidden_states remains
	assert len(comparisons) == 1
	assert comparisons[0].name == "hidden_states"
	assert comparisons[0].diff is not None
	assert comparisons[0].diff.passed

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.passed == 1
	assert summary.failed == 0
	assert summary.skipped == 0

	def test_sglang_vs_megatron_cross_framework(self, tmp_path, capsys):
	"""SGLang 4-step thd baseline vs Megatron 1-step thd target align correctly."""
	torch.manual_seed(42)
	hidden_dim: int = 8

	all_hiddens: torch.Tensor = torch.randn(11, hidden_dim)
	seq_a_hiddens: torch.Tensor = all_hiddens[:6]
	seq_b_hiddens: torch.Tensor = all_hiddens[6:]

	# --- SGLang baseline: 1 prefill + 3 decode ---
	sglang_dir: Path = tmp_path / "baseline"
	sglang_dir.mkdir()
	sglang_dumper = _Dumper(
	config=DumperConfig(
	enable=True,
	dir=str(sglang_dir),
	exp_name=_FIXED_EXP_NAME,
	)
	)

	# Step 0: prefill — seq A (3 tokens) + seq B (2 tokens)
	sglang_dumper.dump("input_ids", torch.tensor([10, 20, 30, 40, 50]))
	sglang_dumper.dump("positions", torch.tensor([0, 1, 2, 0, 1]))
	sglang_dumper.dump("seq_lens", torch.tensor([3, 2]))
	sglang_dumper.dump("req_pool_indices", torch.tensor([7, 3]))
	sglang_dumper.dump("rids", ["A", "B"])
	sglang_dumper.dump(
	"hidden_states",
	torch.stack(
	[
	seq_a_hiddens[0],
	seq_a_hiddens[1],
	seq_a_hiddens[2],
	seq_b_hiddens[0],
	seq_b_hiddens[1],
	]
	),
	)
	sglang_dumper.step()

	# Steps 1-3: decode — 1 token per sequence
	decode_data: list[dict[str, object]] = [
	{
	"input_ids": torch.tensor([31, 51]),
	"positions": torch.tensor([3, 2]),
	"hidden": torch.stack([seq_a_hiddens[3], seq_b_hiddens[2]]),
	},
	{
	"input_ids": torch.tensor([32, 52]),
	"positions": torch.tensor([4, 3]),
	"hidden": torch.stack([seq_a_hiddens[4], seq_b_hiddens[3]]),
	},
	{
	"input_ids": torch.tensor([33, 53]),
	"positions": torch.tensor([5, 4]),
	"hidden": torch.stack([seq_a_hiddens[5], seq_b_hiddens[4]]),
	},
	]
	for step_data in decode_data:
	sglang_dumper.dump("input_ids", step_data["input_ids"])
	sglang_dumper.dump("positions", step_data["positions"])
	sglang_dumper.dump("seq_lens", torch.tensor([1, 1]))
	sglang_dumper.dump("req_pool_indices", torch.tensor([7, 3]))
	sglang_dumper.dump("rids", ["A", "B"])
	sglang_dumper.dump("hidden_states", step_data["hidden"])
	sglang_dumper.step()

	# --- Megatron target: 1 step, thd [T, H] ---
	megatron_dir: Path = tmp_path / "target"
	megatron_dir.mkdir()
	megatron_dumper = _Dumper(
	config=DumperConfig(
	enable=True,
	dir=str(megatron_dir),
	exp_name=_FIXED_EXP_NAME,
	)
	)

	# THD flat: seq A (6 tokens) + seq B (5 tokens) = 11 tokens total
	megatron_input_ids: torch.Tensor = torch.tensor(
	[10, 20, 30, 31, 32, 33, 40, 50, 51, 52, 53]
	)
	megatron_cu_seqlens: torch.Tensor = torch.tensor([0, 6, 11])

	megatron_hidden: torch.Tensor = torch.cat([seq_a_hiddens, seq_b_hiddens], dim=0)

	megatron_dumper.dump("input_ids", megatron_input_ids)
	megatron_dumper.dump("cu_seqlens_q", megatron_cu_seqlens)
	megatron_dumper.dump("hidden_states", megatron_hidden)
	megatron_dumper.step()

	# --- Run comparison ---
	argv = _make_argv(
	sglang_dir / _FIXED_EXP_NAME,
	megatron_dir / _FIXED_EXP_NAME,
	grouping_skip_keys=["rank", "step"],
	token_aligner="smart",
	)

	records, _ = _run_and_parse(argv, capsys)

	log_records = [r for r in records if isinstance(r, LogRecord)]
	layout_infos = [
	i
	for lr in log_records
	for i in lr.infos
	if isinstance(i, InfoLog) and i.category == "layout_detection_fallback"
	]
	assert len(layout_infos) == 1

	comparisons = _get_comparisons(records)
	# AUX_NAMES filtered out → only hidden_states remains
	assert len(comparisons) == 1
	assert comparisons[0].name == "hidden_states"
	assert comparisons[0].diff is not None
	assert comparisons[0].diff.passed

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.passed == 1
	assert summary.failed == 0
	assert summary.skipped == 0

	def test_alignment_fallback_when_no_aux(self, tmp_path, capsys):
	"""Without aux tensors, smart alignment falls back to per-step comparison."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"], num_steps=2)
	argv = _make_argv(
	baseline_path,
	target_path,
	token_aligner="smart",
	diff_threshold=0.1,
	)

	capsys.readouterr()
	run(parse_args(argv))
	captured = capsys.readouterr()
	records = _parse_jsonl(captured.out)
	log_records = [r for r in records if isinstance(r, LogRecord)]
	aux_missing_infos = [
	i
	for lr in log_records
	for i in lr.infos
	if isinstance(i, InfoLog) and i.category == "aux_tensors_missing"
	]
	assert len(aux_missing_infos) == 1

	comparisons = _get_comparisons(records)
	assert len(comparisons) == 2

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.total == 2
	assert summary.passed == 2


	class TestEntrypointNonTensorValues:
	"""Test non-tensor value comparison through the full entrypoint pipeline."""

	def test_non_tensor_float_same_value(self, tmp_path: Path, capsys) -> None:
	"""Two sides dump the same float → ComparisonNonTensorRecord with values_equal=True, category=passed."""
	baseline_path, target_path = _create_non_tensor_dumps(
	tmp_path, name="sm_scale", baseline_value=0.125, target_value=0.125
	)
	argv = _make_argv(baseline_path, target_path, preset="raw")
	records, _ = _run_and_parse(argv, capsys)

	non_tensors = _get_non_tensors(records)
	assert len(non_tensors) == 1
	assert non_tensors[0].name == "sm_scale"
	assert non_tensors[0].values_equal is True
	assert non_tensors[0].category == "passed"

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.passed == 1
	assert summary.failed == 0

	def test_non_tensor_float_different_value(self, tmp_path: Path, capsys) -> None:
	"""Two sides dump different floats → ComparisonNonTensorRecord with values_equal=False, category=failed."""
	baseline_path, target_path = _create_non_tensor_dumps(
	tmp_path, name="sm_scale", baseline_value=0.125, target_value=0.25
	)
	argv = _make_argv(baseline_path, target_path, preset="raw")
	records, _ = _run_and_parse(argv, capsys)

	non_tensors = _get_non_tensors(records)
	assert len(non_tensors) == 1
	assert non_tensors[0].values_equal is False
	assert non_tensors[0].category == "failed"

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.failed == 1

	def test_non_tensor_string_value(self, tmp_path: Path, capsys) -> None:
	"""String non-tensor values are compared and displayed correctly."""
	baseline_path, target_path = _create_non_tensor_dumps(
	tmp_path,
	name="attn_backend",
	baseline_value="flash_attn",
	target_value="flash_attn",
	)
	argv = _make_argv(baseline_path, target_path, preset="raw")
	records, _ = _run_and_parse(argv, capsys)

	non_tensors = _get_non_tensors(records)
	assert len(non_tensors) == 1
	assert non_tensors[0].values_equal is True
	assert non_tensors[0].baseline_type == "str"
	assert non_tensors[0].target_type == "str"

	def test_non_tensor_mixed_with_tensor(self, tmp_path: Path, capsys) -> None:
	"""Tensors and non_tensors in the same dump are each handled correctly."""
	torch.manual_seed(42)
	tensor = torch.randn(4, 4)

	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"

	for side_dir in [baseline_dir, target_dir]:
	_create_non_tensor_rank_dump(
	side_dir,
	rank=0,
	name="sm_scale",
	value=0.125,
	extra_tensor_dumps=[("hidden", tensor)],
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	preset="raw",
	)
	records, _ = _run_and_parse(argv, capsys)

	comparisons = _get_comparisons(records)
	non_tensors = _get_non_tensors(records)
	assert len(comparisons) == 1
	assert comparisons[0].name == "hidden"
	assert len(non_tensors) == 1
	assert non_tensors[0].name == "sm_scale"
	assert non_tensors[0].values_equal is True

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.passed == 2

	def test_non_tensor_complex_object(self, tmp_path: Path, capsys) -> None:
	"""Complex objects (e.g. dict containing a tensor) are displayed via repr, not skipped."""
	value = {"a": 1, "b": "hello", "c": torch.tensor([1.0, 2.0])}
	baseline_path, target_path = _create_non_tensor_dumps(
	tmp_path, name="debug_info", baseline_value=value, target_value=value
	)
	argv = _make_argv(baseline_path, target_path, preset="raw")
	records, _ = _run_and_parse(argv, capsys)

	non_tensors = _get_non_tensors(records)
	assert len(non_tensors) == 1
	assert non_tensors[0].name == "debug_info"
	assert non_tensors[0].baseline_type == "dict"
	assert non_tensors[0].target_type == "dict"

	def test_non_tensor_none_value(self, tmp_path: Path, capsys) -> None:
	"""Dumping None is displayed as ComparisonNonTensorRecord, not skipped as load failure."""
	baseline_path, target_path = _create_non_tensor_dumps(
	tmp_path, name="optional_param", baseline_value=None, target_value=None
	)
	argv = _make_argv(baseline_path, target_path, preset="raw")
	records, _ = _run_and_parse(argv, capsys)

	non_tensors = _get_non_tensors(records)
	assert len(non_tensors) == 1
	assert non_tensors[0].name == "optional_param"
	assert non_tensors[0].values_equal is True
	assert non_tensors[0].baseline_value == "None"
	assert non_tensors[0].baseline_type == "NoneType"
	assert non_tensors[0].category == "passed"

	def test_non_tensor_json_roundtrip(self, tmp_path: Path, capsys) -> None:
	"""ComparisonNonTensorRecord JSON output can be parsed back correctly."""
	baseline_path, target_path = _create_non_tensor_dumps(
	tmp_path, name="sm_scale", baseline_value=0.125, target_value=0.125
	)
	argv = _make_argv(baseline_path, target_path, preset="raw")
	records, _ = _run_and_parse(argv, capsys)

	non_tensors = _get_non_tensors(records)
	assert len(non_tensors) == 1

	json_str: str = non_tensors[0].model_dump_json()
	roundtripped = parse_record_json(json_str)
	assert isinstance(roundtripped, ComparisonNonTensorRecord)
	assert roundtripped.name == "sm_scale"
	assert roundtripped.values_equal is True


	# ───────────────────── Visualization integration tests ─────────────────────


	class TestEntrypointVisualize:
	"""Test --visualize-bundle-details integration."""

	@pytest.fixture(autouse=True)
	def _skip_if_no_matplotlib(self) -> None:
	pytest.importorskip("matplotlib")

	def test_visualize_creates_pngs(self, tmp_path, capsys):
	"""--visualize-bundle-details with --filter produces PNG files."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a", "tensor_b"])
	viz_dir = tmp_path / "viz_out"
	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	filter="tensor_a",
	viz_bundle_details=True,
	viz_output_dir=str(viz_dir),
	)

	records, _ = _run_and_parse(argv, capsys)
	assert len(_get_comparisons(records)) == 1

	png_files = list(viz_dir.glob("*.png"))
	assert len(png_files) == 1
	assert png_files[0].stat().st_size > 0

	def test_no_visualize_no_png(self, tmp_path, capsys):
	"""Without --visualize-bundle-details, no PNGs are created."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	viz_dir = tmp_path / "viz_out"
	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	viz_bundle_details=False,
	viz_output_dir=str(viz_dir),
	)

	_run_and_parse(argv, capsys)
	assert not viz_dir.exists() or len(list(viz_dir.glob("*.png"))) == 0


	# --------------------------- Assertion helpers -------------------


	def _get_comparisons(records: list[AnyRecord]) -> list[ComparisonTensorRecord]:
	return [r for r in records if isinstance(r, ComparisonTensorRecord)]


	def _get_non_tensors(records: list[AnyRecord]) -> list[ComparisonNonTensorRecord]:
	return [r for r in records if isinstance(r, ComparisonNonTensorRecord)]


	def _assert_single_comparison_passed(
	records: list[AnyRecord],
	) -> ComparisonTensorRecord:
	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].diff is not None
	assert comparisons[0].diff.passed
	return comparisons[0]


	# --------------------------- Utils ------------------------------


	def _make_dumper(directory: Path) -> _Dumper:
	return _Dumper(config=DumperConfig(enable=True, dir=str(directory)))


	def _create_dumps(
	tmp_path: Path,
	tensor_names: list[str],
	*,
	baseline_names: list[str] \| None = None,
	num_steps: int = 1,
	) -> tuple[Path, Path]:
	"""Create baseline and target dump directories with given tensor names.

	If baseline_names is None, uses the same names as tensor_names.
	Each step dumps all names with the same tensor (different per baseline/target).
	"""
	if baseline_names is None:
	baseline_names = tensor_names

	d_baseline = tmp_path / "baseline"
	d_target = tmp_path / "target"
	d_baseline.mkdir()
	d_target.mkdir()

	torch.manual_seed(42)
	baseline_tensor = torch.randn(10, 10)
	target_tensor = baseline_tensor + torch.randn(10, 10) * 0.01

	exp_paths: list[Path] = []
	for d, names, tensor in [
	(d_baseline, baseline_names, baseline_tensor),
	(d_target, tensor_names, target_tensor),
	]:
	dumper = _make_dumper(d)
	for _ in range(num_steps):
	for name in names:
	dumper.dump(name, tensor)
	dumper.step()
	exp_paths.append(d / dumper._config.exp_name)

	return exp_paths[0], exp_paths[1]


	def _create_non_tensor_rank_dump(
	directory: Path,
	*,
	rank: int,
	name: str,
	value: object,
	extra_tensor_dumps: list[tuple[str, torch.Tensor]] \| None = None,
	) -> Path:
	with pytest.MonkeyPatch.context() as mp:
	mp.setattr(_dumper_module, "_get_rank", lambda: rank)

	dumper = _Dumper(
	config=DumperConfig(
	enable=True,
	dir=str(directory),
	exp_name=_FIXED_EXP_NAME,
	)
	)
	dumper.__dict__["_static_meta"] = {"world_rank": rank, "world_size": 1}

	dumper.dump(name, value)
	for extra_name, extra_tensor in extra_tensor_dumps or []:
	dumper.dump(extra_name, extra_tensor)
	dumper.step()

	return directory / _FIXED_EXP_NAME


	def _create_non_tensor_dumps(
	tmp_path: Path,
	*,
	name: str,
	baseline_value: object,
	target_value: object,
	) -> tuple[Path, Path]:
	baseline_dir = tmp_path / "baseline"
	target_dir = tmp_path / "target"
	baseline_dir.mkdir()
	target_dir.mkdir()

	baseline_path = _create_non_tensor_rank_dump(
	baseline_dir, rank=0, name=name, value=baseline_value
	)
	target_path = _create_non_tensor_rank_dump(
	target_dir, rank=0, name=name, value=target_value
	)
	return baseline_path, target_path


	def _make_argv(
	baseline_path: Path,
	target_path: Path,
	*,
	preset: str \| None = None,
	grouping_skip_keys: list[str] \| None = None,
	token_aligner: str \| None = None,
	diff_threshold: float = 1e-3,
	output_format: str = "json",
	start_step: int \| None = None,
	end_step: int \| None = None,
	filter: str \| None = None,
	override_dims: list[str] \| None = None,
	override_baseline_dims: list[str] \| None = None,
	override_target_dims: list[str] \| None = None,
	override_config: str \| None = None,
	allow_skipped_pattern: str \| None = None,
	allow_failed_pattern: str \| None = None,
	report_path: str \| None = "",
	viz_bundle_details: bool = False,
	viz_output_dir: str \| None = None,
	visualize_per_token: str \| None = None,
	) -> list[str]:
	argv: list[str] = [
	"--baseline-path",
	str(baseline_path),
	"--target-path",
	str(target_path),
	"--diff-threshold",
	str(diff_threshold),
	"--output-format",
	output_format,
	]

	if preset is not None:
	argv += ["--preset", preset]
	if grouping_skip_keys is not None:
	argv += ["--grouping-skip-keys"] + grouping_skip_keys
	if token_aligner is not None:
	argv += ["--token-aligner", token_aligner]
	if start_step is not None:
	argv += ["--start-step", str(start_step)]
	if end_step is not None:
	argv += ["--end-step", str(end_step)]
	if filter is not None:
	argv += ["--filter", filter]
	for dim in override_dims or []:
	argv += ["--override-dims", dim]
	for dim in override_baseline_dims or []:
	argv += ["--override-baseline-dims", dim]
	for dim in override_target_dims or []:
	argv += ["--override-target-dims", dim]
	if override_config is not None:
	argv += ["--override-config", override_config]
	if allow_skipped_pattern is not None:
	argv += ["--allow-skipped-pattern", allow_skipped_pattern]
	if allow_failed_pattern is not None:
	argv += ["--allow-failed-pattern", allow_failed_pattern]
	if report_path is not None:
	argv += ["--report-path", report_path]
	if viz_bundle_details:
	argv += ["--viz-bundle-details"]
	if viz_output_dir is not None:
	argv += ["--viz-output-dir", viz_output_dir]
	if visualize_per_token is not None:
	argv += ["--visualize-per-token", visualize_per_token]

	return argv


	def _run_and_parse(
	argv: list[str], capsys: pytest.CaptureFixture
	) -> tuple[list[AnyRecord], int]:
	args: Namespace = parse_args(argv)
	capsys.readouterr()
	exit_code: int = run(args)
	return _parse_jsonl(capsys.readouterr().out), exit_code


	def _parse_jsonl(output: str) -> list[AnyRecord]:
	return [parse_record_json(line) for line in output.strip().splitlines()]


	def _create_rank_dump(
	directory: Path,
	*,
	rank: int,
	name: str,
	tensor: torch.Tensor,
	dims: str \| None = None,
	parallel_info: dict \| None = None,
	framework: str = "sglang",
	num_steps: int = 1,
	extra_dumps: list[tuple[str, object]] \| None = None,
	) -> Path:
	"""Create a dump file via the real dumper, as if running on the given rank.

	extra_dumps: additional (name, value) pairs to dump alongside the main tensor each step.
	"""
	with pytest.MonkeyPatch.context() as mp:
	mp.setattr(_dumper_module, "_get_rank", lambda: rank)

	dumper = _Dumper(
	config=DumperConfig(
	enable=True,
	dir=str(directory),
	exp_name=_FIXED_EXP_NAME,
	)
	)

	static_meta: dict = {"world_rank": rank, "world_size": 1}
	if parallel_info is not None:
	static_meta[f"{framework}_parallel_info"] = parallel_info
	dumper.__dict__["_static_meta"] = static_meta

	for _ in range(num_steps):
	dumper.dump(name, tensor, dims=dims)
	for extra_name, extra_value in extra_dumps or []:
	dumper.dump(extra_name, extra_value)
	dumper.step()

	return directory / _FIXED_EXP_NAME


	def _create_multi_step_rank_dump(
	directory: Path,
	*,
	rank: int,
	name: str,
	tensors_per_step: list[torch.Tensor],
	dims: str \| None = None,
	parallel_info: dict \| None = None,
	framework: str = "sglang",
	) -> Path:
	"""Create a dump file with different tensors per step.

	Unlike ``_create_rank_dump`` (which repeats the same tensor),
	this helper accepts a list of tensors — one per step.
	"""
	with pytest.MonkeyPatch.context() as mp:
	mp.setattr(_dumper_module, "_get_rank", lambda: rank)

	dumper = _Dumper(
	config=DumperConfig(
	enable=True,
	dir=str(directory),
	exp_name=_FIXED_EXP_NAME,
	)
	)

	static_meta: dict = {"world_rank": rank, "world_size": 1}
	if parallel_info is not None:
	static_meta[f"{framework}_parallel_info"] = parallel_info
	dumper.__dict__["_static_meta"] = static_meta

	for tensor in tensors_per_step:
	dumper.dump(name, tensor, dims=dims)
	dumper.step()

	return directory / _FIXED_EXP_NAME


	def _create_cp_tp_sharded_dumps(
	directory: Path,
	*,
	full_tensor: torch.Tensor,
	name: str,
	cp_size: int,
	tp_size: int,
	seq_dim: int,
	head_dim: int,
	dims_str: str,
	num_steps: int = 1,
	) -> Path:
	"""Create CP+TP multi-axis sharded dump files from a full tensor."""
	cp_chunks = list(full_tensor.chunk(cp_size, dim=seq_dim))
	rank = 0
	for cp_rank in range(cp_size):
	tp_chunks = list(cp_chunks[cp_rank].chunk(tp_size, dim=head_dim))
	for tp_rank in range(tp_size):
	_create_rank_dump(
	directory,
	rank=rank,
	name=name,
	tensor=tp_chunks[tp_rank],
	dims=dims_str,
	parallel_info={
	"cp_rank": cp_rank,
	"cp_size": cp_size,
	"tp_rank": tp_rank,
	"tp_size": tp_size,
	},
	num_steps=num_steps,
	)
	rank += 1
	return directory / _FIXED_EXP_NAME


	def _create_ep_cp_tp_sharded_dumps(
	directory: Path,
	*,
	full_tensor: torch.Tensor,
	name: str,
	ep_size: int,
	cp_size: int,
	tp_size: int,
	expert_dim: int,
	seq_dim: int,
	head_dim: int,
	dims_str: str,
	num_steps: int = 1,
	) -> Path:
	"""Create EP+CP+TP three-axis sharded dump files from a full tensor."""
	ep_chunks = list(full_tensor.chunk(ep_size, dim=expert_dim))
	rank = 0
	for ep_rank in range(ep_size):
	cp_chunks = list(ep_chunks[ep_rank].chunk(cp_size, dim=seq_dim))
	for cp_rank in range(cp_size):
	tp_chunks = list(cp_chunks[cp_rank].chunk(tp_size, dim=head_dim))
	for tp_rank in range(tp_size):
	_create_rank_dump(
	directory,
	rank=rank,
	name=name,
	tensor=tp_chunks[tp_rank],
	dims=dims_str,
	parallel_info={
	"ep_rank": ep_rank,
	"ep_size": ep_size,
	"cp_rank": cp_rank,
	"cp_size": cp_size,
	"tp_rank": tp_rank,
	"tp_size": tp_size,
	},
	num_steps=num_steps,
	)
	rank += 1
	return directory / _FIXED_EXP_NAME


	def _create_cp_zigzag_tp_sharded_dumps(
	directory: Path,
	*,
	full_tensor: torch.Tensor,
	name: str,
	cp_size: int,
	tp_size: int,
	seq_dim: int,
	head_dim: int,
	dims_str: str,
	num_steps: int = 1,
	) -> Path:
	"""Create CP-zigzag (+optional TP) sharded dump files from a full tensor."""
	num_chunks: int = cp_size * 2
	natural_chunks: list[torch.Tensor] = list(
	full_tensor.chunk(num_chunks, dim=seq_dim)
	)

	zigzag_order: list[int] = []
	for i in range(cp_size):
	zigzag_order.append(i)
	zigzag_order.append(num_chunks - 1 - i)

	zigzagged: torch.Tensor = torch.cat(
	[natural_chunks[idx] for idx in zigzag_order], dim=seq_dim
	)

	cp_chunks: list[torch.Tensor] = list(zigzagged.chunk(cp_size, dim=seq_dim))

	rank: int = 0
	for cp_rank in range(cp_size):
	tp_chunks: list[torch.Tensor] = (
	list(cp_chunks[cp_rank].chunk(tp_size, dim=head_dim))
	if tp_size > 1
	else [cp_chunks[cp_rank]]
	)
	for tp_rank in range(tp_size):
	parallel_info: dict[str, int] = {
	"cp_rank": cp_rank,
	"cp_size": cp_size,
	}
	if tp_size > 1:
	parallel_info["tp_rank"] = tp_rank
	parallel_info["tp_size"] = tp_size

	_create_rank_dump(
	directory,
	rank=rank,
	name=name,
	tensor=tp_chunks[tp_rank],
	dims=dims_str,
	parallel_info=parallel_info,
	num_steps=num_steps,
	)
	rank += 1

	return directory / _FIXED_EXP_NAME


	def _create_cp_zigzag_sp_sharded_dumps(
	directory: Path,
	*,
	full_tensor: torch.Tensor,
	name: str,
	cp_size: int,
	sp_size: int,
	dims_str: str,
	seq_dim: int = 1,
	num_steps: int = 1,
	) -> Path:
	"""Create CP-zigzag + SP sharded dump files for a seq dim (b s h format).

	Shard order (outer to inner, matching left-to-right in dims annotation):
	1. CP zigzag splits seq dim into cp_size chunks (zigzag order)
	2. SP splits each CP chunk into sp_size chunks
	"""
	num_chunks: int = cp_size * 2
	natural_chunks: list[torch.Tensor] = list(
	full_tensor.chunk(num_chunks, dim=seq_dim)
	)

	zigzag_order: list[int] = []
	for i in range(cp_size):
	zigzag_order.append(i)
	zigzag_order.append(num_chunks - 1 - i)

	zigzagged: torch.Tensor = torch.cat(
	[natural_chunks[idx] for idx in zigzag_order], dim=seq_dim
	)
	cp_chunks: list[torch.Tensor] = list(zigzagged.chunk(cp_size, dim=seq_dim))

	rank: int = 0
	for cp_rank in range(cp_size):
	sp_chunks: list[torch.Tensor] = list(
	cp_chunks[cp_rank].chunk(sp_size, dim=seq_dim)
	)
	for sp_rank in range(sp_size):
	_create_rank_dump(
	directory,
	rank=rank,
	name=name,
	tensor=sp_chunks[sp_rank],
	dims=dims_str,
	parallel_info={
	"cp_rank": cp_rank,
	"cp_size": cp_size,
	"sp_rank": sp_rank,
	"sp_size": sp_size,
	},
	num_steps=num_steps,
	)
	rank += 1

	return directory / _FIXED_EXP_NAME


	def _create_replicated_tp_sharded_cp_dumps(
	directory: Path,
	*,
	full_tensor: torch.Tensor,
	name: str,
	cp_size: int,
	tp_size: int,
	seq_dim: int,
	dims_str: str,
	tp_noise: float = 0.0,
	) -> Path:
	"""Create CP-sharded + TP-replicated dump files from a full tensor.

	CP direction: chunks along seq_dim (sharded).
	TP direction: clones (replicated), with optional noise to simulate mismatch.
	"""
	cp_chunks: list[torch.Tensor] = list(full_tensor.chunk(cp_size, dim=seq_dim))

	rank: int = 0
	for cp_rank in range(cp_size):
	for tp_rank in range(tp_size):
	shard = cp_chunks[cp_rank].clone()
	if tp_noise > 0 and tp_rank > 0:
	shard = shard + torch.randn_like(shard) * tp_noise

	_create_rank_dump(
	directory,
	rank=rank,
	name=name,
	tensor=shard,
	dims=dims_str,
	parallel_info={
	"cp_rank": cp_rank,
	"cp_size": cp_size,
	"tp_rank": tp_rank,
	"tp_size": tp_size,
	},
	)
	rank += 1

	return directory / _FIXED_EXP_NAME


	def _create_tp_sharded_dumps(
	directory: Path,
	*,
	full_tensor: torch.Tensor,
	name: str,
	tp_size: int,
	shard_dim: int,
	dims_str: str,
	num_steps: int = 1,
	) -> Path:
	"""Create TP-sharded dump files from a full tensor via the real dumper."""
	shards = list(full_tensor.chunk(tp_size, dim=shard_dim))
	for tp_rank in range(tp_size):
	_create_rank_dump(
	directory,
	rank=tp_rank,
	name=name,
	tensor=shards[tp_rank],
	dims=dims_str,
	parallel_info={"tp_rank": tp_rank, "tp_size": tp_size},
	num_steps=num_steps,
	)
	return directory / _FIXED_EXP_NAME


	def _create_multi_step_tp_sharded_dumps(
	directory: Path,
	*,
	full_tensors_per_step: list[torch.Tensor],
	name: str,
	tp_size: int,
	shard_dim: int,
	dims_str: str,
	) -> Path:
	"""Create TP-sharded dump files with different tensors per step.

	Each step's full tensor is chunked across TP ranks, then
	``_create_multi_step_rank_dump`` writes one file per rank.
	"""
	shards_per_rank: list[list[torch.Tensor]] = [[] for _ in range(tp_size)]
	for full_tensor in full_tensors_per_step:
	shards = list(full_tensor.chunk(tp_size, dim=shard_dim))
	for tp_rank in range(tp_size):
	shards_per_rank[tp_rank].append(shards[tp_rank])

	for tp_rank in range(tp_size):
	_create_multi_step_rank_dump(
	directory,
	rank=tp_rank,
	name=name,
	tensors_per_step=shards_per_rank[tp_rank],
	dims=dims_str,
	parallel_info={"tp_rank": tp_rank, "tp_size": tp_size},
	)
	return directory / _FIXED_EXP_NAME


	def _create_tp_partial_dumps(
	directory: Path,
	*,
	full_tensor: torch.Tensor,
	name: str,
	tp_size: int,
	dims_str: str,
	num_steps: int = 1,
	) -> Path:
	"""Create TP-partial dump files where each rank holds full_tensor / tp_size.

	Each rank stores an equal fraction of the full tensor so that
	element-wise summation across ranks reconstructs the original.
	"""
	for tp_rank in range(tp_size):
	_create_rank_dump(
	directory,
	rank=tp_rank,
	name=name,
	tensor=full_tensor / tp_size,
	dims=dims_str,
	parallel_info={"tp_rank": tp_rank, "tp_size": tp_size},
	num_steps=num_steps,
	)
	return directory / _FIXED_EXP_NAME


	def _create_recompute_rank_dump(
	directory: Path,
	*,
	rank: int,
	name: str,
	original_tensor: torch.Tensor,
	recompute_tensor: torch.Tensor,
	dims: str = "h d",
	) -> Path:
	"""Create a dump with both original and recompute forward passes via monkeypatched dumper.

	The dumper naturally produces recompute_pseudo_rank=0 for original and =1 for recompute,
	plus recompute_pseudo_size=2.
	"""
	with pytest.MonkeyPatch.context() as mp:
	mp.setattr(_dumper_module, "_get_rank", lambda: rank)

	dumper = _Dumper(
	config=DumperConfig(
	enable=True,
	dir=str(directory),
	exp_name=_FIXED_EXP_NAME,
	)
	)
	dumper.__dict__["_static_meta"] = {"world_rank": rank, "world_size": 1}

	# dump original forward
	mp.setattr(
	_dumper_module,
	"_detect_recompute_status",
	lambda: _RecomputeStatus.ORIGINAL,
	)
	dumper.dump(name, original_tensor, dims=dims)

	# dump recompute forward
	mp.setattr(
	_dumper_module,
	"_detect_recompute_status",
	lambda: _RecomputeStatus.RECOMPUTE,
	)
	dumper.dump(name, recompute_tensor, dims=dims)

	dumper.step()

	return directory / _FIXED_EXP_NAME


	def _zigzag_split_seq(seq_natural: torch.Tensor, *, cp_size: int) -> list[torch.Tensor]:
	"""Split a natural-order seq into per-rank zigzag segments."""
	num_chunks: int = cp_size * 2
	chunks: list[torch.Tensor] = list(seq_natural.chunk(num_chunks, dim=0))
	order: list[int] = []
	for i in range(cp_size):
	order.append(i)
	order.append(num_chunks - 1 - i)
	zigzagged: torch.Tensor = torch.cat([chunks[i] for i in order], dim=0)
	return list(zigzagged.chunk(cp_size, dim=0))


	def _create_thd_cp_zigzag_dumps(
	directory: Path,
	*,
	full_tensor: torch.Tensor,
	name: str,
	seq_lens: list[int],
	cp_size: int,
	total_per_rank: int,
	dims_str: str = "t[cp:zigzag]",
	num_steps: int = 1,
	) -> Path:
	"""Create THD CP-zigzag sharded dump files simulating Megatron forward.

	Args:
	full_tensor: 1D tensor of shape [T] in natural order.
	seq_lens: per-seq token counts in natural order (e.g. [100, 64]).
	cp_size: context parallelism size.
	total_per_rank: total tokens per rank (including padding).
	dims_str: dims annotation for the main tensor.
	"""
	# Build per-rank tensors from natural-order full_tensor
	offset: int = 0
	rank_segments: list[list[torch.Tensor]] = [[] for _ in range(cp_size)]

	for seq_len in seq_lens:
	seq_natural: torch.Tensor = full_tensor[offset : offset + seq_len]
	seq_ranks: list[torch.Tensor] = _zigzag_split_seq(seq_natural, cp_size=cp_size)
	for rank_idx in range(cp_size):
	rank_segments[rank_idx].append(seq_ranks[rank_idx])
	offset += seq_len

	# Build cu_seqlens from seq_lens (global, replicated across ranks)
	cu_seqlens_values: list[int] = [0]
	for slen in seq_lens:
	cu_seqlens_values.append(cu_seqlens_values[-1] + slen)

	# Pad to total_per_rank per rank (global pad = last cu_seqlens entry to total_per_rank * cp_size)
	total_global: int = total_per_rank * cp_size
	if cu_seqlens_values[-1] < total_global:
	pad_global: int = total_global - cu_seqlens_values[-1]
	cu_seqlens_values.append(total_global)
	pad_per_rank: int = pad_global // cp_size
	for rank_idx in range(cp_size):
	rank_segments[rank_idx].append(torch.zeros(pad_per_rank))

	cu_seqlens_q: torch.Tensor = torch.tensor(cu_seqlens_values, dtype=torch.int64)

	# Dump each rank
	for cp_rank in range(cp_size):
	rank_tensor: torch.Tensor = torch.cat(rank_segments[cp_rank], dim=0)
	assert (
	rank_tensor.shape[0] == total_per_rank
	), f"rank {cp_rank}: expected {total_per_rank} tokens, got {rank_tensor.shape[0]}"

	_create_rank_dump(
	directory,
	rank=cp_rank,
	name=name,
	tensor=rank_tensor,
	dims=dims_str,
	parallel_info={
	"cp_rank": cp_rank,
	"cp_size": cp_size,
	},
	framework="megatron",
	num_steps=num_steps,
	extra_dumps=[
	("cu_seqlens_q", cu_seqlens_q),
	("input_ids", rank_tensor.to(torch.int64)),
	],
	)

	return directory / _FIXED_EXP_NAME


	class TestEntrypointPerTokenVisualization:
	"""Test --visualize-per-token CLI flag integration."""

	def test_visualize_per_token_creates_png(self, tmp_path: Path, capsys) -> None:
	"""--visualize-per-token with dims metadata produces per-token data in records."""
	pytest.importorskip("matplotlib")

	torch.manual_seed(42)
	baseline_dir: Path = tmp_path / "baseline"
	target_dir: Path = tmp_path / "target"
	baseline_dir.mkdir()
	target_dir.mkdir()

	baseline_tensor: torch.Tensor = torch.randn(10, 10)
	target_tensor: torch.Tensor = baseline_tensor + torch.randn(10, 10) * 0.01

	for name in ["tensor_a", "tensor_b"]:
	_create_rank_dump(
	baseline_dir,
	rank=0,
	name=name,
	tensor=baseline_tensor,
	dims="t h",
	)
	_create_rank_dump(
	target_dir,
	rank=0,
	name=name,
	tensor=target_tensor,
	dims="t h",
	)

	baseline_path: Path = baseline_dir / _FIXED_EXP_NAME
	target_path: Path = target_dir / _FIXED_EXP_NAME

	output_png: Path = tmp_path / "per_token.png"
	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	visualize_per_token=str(output_png),
	)
	records, _ = _run_and_parse(argv, capsys)

	comparisons = _get_comparisons(records)
	assert len(comparisons) == 2

	# per_token_rel_diff should be populated
	for comp in comparisons:
	assert comp.diff is not None
	assert comp.diff.per_token_rel_diff is not None
	assert isinstance(comp.diff.per_token_rel_diff, list)
	assert len(comp.diff.per_token_rel_diff) == 10

	def test_no_visualize_no_per_token(self, tmp_path: Path, capsys) -> None:
	"""Without --visualize-per-token, per_token_rel_diff is None."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	argv = _make_argv(baseline_path, target_path, preset="raw")

	records, _ = _run_and_parse(argv, capsys)

	comparisons = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].diff is not None
	assert comparisons[0].diff.per_token_rel_diff is None


	class TestEntrypointThdCpZigzag:
	"""E2E entrypoint tests for THD CP zigzag format.

	Tests the full pipeline: dump creation → metadata loading → aligner plan →
	unshard + reorder → tensor comparison.
	"""

	def test_sglang_vs_megatron_zigzag_cp(self, tmp_path: Path, capsys) -> None:
	"""SGLang single-rank THD baseline vs Megatron CP=2 zigzag target."""
	torch.manual_seed(42)
	hidden_dim: int = 8
	cp_size: int = 2

	# Two sequences: 8 and 4 tokens (divisible by cp_size*2=4 for clean zigzag)
	seq_a_ids: list[int] = [10, 20, 30, 40, 50, 60, 70, 80]
	seq_b_ids: list[int] = [100, 200, 300, 400]
	all_ids: list[int] = seq_a_ids + seq_b_ids
	total_tokens: int = len(all_ids)
	seq_lens: list[int] = [len(seq_a_ids), len(seq_b_ids)]

	hidden_states: torch.Tensor = torch.randn(total_tokens, hidden_dim)

	# --- SGLang baseline: single rank, 1 step ---
	sglang_dir: Path = tmp_path / "baseline"
	sglang_dir.mkdir()
	sglang_dumper = _Dumper(
	config=DumperConfig(
	enable=True,
	dir=str(sglang_dir),
	exp_name=_FIXED_EXP_NAME,
	)
	)

	positions: list[int] = list(range(seq_lens[0])) + list(range(seq_lens[1]))
	sglang_dumper.dump("input_ids", torch.tensor(all_ids))
	sglang_dumper.dump("positions", torch.tensor(positions))
	sglang_dumper.dump("seq_lens", torch.tensor(seq_lens))
	sglang_dumper.dump("rids", ["A", "B"])
	sglang_dumper.dump("hidden_states", hidden_states)
	sglang_dumper.step()

	# --- Megatron target: CP=2, zigzag, 1 step ---
	megatron_dir: Path = tmp_path / "target"
	megatron_dir.mkdir()

	# Zigzag-split input_ids and hidden_states per sequence, then concat
	ids_tensor: torch.Tensor = torch.tensor(all_ids, dtype=torch.int64)
	offset: int = 0
	rank_id_segments: list[list[torch.Tensor]] = [[] for _ in range(cp_size)]
	rank_hidden_segments: list[list[torch.Tensor]] = [[] for _ in range(cp_size)]
	for slen in seq_lens:
	seq_ids: torch.Tensor = ids_tensor[offset : offset + slen]
	seq_hidden: torch.Tensor = hidden_states[offset : offset + slen]
	zigzag_ids: list[torch.Tensor] = _zigzag_split_seq(seq_ids, cp_size=cp_size)
	zigzag_hidden: list[torch.Tensor] = _zigzag_split_seq(
	seq_hidden, cp_size=cp_size
	)
	for rank_idx in range(cp_size):
	rank_id_segments[rank_idx].append(zigzag_ids[rank_idx])
	rank_hidden_segments[rank_idx].append(zigzag_hidden[rank_idx])
	offset += slen

	cu_seqlens_q: torch.Tensor = torch.tensor(
	[0] + [sum(seq_lens[: i + 1]) for i in range(len(seq_lens))],
	dtype=torch.int64,
	)

	for cp_rank in range(cp_size):
	rank_ids: torch.Tensor = torch.cat(rank_id_segments[cp_rank])
	rank_hidden: torch.Tensor = torch.cat(rank_hidden_segments[cp_rank])
	_create_rank_dump(
	megatron_dir,
	rank=cp_rank,
	name="hidden_states",
	tensor=rank_hidden,
	dims="t[cp:zigzag] h",
	parallel_info={"cp_rank": cp_rank, "cp_size": cp_size},
	framework="megatron",
	extra_dumps=[
	("cu_seqlens_q", cu_seqlens_q),
	("input_ids", rank_ids),
	],
	)

	# --- Run comparison ---
	argv: list[str] = _make_argv(
	sglang_dir / _FIXED_EXP_NAME,
	megatron_dir / _FIXED_EXP_NAME,
	grouping_skip_keys=["rank", "step"],
	token_aligner="smart",
	diff_threshold=1e-3,
	)
	records, _ = _run_and_parse(argv, capsys)

	comparisons: list[ComparisonTensorRecord] = _get_comparisons(records)
	hidden_comparisons: list[ComparisonTensorRecord] = [
	c for c in comparisons if c.name == "hidden_states"
	]
	assert len(hidden_comparisons) >= 1
	assert all(c.diff is not None and c.diff.passed for c in hidden_comparisons)

	def test_thd_cp_zigzag_unshard(self, tmp_path: Path, capsys) -> None:
	"""Both sides THD CP=2 zigzag, comparison should pass."""
	torch.manual_seed(42)
	cp_size: int = 2
	seq_lens: list[int] = [100, 64]
	total_tokens: int = sum(seq_lens)
	total_per_rank: int = 128

	full_tensor: torch.Tensor = torch.randn(total_tokens + 92)

	baseline_dir: Path = tmp_path / "baseline"
	target_dir: Path = tmp_path / "target"
	baseline_dir.mkdir()
	target_dir.mkdir()

	baseline_path: Path = _create_thd_cp_zigzag_dumps(
	baseline_dir,
	full_tensor=full_tensor,
	name="hidden_states",
	seq_lens=seq_lens,
	cp_size=cp_size,
	total_per_rank=total_per_rank,
	)

	# Target: same data with small noise
	target_tensor: torch.Tensor = full_tensor + torch.randn_like(full_tensor) * 1e-5
	target_path: Path = _create_thd_cp_zigzag_dumps(
	target_dir,
	full_tensor=target_tensor,
	name="hidden_states",
	seq_lens=seq_lens,
	cp_size=cp_size,
	total_per_rank=total_per_rank,
	)

	argv: list[str] = _make_argv(
	baseline_path,
	target_path,
	grouping_skip_keys=["rank", "step"],
	token_aligner="smart",
	diff_threshold=1e-3,
	)
	records, _ = _run_and_parse(argv, capsys)

	# hidden_states should pass comparison (after unshard + reorder)
	comparisons: list[ComparisonTensorRecord] = _get_comparisons(records)
	hidden_comparisons: list[ComparisonTensorRecord] = [
	c for c in comparisons if c.name == "hidden_states"
	]
	assert len(hidden_comparisons) >= 1
	assert all(c.diff is not None and c.diff.passed for c in hidden_comparisons)


	class TestEntrypointDpFilter:
	"""E2E tests for DP (data parallel) filtering.

	When DP > 1, only one dp_rank has non-empty tensors; the others
	dump empty (numel=0) tensors. The comparator should filter out the
	empty dp_rank items and produce correct comparison results.
	"""

	def test_dp2_sglang_both_sides(self, tmp_path: Path, capsys) -> None:
	"""DP=2 sglang: both baseline and target have 1 non-empty + 1 empty dp_rank."""
	torch.manual_seed(42)
	tensor_data: torch.Tensor = torch.randn(10, 8)
	target_data: torch.Tensor = tensor_data + torch.randn(10, 8) * 0.001

	for side, side_dir_name, data in [
	("baseline", "baseline", tensor_data),
	("target", "target", target_data),
	]:
	side_dir: Path = tmp_path / side_dir_name
	side_dir.mkdir()

	# dp_rank=0: non-empty tensor
	_create_rank_dump(
	side_dir,
	rank=0,
	name="hidden",
	tensor=data,
	dims="t h",
	parallel_info={
	"tp_rank": 0,
	"tp_size": 1,
	"dp_rank": 0,
	"dp_size": 2,
	},
	framework="sglang",
	)

	# dp_rank=1: empty tensor
	_create_rank_dump(
	side_dir,
	rank=1,
	name="hidden",
	tensor=torch.empty(0, 8),
	dims="t h",
	parallel_info={
	"tp_rank": 0,
	"tp_size": 1,
	"dp_rank": 1,
	"dp_size": 2,
	},
	framework="sglang",
	)

	argv: list[str] = _make_argv(
	tmp_path / "baseline" / _FIXED_EXP_NAME,
	tmp_path / "target" / _FIXED_EXP_NAME,
	diff_threshold=1e-3,
	)
	records, _ = _run_and_parse(argv, capsys)

	comparison: ComparisonTensorRecord = _assert_single_comparison_passed(records)
	assert comparison.name == "hidden"

	def test_dp2_megatron_both_sides(self, tmp_path: Path, capsys) -> None:
	"""DP=2 megatron: both baseline and target have 1 non-empty + 1 empty dp_rank."""
	torch.manual_seed(42)
	tensor_data: torch.Tensor = torch.randn(10, 8)
	target_data: torch.Tensor = tensor_data + torch.randn(10, 8) * 0.001

	for side, side_dir_name, data in [
	("baseline", "baseline", tensor_data),
	("target", "target", target_data),
	]:
	side_dir: Path = tmp_path / side_dir_name
	side_dir.mkdir()

	# dp_rank=0: non-empty tensor
	_create_rank_dump(
	side_dir,
	rank=0,
	name="hidden",
	tensor=data,
	dims="t h",
	parallel_info={
	"tp_rank": 0,
	"tp_size": 1,
	"dp_rank": 0,
	"dp_size": 2,
	},
	framework="megatron",
	)

	# dp_rank=1: empty tensor
	_create_rank_dump(
	side_dir,
	rank=1,
	name="hidden",
	tensor=torch.empty(0, 8),
	dims="t h",
	parallel_info={
	"tp_rank": 0,
	"tp_size": 1,
	"dp_rank": 1,
	"dp_size": 2,
	},
	framework="megatron",
	)

	argv: list[str] = _make_argv(
	tmp_path / "baseline" / _FIXED_EXP_NAME,
	tmp_path / "target" / _FIXED_EXP_NAME,
	diff_threshold=1e-3,
	)
	records, _ = _run_and_parse(argv, capsys)

	comparison: ComparisonTensorRecord = _assert_single_comparison_passed(records)
	assert comparison.name == "hidden"

	def test_dp2_tp2_sglang(self, tmp_path: Path, capsys) -> None:
	"""DP=2 x TP=2 sglang: 4 ranks, dp_rank=0 has data, dp_rank=1 empty."""
	torch.manual_seed(42)
	full_tensor: torch.Tensor = torch.randn(10, 8)
	tp_chunks: list[torch.Tensor] = list(full_tensor.chunk(2, dim=1))

	target_full: torch.Tensor = full_tensor + torch.randn(10, 8) * 0.001
	target_tp_chunks: list[torch.Tensor] = list(target_full.chunk(2, dim=1))

	for side, side_dir_name, chunks in [
	("baseline", "baseline", tp_chunks),
	("target", "target", target_tp_chunks),
	]:
	side_dir: Path = tmp_path / side_dir_name
	side_dir.mkdir()

	rank: int = 0
	for dp_rank in range(2):
	for tp_rank in range(2):
	tensor: torch.Tensor = (
	chunks[tp_rank] if dp_rank == 0 else torch.empty(0, 4)
	)
	_create_rank_dump(
	side_dir,
	rank=rank,
	name="hidden",
	tensor=tensor,
	dims="t h[tp]",
	parallel_info={
	"tp_rank": tp_rank,
	"tp_size": 2,
	"dp_rank": dp_rank,
	"dp_size": 2,
	},
	framework="sglang",
	)
	rank += 1

	argv: list[str] = _make_argv(
	tmp_path / "baseline" / _FIXED_EXP_NAME,
	tmp_path / "target" / _FIXED_EXP_NAME,
	diff_threshold=1e-3,
	)
	records, _ = _run_and_parse(argv, capsys)

	comparison: ComparisonTensorRecord = _assert_single_comparison_passed(records)
	assert comparison.name == "hidden"

	def test_dp2_both_nonempty_raises(self, tmp_path: Path, capsys) -> None:
	"""DP=2 sglang: both dp_rank=0 and dp_rank=1 have non-empty tensors => AssertionError."""
	torch.manual_seed(42)
	tensor_data: torch.Tensor = torch.randn(10, 8)
	target_data: torch.Tensor = tensor_data + torch.randn(10, 8) * 0.001

	for side, side_dir_name, data in [
	("baseline", "baseline", tensor_data),
	("target", "target", target_data),
	]:
	side_dir: Path = tmp_path / side_dir_name
	side_dir.mkdir()

	for dp_rank in range(2):
	_create_rank_dump(
	side_dir,
	rank=dp_rank,
	name="hidden",
	tensor=data,
	dims="t h",
	parallel_info={
	"tp_rank": 0,
	"tp_size": 1,
	"dp_rank": dp_rank,
	"dp_size": 2,
	},
	framework="sglang",
	)

	argv: list[str] = _make_argv(
	tmp_path / "baseline" / _FIXED_EXP_NAME,
	tmp_path / "target" / _FIXED_EXP_NAME,
	diff_threshold=1e-3,
	)

	records, exit_code = _run_and_parse(argv, capsys)
	errors = [r for r in records if isinstance(r, ComparisonErrorRecord)]
	assert len(errors) == 1
	assert errors[0].exception_type == "AssertionError"
	assert "Expected exactly 1 non-empty dp_rank" in errors[0].traceback_str
	assert exit_code == 1


	class TestEntrypointDpGroupAlias:
	"""E2E tests for the ``# dp:=<group>`` dp group alias feature.

	In dp_attn mode, dp_size > 1 but MLP tensors after dp_gather have data
	on all ranks. With ``# dp:=moe_dp`` in dims, the dp filter uses
	``moe_dp_rank/moe_dp_size`` instead of ``dp_rank/dp_size``.
	"""

	def test_dp_alias_absent_group_noop(self, tmp_path: Path, capsys) -> None:
	"""Single rank with ``# dp:=moe_dp`` in dims → parse_dims strips ``#``, comparison OK."""
	torch.manual_seed(42)
	tensor_data: torch.Tensor = torch.randn(10, 8)
	target_data: torch.Tensor = tensor_data + torch.randn(10, 8) * 0.001

	for side_dir_name, data in [("baseline", tensor_data), ("target", target_data)]:
	side_dir: Path = tmp_path / side_dir_name
	side_dir.mkdir()

	_create_rank_dump(
	side_dir,
	rank=0,
	name="hidden",
	tensor=data,
	dims="t h # dp:=moe_dp",
	parallel_info={
	"tp_rank": 0,
	"tp_size": 1,
	"dp_rank": 0,
	"dp_size": 1,
	},
	framework="sglang",
	)

	argv: list[str] = _make_argv(
	tmp_path / "baseline" / _FIXED_EXP_NAME,
	tmp_path / "target" / _FIXED_EXP_NAME,
	diff_threshold=1e-3,
	)
	records, _ = _run_and_parse(argv, capsys)

	comparison: ComparisonTensorRecord = _assert_single_comparison_passed(records)
	assert comparison.name == "hidden"

	def test_dp_alias_via_override_dims(self, tmp_path: Path, capsys) -> None:
	"""--override-dims adds ``# dp:=moe_dp`` → dp filter uses alias, filters correctly."""
	torch.manual_seed(42)
	tensor_data: torch.Tensor = torch.randn(10, 8)
	target_data: torch.Tensor = tensor_data + torch.randn(10, 8) * 0.001

	for side_dir_name, data in [("baseline", tensor_data), ("target", target_data)]:
	side_dir: Path = tmp_path / side_dir_name
	side_dir.mkdir()

	# moe_dp_rank=0: non-empty
	_create_rank_dump(
	side_dir,
	rank=0,
	name="hidden",
	tensor=data,
	dims="t h",
	parallel_info={
	"tp_rank": 0,
	"tp_size": 1,
	"dp_rank": 0,
	"dp_size": 1,
	"moe_dp_rank": 0,
	"moe_dp_size": 2,
	},
	framework="sglang",
	)

	# moe_dp_rank=1: empty
	_create_rank_dump(
	side_dir,
	rank=1,
	name="hidden",
	tensor=torch.empty(0, 8),
	dims="t h",
	parallel_info={
	"tp_rank": 0,
	"tp_size": 1,
	"dp_rank": 0,
	"dp_size": 1,
	"moe_dp_rank": 1,
	"moe_dp_size": 2,
	},
	framework="sglang",
	)

	argv: list[str] = _make_argv(
	tmp_path / "baseline" / _FIXED_EXP_NAME,
	tmp_path / "target" / _FIXED_EXP_NAME,
	diff_threshold=1e-3,
	override_dims=["hidden:t h # dp:=moe_dp"],
	)
	records, _ = _run_and_parse(argv, capsys)

	comparison: ComparisonTensorRecord = _assert_single_comparison_passed(records)
	assert comparison.name == "hidden"

	def test_dp_alias_with_real_alias_group_filters(
	self, tmp_path: Path, capsys
	) -> None:
	"""Alias group present with moe_dp_size=2, one empty rank → filters correctly."""
	torch.manual_seed(42)
	tensor_data: torch.Tensor = torch.randn(10, 8)
	target_data: torch.Tensor = tensor_data + torch.randn(10, 8) * 0.001

	for side_dir_name, data in [("baseline", tensor_data), ("target", target_data)]:
	side_dir: Path = tmp_path / side_dir_name
	side_dir.mkdir()

	for moe_dp_rank in range(2):
	tensor: torch.Tensor = data if moe_dp_rank == 0 else torch.empty(0, 8)
	_create_rank_dump(
	side_dir,
	rank=moe_dp_rank,
	name="hidden",
	tensor=tensor,
	dims="t h # dp:=moe_dp",
	parallel_info={
	"tp_rank": 0,
	"tp_size": 1,
	"dp_rank": 0,
	"dp_size": 1,
	"moe_dp_rank": moe_dp_rank,
	"moe_dp_size": 2,
	},
	framework="sglang",
	)

	argv: list[str] = _make_argv(
	tmp_path / "baseline" / _FIXED_EXP_NAME,
	tmp_path / "target" / _FIXED_EXP_NAME,
	diff_threshold=1e-3,
	)
	records, _ = _run_and_parse(argv, capsys)

	comparison: ComparisonTensorRecord = _assert_single_comparison_passed(records)
	assert comparison.name == "hidden"


	class TestEntrypointMetaOverride:
	"""E2E: dump with wrong dims → --override-dims / --override-config corrects at comparison time."""

	@staticmethod
	def _create_single_rank_pair(
	tmp_path: Path,
	*,
	name: str = "hidden",
	baseline_dims: str \| None = "x y",
	target_dims: str \| None = "x y",
	) -> tuple[Path, Path]:
	"""Create single-rank baseline+target dumps with a close tensor pair."""
	torch.manual_seed(42)
	tensor: torch.Tensor = torch.randn(10, 8)
	target: torch.Tensor = tensor + torch.randn(10, 8) * 0.001

	baseline_dir: Path = tmp_path / "baseline"
	target_dir: Path = tmp_path / "target"
	baseline_dir.mkdir()
	target_dir.mkdir()

	_create_rank_dump(
	baseline_dir, rank=0, name=name, tensor=tensor, dims=baseline_dims
	)
	_create_rank_dump(
	target_dir, rank=0, name=name, tensor=target, dims=target_dims
	)

	return baseline_dir / _FIXED_EXP_NAME, target_dir / _FIXED_EXP_NAME

	@staticmethod
	def _assert_all_passed(
	records: list[AnyRecord], *, expected_count: int = 1
	) -> None:
	"""Assert that exactly expected_count comparisons exist and all passed."""
	comparisons: list[ComparisonTensorRecord] = _get_comparisons(records)
	assert len(comparisons) == expected_count
	assert all(c.diff is not None and c.diff.passed for c in comparisons)

	def test_override_dims_fixes_wrong_dims(self, tmp_path: Path, capsys) -> None:
	"""Tensor dumped with wrong dims='h d' is fixed by --override-dims to 't h[tp]'."""
	torch.manual_seed(42)

	full_tensor: torch.Tensor = torch.randn(10, 8)
	tp_chunks: list[torch.Tensor] = list(full_tensor.chunk(2, dim=1))

	target_full: torch.Tensor = full_tensor + torch.randn(10, 8) * 0.001
	target_tp_chunks: list[torch.Tensor] = list(target_full.chunk(2, dim=1))

	baseline_dir: Path = tmp_path / "baseline"
	target_dir: Path = tmp_path / "target"
	baseline_dir.mkdir()
	target_dir.mkdir()

	# Dump with WRONG dims "h d" instead of correct "t h[tp]"
	for tp_rank in range(2):
	_create_rank_dump(
	baseline_dir,
	rank=tp_rank,
	name="hidden",
	tensor=tp_chunks[tp_rank],
	dims="h d",
	parallel_info={"tp_rank": tp_rank, "tp_size": 2},
	)
	_create_rank_dump(
	target_dir,
	rank=tp_rank,
	name="hidden",
	tensor=target_tp_chunks[tp_rank],
	dims="h d",
	parallel_info={"tp_rank": tp_rank, "tp_size": 2},
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	override_dims=["hidden:t h[tp]"],
	)
	self._assert_all_passed(_run_and_parse(argv, capsys)[0])

	@pytest.mark.parametrize(
	"baseline_dims, target_dims, override_kwarg",
	[
	("x y", "t h", {"override_baseline_dims": ["hidden:t h"]}),
	("t h", "x y", {"override_target_dims": ["hidden:t h"]}),
	("x y", "x y", {"override_dims": ["hidden:t h"]}),
	],
	ids=["baseline_only", "target_only", "both_via_override_dims"],
	)
	def test_single_side_override(
	self,
	tmp_path: Path,
	capsys,
	baseline_dims: str,
	target_dims: str,
	override_kwarg: dict,
	) -> None:
	"""Per-side override fixes the wrong dims on one or both sides."""
	baseline_path, target_path = self._create_single_rank_pair(
	tmp_path,
	baseline_dims=baseline_dims,
	target_dims=target_dims,
	)

	argv = _make_argv(baseline_path, target_path, preset="raw", **override_kwarg)
	self._assert_all_passed(_run_and_parse(argv, capsys)[0])

	def test_override_config_yaml(self, tmp_path: Path, capsys) -> None:
	"""--override-config YAML overrides dims."""
	baseline_path, target_path = self._create_single_rank_pair(tmp_path)

	yaml_path: Path = tmp_path / "override.yaml"
	yaml_path.write_text(textwrap.dedent("""\
	overrides:
	- match: "hidden"
	dims: "t h"
	"""))

	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	override_config=str(yaml_path),
	)
	self._assert_all_passed(_run_and_parse(argv, capsys)[0])

	def test_no_match_uses_original_dims(self, tmp_path: Path, capsys) -> None:
	"""When override regex doesn't match, original dims from dump are used."""
	baseline_path, target_path = self._create_single_rank_pair(
	tmp_path,
	baseline_dims="t h",
	target_dims="t h",
	)

	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	override_dims=["no_match_pattern:b s d"],
	)
	self._assert_all_passed(_run_and_parse(argv, capsys)[0])

	def test_selective_match_multi_tensor(self, tmp_path: Path, capsys) -> None:
	"""Override matches only 'logits'; 'hidden' uses original dims."""
	torch.manual_seed(42)

	baseline_dir: Path = tmp_path / "baseline"
	target_dir: Path = tmp_path / "target"
	baseline_dir.mkdir()
	target_dir.mkdir()

	hidden_b: torch.Tensor = torch.randn(10, 8)
	hidden_t: torch.Tensor = hidden_b + torch.randn(10, 8) * 0.001
	logits_b: torch.Tensor = torch.randn(10, 4)
	logits_t: torch.Tensor = logits_b + torch.randn(10, 4) * 0.001

	for name, b_tensor, t_tensor, dims in [
	("hidden", hidden_b, hidden_t, "t h"),
	("logits", logits_b, logits_t, "x y"),
	]:
	_create_rank_dump(
	baseline_dir, rank=0, name=name, tensor=b_tensor, dims=dims
	)
	_create_rank_dump(target_dir, rank=0, name=name, tensor=t_tensor, dims=dims)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	preset="raw",
	override_dims=["logits:t v"],
	)
	self._assert_all_passed(_run_and_parse(argv, capsys)[0], expected_count=2)

	def test_multiple_cli_override_dims(self, tmp_path: Path, capsys) -> None:
	"""Multiple --override-dims for different tensors."""
	torch.manual_seed(42)

	baseline_dir: Path = tmp_path / "baseline"
	target_dir: Path = tmp_path / "target"
	baseline_dir.mkdir()
	target_dir.mkdir()

	hidden_b: torch.Tensor = torch.randn(10, 8)
	hidden_t: torch.Tensor = hidden_b + torch.randn(10, 8) * 0.001
	logits_b: torch.Tensor = torch.randn(10, 4)
	logits_t: torch.Tensor = logits_b + torch.randn(10, 4) * 0.001

	for name, b_tensor, t_tensor in [
	("hidden", hidden_b, hidden_t),
	("logits", logits_b, logits_t),
	]:
	_create_rank_dump(
	baseline_dir, rank=0, name=name, tensor=b_tensor, dims="x y"
	)
	_create_rank_dump(
	target_dir, rank=0, name=name, tensor=t_tensor, dims="x y"
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	preset="raw",
	override_dims=["hidden:t h", "logits:t v"],
	)
	self._assert_all_passed(_run_and_parse(argv, capsys)[0], expected_count=2)

	def test_per_side_dims_different_parallelism(self, tmp_path: Path, capsys) -> None:
	"""baseline TP-sharded, target EP-sharded — per-side override fixes both."""
	torch.manual_seed(42)
	full_tensor: torch.Tensor = torch.randn(10, 8)
	target_full: torch.Tensor = full_tensor + torch.randn(10, 8) * 0.001

	baseline_dir: Path = tmp_path / "baseline"
	target_dir: Path = tmp_path / "target"
	baseline_dir.mkdir()
	target_dir.mkdir()

	b_chunks: list[torch.Tensor] = list(full_tensor.chunk(2, dim=1))
	for tp_rank in range(2):
	_create_rank_dump(
	baseline_dir,
	rank=tp_rank,
	name="hidden",
	tensor=b_chunks[tp_rank],
	dims="x y",
	parallel_info={"tp_rank": tp_rank, "tp_size": 2},
	)

	t_chunks: list[torch.Tensor] = list(target_full.chunk(2, dim=1))
	for ep_rank in range(2):
	_create_rank_dump(
	target_dir,
	rank=ep_rank,
	name="hidden",
	tensor=t_chunks[ep_rank],
	dims="x y",
	parallel_info={"ep_rank": ep_rank, "ep_size": 2},
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	override_baseline_dims=["hidden:t h[tp]"],
	override_target_dims=["hidden:t h[ep]"],
	)
	self._assert_all_passed(_run_and_parse(argv, capsys)[0])

	def test_yaml_first_match_wins_e2e(self, tmp_path: Path, capsys) -> None:
	"""YAML with two matching rules: first rule wins in real pipeline."""
	baseline_path, target_path = self._create_single_rank_pair(tmp_path)

	yaml_path: Path = tmp_path / "override.yaml"
	yaml_path.write_text(textwrap.dedent("""\
	overrides:
	- match: "hidden"
	dims: "t h"
	- match: "hidden"
	dims: "a b"
	"""))

	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	override_config=str(yaml_path),
	)
	self._assert_all_passed(_run_and_parse(argv, capsys)[0])

	def test_cli_overrides_yaml_e2e(self, tmp_path: Path, capsys) -> None:
	"""CLI --override-dims wins over YAML rule for the same tensor."""
	baseline_path, target_path = self._create_single_rank_pair(tmp_path)

	yaml_path: Path = tmp_path / "override.yaml"
	yaml_path.write_text(textwrap.dedent("""\
	overrides:
	- match: "hidden"
	dims: "a b"
	"""))

	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	override_dims=["hidden:t h"],
	override_config=str(yaml_path),
	)
	self._assert_all_passed(_run_and_parse(argv, capsys)[0])

	def test_override_injects_dims_when_absent(self, tmp_path: Path, capsys) -> None:
	"""Override injects dims into meta even when dump had no dims annotation."""
	baseline_path, target_path = self._create_single_rank_pair(
	tmp_path,
	baseline_dims=None,
	target_dims=None,
	)

	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	override_dims=["hidden:t h"],
	)
	self._assert_all_passed(_run_and_parse(argv, capsys)[0])

	def test_non_tensor_unaffected_by_override(self, tmp_path: Path, capsys) -> None:
	"""Non-tensor values pass through without error even with active override."""
	torch.manual_seed(42)
	tensor: torch.Tensor = torch.randn(4, 4)

	baseline_dir: Path = tmp_path / "baseline"
	target_dir: Path = tmp_path / "target"
	baseline_dir.mkdir()
	target_dir.mkdir()

	for side_dir in [baseline_dir, target_dir]:
	_create_non_tensor_rank_dump(
	side_dir,
	rank=0,
	name="sm_scale",
	value=0.125,
	extra_tensor_dumps=[("hidden", tensor)],
	)

	argv = _make_argv(
	baseline_dir / _FIXED_EXP_NAME,
	target_dir / _FIXED_EXP_NAME,
	preset="raw",
	override_dims=["hidden:x y"],
	)
	records, _ = _run_and_parse(argv, capsys)

	non_tensors: list[ComparisonNonTensorRecord] = [
	r for r in records if isinstance(r, ComparisonNonTensorRecord)
	]
	assert len(non_tensors) == 1
	assert non_tensors[0].name == "sm_scale"
	assert non_tensors[0].values_equal

	comparisons: list[ComparisonTensorRecord] = _get_comparisons(records)
	assert len(comparisons) == 1
	assert comparisons[0].name == "hidden"

	summary: SummaryRecord = [r for r in records if isinstance(r, SummaryRecord)][0]
	assert summary.failed == 0


	class TestExitCode:
	"""E2E tests for exit code behavior based on comparison results."""

	def test_e2e_all_passed_exit_zero(self, tmp_path, capsys):
	"""Integration: all comparisons pass → run() returns 0."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a", "tensor_b"])
	argv = _make_argv(baseline_path, target_path, preset="raw")

	records, exit_code = _run_and_parse(argv, capsys)
	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.passed == 2
	assert summary.failed == 0
	assert exit_code == 0

	def test_e2e_has_failed_exit_nonzero(self, tmp_path, capsys):
	"""Integration: a failed comparison → run() returns 1."""
	torch.manual_seed(42)
	baseline_path = _create_rank_dump(
	tmp_path / "baseline", rank=0, name="tensor_a", tensor=torch.randn(10, 10)
	)
	target_path = _create_rank_dump(
	tmp_path / "target",
	rank=0,
	name="tensor_a",
	tensor=torch.randn(10, 10) * 100,
	)
	argv = _make_argv(baseline_path, target_path, preset="raw", diff_threshold=1e-3)

	records, exit_code = _run_and_parse(argv, capsys)
	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.failed == 1
	assert exit_code == 1

	def test_e2e_allow_failed_pattern_exit_zero(self, tmp_path, capsys):
	"""E2E: failed tensor matched by allow_failed_pattern + a passing tensor → exit 0."""
	torch.manual_seed(42)
	shared_tensor = torch.randn(10, 10)

	baseline_path = _create_rank_dump(
	tmp_path / "baseline",
	rank=0,
	name="tensor_bad",
	tensor=torch.randn(10, 10),
	extra_dumps=[("tensor_good", shared_tensor)],
	)
	target_path = _create_rank_dump(
	tmp_path / "target",
	rank=0,
	name="tensor_bad",
	tensor=torch.randn(10, 10) * 100,
	extra_dumps=[("tensor_good", shared_tensor)],
	)
	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	diff_threshold=1e-3,
	allow_failed_pattern="tensor_bad",
	)

	records, exit_code = _run_and_parse(argv, capsys)
	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.passed == 1
	assert summary.failed == 1
	assert exit_code == 0

	def test_e2e_allow_failed_pattern_no_match_exit_one(self, tmp_path, capsys):
	"""E2E: failed tensor NOT matched by allow_failed_pattern → exit 1."""
	torch.manual_seed(42)
	shared_tensor = torch.randn(10, 10)

	baseline_path = _create_rank_dump(
	tmp_path / "baseline",
	rank=0,
	name="tensor_bad",
	tensor=torch.randn(10, 10),
	extra_dumps=[("tensor_good", shared_tensor)],
	)
	target_path = _create_rank_dump(
	tmp_path / "target",
	rank=0,
	name="tensor_bad",
	tensor=torch.randn(10, 10) * 100,
	extra_dumps=[("tensor_good", shared_tensor)],
	)
	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	diff_threshold=1e-3,
	allow_failed_pattern="other_tensor",
	)

	records, exit_code = _run_and_parse(argv, capsys)
	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.passed == 1
	assert summary.failed == 1
	assert exit_code == 1


	class TestExitCodeSubprocess:
	"""E2E subprocess tests: invoke comparator as a child process and verify exit code."""

	@staticmethod
	def _run_comparator(
	baseline_path: Path,
	target_path: Path,
	*,
	preset: str = "raw",
	allow_skipped_pattern: str = ".*",
	) -> subprocess.CompletedProcess[str]:
	cmd: list[str] = [
	sys.executable,
	"-m",
	"sglang.srt.debug_utils.comparator",
	"--baseline-path",
	str(baseline_path),
	"--target-path",
	str(target_path),
	"--preset",
	preset,
	"--output-format",
	"json",
	"--allow-skipped-pattern",
	allow_skipped_pattern,
	]
	return subprocess.run(cmd, capture_output=True, text=True)

	def test_all_passed_exit_zero(self, tmp_path):
	"""Subprocess: all comparisons pass → exit 0."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	result = self._run_comparator(baseline_path, target_path)
	assert result.returncode == 0

	def test_failed_exit_nonzero(self, tmp_path):
	"""Subprocess: failed comparison → exit 1."""
	torch.manual_seed(42)
	baseline_path = _create_rank_dump(
	tmp_path / "baseline", rank=0, name="t", tensor=torch.randn(10, 10)
	)
	target_path = _create_rank_dump(
	tmp_path / "target", rank=0, name="t", tensor=torch.randn(10, 10) * 100
	)
	result = self._run_comparator(baseline_path, target_path)
	assert result.returncode == 1

	def test_skipped_allow_all_exit_zero(self, tmp_path):
	"""Subprocess: skipped comparison with allow_skipped_pattern='.*' → exit 0."""
	baseline_path, target_path = _create_dumps(
	tmp_path,
	tensor_names=["tensor_a", "tensor_extra"],
	baseline_names=["tensor_a"],
	)
	result = self._run_comparator(
	baseline_path, target_path, allow_skipped_pattern=".*"
	)
	assert result.returncode == 0

	def test_skipped_forbid_all_exit_nonzero(self, tmp_path):
	"""Subprocess: skipped comparison with allow_skipped_pattern='^$' → exit 1."""
	baseline_path, target_path = _create_dumps(
	tmp_path,
	tensor_names=["tensor_a", "tensor_extra"],
	baseline_names=["tensor_a"],
	)
	result = self._run_comparator(
	baseline_path, target_path, allow_skipped_pattern="^$"
	)
	assert result.returncode == 1


	class TestReportOutput:
	"""Test JSONL report file output via ReportSink."""

	def test_default_report_path(self, tmp_path, capsys):
	"""Default writes to <target>/comparator_report.jsonl with ConfigRecord + SummaryRecord."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	argv = _make_argv(baseline_path, target_path, preset="raw", report_path=None)

	exit_code: int = run(parse_args(argv))

	report_file: Path = target_path / "comparator_report.jsonl"
	assert report_file.exists()

	report_records: list[AnyRecord] = _parse_jsonl(report_file.read_text())
	assert isinstance(report_records[0], ConfigRecord)
	assert isinstance(report_records[-1], SummaryRecord)
	assert exit_code == 0

	def test_custom_report_path(self, tmp_path, capsys):
	"""--report-path writes to the specified location."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	custom_path: Path = tmp_path / "custom" / "report.jsonl"
	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	report_path=str(custom_path),
	)

	run(parse_args(argv))

	assert custom_path.exists()
	report_records: list[AnyRecord] = _parse_jsonl(custom_path.read_text())
	assert isinstance(report_records[0], ConfigRecord)
	assert isinstance(report_records[-1], SummaryRecord)

	def test_disabled_report(self, tmp_path, capsys):
	"""--report-path '' disables file generation."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	argv = _make_argv(baseline_path, target_path, preset="raw", report_path="")

	run(parse_args(argv))

	report_file: Path = target_path / "comparator_report.jsonl"
	assert not report_file.exists()

	def test_report_matches_stdout_json(self, tmp_path, capsys):
	"""In json mode, report content matches stdout output."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	report_file: Path = tmp_path / "report.jsonl"
	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	output_format="json",
	report_path=str(report_file),
	)

	capsys.readouterr()
	run(parse_args(argv))

	stdout_lines: list[str] = capsys.readouterr().out.strip().splitlines()
	report_lines: list[str] = report_file.read_text().strip().splitlines()
	assert stdout_lines == report_lines

	def test_text_mode_also_writes_report(self, tmp_path, capsys):
	"""Text stdout mode still writes JSONL report."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	report_file: Path = tmp_path / "report.jsonl"
	argv = _make_argv(
	baseline_path,
	target_path,
	preset="raw",
	output_format="text",
	report_path=str(report_file),
	)

	run(parse_args(argv))

	assert report_file.exists()
	report_records: list[AnyRecord] = _parse_jsonl(report_file.read_text())
	assert isinstance(report_records[0], ConfigRecord)
	assert isinstance(report_records[-1], SummaryRecord)

	def test_streaming_flush(self, tmp_path, capsys):
	"""Report file is flushed after each record (readable before close)."""
	from sglang.srt.debug_utils.comparator.report_sink import report_sink

	report_file: Path = tmp_path / "stream_report.jsonl"
	report_sink.configure(
	output_format="json",
	report_path=report_file,
	)

	report_sink.add(ConfigRecord(config={"test": True}))

	content: str = report_file.read_text()
	assert len(content.strip().splitlines()) == 1
	parsed: AnyRecord = parse_record_json(content.strip())
	assert isinstance(parsed, ConfigRecord)


	class TestEntrypointDpAttentionMissingAlias:
	"""Regression: dp-attention without ``# dp:=attn_dp`` → shape mismatch failure.

	In dp-attention mode (tp_size=2, attn_dp_size=2), layer_input is dumped
	after prepare_attn which DP-distributes tokens. One rank gets 0 tokens
	(shape [0, H]), the other gets all tokens (shape [T, H]).

	Without ``# dp:=attn_dp`` in dims, the comparator has no dp_rank/dp_size
	to filter on, so it picks one rank via TP pick — potentially the empty
	one — causing a shape mismatch with the baseline.
	"""

	@staticmethod
	def _sglang_dp_attn_parallel_info(*, tp_rank: int) -> dict:
	return {
	"tp_rank": tp_rank,
	"tp_size": 2,
	"pp_rank": 0,
	"pp_size": 1,
	"moe_ep_rank": 0,
	"moe_ep_size": 1,
	"moe_tp_rank": tp_rank,
	"moe_tp_size": 2,
	"moe_dp_rank": 0,
	"moe_dp_size": 1,
	"enable_dp_attention": True,
	"attn_tp_rank": 0,
	"attn_tp_size": 1,
	"attn_dp_rank": tp_rank,
	"attn_dp_size": 2,
	"local_attn_dp_rank": tp_rank,
	"local_attn_dp_size": 2,
	"attn_cp_rank": 0,
	"attn_cp_size": 1,
	}

	def test_missing_dp_alias_causes_shape_mismatch(
	self, tmp_path: Path, capsys
	) -> None:
	"""dims='t h' (no dp:=attn_dp) → comparator picks empty rank → shape_mismatch failure."""
	torch.manual_seed(42)
	tensor_data: torch.Tensor = torch.randn(5, 8)
	target_data: torch.Tensor = tensor_data + torch.randn(5, 8) * 0.001

	for side_name, data in [("baseline", tensor_data), ("target", target_data)]:
	side_dir: Path = tmp_path / side_name
	side_dir.mkdir()

	# Baseline: single rank, no DP attention
	if side_name == "baseline":
	_create_rank_dump(
	side_dir,
	rank=0,
	name="layer_input",
	tensor=data,
	dims="t h",
	parallel_info={"tp_rank": 0, "tp_size": 1},
	framework="sglang",
	)
	else:
	# Target: dp-attention, tp_rank=0 gets 0 tokens, tp_rank=1 gets all
	_create_rank_dump(
	side_dir,
	rank=0,
	name="layer_input",
	tensor=torch.empty(0, 8),
	dims="t h",
	parallel_info=self._sglang_dp_attn_parallel_info(tp_rank=0),
	framework="sglang",
	)
	_create_rank_dump(
	side_dir,
	rank=1,
	name="layer_input",
	tensor=data,
	dims="t h",
	parallel_info=self._sglang_dp_attn_parallel_info(tp_rank=1),
	framework="sglang",
	)

	argv: list[str] = _make_argv(
	tmp_path / "baseline" / _FIXED_EXP_NAME,
	tmp_path / "target" / _FIXED_EXP_NAME,
	diff_threshold=1e-3,
	)
	records, exit_code = _run_and_parse(argv, capsys)

	assert exit_code == 1

	errors = [r for r in records if isinstance(r, ComparisonErrorRecord)]
	assert len(errors) == 1
	assert errors[0].category == "errored"


	class TestEntrypointAutoDescend:
	"""Test auto-descend: --baseline-path / --target-path pointing to a parent
	directory that contains a single subdirectory with .pt files."""

	def test_auto_descend_single_engine(self, tmp_path: Path, capsys) -> None:
	"""Parent dir wrapping a single engine subdir is auto-descended and comparison succeeds."""
	baseline_exp, target_exp = _create_dumps(tmp_path, ["tensor_a"])

	baseline_wrapper: Path = tmp_path / "baseline_wrap"
	target_wrapper: Path = tmp_path / "target_wrap"
	baseline_wrapper.mkdir()
	target_wrapper.mkdir()
	baseline_exp.rename(baseline_wrapper / "engine_0")
	target_exp.rename(target_wrapper / "engine_0")

	argv = _make_argv(baseline_wrapper, target_wrapper, preset="raw")
	records, exit_code = _run_and_parse(argv, capsys)

	assert exit_code == 0
	_assert_single_comparison_passed(records)

	def test_no_descend_when_pt_at_root(self, tmp_path: Path, capsys) -> None:
	"""Direct .pt files — no descend needed, comparison still works."""
	baseline_exp, target_exp = _create_dumps(tmp_path, ["tensor_a"])

	argv = _make_argv(baseline_exp, target_exp, preset="raw")
	records, exit_code = _run_and_parse(argv, capsys)

	assert exit_code == 0
	_assert_single_comparison_passed(records)

	def test_auto_descend_emits_log_record(self, tmp_path: Path, capsys) -> None:
	"""Auto-descend emits a LogRecord with the info message."""
	baseline_exp, target_exp = _create_dumps(tmp_path, ["tensor_a"])

	wrapper: Path = tmp_path / "target_wrap"
	wrapper.mkdir()
	target_exp.rename(wrapper / "engine_0")

	argv = _make_argv(baseline_exp, wrapper, preset="raw")
	records, _ = _run_and_parse(argv, capsys)

	log_records: list[LogRecord] = [r for r in records if isinstance(r, LogRecord)]
	auto_descend_msgs: list[str] = [
	info.message
	for lr in log_records
	for info in lr.infos
	if "auto-descend" in info.message
	]
	assert any("target_path" in m for m in auto_descend_msgs)

	def test_auto_descend_single_nonempty_among_empty(
	self, tmp_path: Path, capsys
	) -> None:
	"""Two subdirs but only one has .pt — auto-descend picks the non-empty one."""
	baseline_exp, target_exp = _create_dumps(tmp_path, ["tensor_a"])

	wrapper: Path = tmp_path / "target_wrap"
	wrapper.mkdir()
	target_exp.rename(wrapper / "engine_0")
	(wrapper / "empty_subdir").mkdir()

	argv = _make_argv(baseline_exp, wrapper, preset="raw")
	records, exit_code = _run_and_parse(argv, capsys)

	assert exit_code == 0
	_assert_single_comparison_passed(records)

	def test_error_multiple_nonempty_subdirs(self, tmp_path: Path) -> None:
	"""Two subdirs both with .pt — raises ValueError with clear message."""
	baseline_exp, target_exp = _create_dumps(tmp_path, ["tensor_a"])

	wrapper: Path = tmp_path / "target_wrap"
	wrapper.mkdir()
	target_exp.rename(wrapper / "engine_0")
	engine_1: Path = wrapper / "engine_1"
	engine_1.mkdir()
	torch.save(torch.tensor([1.0]), engine_1 / "dummy.pt")

	argv: list[str] = _make_argv(baseline_exp, wrapper, preset="raw")
	with pytest.raises(ValueError, match="multiple subdirectories contain data"):
	run(parse_args(argv))

	def test_error_no_data_found(self, tmp_path: Path) -> None:
	"""No .pt files anywhere — raises ValueError."""
	baseline_exp, _ = _create_dumps(tmp_path, ["tensor_a"])

	empty_dir: Path = tmp_path / "empty_target"
	empty_dir.mkdir()
	(empty_dir / "subdir").mkdir()

	argv: list[str] = _make_argv(baseline_exp, empty_dir, preset="raw")
	with pytest.raises(ValueError, match="no .pt files found"):
	run(parse_args(argv))


	class TestErrorResilience:
	"""Bundle comparison exception → continue with remaining bundles."""

	def test_one_bundle_errors_others_continue(self, tmp_path, capsys, monkeypatch):
	"""One bundle raises exception → other bundles still compared, summary correct."""
	baseline_path, target_path = _create_dumps(
	tmp_path, ["tensor_a", "tensor_b", "tensor_c"]
	)
	argv = _make_argv(baseline_path, target_path, preset="raw")

	original = _entrypoint_module.compare_bundle_pair

	def _patched(**kwargs):
	if kwargs["name"] == "tensor_b":
	raise RuntimeError("intentional test error")
	return original(**kwargs)

	monkeypatch.setattr(_entrypoint_module, "compare_bundle_pair", _patched)

	records, exit_code = _run_and_parse(argv, capsys)

	comparisons = _get_comparisons(records)
	assert len(comparisons) == 2

	errors = [r for r in records if isinstance(r, ComparisonErrorRecord)]
	assert len(errors) == 1
	assert errors[0].name == "tensor_b"
	assert errors[0].exception_type == "RuntimeError"
	assert "intentional test error" in errors[0].traceback_str

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.errored == 1
	assert summary.passed == 2
	assert summary.total == 3

	assert exit_code == 1

	def test_all_bundles_error_exits_one(self, tmp_path, capsys, monkeypatch):
	"""All bundles error → exit 1, summary all errored."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	argv = _make_argv(baseline_path, target_path, preset="raw")

	def _always_raise(**kwargs):
	raise ValueError("always fail")

	monkeypatch.setattr(_entrypoint_module, "compare_bundle_pair", _always_raise)

	records, exit_code = _run_and_parse(argv, capsys)

	summary = records[-1]
	assert isinstance(summary, SummaryRecord)
	assert summary.errored == 1
	assert summary.passed == 0
	assert exit_code == 1

	def test_error_record_json_roundtrip_in_output(self, tmp_path, capsys, monkeypatch):
	"""ComparisonErrorRecord correctly serializes and deserializes in output."""
	baseline_path, target_path = _create_dumps(tmp_path, ["tensor_a"])
	argv = _make_argv(baseline_path, target_path, preset="raw")

	def _raise(**kwargs):
	raise TypeError("bad type")

	monkeypatch.setattr(_entrypoint_module, "compare_bundle_pair", _raise)

	records, _ = _run_and_parse(argv, capsys)
	errors = [r for r in records if isinstance(r, ComparisonErrorRecord)]
	assert len(errors) == 1
	assert errors[0].exception_type == "TypeError"


	if __name__ == "__main__":
	sys.exit(pytest.main([__file__]))