"""Comprehensive tests for the public DiffCompressor API. Tests cover: 1. Context line reduction 2. Hunk selection and limiting 3. Compression ratios 4. Edge cases 5. Bug-fix regressions and routing-gap fixtures Stage 3b note (2026-04-25): the Python `DiffCompressor` implementation was retired in favor of the Rust-backed shim (`headroom._core` via PyO3). Tests that probed Python-only internals — `_parse_diff`, `_score_hunks`, the `DiffHunk` / `DiffFile` parser dataclasses — were removed because the Rust crate has its own parallel coverage in `crates/headroom-core/tests`. Public-API tests (anything calling `compressor.compress(...)`) are preserved unchanged: they exercise the Rust backend through the same import path and assert the same outputs. """ from headroom.transforms.diff_compressor import ( DiffCompressionResult, DiffCompressor, DiffCompressorConfig, ) class TestContextReduction: """Tests for context line reduction.""" def test_reduce_context_lines(self): """Context lines are reduced to configured maximum.""" content = """diff --git a/file.py b/file.py --- a/file.py +++ b/file.py @@ -1,10 +1,11 @@ context1 context2 context3 context4 +added context5 context6 context7 context8 """ # Default max_context_lines is 2 compressor = DiffCompressor( config=DiffCompressorConfig( max_context_lines=2, min_lines_for_ccr=5, enable_ccr=False, ) ) result = compressor.compress(content) # Should keep 2 context before and 2 after the +added line # Plus the added line itself lines = result.compressed.split("\n") context_count = sum(1 for line in lines if line.startswith(" ")) # At most 4 context lines (2 before + 2 after) assert context_count <= 4 def test_preserve_all_changes(self): """All addition and deletion lines are preserved.""" content = """diff --git a/file.py b/file.py --- a/file.py +++ b/file.py @@ -1,10 +1,10 @@ ctx1 ctx2 -removed1 +added1 ctx3 ctx4 -removed2 +added2 ctx5 ctx6 """ compressor = DiffCompressor( config=DiffCompressorConfig( min_lines_for_ccr=5, enable_ccr=False, ) ) result = compressor.compress(content) assert "-removed1" in result.compressed assert "-removed2" in result.compressed assert "+added1" in result.compressed assert "+added2" in result.compressed class TestHunkSelection: """Tests for hunk selection when limiting.""" def test_max_hunks_per_file(self): """Hunks are limited to max_hunks_per_file.""" # Create a diff with many hunks hunks = [] for i in range(20): hunks.append(f"""@@ -{i * 10},3 +{i * 10},4 @@ context +added_{i} more """) content = f"""diff --git a/bigfile.py b/bigfile.py --- a/bigfile.py +++ b/bigfile.py {"".join(hunks)}""" compressor = DiffCompressor( config=DiffCompressorConfig( max_hunks_per_file=5, min_lines_for_ccr=10, enable_ccr=False, ) ) result = compressor.compress(content) # Should have at most 5 hunks hunk_count = result.compressed.count("@@") # Each hunk has one @@ header (we count full hunk headers) assert hunk_count <= 10 # Each hunk header appears twice @@...@@ def test_keeps_first_and_last_hunk(self): """First and last hunks are preserved when limiting.""" hunks = [] for i in range(10): hunks.append(f"""@@ -{i * 10},3 +{i * 10},4 @@ context +added_{i} more """) content = f"""diff --git a/file.py b/file.py --- a/file.py +++ b/file.py {"".join(hunks)}""" compressor = DiffCompressor( config=DiffCompressorConfig( max_hunks_per_file=3, min_lines_for_ccr=10, enable_ccr=False, ) ) result = compressor.compress(content) # First hunk (added_0) should be present assert "+added_0" in result.compressed # Last hunk (added_9) should be present assert "+added_9" in result.compressed class TestFileSelection: """Tests for file selection when limiting.""" def test_max_files(self): """Files are limited to max_files.""" # Create diff with many files files = [] for i in range(30): files.append(f"""diff --git a/file{i}.py b/file{i}.py --- a/file{i}.py +++ b/file{i}.py @@ -1,2 +1,3 @@ ctx +added ctx2 """) content = "\n".join(files) compressor = DiffCompressor( config=DiffCompressorConfig( max_files=10, min_lines_for_ccr=20, enable_ccr=False, ) ) result = compressor.compress(content) # Count diff --git headers file_count = result.compressed.count("diff --git") assert file_count <= 10 class TestCompressionResult: """Tests for DiffCompressionResult properties.""" def test_compression_ratio_calculation(self): """Compression ratio is calculated correctly.""" result = DiffCompressionResult( compressed="a\nb\nc", original_line_count=100, compressed_line_count=10, files_affected=2, additions=5, deletions=3, hunks_kept=2, hunks_removed=5, ) assert result.compression_ratio == 0.1 def test_tokens_saved_estimate(self): """Token savings estimation works correctly.""" result = DiffCompressionResult( compressed="short", original_line_count=100, compressed_line_count=10, files_affected=1, additions=10, deletions=5, hunks_kept=1, hunks_removed=0, ) # 90 lines saved * 40 chars/line / 4 chars/token = 900 tokens assert result.tokens_saved_estimate == 900 class TestSmallDiffPassthrough: """Tests for small diff passthrough behavior.""" def test_small_diff_unchanged(self): """Diffs smaller than threshold pass through unchanged.""" content = """diff --git a/small.py b/small.py --- a/small.py +++ b/small.py @@ -1,2 +1,3 @@ line1 +added line2 """ compressor = DiffCompressor( config=DiffCompressorConfig( min_lines_for_ccr=100, # High threshold ) ) result = compressor.compress(content) # Should be unchanged assert result.compressed == content assert result.compression_ratio == 1.0 class TestOutputFormatting: """Tests for output formatting.""" def test_summary_line_added(self): """Summary line is added at end of compressed diff.""" # Large diff that will be compressed hunks = [] for i in range(15): hunks.append(f"""@@ -{i * 10},5 +{i * 10},6 @@ ctx1 ctx2 +added_{i} ctx3 ctx4 """) content = f"""diff --git a/file.py b/file.py --- a/file.py +++ b/file.py {"".join(hunks)}""" compressor = DiffCompressor( config=DiffCompressorConfig( max_hunks_per_file=5, min_lines_for_ccr=10, enable_ccr=False, ) ) result = compressor.compress(content) # Should have summary at end assert "files changed" in result.compressed assert "hunks omitted" in result.compressed def test_preserves_diff_format(self): """Output preserves valid unified diff format.""" content = """diff --git a/test.py b/test.py --- a/test.py +++ b/test.py @@ -1,3 +1,4 @@ def test(): + # new comment pass return True """ compressor = DiffCompressor( config=DiffCompressorConfig( min_lines_for_ccr=5, enable_ccr=False, ) ) result = compressor.compress(content) # Should have all standard diff markers assert "diff --git" in result.compressed assert "---" in result.compressed assert "+++" in result.compressed assert "@@" in result.compressed class TestEdgeCases: """Tests for edge cases and boundary conditions.""" def test_empty_input(self): """Empty input is handled gracefully.""" compressor = DiffCompressor() result = compressor.compress("") assert result.compressed == "" assert result.compression_ratio == 1.0 def test_non_diff_input(self): """Non-diff input passes through unchanged.""" content = "This is not a diff\nJust regular text" compressor = DiffCompressor() result = compressor.compress(content) # Should pass through (no diff --git found) assert result.compressed == content def test_unicode_content(self): """Unicode characters in diff are handled.""" content = """diff --git a/i18n.py b/i18n.py --- a/i18n.py +++ b/i18n.py @@ -1,2 +1,3 @@ msg = "hello" +msg_ja = "こんにちは" return msg """ compressor = DiffCompressor() result = compressor.compress(content) assert "こんにちは" in result.compressed def test_no_newline_at_eof(self): """Handles 'No newline at end of file' indicator.""" content = """diff --git a/file.py b/file.py --- a/file.py +++ b/file.py @@ -1,2 +1,2 @@ line1 -line2 \\ No newline at end of file +line2_modified \\ No newline at end of file """ compressor = DiffCompressor() result = compressor.compress(content) # Should not crash and preserve the indicator assert "No newline" in result.compressed or "-line2" in result.compressed def test_empty_hunks(self): """Files with no actual hunks are handled.""" content = """diff --git a/file.py b/file.py --- a/file.py +++ b/file.py """ compressor = DiffCompressor() result = compressor.compress(content) # Should not crash assert result.compressed is not None class TestConfigOptions: """Tests for configuration options.""" def test_max_context_lines_config(self): """max_context_lines configuration controls context reduction.""" content = """diff --git a/file.py b/file.py --- a/file.py +++ b/file.py @@ -1,10 +1,11 @@ c1 c2 c3 c4 c5 +added c6 c7 c8 c9 c10 """ # With max_context_lines=1 compressor = DiffCompressor( config=DiffCompressorConfig( max_context_lines=1, min_lines_for_ccr=5, enable_ccr=False, ) ) result = compressor.compress(content) # Count context lines (lines starting with space) context_count = sum(1 for line in result.compressed.split("\n") if line.startswith(" ")) # Should have at most 2 context lines (1 before + 1 after) assert context_count <= 2 def test_always_keep_additions_default(self): """Additions are always kept by default.""" content = """diff --git a/file.py b/file.py --- a/file.py +++ b/file.py @@ -1,3 +1,5 @@ ctx +add1 +add2 ctx """ compressor = DiffCompressor( config=DiffCompressorConfig( always_keep_additions=True, min_lines_for_ccr=2, enable_ccr=False, ) ) result = compressor.compress(content) assert "+add1" in result.compressed assert "+add2" in result.compressed def test_always_keep_deletions_default(self): """Deletions are always kept by default.""" content = """diff --git a/file.py b/file.py --- a/file.py +++ b/file.py @@ -1,5 +1,3 @@ ctx -del1 -del2 ctx """ compressor = DiffCompressor( config=DiffCompressorConfig( always_keep_deletions=True, min_lines_for_ccr=2, enable_ccr=False, ) ) result = compressor.compress(content) assert "-del1" in result.compressed assert "-del2" in result.compressed # ─── Bug-fix tests (2026-04-25): four silent information-loss paths ───────── # # Before the fix, the parser captured these patterns but the emitter dropped # them, or the regex didn't match them at all. Each test exercises one of # the four paths the same way the Rust unit tests do. def _cfg_below_threshold(): """Small config so the parser+emitter actually run on test inputs.""" from headroom.transforms.diff_compressor import DiffCompressorConfig return DiffCompressorConfig(min_lines_for_ccr=5) class TestBugfixRenamePreservation: """rename/similarity/dissimilarity/copy markers were captured into is_renamed=True and then dropped by the emitter. Output looked like a plain modification of the old path.""" def test_rename_with_similarity_index_preserved(self): from headroom.transforms.diff_compressor import DiffCompressor diff = ( "diff --git a/old.py b/new.py\n" "similarity index 92%\n" "rename from old.py\n" "rename to new.py\n" "--- a/old.py\n" "+++ b/new.py\n" "@@ -1,3 +1,3 @@\n" " ctx_a\n" "-old\n" "+new\n" " ctx_b\n" ) result = DiffCompressor(_cfg_below_threshold()).compress(diff) assert "similarity index 92%" in result.compressed assert "rename from old.py" in result.compressed assert "rename to new.py" in result.compressed def test_dissimilarity_index_preserved(self): from headroom.transforms.diff_compressor import DiffCompressor diff = ( "diff --git a/x.py b/y.py\n" "dissimilarity index 60%\n" "rename from x.py\n" "rename to y.py\n" "--- a/x.py\n" "+++ b/y.py\n" "@@ -1 +1 @@\n" "-a\n" "+b\n" ) result = DiffCompressor(_cfg_below_threshold()).compress(diff) assert "dissimilarity index 60%" in result.compressed def test_copy_markers_preserved(self): from headroom.transforms.diff_compressor import DiffCompressor diff = ( "diff --git a/orig.py b/dup.py\n" "similarity index 100%\n" "copy from orig.py\n" "copy to dup.py\n" "--- a/orig.py\n" "+++ b/dup.py\n" "@@ -1 +1 @@\n" "-old\n" "+new\n" ) result = DiffCompressor(_cfg_below_threshold()).compress(diff) assert "copy from orig.py" in result.compressed assert "copy to dup.py" in result.compressed class TestBugfixCombinedDiff: """Combined-diff `@@@` hunks from merge commits had ALL content silently dropped because the regex hardcoded `@@`.""" def test_3way_combined_diff_content_preserved(self): from headroom.transforms.diff_compressor import DiffCompressor diff = ( "diff --git a/merge.py b/merge.py\n" "--- a/merge.py\n" "+++ b/merge.py\n" "@@@ -1,3 -1,3 +1,4 @@@\n" " unchanged_a\n" "- old_branch_1\n" " -old_branch_2\n" "++new_in_merge\n" " +new_added\n" " unchanged_b\n" ) result = DiffCompressor(_cfg_below_threshold()).compress(diff) assert "@@@ -1,3 -1,3 +1,4 @@@" in result.compressed assert "++new_in_merge" in result.compressed assert result.files_affected > 0 class TestBugfixNoNewlineMarker: r"""`\ No newline at end of file` got dropped by context trim whenever it was further than max_context_lines from a +/- change.""" def test_no_newline_marker_survives_distance(self): from headroom.transforms.diff_compressor import DiffCompressor diff = ( "diff --git a/last.txt b/last.txt\n" "--- a/last.txt\n" "+++ b/last.txt\n" "@@ -1,8 +1,8 @@\n" "-old_first\n" "+new_first\n" " ctx_a\n" " ctx_b\n" " ctx_c\n" " ctx_d\n" " ctx_e\n" " ctx_f\n" "\\ No newline at end of file\n" ) result = DiffCompressor(_cfg_below_threshold()).compress(diff) assert "\\ No newline at end of file" in result.compressed class TestBugfixPreDiffContent: """Anything before the first `diff --git` (commit headers, email-style metadata) was silently dropped.""" def test_commit_header_preserved(self): from headroom.transforms.diff_compressor import DiffCompressor diff = ( "commit abc1234567890abcdef\n" "Author: Tester \n" "Date: Mon Apr 25 12:00:00 2026\n" "\n" " Refactor: rename and modify\n" "\n" "diff --git a/x.py b/x.py\n" "--- a/x.py\n" "+++ b/x.py\n" "@@ -1 +1 @@\n" "-a\n" "+b\n" ) result = DiffCompressor(_cfg_below_threshold()).compress(diff) assert result.compressed.startswith("commit abc1234567890abcdef") assert "Author: Tester" in result.compressed assert "Refactor: rename and modify" in result.compressed assert "diff --git a/x.py b/x.py" in result.compressed assert "-a" in result.compressed assert "+b" in result.compressed def test_no_pre_diff_content_does_not_add_blank_line(self): """Edge case: when there's no pre-diff content, output must NOT gain a leading blank line from a stray empty-list prepend.""" from headroom.transforms.diff_compressor import DiffCompressor diff = "diff --git a/x.py b/x.py\n--- a/x.py\n+++ b/x.py\n@@ -1 +1 @@\n-a\n+b\n" result = DiffCompressor(_cfg_below_threshold()).compress(diff) assert result.compressed.startswith("diff --git a/x.py b/x.py") class TestRoutingGapMergeDiffs: """Routing gap (2026-04-25 follow-up): ContentRouter detects diff inputs and routes them to DiffCompressor, but the parser previously only knew the `diff --git` shape. Merge-commit diffs from `git log -p` use `diff --combined ` or `diff --cc ` and were treated as non-diff blobs and passed through unchanged. """ def test_diff_combined_header_starts_a_file_section(self): from headroom.transforms.diff_compressor import DiffCompressor diff = ( "diff --combined merge_target.py\n" "index abc..def..ghi 100644\n" "--- a/merge_target.py\n" "+++ b/merge_target.py\n" "@@@ -1,3 -1,3 +1,4 @@@\n" " unchanged_a\n" "- old_p1\n" " -old_p2\n" "++new_in_merge\n" " unchanged_b\n" ) result = DiffCompressor(_cfg_below_threshold()).compress(diff) assert result.files_affected == 1 assert "diff --combined merge_target.py" in result.compressed assert "@@@ -1,3 -1,3 +1,4 @@@" in result.compressed assert "++new_in_merge" in result.compressed def test_diff_cc_header_starts_a_file_section(self): from headroom.transforms.diff_compressor import DiffCompressor diff = ( "diff --cc cc_target.py\n" "index abc..def..ghi\n" "--- a/cc_target.py\n" "+++ b/cc_target.py\n" "@@@ -1,3 -1,3 +1,4 @@@\n" " ctx\n" "- removed_p1\n" " -removed_p2\n" "++added_in_merge\n" " more_ctx\n" ) result = DiffCompressor(_cfg_below_threshold()).compress(diff) assert result.files_affected == 1 assert "diff --cc cc_target.py" in result.compressed assert "++added_in_merge" in result.compressed class TestRoutingGapDetectorScanWindow: """Routing gap (2026-04-25 follow-up): `_try_detect_diff` only scanned the first 50 lines, so `git log -p` outputs with long commit messages pushed the diff past the detection window — input was misrouted away from DiffCompressor entirely. Window widened to 500 lines. """ def test_detect_picks_up_diff_after_long_commit_message(self): from headroom.transforms.content_detector import ( ContentType, detect_content_type, ) # 60 lines of commit message before the diff. Old 50-line cap # would have missed the `diff --git` header entirely. msg_lines = [ "commit abc123", "Author: Tester ", "Date: Mon Apr 25 12:00:00 2026", "", ] + [f" msg line {i}" for i in range(60)] diff = ( "\n".join(msg_lines) + "\n\n" + "diff --git a/x.py b/x.py\n--- a/x.py\n+++ b/x.py\n@@ -1 +1 @@\n-old\n+new\n" ) result = detect_content_type(diff) assert result.content_type == ContentType.GIT_DIFF assert result.confidence >= 0.7 def test_detect_recognizes_combined_diff_headers(self): """The detector also gained recognition for combined-diff hunk headers (`@@@`+) — useful when the only signal in a snippet is the merge-style hunk.""" from headroom.transforms.content_detector import ( ContentType, detect_content_type, ) # Full merge diff (with `--- a/` shared with regular diffs as a # belt-and-suspenders signal). diff = ( "diff --combined m.py\n--- a/m.py\n+++ b/m.py\n@@@ -1,2 -1,2 +1,3 @@@\n ctx\n++added\n" ) result = detect_content_type(diff) assert result.content_type == ContentType.GIT_DIFF