File size: 4,420 Bytes
d466b7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from __future__ import annotations
import json, tempfile
from pathlib import Path
from dir2md.core import Config, generate_markdown_report


def _make_repo(tmp: Path) -> Path:
    (tmp/"src").mkdir(parents=True, exist_ok=True)
    # Make this file long enough to trigger truncation
    long_content = "\n".join([f"    print('line {i}')" for i in range(100)])
    (tmp/"src"/"a.py").write_text(f"""
import os

class A: pass

def foo():
{long_content}
    return 42
""", encoding="utf-8")
    (tmp/"src"/"b.py").write_text("""
import sys

def bar():
    return 43
""", encoding="utf-8")
    # Similar file (for deduplication testing)
    (tmp/"src"/"b_copy.py").write_text((tmp/"src"/"b.py").read_text(encoding="utf-8"), encoding="utf-8")
    (tmp/"README.md").write_text("# Title\n\nSome text\n", encoding="utf-8")
    return tmp


def test_budget_and_modes(tmp_path: Path):
    root = _make_repo(tmp_path)
    cfg = Config(
        root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
        respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
        include_contents=True, only_ext=None, add_stats=True, add_toc=False,
        llm_mode="summary", budget_tokens=200, max_file_tokens=1200, dedup_bits=16,
        sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=True,
        preset="pro", explain_capsule=True,
    )
    md = generate_markdown_report(cfg)
    assert "Estimated tokens (prompt):" in md
    mpath = (root/"OUT.manifest.json")
    assert mpath.exists()
    man = json.loads(mpath.read_text(encoding="utf-8"))
    # b_copy.py likely to be excluded due to deduplication
    paths = {entry["path"] for entry in man["files"]}
    assert any(p.endswith("a.py") for p in paths)
    assert any(p.endswith("b.py") for p in paths)


def test_ref_mode_manifest(tmp_path: Path):
    root = _make_repo(tmp_path)
    cfg = Config(
        root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
        respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
        include_contents=True, only_ext=None, add_stats=True, add_toc=False,
        llm_mode="ref", budget_tokens=120, max_file_tokens=1200, dedup_bits=16,
        sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=True,
        preset="pro", explain_capsule=False,
    )
    md = generate_markdown_report(cfg)
    man = json.loads((root/"OUT.manifest.json").read_text(encoding="utf-8"))
    assert "stats" in man
    assert "files" in man
    assert all("sha256" in e for e in man["files"])


def test_inline_sampling(tmp_path: Path):
    root = _make_repo(tmp_path)
    # Drastically reduced budget to trigger sampling
    cfg = Config(
        root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
        respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=50,
        include_contents=True, only_ext=None, add_stats=True, add_toc=False,
        llm_mode="inline", budget_tokens=50, max_file_tokens=30, dedup_bits=0,
        sample_head=5, sample_tail=3, strip_comments=False, emit_manifest=False,
        preset="pro", explain_capsule=True,
    )
    md = generate_markdown_report(cfg)
    assert "truncated middle" in md
    assert "why: inline" in md

def test_masking(tmp_path: Path):
    root = _make_repo(tmp_path)
    # Add a file with a secret
    secret_content = "My AWS key is AKIAIOSFODNN7EXAMPLE"
    (root / ".env").write_text(secret_content, encoding="utf-8")

    cfg = Config(
        root=root, output=root/"OUT.md", include_globs=[], exclude_globs=[], omit_globs=[],
        respect_gitignore=False, follow_symlinks=False, max_bytes=200_000, max_lines=2000,
        include_contents=True, only_ext=None, add_stats=True, add_toc=False,
        llm_mode="inline", budget_tokens=1000, max_file_tokens=1000, dedup_bits=0,
        sample_head=120, sample_tail=40, strip_comments=False, emit_manifest=False,
        preset="pro", explain_capsule=False, no_timestamp=True,
        masking_mode="basic",
    )
    md = generate_markdown_report(cfg)

    assert secret_content not in md
    assert "[*** MASKED_SECRET ***]" in md

    # Test with masking off
    cfg.masking_mode = "off"
    md_unmasked = generate_markdown_report(cfg)
    assert secret_content in md_unmasked
    assert "[*** MASKED_SECRET ***]" not in md_unmasked