File size: 4,511 Bytes
2db20ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | """
Repository Structure Scanner
=============================
Scans the entire file structure of a repository and outputs a tree-like
Markdown representation. Handles massive (1GB+) repositories efficiently.
Usage:
python scan_structure.py [path] [--output FILE] [--ignore PATTERN ...]
Output:
A Markdown file containing the full directory tree.
"""
import os
import sys
import argparse
import subprocess
from pathlib import Path
from collections import defaultdict
# ---------------------------------------------------------------------------
# Default ignore patterns (common non-source dirs / files)
# ---------------------------------------------------------------------------
DEFAULT_IGNORE = {
".git",
"__pycache__",
".venv",
"venv",
"env",
"node_modules",
".ipynb_checkpoints",
".mypy_cache",
".pytest_cache",
".tox",
".eggs",
"*.egg-info",
".DS_Store",
"Thumbs.db",
"desktop.ini",
}
def should_ignore(name: str, ignore_set: set) -> bool:
"""Return True if *name* matches any pattern in the ignore set."""
if name in ignore_set:
return True
for pattern in ignore_set:
if pattern.startswith("*") and name.endswith(pattern[1:]):
return True
return False
def build_tree(root_path: str, ignore_set: set) -> list[str]:
"""
Walk *root_path* depth-first and return a list of tree-formatted lines.
Uses ``os.scandir`` for performance on large filesystems and sorts
entries alphabetically (directories first).
"""
lines: list[str] = []
def _walk(current: str, prefix: str) -> None:
try:
entries = sorted(
os.scandir(current),
key=lambda e: (not e.is_dir(follow_symlinks=False), e.name.lower()),
)
except PermissionError:
return
# Filter out ignored entries
entries = [e for e in entries if not should_ignore(e.name, ignore_set)]
for idx, entry in enumerate(entries):
is_last = idx == len(entries) - 1
connector = "└── " if is_last else "├── "
suffix = "/" if entry.is_dir(follow_symlinks=False) else ""
lines.append(f"{prefix}{connector}{entry.name}{suffix}")
if entry.is_dir(follow_symlinks=False):
extension = " " if is_last else "│ "
_walk(entry.path, prefix + extension)
_walk(root_path, "")
return lines
def main() -> None:
parser = argparse.ArgumentParser(
description="Scan repository file structure and output a Markdown tree."
)
parser.add_argument(
"path",
nargs="?",
default=".",
help="Root directory to scan (default: current directory).",
)
parser.add_argument(
"--output",
"-o",
default=None,
help="Output Markdown file path (default: STRUCTURE.md in scanned dir).",
)
parser.add_argument(
"--ignore",
nargs="*",
default=None,
help="Extra patterns to ignore (added to built-in defaults).",
)
parser.add_argument(
"--no-default-ignore",
action="store_true",
help="Disable the built-in ignore list (scan everything).",
)
args = parser.parse_args()
root = os.path.abspath(args.path)
root_name = os.path.basename(root)
# Build ignore set
ignore_set: set = set() if args.no_default_ignore else set(DEFAULT_IGNORE)
if args.ignore:
ignore_set.update(args.ignore)
# Always ignore the output file itself to avoid self-referencing
out_path = args.output or os.path.join(root, "STRUCTURE.md")
out_name = os.path.basename(out_path)
ignore_set.add(out_name)
print(f"Scanning: {root}")
print(f"Ignoring: {', '.join(sorted(ignore_set))}")
tree_lines = build_tree(root, ignore_set)
# Compose Markdown content
md_lines = [
f"## Project Structure\n",
f"```text",
f"{root_name}/",
]
md_lines.extend(tree_lines)
md_lines.append("```\n")
content = "\n".join(md_lines)
with open(out_path, "w", encoding="utf-8") as fh:
fh.write(content)
total_entries = len(tree_lines)
print(f"Done – {total_entries} entries written to {out_path}")
if __name__ == "__main__":
main()
|