""" Repository Structure Scanner ============================= Scans the entire file structure of a repository and outputs a tree-like Markdown representation. Handles massive (1GB+) repositories efficiently. Usage: python scan_structure.py [path] [--output FILE] [--ignore PATTERN ...] Output: A Markdown file containing the full directory tree. """ import os import sys import argparse import subprocess from pathlib import Path from collections import defaultdict # --------------------------------------------------------------------------- # Default ignore patterns (common non-source dirs / files) # --------------------------------------------------------------------------- DEFAULT_IGNORE = { ".git", "__pycache__", ".venv", "venv", "env", "node_modules", ".ipynb_checkpoints", ".mypy_cache", ".pytest_cache", ".tox", ".eggs", "*.egg-info", ".DS_Store", "Thumbs.db", "desktop.ini", } def should_ignore(name: str, ignore_set: set) -> bool: """Return True if *name* matches any pattern in the ignore set.""" if name in ignore_set: return True for pattern in ignore_set: if pattern.startswith("*") and name.endswith(pattern[1:]): return True return False def build_tree(root_path: str, ignore_set: set) -> list[str]: """ Walk *root_path* depth-first and return a list of tree-formatted lines. Uses ``os.scandir`` for performance on large filesystems and sorts entries alphabetically (directories first). """ lines: list[str] = [] def _walk(current: str, prefix: str) -> None: try: entries = sorted( os.scandir(current), key=lambda e: (not e.is_dir(follow_symlinks=False), e.name.lower()), ) except PermissionError: return # Filter out ignored entries entries = [e for e in entries if not should_ignore(e.name, ignore_set)] for idx, entry in enumerate(entries): is_last = idx == len(entries) - 1 connector = "└── " if is_last else "├── " suffix = "/" if entry.is_dir(follow_symlinks=False) else "" lines.append(f"{prefix}{connector}{entry.name}{suffix}") if entry.is_dir(follow_symlinks=False): extension = " " if is_last else "│ " _walk(entry.path, prefix + extension) _walk(root_path, "") return lines def main() -> None: parser = argparse.ArgumentParser( description="Scan repository file structure and output a Markdown tree." ) parser.add_argument( "path", nargs="?", default=".", help="Root directory to scan (default: current directory).", ) parser.add_argument( "--output", "-o", default=None, help="Output Markdown file path (default: STRUCTURE.md in scanned dir).", ) parser.add_argument( "--ignore", nargs="*", default=None, help="Extra patterns to ignore (added to built-in defaults).", ) parser.add_argument( "--no-default-ignore", action="store_true", help="Disable the built-in ignore list (scan everything).", ) args = parser.parse_args() root = os.path.abspath(args.path) root_name = os.path.basename(root) # Build ignore set ignore_set: set = set() if args.no_default_ignore else set(DEFAULT_IGNORE) if args.ignore: ignore_set.update(args.ignore) # Always ignore the output file itself to avoid self-referencing out_path = args.output or os.path.join(root, "STRUCTURE.md") out_name = os.path.basename(out_path) ignore_set.add(out_name) print(f"Scanning: {root}") print(f"Ignoring: {', '.join(sorted(ignore_set))}") tree_lines = build_tree(root, ignore_set) # Compose Markdown content md_lines = [ f"## Project Structure\n", f"```text", f"{root_name}/", ] md_lines.extend(tree_lines) md_lines.append("```\n") content = "\n".join(md_lines) with open(out_path, "w", encoding="utf-8") as fh: fh.write(content) total_entries = len(tree_lines) print(f"Done – {total_entries} entries written to {out_path}") if __name__ == "__main__": main()