QRAT2025 / scan_structure.py
algorembrant's picture
Add files using upload-large-folder tool
4ce5e6b verified
"""
Repository Structure Scanner
=============================
Scans the entire file structure of a repository and outputs a tree-like
Markdown representation. Handles massive (1GB+) repositories efficiently.
Usage:
python scan_structure.py [path] [--output FILE] [--ignore PATTERN ...]
Output:
A Markdown file containing the full directory tree.
"""
import os
import sys
import argparse
import subprocess
from pathlib import Path
from collections import defaultdict
# ---------------------------------------------------------------------------
# Default ignore patterns (common non-source dirs / files)
# ---------------------------------------------------------------------------
DEFAULT_IGNORE = {
".git",
"__pycache__",
".venv",
"venv",
"env",
"node_modules",
".ipynb_checkpoints",
".mypy_cache",
".pytest_cache",
".tox",
".eggs",
"*.egg-info",
".DS_Store",
"Thumbs.db",
"desktop.ini",
}
def should_ignore(name: str, ignore_set: set) -> bool:
"""Return True if *name* matches any pattern in the ignore set."""
if name in ignore_set:
return True
for pattern in ignore_set:
if pattern.startswith("*") and name.endswith(pattern[1:]):
return True
return False
def build_tree(root_path: str, ignore_set: set) -> list[str]:
"""
Walk *root_path* depth-first and return a list of tree-formatted lines.
Uses ``os.scandir`` for performance on large filesystems and sorts
entries alphabetically (directories first).
"""
lines: list[str] = []
def _walk(current: str, prefix: str) -> None:
try:
entries = sorted(
os.scandir(current),
key=lambda e: (not e.is_dir(follow_symlinks=False), e.name.lower()),
)
except PermissionError:
return
# Filter out ignored entries
entries = [e for e in entries if not should_ignore(e.name, ignore_set)]
for idx, entry in enumerate(entries):
is_last = idx == len(entries) - 1
connector = "└── " if is_last else "β”œβ”€β”€ "
suffix = "/" if entry.is_dir(follow_symlinks=False) else ""
lines.append(f"{prefix}{connector}{entry.name}{suffix}")
if entry.is_dir(follow_symlinks=False):
extension = " " if is_last else "β”‚ "
_walk(entry.path, prefix + extension)
_walk(root_path, "")
return lines
def main() -> None:
parser = argparse.ArgumentParser(
description="Scan repository file structure and output a Markdown tree."
)
parser.add_argument(
"path",
nargs="?",
default=".",
help="Root directory to scan (default: current directory).",
)
parser.add_argument(
"--output",
"-o",
default=None,
help="Output Markdown file path (default: STRUCTURE.md in scanned dir).",
)
parser.add_argument(
"--ignore",
nargs="*",
default=None,
help="Extra patterns to ignore (added to built-in defaults).",
)
parser.add_argument(
"--no-default-ignore",
action="store_true",
help="Disable the built-in ignore list (scan everything).",
)
args = parser.parse_args()
root = os.path.abspath(args.path)
root_name = os.path.basename(root)
# Build ignore set
ignore_set: set = set() if args.no_default_ignore else set(DEFAULT_IGNORE)
if args.ignore:
ignore_set.update(args.ignore)
# Always ignore the output file itself to avoid self-referencing
out_path = args.output or os.path.join(root, "STRUCTURE.md")
out_name = os.path.basename(out_path)
ignore_set.add(out_name)
print(f"Scanning: {root}")
print(f"Ignoring: {', '.join(sorted(ignore_set))}")
tree_lines = build_tree(root, ignore_set)
# Compose Markdown content
md_lines = [
f"## Project Structure\n",
f"```text",
f"{root_name}/",
]
md_lines.extend(tree_lines)
md_lines.append("```\n")
content = "\n".join(md_lines)
with open(out_path, "w", encoding="utf-8") as fh:
fh.write(content)
total_entries = len(tree_lines)
print(f"Done – {total_entries} entries written to {out_path}")
if __name__ == "__main__":
main()