Spaces:

GraziePrego
/

agent

Paused

App Files Files Community

agent / plugins /_promptinclude /helpers /scanner.py

GraziePrego

Upload folder using huggingface_hub

7d4338a verified about 2 months ago

raw

history blame contribute delete

5.65 kB

	"""Scan workdir for promptinclude files. No agent/tool dependencies."""

	import fnmatch
	import os
	from typing import Literal, TypedDict

	from pathspec import PathSpec

	from helpers import tokens


	# ------------------------------------------------------------------
	# Types
	# ------------------------------------------------------------------

	class FileEntry(TypedDict):
	path: str
	content: str
	token_count: int
	status: Literal["ok", "cropped", "skipped"]


	class ScanResult(TypedDict):
	files: list[FileEntry]
	skipped_count: int


	# ------------------------------------------------------------------
	# Public API
	# ------------------------------------------------------------------

	def scan_promptinclude_files(
	root: str,
	*,
	name_pattern: str = "*.promptinclude.md",
	max_depth: int = 10,
	max_file_tokens: int = 2000,
	max_file_count: int = 50,
	max_total_tokens: int = 8000,
	gitignore: str = "",
	) -> ScanResult:
	ignore_spec = _build_ignore_spec(gitignore)
	matched = _find_matching_files(root, name_pattern, max_depth, ignore_spec)
	matched.sort()

	result_files: list[FileEntry] = []
	total_tokens_used = 0
	skipped_count = 0
	budget_exhausted = False

	for path in matched:
	if budget_exhausted or len(result_files) >= max_file_count:
	skipped_count += 1
	continue

	try:
	with open(path, "r", encoding="utf-8", errors="replace") as f:
	raw = f.read()
	except (OSError, IOError):
	skipped_count += 1
	continue

	if not raw.strip():
	continue

	file_tokens = tokens.count_tokens(raw)

	# check if adding path line alone exceeds budget
	path_tokens = tokens.count_tokens(path) + 5 # overhead for formatting
	if total_tokens_used + path_tokens > max_total_tokens:
	skipped_count += 1
	budget_exhausted = True
	continue

	# per-file token cap
	capped = min(file_tokens, max_file_tokens)

	if total_tokens_used + path_tokens + capped > max_total_tokens:
	# try to fit partial
	remaining = max_total_tokens - total_tokens_used - path_tokens
	if remaining > 50:
	trimmed = tokens.trim_to_tokens(raw, remaining, direction="start")
	trimmed_count = tokens.count_tokens(trimmed)
	total_tokens_used += path_tokens + trimmed_count
	result_files.append(FileEntry(
	path=path, content=trimmed,
	token_count=trimmed_count, status="cropped",
	))
	else:
	result_files.append(FileEntry(
	path=path, content="",
	token_count=0, status="skipped",
	))
	budget_exhausted = True
	continue

	if capped < file_tokens:
	trimmed = tokens.trim_to_tokens(raw, max_file_tokens, direction="start")
	trimmed_count = tokens.count_tokens(trimmed)
	total_tokens_used += path_tokens + trimmed_count
	result_files.append(FileEntry(
	path=path, content=trimmed,
	token_count=trimmed_count, status="cropped",
	))
	else:
	total_tokens_used += path_tokens + file_tokens
	result_files.append(FileEntry(
	path=path, content=raw,
	token_count=file_tokens, status="ok",
	))

	# remaining unprocessed files from matched list
	remaining_unprocessed = len(matched) - len(result_files) - skipped_count
	if remaining_unprocessed > 0:
	skipped_count += remaining_unprocessed

	return ScanResult(files=result_files, skipped_count=skipped_count)


	# ------------------------------------------------------------------
	# Internal helpers
	# ------------------------------------------------------------------

	def _build_ignore_spec(gitignore: str) -> PathSpec \| None:
	if not gitignore or not gitignore.strip():
	return None
	lines = [
	line.strip()
	for line in gitignore.splitlines()
	if line.strip() and not line.strip().startswith("#")
	]
	if not lines:
	return None
	return PathSpec.from_lines("gitwildmatch", lines)


	def _find_matching_files(
	root: str,
	name_pattern: str,
	max_depth: int,
	ignore_spec: PathSpec \| None,
	) -> list[str]:
	root = os.path.abspath(root)
	if not os.path.isdir(root):
	return []

	results: list[str] = []

	for dirpath, dirnames, filenames in os.walk(root, topdown=True):
	depth = dirpath[len(root):].count(os.sep)
	if depth >= max_depth:
	dirnames.clear()
	continue

	# filter ignored dirs in-place
	if ignore_spec:
	filtered_dirs = []
	for d in dirnames:
	rel = os.path.relpath(os.path.join(dirpath, d), root)
	rel_posix = rel.replace(os.sep, "/")
	if ignore_spec.match_file(rel_posix) or ignore_spec.match_file(f"{rel_posix}/"):
	continue
	filtered_dirs.append(d)
	dirnames[:] = filtered_dirs

	for fname in filenames:
	if not fnmatch.fnmatch(fname, name_pattern):
	continue
	full = os.path.join(dirpath, fname)
	if ignore_spec:
	rel = os.path.relpath(full, root)
	rel_posix = rel.replace(os.sep, "/")
	if ignore_spec.match_file(rel_posix):
	continue
	results.append(full)

	return results