Spaces:

GraziePrego
/

agent

Paused

File size: 5,654 Bytes

7d4338a

"""Scan workdir for promptinclude files. No agent/tool dependencies."""

import fnmatch
import os
from typing import Literal, TypedDict

from pathspec import PathSpec

from helpers import tokens


# ------------------------------------------------------------------
# Types
# ------------------------------------------------------------------

class FileEntry(TypedDict):
    path: str
    content: str
    token_count: int
    status: Literal["ok", "cropped", "skipped"]


class ScanResult(TypedDict):
    files: list[FileEntry]
    skipped_count: int


# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------

def scan_promptinclude_files(
    root: str,
    *,
    name_pattern: str = "*.promptinclude.md",
    max_depth: int = 10,
    max_file_tokens: int = 2000,
    max_file_count: int = 50,
    max_total_tokens: int = 8000,
    gitignore: str = "",
) -> ScanResult:
    ignore_spec = _build_ignore_spec(gitignore)
    matched = _find_matching_files(root, name_pattern, max_depth, ignore_spec)
    matched.sort()

    result_files: list[FileEntry] = []
    total_tokens_used = 0
    skipped_count = 0
    budget_exhausted = False

    for path in matched:
        if budget_exhausted or len(result_files) >= max_file_count:
            skipped_count += 1
            continue

        try:
            with open(path, "r", encoding="utf-8", errors="replace") as f:
                raw = f.read()
        except (OSError, IOError):
            skipped_count += 1
            continue

        if not raw.strip():
            continue

        file_tokens = tokens.count_tokens(raw)

        # check if adding path line alone exceeds budget
        path_tokens = tokens.count_tokens(path) + 5  # overhead for formatting
        if total_tokens_used + path_tokens > max_total_tokens:
            skipped_count += 1
            budget_exhausted = True
            continue

        # per-file token cap
        capped = min(file_tokens, max_file_tokens)

        if total_tokens_used + path_tokens + capped > max_total_tokens:
            # try to fit partial
            remaining = max_total_tokens - total_tokens_used - path_tokens
            if remaining > 50:
                trimmed = tokens.trim_to_tokens(raw, remaining, direction="start")
                trimmed_count = tokens.count_tokens(trimmed)
                total_tokens_used += path_tokens + trimmed_count
                result_files.append(FileEntry(
                    path=path, content=trimmed,
                    token_count=trimmed_count, status="cropped",
                ))
            else:
                result_files.append(FileEntry(
                    path=path, content="",
                    token_count=0, status="skipped",
                ))
            budget_exhausted = True
            continue

        if capped < file_tokens:
            trimmed = tokens.trim_to_tokens(raw, max_file_tokens, direction="start")
            trimmed_count = tokens.count_tokens(trimmed)
            total_tokens_used += path_tokens + trimmed_count
            result_files.append(FileEntry(
                path=path, content=trimmed,
                token_count=trimmed_count, status="cropped",
            ))
        else:
            total_tokens_used += path_tokens + file_tokens
            result_files.append(FileEntry(
                path=path, content=raw,
                token_count=file_tokens, status="ok",
            ))

    # remaining unprocessed files from matched list
    remaining_unprocessed = len(matched) - len(result_files) - skipped_count
    if remaining_unprocessed > 0:
        skipped_count += remaining_unprocessed

    return ScanResult(files=result_files, skipped_count=skipped_count)


# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------

def _build_ignore_spec(gitignore: str) -> PathSpec | None:
    if not gitignore or not gitignore.strip():
        return None
    lines = [
        line.strip()
        for line in gitignore.splitlines()
        if line.strip() and not line.strip().startswith("#")
    ]
    if not lines:
        return None
    return PathSpec.from_lines("gitwildmatch", lines)


def _find_matching_files(
    root: str,
    name_pattern: str,
    max_depth: int,
    ignore_spec: PathSpec | None,
) -> list[str]:
    root = os.path.abspath(root)
    if not os.path.isdir(root):
        return []

    results: list[str] = []

    for dirpath, dirnames, filenames in os.walk(root, topdown=True):
        depth = dirpath[len(root):].count(os.sep)
        if depth >= max_depth:
            dirnames.clear()
            continue

        # filter ignored dirs in-place
        if ignore_spec:
            filtered_dirs = []
            for d in dirnames:
                rel = os.path.relpath(os.path.join(dirpath, d), root)
                rel_posix = rel.replace(os.sep, "/")
                if ignore_spec.match_file(rel_posix) or ignore_spec.match_file(f"{rel_posix}/"):
                    continue
                filtered_dirs.append(d)
            dirnames[:] = filtered_dirs

        for fname in filenames:
            if not fnmatch.fnmatch(fname, name_pattern):
                continue
            full = os.path.join(dirpath, fname)
            if ignore_spec:
                rel = os.path.relpath(full, root)
                rel_posix = rel.replace(os.sep, "/")
                if ignore_spec.match_file(rel_posix):
                    continue
            results.append(full)

    return results