autoscan / docs /api /core.md
Chris4K's picture
Initial commit v5.0.0.
5248e3b verified

API Reference — core/

core/__init__.py — public exports

from core import (
    # helpers.py
    run, jload, write_tmp, relpath, have_binary,
    # models.py
    make_finding, dedup_findings, sort_findings,
    # hf.py
    hf_space_to_git, list_user_spaces, comment_on_space,
    # bootstrap.py
    bootstrap_binaries,
    # baseline.py
    make_fingerprint, save_baseline, load_baseline,
    filter_by_baseline, parse_ignore_file, apply_ignore_rules,
)

core/scanner.py

scan_repo()

def scan_repo(
    repo_url: str,
    hf_token: Optional[str] = None,
    deep_history: bool = False,
    run_security: bool = True,
    run_performance: bool = True,
    run_llm: bool = True,
    max_workers: int = 8,
    progress_cb: Optional[Callable[[float, str], None]] = None,
) -> Tuple[List[dict], List[str]]

Clone or copy the target, run all enabled scanners in parallel, return (findings, log).

Parameters:

Parameter Type Default Description
repo_url str HTTPS URL (HF Space or git repo) or local directory path
hf_token str None HF Bearer token for private/gated repos
deep_history bool False If True, run full git clone (no --depth 1) and include gitleaks
run_security bool True Enable security scanners (Semgrep, bandit, pip-audit, …)
run_performance bool True Enable performance scanners (Semgrep:Perf, ruff)
run_llm bool True Enable LLM/agent scanners (Semgrep:LLM, agent-audit)
max_workers int 8 Maximum thread-pool workers
progress_cb Callable None Called with (fraction: float, description: str) as each scanner completes

Returns: (findings, log) where findings is a deduplicated, sorted List[dict] and log is a List[str] of scanner messages (first entry is the summary "OK (N unique findings)").

Error handling: Never raises. Returns ([], [error_message]) on clone failure or invalid target.

Example:

from core.scanner import scan_repo

findings, log = scan_repo(
    "https://huggingface.co/spaces/owner/myspace",
    run_performance=False,
    progress_cb=lambda f, d: print(f"{f:.0%} {d}"),
)
print(log[0])   # "OK (23 unique findings)"

core/baseline.py

make_fingerprint(finding)

def make_fingerprint(finding: dict) -> str

Return a 16-hex-char deterministic fingerprint:

sha256( tool:rule:file:line:message )[:16]

save_baseline(findings, path)

def save_baseline(findings: List[dict], path: Union[str, Path]) -> None

Persist fingerprints to a JSON file at path. Overwrites if it exists. The JSON format:

{
  "created": "2025-01-01T12:00:00Z",
  "scanner_version": "4.0.0",
  "fingerprints": ["abc123...", ...]
}

load_baseline(path)

def load_baseline(path: Union[str, Path]) -> Set[str]

Return the set of fingerprint strings from a saved baseline JSON file.

filter_by_baseline(findings, baseline)

def filter_by_baseline(
    findings: List[dict],
    baseline: Set[str],
) -> Tuple[List[dict], List[dict]]

Return (kept, suppressed) — findings whose fingerprints are not in baseline vs those that are.

parse_ignore_file(path)

def parse_ignore_file(path: Union[str, Path]) -> List[IgnoreRule]

Parse a .hfscanignore file and return a list of IgnoreRule dataclass instances.

.hfscanignore syntax:

# comment
tests/                         # suppress all findings under tests/
* rule:B101                    # suppress rule everywhere
src/legacy/ severity:INFO      # suppress INFO severity under path
src/gen/ rule:B608             # suppress rule under path

apply_ignore_rules(findings, rules)

def apply_ignore_rules(
    findings: List[dict],
    rules: List[IgnoreRule],
) -> Tuple[List[dict], int]

Return (kept_findings, ignored_count). Rules are evaluated in order; first match wins.

IgnoreRule dataclass

@dataclass
class IgnoreRule:
    path_prefix: str        # "" = wildcard (applies everywhere)
    rule_id: str            # "" = no rule filter
    severity: str           # "" = no severity filter

core/models.py

make_finding()

def make_finding(
    tool: str,
    rule: str,
    severity: str,
    file: str,
    line: int,
    message: str,
    owasp: Union[str, List[str]],
    category: str = "security",
    confidence: str = None,
    remediation: str = None,
) -> dict

Build a normalized finding dict. All scanner runners call this to ensure uniform output shape.

  • confidence: if None, looked up from TOOL_DEFAULT_CONFIDENCE; falls back to "possible".
  • remediation: if None, looked up from report.remediation.REMEDIATION by rule; falls back to "".
  • owasp: str is automatically wrapped in a list.

sort_findings(findings)

def sort_findings(findings: List[dict]) -> List[dict]

Sort by severity (ERROR < WARNING < INFO) → confidence (confirmed < likely < possible) → fileline.

dedup_findings(findings)

def dedup_findings(findings: List[dict]) -> List[dict]

Remove duplicates keyed on (tool, file, line, message). Preserves first occurrence order.

Constants

SEVERITY_RANK: dict      # {"ERROR": 0, "HIGH": 0, "WARNING": 1, ...}
CONFIDENCE_RANK: dict    # {"confirmed": 0, "likely": 1, "possible": 2}
TOOL_DEFAULT_CONFIDENCE: dict  # per-tool default confidence levels
FORBIDDEN_FILES: list    # file names that are always flagged

core/helpers.py

run(cmd, cwd=None, timeout=300)

def run(cmd: List[str], cwd: str = None, timeout: int = 300) -> Tuple[str, int]

Run a subprocess. Returns (stdout_stripped, returncode). Never raises.

Exit code Meaning
Normal Return code from the process
124 Timed out (subprocess.TimeoutExpired)
127 Binary not found (FileNotFoundError)

jload(txt)

def jload(txt: str) -> Optional[Any]

Parse JSON from a string. Returns None for empty strings or parse errors.

write_tmp(content, suffix=".yaml")

def write_tmp(content: str, suffix: str = ".yaml") -> str

Write content to a temp file and return its absolute path.

relpath(base, p)

def relpath(base: str, p: str) -> str

Return p relative to base. If p is not under base, return str(p) unchanged.

have_binary(name)

def have_binary(name: str) -> bool

Return True if name is on PATH (via shutil.which).


core/hf.py

hf_space_to_git(url, token=None)

def hf_space_to_git(url: str, token: str = None) -> Optional[str]

Convert https://huggingface.co/spaces/<ns>/<name> to a git-cloneable URL. Returns None for non-HF URLs. If token is provided, embeds it as HTTP basic auth (USER:<token>@).

list_user_spaces(username, hf_token=None, limit=500)

def list_user_spaces(
    username: str,
    hf_token: str = None,
    limit: int = 500,
) -> Tuple[List[str], str]

Return (space_urls, status_message). Queries https://huggingface.co/api/spaces?author=<username>. Returns ([], error_message) on HTTP error or network failure.


core/bootstrap.py

bootstrap_binaries()

def bootstrap_binaries() -> dict

Download gitleaks and hadolint binaries for the current platform if not already on PATH. Returns a dict with keys "gitleaks" and "hadolint", values "ok" / "already installed" / "error: ...".

Binaries are placed in:

  • Windows: <venv>\Scripts\ (next to python.exe, so shutil.which finds them)
  • macOS/Linux: ~/.local/bin/

Versions: GITLEAKS_VERSION = "8.18.4", HADOLINT_VERSION = "2.12.0" (defined as module constants).