autoscan / app.py
Chris4K's picture
Upload 384 files
a2a5bfd verified
"""HF Security & Performance Scanner v4.0 β€” modular.
Entrypoint: launches the Gradio UI. All scanning logic lives in core/scanner.py,
all rules in rules/, all reporting in report/.
"""
import json
import os
import sys
import tempfile
from datetime import datetime, timezone
from pathlib import Path
import gradio as gr
import pandas as pd
# Make sibling packages importable when running from any directory
sys.path.insert(0, str(Path(__file__).parent))
from core import (
bootstrap_binaries,
comment_on_space,
have_binary,
list_user_spaces,
)
from core.scanner import scan_repo
from report import generate_html_report, generate_sarif
# Bootstrap binaries on import (idempotent)
BOOTSTRAP_RESULT = bootstrap_binaries()
# =========================================================
# UI orchestration (multi-target, progress, report file)
# =========================================================
def run_scan(target, hf_token, comment, crawl_user, deep_history, max_targets,
run_security, run_performance, run_llm, run_supply_chain,
progress=gr.Progress()): # noqa: B008
if not target.strip():
return pd.DataFrame(), "❌ No target provided", "", None, None
target = target.strip().rstrip("/")
log_lines = []
log_lines.append(
f"[bootstrap] gitleaks: {BOOTSTRAP_RESULT.get('gitleaks')}, "
f"hadolint: {BOOTSTRAP_RESULT.get('hadolint')}"
)
tools_status = {
"semgrep": have_binary("semgrep"),
"bandit": have_binary("bandit"),
"detect-secrets": have_binary("detect-secrets"),
"pip-audit": have_binary("pip-audit"),
"ruff": have_binary("ruff"),
"gitleaks": have_binary("gitleaks"),
"hadolint": have_binary("hadolint"),
"agent-audit": have_binary("agent-audit"),
}
log_lines.append(
"[tools] " + ", ".join(f"{k}={'βœ“' if v else 'βœ—'}"
for k, v in tools_status.items())
)
# Resolve targets. Bare username (no slash, no http) β†’ treat as HF crawl.
looks_like_username = (
not crawl_user
and not target.startswith("http")
and not os.path.isdir(target)
and "/" not in target
and " " not in target
)
if looks_like_username:
log_lines.append(f"[auto] '{target}' looks like an HF username; crawling its spaces")
crawl_user = True
if crawl_user:
progress(0, desc=f"Listing spaces for {target}...")
targets, msg = list_user_spaces(target, hf_token=hf_token)
log_lines.append(f"[list] {msg}")
if not targets:
return pd.DataFrame(), "❌ " + msg, "\n".join(log_lines), None, None
targets = targets[: int(max_targets)]
else:
targets = [target]
# Scan each target
all_findings = []
n = len(targets)
for i, t in enumerate(targets):
progress(i / n, desc=f"Scanning {i+1}/{n}: {t}")
def _make_cb(idx, total, label):
def _cb(local_frac, desc=""):
progress((idx + local_frac) / total,
desc=f"[{idx+1}/{total}] {label}: {desc}")
return _cb
try:
findings, scan_log = scan_repo(
t, hf_token=hf_token, deep_history=deep_history,
run_security=run_security, run_performance=run_performance,
run_llm=run_llm, run_supply_chain=run_supply_chain,
progress_cb=_make_cb(i, n, t),
)
log_lines.extend(f"[scan {i+1}/{n}] {t} -> {line}" for line in scan_log)
all_findings.extend(findings)
if comment and findings and t.startswith("https://huggingface.co/spaces/"):
cmt = comment_on_space(t, hf_token, findings)
log_lines.append(f"[comment] {t} -> {cmt}")
except Exception as e:
log_lines.append(f"[scan {i+1}/{n}] {t} -> ERROR: {e}")
progress(1.0, desc="Done")
# Findings DataFrame
if all_findings:
df = pd.DataFrame(all_findings)
df["owasp"] = df["owasp"].apply(
lambda x: ", ".join(x) if isinstance(x, list) else x
)
df = df[["category", "severity", "confidence", "tool", "rule",
"file", "line", "message", "owasp", "remediation"]]
else:
df = pd.DataFrame(columns=[
"category", "severity", "confidence", "tool", "rule",
"file", "line", "message", "owasp", "remediation",
])
# Summary
by_sev, by_tool, by_conf, by_cat = {}, {}, {}, {}
for f in all_findings:
by_sev[f["severity"]] = by_sev.get(f["severity"], 0) + 1
by_tool[f["tool"]] = by_tool.get(f["tool"], 0) + 1
by_conf[f["confidence"]] = by_conf.get(f["confidence"], 0) + 1
by_cat[f["category"]] = by_cat.get(f["category"], 0) + 1
summary = (
f"**Targets:** {n} \n"
f"**Total findings:** {len(all_findings)} \n"
f"**By category:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_cat.items())) + " \n"
f"**By severity:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_sev.items())) + " \n"
f"**By confidence:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_conf.items())) + " \n"
f"**By tool:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_tool.items())) + " \n"
f"**Timestamp:** {datetime.now(timezone.utc).isoformat()}Z"
)
# HTML + SARIF reports
_now = datetime.now(timezone.utc)
scan_meta = {
"target": target,
"n_targets": n,
"timestamp": _now.isoformat() + "Z",
"footer": "HF Security Scanner v4.0",
}
report_html = generate_html_report(all_findings, scan_meta)
_ts = _now.strftime('%Y%m%d_%H%M%S')
report_path = os.path.join(tempfile.gettempdir(), f"scan_report_{_ts}.html")
with open(report_path, "w", encoding="utf-8") as fh:
fh.write(report_html)
sarif_doc = generate_sarif(all_findings, scan_meta)
sarif_path = os.path.join(tempfile.gettempdir(), f"scan_report_{_ts}.sarif")
with open(sarif_path, "w", encoding="utf-8") as fh:
json.dump(sarif_doc, fh, indent=2)
log_lines.append(f"[report] HTML β†’ {report_path}")
log_lines.append(f"[report] SARIF β†’ {sarif_path}")
return df, summary, "\n".join(log_lines), report_html, report_path
# =========================================================
# Gradio UI
# =========================================================
with gr.Blocks(title="HF Security & Performance Scanner v4.0") as demo:
gr.Markdown(
"# πŸ” Hugging Face & Git Security + Performance + LLM Scanner v4.0\n"
f"**Bootstrap status:** gitleaks={BOOTSTRAP_RESULT.get('gitleaks')}, "
f"hadolint={BOOTSTRAP_RESULT.get('hadolint')}"
)
with gr.Row():
with gr.Column(scale=2):
target = gr.Textbox(
label="Target",
placeholder=("https://github.com/org/repo | "
"https://huggingface.co/spaces/user/space | hf-username"),
)
hf_token = gr.Textbox(
label="HF Token (gated repos / commenting / private listing)",
type="password", placeholder="hf_...",
)
with gr.Column(scale=1):
run_security = gr.Checkbox(label="Security scan", value=True)
run_performance = gr.Checkbox(label="Performance scan", value=True)
run_llm = gr.Checkbox(
label="LLM/Agent scan (Semgrep:LLM + Agent Audit)",
value=True,
)
run_supply_chain = gr.Checkbox(
label="Supply-chain scan (Semgrep:SupplyChain + socket)",
value=True,
)
crawl_user = gr.Checkbox(label="Scan all HF Spaces of this user", value=False)
comment = gr.Checkbox(label="Post discussion to HF Space", value=False)
deep_history = gr.Checkbox(label="Deep scan (full git history)", value=False)
max_targets = gr.Slider(1, 100, value=20, step=1,
label="Max targets when crawling")
btn = gr.Button("πŸš€ Scan", variant="primary", size="lg")
summary = gr.Markdown()
with gr.Tabs():
with gr.TabItem("πŸ“Š Findings"):
findings_df = gr.Dataframe(label="Findings", wrap=True, interactive=False)
with gr.TabItem("πŸ“„ Report"):
report_file = gr.File(label="Download HTML report")
report_html = gr.HTML(label="Preview")
with gr.TabItem("πŸ“‹ Run log"):
log = gr.Textbox(label="Log", lines=20, max_lines=60)
btn.click(
run_scan,
inputs=[target, hf_token, comment, crawl_user, deep_history, max_targets,
run_security, run_performance, run_llm, run_supply_chain],
outputs=[findings_df, summary, log, report_html, report_file],
)
if __name__ == "__main__":
# Explicitly bind to 127.0.0.1:7860 so Playwright tests target the expected port
demo.launch(server_name="127.0.0.1", server_port=7860, share=False)