Spaces:

Chris4K
/

autoscan

Running

App Files Files Community

autoscan / app.py

Chris4K

Upload 384 files

a2a5bfd verified 16 days ago

raw

history blame contribute delete

9.12 kB

	"""HF Security & Performance Scanner v4.0 — modular.

	Entrypoint: launches the Gradio UI. All scanning logic lives in core/scanner.py,
	all rules in rules/, all reporting in report/.
	"""
	import json
	import os
	import sys
	import tempfile
	from datetime import datetime, timezone
	from pathlib import Path

	import gradio as gr
	import pandas as pd

	# Make sibling packages importable when running from any directory
	sys.path.insert(0, str(Path(__file__).parent))

	from core import (
	bootstrap_binaries,
	comment_on_space,
	have_binary,
	list_user_spaces,
	)
	from core.scanner import scan_repo
	from report import generate_html_report, generate_sarif

	# Bootstrap binaries on import (idempotent)
	BOOTSTRAP_RESULT = bootstrap_binaries()



	# =========================================================
	# UI orchestration (multi-target, progress, report file)
	# =========================================================
	def run_scan(target, hf_token, comment, crawl_user, deep_history, max_targets,
	run_security, run_performance, run_llm, run_supply_chain,
	progress=gr.Progress()): # noqa: B008
	if not target.strip():
	return pd.DataFrame(), "❌ No target provided", "", None, None

	target = target.strip().rstrip("/")
	log_lines = []
	log_lines.append(
	f"[bootstrap] gitleaks: {BOOTSTRAP_RESULT.get('gitleaks')}, "
	f"hadolint: {BOOTSTRAP_RESULT.get('hadolint')}"
	)

	tools_status = {
	"semgrep": have_binary("semgrep"),
	"bandit": have_binary("bandit"),
	"detect-secrets": have_binary("detect-secrets"),
	"pip-audit": have_binary("pip-audit"),
	"ruff": have_binary("ruff"),
	"gitleaks": have_binary("gitleaks"),
	"hadolint": have_binary("hadolint"),
	"agent-audit": have_binary("agent-audit"),
	}
	log_lines.append(
	"[tools] " + ", ".join(f"{k}={'✓' if v else '✗'}"
	for k, v in tools_status.items())
	)

	# Resolve targets. Bare username (no slash, no http) → treat as HF crawl.
	looks_like_username = (
	not crawl_user
	and not target.startswith("http")
	and not os.path.isdir(target)
	and "/" not in target
	and " " not in target
	)
	if looks_like_username:
	log_lines.append(f"[auto] '{target}' looks like an HF username; crawling its spaces")
	crawl_user = True

	if crawl_user:
	progress(0, desc=f"Listing spaces for {target}...")
	targets, msg = list_user_spaces(target, hf_token=hf_token)
	log_lines.append(f"[list] {msg}")
	if not targets:
	return pd.DataFrame(), "❌ " + msg, "\n".join(log_lines), None, None
	targets = targets[: int(max_targets)]
	else:
	targets = [target]

	# Scan each target
	all_findings = []
	n = len(targets)
	for i, t in enumerate(targets):
	progress(i / n, desc=f"Scanning {i+1}/{n}: {t}")

	def _make_cb(idx, total, label):
	def _cb(local_frac, desc=""):
	progress((idx + local_frac) / total,
	desc=f"[{idx+1}/{total}] {label}: {desc}")
	return _cb

	try:
	findings, scan_log = scan_repo(
	t, hf_token=hf_token, deep_history=deep_history,
	run_security=run_security, run_performance=run_performance,
	run_llm=run_llm, run_supply_chain=run_supply_chain,
	progress_cb=_make_cb(i, n, t),
	)
	log_lines.extend(f"[scan {i+1}/{n}] {t} -> {line}" for line in scan_log)
	all_findings.extend(findings)

	if comment and findings and t.startswith("https://huggingface.co/spaces/"):
	cmt = comment_on_space(t, hf_token, findings)
	log_lines.append(f"[comment] {t} -> {cmt}")
	except Exception as e:
	log_lines.append(f"[scan {i+1}/{n}] {t} -> ERROR: {e}")

	progress(1.0, desc="Done")

	# Findings DataFrame
	if all_findings:
	df = pd.DataFrame(all_findings)
	df["owasp"] = df["owasp"].apply(
	lambda x: ", ".join(x) if isinstance(x, list) else x
	)
	df = df[["category", "severity", "confidence", "tool", "rule",
	"file", "line", "message", "owasp", "remediation"]]
	else:
	df = pd.DataFrame(columns=[
	"category", "severity", "confidence", "tool", "rule",
	"file", "line", "message", "owasp", "remediation",
	])

	# Summary
	by_sev, by_tool, by_conf, by_cat = {}, {}, {}, {}
	for f in all_findings:
	by_sev[f["severity"]] = by_sev.get(f["severity"], 0) + 1
	by_tool[f["tool"]] = by_tool.get(f["tool"], 0) + 1
	by_conf[f["confidence"]] = by_conf.get(f["confidence"], 0) + 1
	by_cat[f["category"]] = by_cat.get(f["category"], 0) + 1

	summary = (
	f"Targets: {n} \n"
	f"Total findings: {len(all_findings)} \n"
	f"By category: " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_cat.items())) + " \n"
	f"By severity: " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_sev.items())) + " \n"
	f"By confidence: " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_conf.items())) + " \n"
	f"By tool: " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_tool.items())) + " \n"
	f"Timestamp: {datetime.now(timezone.utc).isoformat()}Z"
	)

	# HTML + SARIF reports
	_now = datetime.now(timezone.utc)
	scan_meta = {
	"target": target,
	"n_targets": n,
	"timestamp": _now.isoformat() + "Z",
	"footer": "HF Security Scanner v4.0",
	}
	report_html = generate_html_report(all_findings, scan_meta)
	_ts = _now.strftime('%Y%m%d_%H%M%S')
	report_path = os.path.join(tempfile.gettempdir(), f"scan_report_{_ts}.html")
	with open(report_path, "w", encoding="utf-8") as fh:
	fh.write(report_html)

	sarif_doc = generate_sarif(all_findings, scan_meta)
	sarif_path = os.path.join(tempfile.gettempdir(), f"scan_report_{_ts}.sarif")
	with open(sarif_path, "w", encoding="utf-8") as fh:
	json.dump(sarif_doc, fh, indent=2)
	log_lines.append(f"[report] HTML → {report_path}")
	log_lines.append(f"[report] SARIF → {sarif_path}")

	return df, summary, "\n".join(log_lines), report_html, report_path


	# =========================================================
	# Gradio UI
	# =========================================================
	with gr.Blocks(title="HF Security & Performance Scanner v4.0") as demo:
	gr.Markdown(
	"# 🔐 Hugging Face & Git Security + Performance + LLM Scanner v4.0\n"
	f"Bootstrap status: gitleaks={BOOTSTRAP_RESULT.get('gitleaks')}, "
	f"hadolint={BOOTSTRAP_RESULT.get('hadolint')}"
	)

	with gr.Row():
	with gr.Column(scale=2):
	target = gr.Textbox(
	label="Target",
	placeholder=("https://github.com/org/repo \| "
	"https://huggingface.co/spaces/user/space \| hf-username"),
	)
	hf_token = gr.Textbox(
	label="HF Token (gated repos / commenting / private listing)",
	type="password", placeholder="hf_...",
	)
	with gr.Column(scale=1):
	run_security = gr.Checkbox(label="Security scan", value=True)
	run_performance = gr.Checkbox(label="Performance scan", value=True)
	run_llm = gr.Checkbox(
	label="LLM/Agent scan (Semgrep:LLM + Agent Audit)",
	value=True,
	)
	run_supply_chain = gr.Checkbox(
	label="Supply-chain scan (Semgrep:SupplyChain + socket)",
	value=True,
	)
	crawl_user = gr.Checkbox(label="Scan all HF Spaces of this user", value=False)
	comment = gr.Checkbox(label="Post discussion to HF Space", value=False)
	deep_history = gr.Checkbox(label="Deep scan (full git history)", value=False)
	max_targets = gr.Slider(1, 100, value=20, step=1,
	label="Max targets when crawling")

	btn = gr.Button("🚀 Scan", variant="primary", size="lg")
	summary = gr.Markdown()

	with gr.Tabs():
	with gr.TabItem("📊 Findings"):
	findings_df = gr.Dataframe(label="Findings", wrap=True, interactive=False)
	with gr.TabItem("📄 Report"):
	report_file = gr.File(label="Download HTML report")
	report_html = gr.HTML(label="Preview")
	with gr.TabItem("📋 Run log"):
	log = gr.Textbox(label="Log", lines=20, max_lines=60)

	btn.click(
	run_scan,
	inputs=[target, hf_token, comment, crawl_user, deep_history, max_targets,
	run_security, run_performance, run_llm, run_supply_chain],
	outputs=[findings_df, summary, log, report_html, report_file],
	)

	if __name__ == "__main__":
	# Explicitly bind to 127.0.0.1:7860 so Playwright tests target the expected port
	demo.launch(server_name="127.0.0.1", server_port=7860, share=False)