"""HF Security & Performance Scanner v4.0 — modular. Entrypoint: launches the Gradio UI. All scanning logic lives in core/scanner.py, all rules in rules/, all reporting in report/. """ import json import os import sys import tempfile from datetime import datetime, timezone from pathlib import Path import gradio as gr import pandas as pd # Make sibling packages importable when running from any directory sys.path.insert(0, str(Path(__file__).parent)) from core import ( bootstrap_binaries, comment_on_space, have_binary, list_user_spaces, ) from core.scanner import scan_repo from report import generate_html_report, generate_sarif # Bootstrap binaries on import (idempotent) BOOTSTRAP_RESULT = bootstrap_binaries() # ========================================================= # UI orchestration (multi-target, progress, report file) # ========================================================= def run_scan(target, hf_token, comment, crawl_user, deep_history, max_targets, run_security, run_performance, run_llm, run_supply_chain, progress=gr.Progress()): # noqa: B008 if not target.strip(): return pd.DataFrame(), "❌ No target provided", "", None, None target = target.strip().rstrip("/") log_lines = [] log_lines.append( f"[bootstrap] gitleaks: {BOOTSTRAP_RESULT.get('gitleaks')}, " f"hadolint: {BOOTSTRAP_RESULT.get('hadolint')}" ) tools_status = { "semgrep": have_binary("semgrep"), "bandit": have_binary("bandit"), "detect-secrets": have_binary("detect-secrets"), "pip-audit": have_binary("pip-audit"), "ruff": have_binary("ruff"), "gitleaks": have_binary("gitleaks"), "hadolint": have_binary("hadolint"), "agent-audit": have_binary("agent-audit"), } log_lines.append( "[tools] " + ", ".join(f"{k}={'✓' if v else '✗'}" for k, v in tools_status.items()) ) # Resolve targets. Bare username (no slash, no http) → treat as HF crawl. looks_like_username = ( not crawl_user and not target.startswith("http") and not os.path.isdir(target) and "/" not in target and " " not in target ) if looks_like_username: log_lines.append(f"[auto] '{target}' looks like an HF username; crawling its spaces") crawl_user = True if crawl_user: progress(0, desc=f"Listing spaces for {target}...") targets, msg = list_user_spaces(target, hf_token=hf_token) log_lines.append(f"[list] {msg}") if not targets: return pd.DataFrame(), "❌ " + msg, "\n".join(log_lines), None, None targets = targets[: int(max_targets)] else: targets = [target] # Scan each target all_findings = [] n = len(targets) for i, t in enumerate(targets): progress(i / n, desc=f"Scanning {i+1}/{n}: {t}") def _make_cb(idx, total, label): def _cb(local_frac, desc=""): progress((idx + local_frac) / total, desc=f"[{idx+1}/{total}] {label}: {desc}") return _cb try: findings, scan_log = scan_repo( t, hf_token=hf_token, deep_history=deep_history, run_security=run_security, run_performance=run_performance, run_llm=run_llm, run_supply_chain=run_supply_chain, progress_cb=_make_cb(i, n, t), ) log_lines.extend(f"[scan {i+1}/{n}] {t} -> {line}" for line in scan_log) all_findings.extend(findings) if comment and findings and t.startswith("https://huggingface.co/spaces/"): cmt = comment_on_space(t, hf_token, findings) log_lines.append(f"[comment] {t} -> {cmt}") except Exception as e: log_lines.append(f"[scan {i+1}/{n}] {t} -> ERROR: {e}") progress(1.0, desc="Done") # Findings DataFrame if all_findings: df = pd.DataFrame(all_findings) df["owasp"] = df["owasp"].apply( lambda x: ", ".join(x) if isinstance(x, list) else x ) df = df[["category", "severity", "confidence", "tool", "rule", "file", "line", "message", "owasp", "remediation"]] else: df = pd.DataFrame(columns=[ "category", "severity", "confidence", "tool", "rule", "file", "line", "message", "owasp", "remediation", ]) # Summary by_sev, by_tool, by_conf, by_cat = {}, {}, {}, {} for f in all_findings: by_sev[f["severity"]] = by_sev.get(f["severity"], 0) + 1 by_tool[f["tool"]] = by_tool.get(f["tool"], 0) + 1 by_conf[f["confidence"]] = by_conf.get(f["confidence"], 0) + 1 by_cat[f["category"]] = by_cat.get(f["category"], 0) + 1 summary = ( f"**Targets:** {n} \n" f"**Total findings:** {len(all_findings)} \n" f"**By category:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_cat.items())) + " \n" f"**By severity:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_sev.items())) + " \n" f"**By confidence:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_conf.items())) + " \n" f"**By tool:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_tool.items())) + " \n" f"**Timestamp:** {datetime.now(timezone.utc).isoformat()}Z" ) # HTML + SARIF reports _now = datetime.now(timezone.utc) scan_meta = { "target": target, "n_targets": n, "timestamp": _now.isoformat() + "Z", "footer": "HF Security Scanner v4.0", } report_html = generate_html_report(all_findings, scan_meta) _ts = _now.strftime('%Y%m%d_%H%M%S') report_path = os.path.join(tempfile.gettempdir(), f"scan_report_{_ts}.html") with open(report_path, "w", encoding="utf-8") as fh: fh.write(report_html) sarif_doc = generate_sarif(all_findings, scan_meta) sarif_path = os.path.join(tempfile.gettempdir(), f"scan_report_{_ts}.sarif") with open(sarif_path, "w", encoding="utf-8") as fh: json.dump(sarif_doc, fh, indent=2) log_lines.append(f"[report] HTML → {report_path}") log_lines.append(f"[report] SARIF → {sarif_path}") return df, summary, "\n".join(log_lines), report_html, report_path # ========================================================= # Gradio UI # ========================================================= with gr.Blocks(title="HF Security & Performance Scanner v4.0") as demo: gr.Markdown( "# 🔐 Hugging Face & Git Security + Performance + LLM Scanner v4.0\n" f"**Bootstrap status:** gitleaks={BOOTSTRAP_RESULT.get('gitleaks')}, " f"hadolint={BOOTSTRAP_RESULT.get('hadolint')}" ) with gr.Row(): with gr.Column(scale=2): target = gr.Textbox( label="Target", placeholder=("https://github.com/org/repo | " "https://huggingface.co/spaces/user/space | hf-username"), ) hf_token = gr.Textbox( label="HF Token (gated repos / commenting / private listing)", type="password", placeholder="hf_...", ) with gr.Column(scale=1): run_security = gr.Checkbox(label="Security scan", value=True) run_performance = gr.Checkbox(label="Performance scan", value=True) run_llm = gr.Checkbox( label="LLM/Agent scan (Semgrep:LLM + Agent Audit)", value=True, ) run_supply_chain = gr.Checkbox( label="Supply-chain scan (Semgrep:SupplyChain + socket)", value=True, ) crawl_user = gr.Checkbox(label="Scan all HF Spaces of this user", value=False) comment = gr.Checkbox(label="Post discussion to HF Space", value=False) deep_history = gr.Checkbox(label="Deep scan (full git history)", value=False) max_targets = gr.Slider(1, 100, value=20, step=1, label="Max targets when crawling") btn = gr.Button("🚀 Scan", variant="primary", size="lg") summary = gr.Markdown() with gr.Tabs(): with gr.TabItem("📊 Findings"): findings_df = gr.Dataframe(label="Findings", wrap=True, interactive=False) with gr.TabItem("📄 Report"): report_file = gr.File(label="Download HTML report") report_html = gr.HTML(label="Preview") with gr.TabItem("📋 Run log"): log = gr.Textbox(label="Log", lines=20, max_lines=60) btn.click( run_scan, inputs=[target, hf_token, comment, crawl_user, deep_history, max_targets, run_security, run_performance, run_llm, run_supply_chain], outputs=[findings_df, summary, log, report_html, report_file], ) if __name__ == "__main__": # Explicitly bind to 127.0.0.1:7860 so Playwright tests target the expected port demo.launch(server_name="127.0.0.1", server_port=7860, share=False)