| """HF Security & Performance Scanner v4.0 β modular. |
| |
| Entrypoint: launches the Gradio UI. All scanning logic lives in core/scanner.py, |
| all rules in rules/, all reporting in report/. |
| """ |
| import json |
| import os |
| import sys |
| import tempfile |
| from datetime import datetime, timezone |
| from pathlib import Path |
|
|
| import gradio as gr |
| import pandas as pd |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent)) |
|
|
| from core import ( |
| bootstrap_binaries, |
| comment_on_space, |
| have_binary, |
| list_user_spaces, |
| ) |
| from core.scanner import scan_repo |
| from report import generate_html_report, generate_sarif |
|
|
| |
| BOOTSTRAP_RESULT = bootstrap_binaries() |
|
|
|
|
|
|
| |
| |
| |
| def run_scan(target, hf_token, comment, crawl_user, deep_history, max_targets, |
| run_security, run_performance, run_llm, run_supply_chain, |
| progress=gr.Progress()): |
| if not target.strip(): |
| return pd.DataFrame(), "β No target provided", "", None, None |
|
|
| target = target.strip().rstrip("/") |
| log_lines = [] |
| log_lines.append( |
| f"[bootstrap] gitleaks: {BOOTSTRAP_RESULT.get('gitleaks')}, " |
| f"hadolint: {BOOTSTRAP_RESULT.get('hadolint')}" |
| ) |
|
|
| tools_status = { |
| "semgrep": have_binary("semgrep"), |
| "bandit": have_binary("bandit"), |
| "detect-secrets": have_binary("detect-secrets"), |
| "pip-audit": have_binary("pip-audit"), |
| "ruff": have_binary("ruff"), |
| "gitleaks": have_binary("gitleaks"), |
| "hadolint": have_binary("hadolint"), |
| "agent-audit": have_binary("agent-audit"), |
| } |
| log_lines.append( |
| "[tools] " + ", ".join(f"{k}={'β' if v else 'β'}" |
| for k, v in tools_status.items()) |
| ) |
|
|
| |
| looks_like_username = ( |
| not crawl_user |
| and not target.startswith("http") |
| and not os.path.isdir(target) |
| and "/" not in target |
| and " " not in target |
| ) |
| if looks_like_username: |
| log_lines.append(f"[auto] '{target}' looks like an HF username; crawling its spaces") |
| crawl_user = True |
|
|
| if crawl_user: |
| progress(0, desc=f"Listing spaces for {target}...") |
| targets, msg = list_user_spaces(target, hf_token=hf_token) |
| log_lines.append(f"[list] {msg}") |
| if not targets: |
| return pd.DataFrame(), "β " + msg, "\n".join(log_lines), None, None |
| targets = targets[: int(max_targets)] |
| else: |
| targets = [target] |
|
|
| |
| all_findings = [] |
| n = len(targets) |
| for i, t in enumerate(targets): |
| progress(i / n, desc=f"Scanning {i+1}/{n}: {t}") |
|
|
| def _make_cb(idx, total, label): |
| def _cb(local_frac, desc=""): |
| progress((idx + local_frac) / total, |
| desc=f"[{idx+1}/{total}] {label}: {desc}") |
| return _cb |
|
|
| try: |
| findings, scan_log = scan_repo( |
| t, hf_token=hf_token, deep_history=deep_history, |
| run_security=run_security, run_performance=run_performance, |
| run_llm=run_llm, run_supply_chain=run_supply_chain, |
| progress_cb=_make_cb(i, n, t), |
| ) |
| log_lines.extend(f"[scan {i+1}/{n}] {t} -> {line}" for line in scan_log) |
| all_findings.extend(findings) |
|
|
| if comment and findings and t.startswith("https://huggingface.co/spaces/"): |
| cmt = comment_on_space(t, hf_token, findings) |
| log_lines.append(f"[comment] {t} -> {cmt}") |
| except Exception as e: |
| log_lines.append(f"[scan {i+1}/{n}] {t} -> ERROR: {e}") |
|
|
| progress(1.0, desc="Done") |
|
|
| |
| if all_findings: |
| df = pd.DataFrame(all_findings) |
| df["owasp"] = df["owasp"].apply( |
| lambda x: ", ".join(x) if isinstance(x, list) else x |
| ) |
| df = df[["category", "severity", "confidence", "tool", "rule", |
| "file", "line", "message", "owasp", "remediation"]] |
| else: |
| df = pd.DataFrame(columns=[ |
| "category", "severity", "confidence", "tool", "rule", |
| "file", "line", "message", "owasp", "remediation", |
| ]) |
|
|
| |
| by_sev, by_tool, by_conf, by_cat = {}, {}, {}, {} |
| for f in all_findings: |
| by_sev[f["severity"]] = by_sev.get(f["severity"], 0) + 1 |
| by_tool[f["tool"]] = by_tool.get(f["tool"], 0) + 1 |
| by_conf[f["confidence"]] = by_conf.get(f["confidence"], 0) + 1 |
| by_cat[f["category"]] = by_cat.get(f["category"], 0) + 1 |
|
|
| summary = ( |
| f"**Targets:** {n} \n" |
| f"**Total findings:** {len(all_findings)} \n" |
| f"**By category:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_cat.items())) + " \n" |
| f"**By severity:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_sev.items())) + " \n" |
| f"**By confidence:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_conf.items())) + " \n" |
| f"**By tool:** " + ", ".join(f"`{k}`={v}" for k, v in sorted(by_tool.items())) + " \n" |
| f"**Timestamp:** {datetime.now(timezone.utc).isoformat()}Z" |
| ) |
|
|
| |
| _now = datetime.now(timezone.utc) |
| scan_meta = { |
| "target": target, |
| "n_targets": n, |
| "timestamp": _now.isoformat() + "Z", |
| "footer": "HF Security Scanner v4.0", |
| } |
| report_html = generate_html_report(all_findings, scan_meta) |
| _ts = _now.strftime('%Y%m%d_%H%M%S') |
| report_path = os.path.join(tempfile.gettempdir(), f"scan_report_{_ts}.html") |
| with open(report_path, "w", encoding="utf-8") as fh: |
| fh.write(report_html) |
|
|
| sarif_doc = generate_sarif(all_findings, scan_meta) |
| sarif_path = os.path.join(tempfile.gettempdir(), f"scan_report_{_ts}.sarif") |
| with open(sarif_path, "w", encoding="utf-8") as fh: |
| json.dump(sarif_doc, fh, indent=2) |
| log_lines.append(f"[report] HTML β {report_path}") |
| log_lines.append(f"[report] SARIF β {sarif_path}") |
|
|
| return df, summary, "\n".join(log_lines), report_html, report_path |
|
|
|
|
| |
| |
| |
| with gr.Blocks(title="HF Security & Performance Scanner v4.0") as demo: |
| gr.Markdown( |
| "# π Hugging Face & Git Security + Performance + LLM Scanner v4.0\n" |
| f"**Bootstrap status:** gitleaks={BOOTSTRAP_RESULT.get('gitleaks')}, " |
| f"hadolint={BOOTSTRAP_RESULT.get('hadolint')}" |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=2): |
| target = gr.Textbox( |
| label="Target", |
| placeholder=("https://github.com/org/repo | " |
| "https://huggingface.co/spaces/user/space | hf-username"), |
| ) |
| hf_token = gr.Textbox( |
| label="HF Token (gated repos / commenting / private listing)", |
| type="password", placeholder="hf_...", |
| ) |
| with gr.Column(scale=1): |
| run_security = gr.Checkbox(label="Security scan", value=True) |
| run_performance = gr.Checkbox(label="Performance scan", value=True) |
| run_llm = gr.Checkbox( |
| label="LLM/Agent scan (Semgrep:LLM + Agent Audit)", |
| value=True, |
| ) |
| run_supply_chain = gr.Checkbox( |
| label="Supply-chain scan (Semgrep:SupplyChain + socket)", |
| value=True, |
| ) |
| crawl_user = gr.Checkbox(label="Scan all HF Spaces of this user", value=False) |
| comment = gr.Checkbox(label="Post discussion to HF Space", value=False) |
| deep_history = gr.Checkbox(label="Deep scan (full git history)", value=False) |
| max_targets = gr.Slider(1, 100, value=20, step=1, |
| label="Max targets when crawling") |
|
|
| btn = gr.Button("π Scan", variant="primary", size="lg") |
| summary = gr.Markdown() |
|
|
| with gr.Tabs(): |
| with gr.TabItem("π Findings"): |
| findings_df = gr.Dataframe(label="Findings", wrap=True, interactive=False) |
| with gr.TabItem("π Report"): |
| report_file = gr.File(label="Download HTML report") |
| report_html = gr.HTML(label="Preview") |
| with gr.TabItem("π Run log"): |
| log = gr.Textbox(label="Log", lines=20, max_lines=60) |
|
|
| btn.click( |
| run_scan, |
| inputs=[target, hf_token, comment, crawl_user, deep_history, max_targets, |
| run_security, run_performance, run_llm, run_supply_chain], |
| outputs=[findings_df, summary, log, report_html, report_file], |
| ) |
|
|
| if __name__ == "__main__": |
| |
| demo.launch(server_name="127.0.0.1", server_port=7860, share=False) |
|
|