Hush / app.py
kobinasam's picture
initial-commit
6c40084 verified
Raw
History Blame Contribute Delete
16.7 kB
"""
Hush
====
A local privacy airlock for cloud AI. Paste text, and Hush scrubs out the sensitive
parts (names, emails, phones, cards, IDs, places) into placeholders like [[NAME_1]].
Use the safe version anywhere. When you bring the AI's reply back, Hush pours your
real values back in, from a key that stays in your session and is never logged.
Track 1 style tool for the "Small Models Big Adventures" hackathon.
Model: openbmb/MiniCPM5-1B (1B, on-device, well under the 32B ceiling).
Why a SMALL model is the point: redacting secrets is the one job you must not hand to
a remote service, because that is the leak you are trying to prevent. A 1B model is
small enough to run entirely on your own machine, so nothing has to leave it.
Engineering: a deterministic layer (validated regex, Luhn-checked cards, range-checked
IPs) guarantees the structured PII no matter what. The small model only ADDS contextual
entities, and it can only ever redact strings that literally appear in your text, so it
cannot hallucinate secrets into existence. With no model (keeper mode) the deterministic
layer still does the whole structured job, so the tool is always useful.
"""
import os
import re
import html
import inspect
import gradio as gr
_BLOCKS_HAS_CSS = "css" in inspect.signature(gr.Blocks.__init__).parameters
_LAUNCH_HAS_SSR = "ssr_mode" in inspect.signature(gr.Blocks.launch).parameters
_TB_HAS_COPY = "show_copy_button" in inspect.signature(gr.Textbox.__init__).parameters
_COPY = {"show_copy_button": True} if _TB_HAS_COPY else {}
MODEL_ID = os.environ.get("HUSH_MODEL", "openbmb/MiniCPM5-1B")
DEBUG = os.environ.get("HUSH_DEBUG", "").strip().lower() in {"1", "true", "yes"}
MAX_CHARS = int(os.environ.get("HUSH_MAX_CHARS", "6000"))
MODEL_CHARS = 3000 # cap what we hand the model, for speed
# ----------------------------------------------------------- redaction core ----
TYPE_LABEL = {"URL": "LINK", "EMAIL": "EMAIL", "SSN": "SSN", "IP": "IP",
"CARD": "CARD", "PHONE": "PHONE", "PERSON": "NAME", "ORG": "ORG",
"LOCATION": "PLACE", "AGE": "AGE", "ID": "ID"}
PLACEHOLDER_RE = re.compile(r"\[\[([A-Z]+)_(\d+)\]\]")
DETECTORS = [
("URL", re.compile(r"\bhttps?://[^\s<>()\[\]]+", re.I)),
("EMAIL", re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b")),
("SSN", re.compile(r"\b\d{3}-\d{2}-\d{4}\b")),
("IP", re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")),
("CARD", re.compile(r"\b(?:\d[ -]?){13,19}\b")),
("PHONE", re.compile(
r"(?<!\w)(?:\+?\d{1,3}[\s.\-]?)?(?:\(?\d{3}\)?[\s.\-]?)\d{3}[\s.\-]?\d{4}(?!\w)")),
]
_TRAIL = ".,;:!?)]}'\""
def _luhn_ok(s):
digits = [int(c) for c in s if c.isdigit()]
if len(digits) < 13:
return False
total, parity = 0, len(digits) % 2
for i, d in enumerate(digits):
if i % 2 == parity:
d *= 2
if d > 9:
d -= 9
total += d
return total % 10 == 0
def _valid_ipv4(s):
parts = s.split(".")
return len(parts) == 4 and all(p.isdigit() and 0 <= int(p) <= 255 for p in parts)
def find_spans(text):
spans = []
occupied = [False] * len(text)
def free(a, b):
return not any(occupied[a:b])
def occupy(a, b):
for i in range(a, b):
occupied[i] = True
for typ, rx in DETECTORS:
for mt in rx.finditer(text):
a, b, val = mt.start(), mt.end(), mt.group()
if typ in ("URL", "EMAIL"): # trim trailing punctuation
while b > a and text[b - 1] in _TRAIL:
b -= 1
val = text[a:b]
if typ == "IP" and not _valid_ipv4(val):
continue
if typ == "CARD" and not _luhn_ok(val):
continue
if b <= a or not free(a, b):
continue
spans.append((a, b, typ))
occupy(a, b)
return spans, occupied
def _add_model_spans(text, spans, occupied, entities):
for typ, val in entities:
val = (val or "").strip().strip('"').strip()
if len(val) < 2:
continue
pat = re.escape(val)
if val[0].isalnum():
pat = r"(?<!\w)" + pat
if val[-1].isalnum():
pat = pat + r"(?!\w)"
try:
rx = re.compile(pat, re.I)
except re.error:
continue
for mt in rx.finditer(text):
a, b = mt.start(), mt.end()
if b <= a or any(occupied[i] for i in range(a, b)):
continue
spans.append((a, b, typ))
for i in range(a, b):
occupied[i] = True
def redact(text, entities=None):
spans, occupied = find_spans(text)
if entities:
_add_model_spans(text, spans, occupied, entities)
spans.sort()
value_to_ph, counters, mapping = {}, {}, {}
out, last = [], 0
for a, b, typ in spans:
val = text[a:b]
key = (typ, val.lower())
ph = value_to_ph.get(key)
if ph is None:
label = TYPE_LABEL.get(typ, typ)
counters[label] = counters.get(label, 0) + 1
ph = f"[[{label}_{counters[label]}]]"
value_to_ph[key] = ph
mapping[ph] = val
out.append(text[last:a])
out.append(ph)
last = b
out.append(text[last:])
return "".join(out), mapping
def restore(text, mapping):
if not mapping:
return text or ""
return PLACEHOLDER_RE.sub(lambda m: mapping.get(m.group(0), m.group(0)), text or "")
# ----------------------------------------------------------------- model -------
def _noop_gpu(*a, **k):
def wrap(fn):
return fn
return wrap(a[0]) if a and callable(a[0]) else wrap
if os.environ.get("SPACES_ZERO_GPU", "").lower() in {"true", "1"}:
try:
import spaces
GPU = spaces.GPU
except Exception: # noqa: BLE001
GPU = _noop_gpu
else:
GPU = _noop_gpu
_tokenizer = None
_model = None
MODE = "keeper"
SYSTEM = (
"/no_think\n"
"You find personal or identifying details in text that someone would want to remove "
"before sharing it publicly. Look for: people's names, organizations or companies, "
"street addresses or specific place names, ages, and account or ID numbers. List each "
"on its own line as TYPE: exact text, where TYPE is one of PERSON, ORG, LOCATION, AGE, "
"ID. Copy the text EXACTLY as it appears. Do not invent anything. If there is nothing, "
"write NONE."
)
_EXAMPLE_IN = "Dr. Alice Reyes from Northwind Labs called about the Oslo trial."
_EXAMPLE_OUT = "PERSON: Alice Reyes\nORG: Northwind Labs\nLOCATION: Oslo"
def load_model():
global _tokenizer, _model, MODE
try:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
_model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, torch_dtype=torch.float32, trust_remote_code=True)
_model.eval()
MODE = "model"
print(f"[Hush] Loaded {MODEL_ID} -- model mode (deterministic + contextual).")
except Exception as exc: # noqa: BLE001
MODE = "keeper"
print(f"[Hush] Could not load {MODEL_ID} ({exc}). Keeper mode (deterministic only).")
load_model()
@GPU(duration=40)
def _model_entities(text):
import torch
if torch.cuda.is_available() and next(_model.parameters()).device.type != "cuda":
_model.to("cuda", dtype=torch.bfloat16)
messages = [
{"role": "system", "content": SYSTEM},
{"role": "user", "content": _EXAMPLE_IN},
{"role": "assistant", "content": _EXAMPLE_OUT},
{"role": "user", "content": text[:MODEL_CHARS]},
]
def _enc(extra):
return _tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt",
return_dict=True, **extra)
try:
enc = _enc({"enable_thinking": False})
except TypeError:
enc = _enc({})
enc = enc.to(_model.device)
input_len = enc["input_ids"].shape[1]
with torch.no_grad():
out = _model.generate(**enc, max_new_tokens=200, do_sample=False,
pad_token_id=_tokenizer.eos_token_id)
raw = _tokenizer.decode(out[0][input_len:], skip_special_tokens=True)
return parse_entities(raw)
def parse_entities(raw):
raw = re.sub(r"<think>.*?</think>", "", raw or "", flags=re.S | re.I)
raw = re.sub(r"<think>.*$", "", raw, flags=re.S | re.I)
ents, seen = [], set()
for line in raw.splitlines():
mt = re.match(r"\s*(PERSON|ORG|LOCATION|AGE|ID)\s*[:\-]\s*(.+)", line, re.I)
if not mt:
continue
typ = mt.group(1).upper()
val = mt.group(2).strip().strip('"').strip()
if val and val.upper() != "NONE" and (typ, val.lower()) not in seen:
seen.add((typ, val.lower()))
ents.append((typ, val))
return ents
# --------------------------------------------------------------- rendering -----
def esc(s):
return html.escape(str(s))
def render_summary(mapping, note=""):
if not mapping:
return ('<div class="sum empty">No obvious secrets found. The text is unchanged. '
'Still, glance over it yourself before you share it.</div>')
counts = {}
for ph in mapping:
lab = ph[2:-2].rsplit("_", 1)[0]
counts[lab] = counts.get(lab, 0) + 1
chips = "".join(f'<span class="chip">{esc(k)}{v}</span>'
for k, v in sorted(counts.items()))
rows = "".join(f'<tr><td><code>{esc(ph)}</code></td><td>{esc(orig)}</td></tr>'
for ph, orig in mapping.items())
return (f'<div class="sum"><div class="sum-h">Hushed {len(mapping)} item(s)</div>'
f'<div class="chips">{chips}</div>'
f'<details><summary>show what was hidden (stays on this page only)</summary>'
f'<table class="map"><tr><th>placeholder</th><th>your original</th></tr>'
f'{rows}</table></details>{note}</div>')
def on_scrub(text, _state):
text = text or ""
if not text.strip():
return "", ('<div class="sum empty">Paste some text above, then press Hush.</div>'), {}
note = ""
if len(text) > MAX_CHARS:
text = text[:MAX_CHARS]
note = (f'<div class="sum-note">Only the first {MAX_CHARS} characters were '
f'processed.</div>')
entities = []
if MODE == "model":
try:
entities = _model_entities(text)
except Exception as exc: # noqa: BLE001
print(f"[Hush] model error: {exc}")
entities = []
redacted, mapping = redact(text, entities)
return redacted, render_summary(mapping, note), mapping
def on_restore(ai_text, mapping):
restored = restore(ai_text or "", mapping or {})
return restored
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Fraunces:ital,opsz,wght@0,9..144,400;0,9..144,600;1,9..144,400&family=Spectral:ital,wght@0,400;0,500;1,400&display=swap');
:root{--paper:#eef1ee;--paper-2:#e3e8e4;--ink:#26302c;--ink-soft:#5c6b63;
--teal:#2f5d57;--amber:#9a6b2f;--line:#bcc8c1;--ok:#2f5d57;}
.gradio-container,.gradio-container.dark,.dark{
--body-background-fill:transparent;--background-fill-primary:#fbfdfb;--background-fill-secondary:#eef3ef;
--block-background-fill:#fbfdfb;--block-border-color:var(--line);--border-color-primary:var(--line);
--body-text-color:var(--ink);--body-text-color-subdued:var(--ink-soft);
--block-label-text-color:var(--teal);--block-title-text-color:var(--ink);
--block-label-background-fill:#e3ebe5;--block-title-background-fill:transparent;
--input-background-fill:#fbfdfb;--input-border-color:var(--line);--input-placeholder-color:var(--ink-soft);
--button-primary-background-fill:var(--teal);--button-primary-background-fill-hover:#244641;
--button-primary-text-color:#f4f8f5;--button-primary-border-color:#244641;
--button-secondary-background-fill:#e3ebe5;--button-secondary-background-fill-hover:#d6e0d9;
--button-secondary-text-color:var(--ink);--button-secondary-border-color:var(--line);
--color-accent:var(--amber);--color-accent-soft:#efe3cd;}
.gradio-container{background:radial-gradient(120% 80% at 80% -10%,#f3f6f3,var(--paper) 55%,var(--paper-2));
font-family:'Spectral',Georgia,serif !important;color:var(--ink) !important;max-width:920px !important;}
.gradio-container textarea,.gradio-container input[type="text"],.gradio-container input:not([type]){
background:#fbfdfb !important;color:var(--ink) !important;-webkit-text-fill-color:var(--ink) !important;border-color:var(--line) !important;font-family:ui-monospace,Menlo,Consolas,monospace !important;}
.gradio-container textarea::placeholder{color:var(--ink-soft) !important;-webkit-text-fill-color:var(--ink-soft) !important;opacity:1;}
.hu-title{font-family:'Fraunces',serif;font-weight:600;font-size:2.5rem;line-height:1;margin:.2rem 0 0;}
.hu-title em{font-style:italic;color:var(--teal);}
.hu-sub{font-style:italic;color:var(--ink-soft);margin:.35rem 0 1rem;font-size:1.05rem;}
.hu-mode{display:inline-block;font-size:.72rem;letter-spacing:.12em;text-transform:uppercase;color:var(--teal);border:1px solid var(--line);border-radius:999px;padding:.15rem .6rem;}
.hu-step{font-family:'Fraunces',serif;font-weight:600;color:var(--teal);margin:14px 0 4px;font-size:1.05rem;}
.sum{background:#fbfdfb;border:1px solid var(--line);border-left:3px solid var(--teal);border-radius:10px;padding:12px 14px;margin-top:6px;}
.sum.empty{color:var(--ink-soft);font-style:italic;border-left-color:var(--line);}
.sum-h{font-family:'Fraunces',serif;font-weight:600;color:var(--ink);margin-bottom:6px;}
.chips{display:flex;flex-wrap:wrap;gap:6px;}
.chip{font-size:.78rem;letter-spacing:.04em;background:#e3ebe5;color:var(--teal);border:1px solid var(--line);border-radius:999px;padding:.12rem .6rem;}
.sum details{margin-top:10px;}
.sum summary{cursor:pointer;color:var(--amber);font-size:.9rem;}
.map{width:100%;border-collapse:collapse;margin-top:8px;font-size:.9rem;}
.map th{text-align:left;color:var(--ink-soft);font-weight:400;border-bottom:1px solid var(--line);padding:4px 6px;}
.map td{border-bottom:1px solid var(--line);padding:4px 6px;vertical-align:top;}
.map code{background:#eef3ef;border-radius:4px;padding:.05rem .3rem;color:var(--teal);}
.sum-note{margin-top:8px;color:var(--ink-soft);font-size:.85rem;font-style:italic;}
.hu-foot{color:var(--ink-soft);font-size:.82rem;font-style:italic;text-align:center;margin-top:14px;}
footer{display:none !important;}
"""
_bk = {"title": "Hush"}
if _BLOCKS_HAS_CSS:
_bk["css"] = CSS
_bk["theme"] = gr.themes.Soft()
with gr.Blocks(**_bk) as demo:
state = gr.State({})
if MODE == "model":
public = "MiniCPM5-1B 路 on-device + rules"
else:
public = "Hush 路 rules engine"
mode_label = f"{public} 路 [{MODE}]" if DEBUG else public
gr.HTML(f"""
<div><div class="hu-title">Hush</div>
<div class="hu-sub">A local privacy airlock. Scrub secrets out before you paste into cloud AI; pour them back in after.</div>
<span class="hu-mode">{mode_label}</span></div>""")
gr.HTML('<div class="hu-step">1 路 Scrub your text</div>')
src = gr.Textbox(placeholder="Paste the text you want to share... (names, emails, phone numbers, addresses, card and ID numbers will be hidden)",
lines=7, show_label=False)
scrub = gr.Button("Hush it", variant="primary")
out = gr.Textbox(label="Safe to share", lines=7, interactive=True, **_COPY)
summary = gr.HTML(render_summary({}))
gr.HTML('<div class="hu-step">2 路 Bring the reply back</div>')
reply = gr.Textbox(placeholder="Paste the AI's reply here (it will still contain the [[PLACEHOLDERS]])...",
lines=5, show_label=False)
unscrub = gr.Button("Restore my details", variant="primary")
restored = gr.Textbox(label="Your reply, with real details restored", lines=6,
interactive=True, **_COPY)
gr.HTML('<div class="hu-foot">The key that maps placeholders to your details stays in '
'this browser session and is never stored. Run Hush locally and nothing leaves your machine.</div>')
scrub.click(on_scrub, [src, state], [out, summary, state])
unscrub.click(on_restore, [reply, state], restored)
if __name__ == "__main__":
_lk = {}
if not _BLOCKS_HAS_CSS:
_lk["css"] = CSS
_lk["theme"] = gr.themes.Soft()
if _LAUNCH_HAS_SSR:
_lk["ssr_mode"] = False
demo.queue(max_size=24).launch(**_lk)