Spaces:
Sleeping
Sleeping
| """ | |
| Hush | |
| ==== | |
| A local privacy airlock for cloud AI. Paste text, and Hush scrubs out the sensitive | |
| parts (names, emails, phones, cards, IDs, places) into placeholders like [[NAME_1]]. | |
| Use the safe version anywhere. When you bring the AI's reply back, Hush pours your | |
| real values back in, from a key that stays in your session and is never logged. | |
| Track 1 style tool for the "Small Models Big Adventures" hackathon. | |
| Model: openbmb/MiniCPM5-1B (1B, on-device, well under the 32B ceiling). | |
| Why a SMALL model is the point: redacting secrets is the one job you must not hand to | |
| a remote service, because that is the leak you are trying to prevent. A 1B model is | |
| small enough to run entirely on your own machine, so nothing has to leave it. | |
| Engineering: a deterministic layer (validated regex, Luhn-checked cards, range-checked | |
| IPs) guarantees the structured PII no matter what. The small model only ADDS contextual | |
| entities, and it can only ever redact strings that literally appear in your text, so it | |
| cannot hallucinate secrets into existence. With no model (keeper mode) the deterministic | |
| layer still does the whole structured job, so the tool is always useful. | |
| """ | |
| import os | |
| import re | |
| import html | |
| import inspect | |
| import gradio as gr | |
| _BLOCKS_HAS_CSS = "css" in inspect.signature(gr.Blocks.__init__).parameters | |
| _LAUNCH_HAS_SSR = "ssr_mode" in inspect.signature(gr.Blocks.launch).parameters | |
| _TB_HAS_COPY = "show_copy_button" in inspect.signature(gr.Textbox.__init__).parameters | |
| _COPY = {"show_copy_button": True} if _TB_HAS_COPY else {} | |
| MODEL_ID = os.environ.get("HUSH_MODEL", "openbmb/MiniCPM5-1B") | |
| DEBUG = os.environ.get("HUSH_DEBUG", "").strip().lower() in {"1", "true", "yes"} | |
| MAX_CHARS = int(os.environ.get("HUSH_MAX_CHARS", "6000")) | |
| MODEL_CHARS = 3000 # cap what we hand the model, for speed | |
| # ----------------------------------------------------------- redaction core ---- | |
| TYPE_LABEL = {"URL": "LINK", "EMAIL": "EMAIL", "SSN": "SSN", "IP": "IP", | |
| "CARD": "CARD", "PHONE": "PHONE", "PERSON": "NAME", "ORG": "ORG", | |
| "LOCATION": "PLACE", "AGE": "AGE", "ID": "ID"} | |
| PLACEHOLDER_RE = re.compile(r"\[\[([A-Z]+)_(\d+)\]\]") | |
| DETECTORS = [ | |
| ("URL", re.compile(r"\bhttps?://[^\s<>()\[\]]+", re.I)), | |
| ("EMAIL", re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b")), | |
| ("SSN", re.compile(r"\b\d{3}-\d{2}-\d{4}\b")), | |
| ("IP", re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")), | |
| ("CARD", re.compile(r"\b(?:\d[ -]?){13,19}\b")), | |
| ("PHONE", re.compile( | |
| r"(?<!\w)(?:\+?\d{1,3}[\s.\-]?)?(?:\(?\d{3}\)?[\s.\-]?)\d{3}[\s.\-]?\d{4}(?!\w)")), | |
| ] | |
| _TRAIL = ".,;:!?)]}'\"" | |
| def _luhn_ok(s): | |
| digits = [int(c) for c in s if c.isdigit()] | |
| if len(digits) < 13: | |
| return False | |
| total, parity = 0, len(digits) % 2 | |
| for i, d in enumerate(digits): | |
| if i % 2 == parity: | |
| d *= 2 | |
| if d > 9: | |
| d -= 9 | |
| total += d | |
| return total % 10 == 0 | |
| def _valid_ipv4(s): | |
| parts = s.split(".") | |
| return len(parts) == 4 and all(p.isdigit() and 0 <= int(p) <= 255 for p in parts) | |
| def find_spans(text): | |
| spans = [] | |
| occupied = [False] * len(text) | |
| def free(a, b): | |
| return not any(occupied[a:b]) | |
| def occupy(a, b): | |
| for i in range(a, b): | |
| occupied[i] = True | |
| for typ, rx in DETECTORS: | |
| for mt in rx.finditer(text): | |
| a, b, val = mt.start(), mt.end(), mt.group() | |
| if typ in ("URL", "EMAIL"): # trim trailing punctuation | |
| while b > a and text[b - 1] in _TRAIL: | |
| b -= 1 | |
| val = text[a:b] | |
| if typ == "IP" and not _valid_ipv4(val): | |
| continue | |
| if typ == "CARD" and not _luhn_ok(val): | |
| continue | |
| if b <= a or not free(a, b): | |
| continue | |
| spans.append((a, b, typ)) | |
| occupy(a, b) | |
| return spans, occupied | |
| def _add_model_spans(text, spans, occupied, entities): | |
| for typ, val in entities: | |
| val = (val or "").strip().strip('"').strip() | |
| if len(val) < 2: | |
| continue | |
| pat = re.escape(val) | |
| if val[0].isalnum(): | |
| pat = r"(?<!\w)" + pat | |
| if val[-1].isalnum(): | |
| pat = pat + r"(?!\w)" | |
| try: | |
| rx = re.compile(pat, re.I) | |
| except re.error: | |
| continue | |
| for mt in rx.finditer(text): | |
| a, b = mt.start(), mt.end() | |
| if b <= a or any(occupied[i] for i in range(a, b)): | |
| continue | |
| spans.append((a, b, typ)) | |
| for i in range(a, b): | |
| occupied[i] = True | |
| def redact(text, entities=None): | |
| spans, occupied = find_spans(text) | |
| if entities: | |
| _add_model_spans(text, spans, occupied, entities) | |
| spans.sort() | |
| value_to_ph, counters, mapping = {}, {}, {} | |
| out, last = [], 0 | |
| for a, b, typ in spans: | |
| val = text[a:b] | |
| key = (typ, val.lower()) | |
| ph = value_to_ph.get(key) | |
| if ph is None: | |
| label = TYPE_LABEL.get(typ, typ) | |
| counters[label] = counters.get(label, 0) + 1 | |
| ph = f"[[{label}_{counters[label]}]]" | |
| value_to_ph[key] = ph | |
| mapping[ph] = val | |
| out.append(text[last:a]) | |
| out.append(ph) | |
| last = b | |
| out.append(text[last:]) | |
| return "".join(out), mapping | |
| def restore(text, mapping): | |
| if not mapping: | |
| return text or "" | |
| return PLACEHOLDER_RE.sub(lambda m: mapping.get(m.group(0), m.group(0)), text or "") | |
| # ----------------------------------------------------------------- model ------- | |
| def _noop_gpu(*a, **k): | |
| def wrap(fn): | |
| return fn | |
| return wrap(a[0]) if a and callable(a[0]) else wrap | |
| if os.environ.get("SPACES_ZERO_GPU", "").lower() in {"true", "1"}: | |
| try: | |
| import spaces | |
| GPU = spaces.GPU | |
| except Exception: # noqa: BLE001 | |
| GPU = _noop_gpu | |
| else: | |
| GPU = _noop_gpu | |
| _tokenizer = None | |
| _model = None | |
| MODE = "keeper" | |
| SYSTEM = ( | |
| "/no_think\n" | |
| "You find personal or identifying details in text that someone would want to remove " | |
| "before sharing it publicly. Look for: people's names, organizations or companies, " | |
| "street addresses or specific place names, ages, and account or ID numbers. List each " | |
| "on its own line as TYPE: exact text, where TYPE is one of PERSON, ORG, LOCATION, AGE, " | |
| "ID. Copy the text EXACTLY as it appears. Do not invent anything. If there is nothing, " | |
| "write NONE." | |
| ) | |
| _EXAMPLE_IN = "Dr. Alice Reyes from Northwind Labs called about the Oslo trial." | |
| _EXAMPLE_OUT = "PERSON: Alice Reyes\nORG: Northwind Labs\nLOCATION: Oslo" | |
| def load_model(): | |
| global _tokenizer, _model, MODE | |
| try: | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| _model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, torch_dtype=torch.float32, trust_remote_code=True) | |
| _model.eval() | |
| MODE = "model" | |
| print(f"[Hush] Loaded {MODEL_ID} -- model mode (deterministic + contextual).") | |
| except Exception as exc: # noqa: BLE001 | |
| MODE = "keeper" | |
| print(f"[Hush] Could not load {MODEL_ID} ({exc}). Keeper mode (deterministic only).") | |
| load_model() | |
| def _model_entities(text): | |
| import torch | |
| if torch.cuda.is_available() and next(_model.parameters()).device.type != "cuda": | |
| _model.to("cuda", dtype=torch.bfloat16) | |
| messages = [ | |
| {"role": "system", "content": SYSTEM}, | |
| {"role": "user", "content": _EXAMPLE_IN}, | |
| {"role": "assistant", "content": _EXAMPLE_OUT}, | |
| {"role": "user", "content": text[:MODEL_CHARS]}, | |
| ] | |
| def _enc(extra): | |
| return _tokenizer.apply_chat_template( | |
| messages, add_generation_prompt=True, return_tensors="pt", | |
| return_dict=True, **extra) | |
| try: | |
| enc = _enc({"enable_thinking": False}) | |
| except TypeError: | |
| enc = _enc({}) | |
| enc = enc.to(_model.device) | |
| input_len = enc["input_ids"].shape[1] | |
| with torch.no_grad(): | |
| out = _model.generate(**enc, max_new_tokens=200, do_sample=False, | |
| pad_token_id=_tokenizer.eos_token_id) | |
| raw = _tokenizer.decode(out[0][input_len:], skip_special_tokens=True) | |
| return parse_entities(raw) | |
| def parse_entities(raw): | |
| raw = re.sub(r"<think>.*?</think>", "", raw or "", flags=re.S | re.I) | |
| raw = re.sub(r"<think>.*$", "", raw, flags=re.S | re.I) | |
| ents, seen = [], set() | |
| for line in raw.splitlines(): | |
| mt = re.match(r"\s*(PERSON|ORG|LOCATION|AGE|ID)\s*[:\-]\s*(.+)", line, re.I) | |
| if not mt: | |
| continue | |
| typ = mt.group(1).upper() | |
| val = mt.group(2).strip().strip('"').strip() | |
| if val and val.upper() != "NONE" and (typ, val.lower()) not in seen: | |
| seen.add((typ, val.lower())) | |
| ents.append((typ, val)) | |
| return ents | |
| # --------------------------------------------------------------- rendering ----- | |
| def esc(s): | |
| return html.escape(str(s)) | |
| def render_summary(mapping, note=""): | |
| if not mapping: | |
| return ('<div class="sum empty">No obvious secrets found. The text is unchanged. ' | |
| 'Still, glance over it yourself before you share it.</div>') | |
| counts = {} | |
| for ph in mapping: | |
| lab = ph[2:-2].rsplit("_", 1)[0] | |
| counts[lab] = counts.get(lab, 0) + 1 | |
| chips = "".join(f'<span class="chip">{esc(k)} 路 {v}</span>' | |
| for k, v in sorted(counts.items())) | |
| rows = "".join(f'<tr><td><code>{esc(ph)}</code></td><td>{esc(orig)}</td></tr>' | |
| for ph, orig in mapping.items()) | |
| return (f'<div class="sum"><div class="sum-h">Hushed {len(mapping)} item(s)</div>' | |
| f'<div class="chips">{chips}</div>' | |
| f'<details><summary>show what was hidden (stays on this page only)</summary>' | |
| f'<table class="map"><tr><th>placeholder</th><th>your original</th></tr>' | |
| f'{rows}</table></details>{note}</div>') | |
| def on_scrub(text, _state): | |
| text = text or "" | |
| if not text.strip(): | |
| return "", ('<div class="sum empty">Paste some text above, then press Hush.</div>'), {} | |
| note = "" | |
| if len(text) > MAX_CHARS: | |
| text = text[:MAX_CHARS] | |
| note = (f'<div class="sum-note">Only the first {MAX_CHARS} characters were ' | |
| f'processed.</div>') | |
| entities = [] | |
| if MODE == "model": | |
| try: | |
| entities = _model_entities(text) | |
| except Exception as exc: # noqa: BLE001 | |
| print(f"[Hush] model error: {exc}") | |
| entities = [] | |
| redacted, mapping = redact(text, entities) | |
| return redacted, render_summary(mapping, note), mapping | |
| def on_restore(ai_text, mapping): | |
| restored = restore(ai_text or "", mapping or {}) | |
| return restored | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Fraunces:ital,opsz,wght@0,9..144,400;0,9..144,600;1,9..144,400&family=Spectral:ital,wght@0,400;0,500;1,400&display=swap'); | |
| :root{--paper:#eef1ee;--paper-2:#e3e8e4;--ink:#26302c;--ink-soft:#5c6b63; | |
| --teal:#2f5d57;--amber:#9a6b2f;--line:#bcc8c1;--ok:#2f5d57;} | |
| .gradio-container,.gradio-container.dark,.dark{ | |
| --body-background-fill:transparent;--background-fill-primary:#fbfdfb;--background-fill-secondary:#eef3ef; | |
| --block-background-fill:#fbfdfb;--block-border-color:var(--line);--border-color-primary:var(--line); | |
| --body-text-color:var(--ink);--body-text-color-subdued:var(--ink-soft); | |
| --block-label-text-color:var(--teal);--block-title-text-color:var(--ink); | |
| --block-label-background-fill:#e3ebe5;--block-title-background-fill:transparent; | |
| --input-background-fill:#fbfdfb;--input-border-color:var(--line);--input-placeholder-color:var(--ink-soft); | |
| --button-primary-background-fill:var(--teal);--button-primary-background-fill-hover:#244641; | |
| --button-primary-text-color:#f4f8f5;--button-primary-border-color:#244641; | |
| --button-secondary-background-fill:#e3ebe5;--button-secondary-background-fill-hover:#d6e0d9; | |
| --button-secondary-text-color:var(--ink);--button-secondary-border-color:var(--line); | |
| --color-accent:var(--amber);--color-accent-soft:#efe3cd;} | |
| .gradio-container{background:radial-gradient(120% 80% at 80% -10%,#f3f6f3,var(--paper) 55%,var(--paper-2)); | |
| font-family:'Spectral',Georgia,serif !important;color:var(--ink) !important;max-width:920px !important;} | |
| .gradio-container textarea,.gradio-container input[type="text"],.gradio-container input:not([type]){ | |
| background:#fbfdfb !important;color:var(--ink) !important;-webkit-text-fill-color:var(--ink) !important;border-color:var(--line) !important;font-family:ui-monospace,Menlo,Consolas,monospace !important;} | |
| .gradio-container textarea::placeholder{color:var(--ink-soft) !important;-webkit-text-fill-color:var(--ink-soft) !important;opacity:1;} | |
| .hu-title{font-family:'Fraunces',serif;font-weight:600;font-size:2.5rem;line-height:1;margin:.2rem 0 0;} | |
| .hu-title em{font-style:italic;color:var(--teal);} | |
| .hu-sub{font-style:italic;color:var(--ink-soft);margin:.35rem 0 1rem;font-size:1.05rem;} | |
| .hu-mode{display:inline-block;font-size:.72rem;letter-spacing:.12em;text-transform:uppercase;color:var(--teal);border:1px solid var(--line);border-radius:999px;padding:.15rem .6rem;} | |
| .hu-step{font-family:'Fraunces',serif;font-weight:600;color:var(--teal);margin:14px 0 4px;font-size:1.05rem;} | |
| .sum{background:#fbfdfb;border:1px solid var(--line);border-left:3px solid var(--teal);border-radius:10px;padding:12px 14px;margin-top:6px;} | |
| .sum.empty{color:var(--ink-soft);font-style:italic;border-left-color:var(--line);} | |
| .sum-h{font-family:'Fraunces',serif;font-weight:600;color:var(--ink);margin-bottom:6px;} | |
| .chips{display:flex;flex-wrap:wrap;gap:6px;} | |
| .chip{font-size:.78rem;letter-spacing:.04em;background:#e3ebe5;color:var(--teal);border:1px solid var(--line);border-radius:999px;padding:.12rem .6rem;} | |
| .sum details{margin-top:10px;} | |
| .sum summary{cursor:pointer;color:var(--amber);font-size:.9rem;} | |
| .map{width:100%;border-collapse:collapse;margin-top:8px;font-size:.9rem;} | |
| .map th{text-align:left;color:var(--ink-soft);font-weight:400;border-bottom:1px solid var(--line);padding:4px 6px;} | |
| .map td{border-bottom:1px solid var(--line);padding:4px 6px;vertical-align:top;} | |
| .map code{background:#eef3ef;border-radius:4px;padding:.05rem .3rem;color:var(--teal);} | |
| .sum-note{margin-top:8px;color:var(--ink-soft);font-size:.85rem;font-style:italic;} | |
| .hu-foot{color:var(--ink-soft);font-size:.82rem;font-style:italic;text-align:center;margin-top:14px;} | |
| footer{display:none !important;} | |
| """ | |
| _bk = {"title": "Hush"} | |
| if _BLOCKS_HAS_CSS: | |
| _bk["css"] = CSS | |
| _bk["theme"] = gr.themes.Soft() | |
| with gr.Blocks(**_bk) as demo: | |
| state = gr.State({}) | |
| if MODE == "model": | |
| public = "MiniCPM5-1B 路 on-device + rules" | |
| else: | |
| public = "Hush 路 rules engine" | |
| mode_label = f"{public} 路 [{MODE}]" if DEBUG else public | |
| gr.HTML(f""" | |
| <div><div class="hu-title">Hush</div> | |
| <div class="hu-sub">A local privacy airlock. Scrub secrets out before you paste into cloud AI; pour them back in after.</div> | |
| <span class="hu-mode">{mode_label}</span></div>""") | |
| gr.HTML('<div class="hu-step">1 路 Scrub your text</div>') | |
| src = gr.Textbox(placeholder="Paste the text you want to share... (names, emails, phone numbers, addresses, card and ID numbers will be hidden)", | |
| lines=7, show_label=False) | |
| scrub = gr.Button("Hush it", variant="primary") | |
| out = gr.Textbox(label="Safe to share", lines=7, interactive=True, **_COPY) | |
| summary = gr.HTML(render_summary({})) | |
| gr.HTML('<div class="hu-step">2 路 Bring the reply back</div>') | |
| reply = gr.Textbox(placeholder="Paste the AI's reply here (it will still contain the [[PLACEHOLDERS]])...", | |
| lines=5, show_label=False) | |
| unscrub = gr.Button("Restore my details", variant="primary") | |
| restored = gr.Textbox(label="Your reply, with real details restored", lines=6, | |
| interactive=True, **_COPY) | |
| gr.HTML('<div class="hu-foot">The key that maps placeholders to your details stays in ' | |
| 'this browser session and is never stored. Run Hush locally and nothing leaves your machine.</div>') | |
| scrub.click(on_scrub, [src, state], [out, summary, state]) | |
| unscrub.click(on_restore, [reply, state], restored) | |
| if __name__ == "__main__": | |
| _lk = {} | |
| if not _BLOCKS_HAS_CSS: | |
| _lk["css"] = CSS | |
| _lk["theme"] = gr.themes.Soft() | |
| if _LAUNCH_HAS_SSR: | |
| _lk["ssr_mode"] = False | |
| demo.queue(max_size=24).launch(**_lk) | |