Spaces:
Sleeping
Sleeping
| import re | |
| import gradio as gr | |
| # --------------------------- | |
| # 1. REGEX PATTERNS | |
| # --------------------------- | |
| EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b") | |
| URL_RE = re.compile(r"\bhttps?://\S+|\bwww\.\S+\b", re.IGNORECASE) | |
| IP_RE = re.compile(r"\b\d{1,3}(?:\.\d{1,3}){3}\b") | |
| DATE_RE = re.compile(r"\b(?:\d{1,2}[\/\-.]\d{1,2}[\/\-.]\d{2,4}|\d{4}-\d{2}-\d{2})\b") | |
| # téléphones un peu larges, on filtrera les trop courts | |
| PHONE_RE = re.compile(r"(?:\+?\d[\d\s().-]{5,}\d)") | |
| # cartes bancaires : on détecte 13 à 19 chiffres avec séparateurs | |
| CARD_RE = re.compile(r"\b(?:\d[ -]*?){13,19}\b") | |
| # SSN US basique | |
| SSN_RE = re.compile(r"\b\d{3}-\d{2}-\d{4}\b") | |
| # --------------------------- | |
| # 2. UTILITAIRES | |
| # --------------------------- | |
| def luhn_valid(number: str) -> bool: | |
| """Vérifie un numéro de carte avec Luhn (simple).""" | |
| digits = [int(d) for d in number if d.isdigit()] | |
| if len(digits) < 13 or len(digits) > 19: | |
| return False | |
| checksum = 0 | |
| parity = len(digits) % 2 | |
| for i, d in enumerate(digits): | |
| if i % 2 == parity: | |
| d = d * 2 | |
| if d > 9: | |
| d -= 9 | |
| checksum += d | |
| return checksum % 10 == 0 | |
| def mask_value(value: str, style: str = "tag") -> str: | |
| """ | |
| style: | |
| - tag -> [PII] | |
| - stars -> ******** | |
| - keep_len -> même longueur mais * | |
| """ | |
| if style == "tag": | |
| return "[PII]" | |
| elif style == "stars": | |
| return "*" * len(value) | |
| elif style == "keep_len": | |
| return "".join("*" if not c.isspace() else c for c in value) | |
| return "[PII]" | |
| # --------------------------- | |
| # 3. ANONYMISATION | |
| # --------------------------- | |
| def anonymize(text: str, style: str): | |
| if not text.strip(): | |
| return "Paste or type a text to anonymize.", "{}" | |
| counts = { | |
| "email": 0, | |
| "url": 0, | |
| "ip": 0, | |
| "date": 0, | |
| "phone": 0, | |
| "card": 0, | |
| "ssn": 0, | |
| } | |
| # 1) Emails | |
| def repl_email(m): | |
| counts["email"] += 1 | |
| return mask_value(m.group(), style) | |
| text = EMAIL_RE.sub(repl_email, text) | |
| # 2) URLs | |
| def repl_url(m): | |
| counts["url"] += 1 | |
| return mask_value(m.group(), style) | |
| text = URL_RE.sub(repl_url, text) | |
| # 3) IPs | |
| def repl_ip(m): | |
| counts["ip"] += 1 | |
| return mask_value(m.group(), style) | |
| text = IP_RE.sub(repl_ip, text) | |
| # 4) Dates | |
| def repl_date(m): | |
| counts["date"] += 1 | |
| return mask_value(m.group(), style) | |
| text = DATE_RE.sub(repl_date, text) | |
| # 5) SSN | |
| def repl_ssn(m): | |
| counts["ssn"] += 1 | |
| return mask_value(m.group(), style) | |
| text = SSN_RE.sub(repl_ssn, text) | |
| # 6) Cartes bancaires (avec Luhn) | |
| def repl_card(m): | |
| raw = m.group() | |
| digits = "".join(ch for ch in raw if ch.isdigit()) | |
| if luhn_valid(digits): | |
| counts["card"] += 1 | |
| return mask_value(raw, style) | |
| return raw # pas une vraie carte | |
| text = CARD_RE.sub(repl_card, text) | |
| # 7) Téléphones | |
| def repl_phone(m): | |
| raw = m.group() | |
| # on nettoie | |
| digits = "".join(ch for ch in raw if ch.isdigit()) | |
| if len(digits) < 6: # trop court pour être un vrai numéro | |
| return raw | |
| counts["phone"] += 1 | |
| return mask_value(raw, style) | |
| text = PHONE_RE.sub(repl_phone, text) | |
| # petit résumé JSON | |
| import json | |
| stats = json.dumps({k: v for k, v in counts.items() if v > 0}, indent=2, ensure_ascii=False) | |
| return text, stats or "{}" | |
| # --------------------------- | |
| # 4. GRADIO UI | |
| # --------------------------- | |
| with gr.Blocks(title="PII-Shield — Anonymizer") as demo: | |
| gr.Markdown("# 🛡️ PII-Shield — Text Anonymizer") | |
| gr.Markdown("Collez un SMS, un e-mail ou un texte administratif. L’outil masque automatiquement emails, téléphones, URLs, IPs, dates, SSN et cartes bancaires (Luhn).") | |
| with gr.Row(): | |
| inp = gr.Textbox(lines=10, label="Texte à anonymiser") | |
| style = gr.Radio( | |
| ["tag", "stars", "keep_len"], | |
| value="tag", | |
| label="Style de masquage", | |
| info="tag = [PII], stars = ********, keep_len = même longueur" | |
| ) | |
| btn = gr.Button("Analyser / Masquer") | |
| out = gr.Textbox(lines=10, label="Texte anonymisé") | |
| stats = gr.JSON(label="Éléments détectés") | |
| examples = [ | |
| "Salut Marie, écris-moi à marie.dupont@example.com ou appelle au +33 6 12 34 56 78. RDV le 12/05/2024.", | |
| "Payment card: 4111 1111 1111 1111, IP: 192.168.0.1, site: https://example.org", | |
| "US client SSN: 123-45-6789" | |
| ] | |
| gr.Examples(examples=examples, inputs=inp, outputs=[out, stats]) | |
| btn.click(anonymize, inputs=[inp, style], outputs=[out, stats]) | |
| if __name__ == "__main__": | |
| demo.launch() |