Spaces:

end-rin
/

unicode-attack-demo

Running

App Files Files Community

end-rin commited on Feb 9

Commit

ecebbb9

1 Parent(s): 14d697e

Add Unicode attack demo app

Browse files

Files changed (3) hide show

README.md +28 -6
app.py +279 -0
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -1,13 +1,35 @@
 ---
-title: Unicode Attack Demo
-emoji: ⚡
-colorFrom: gray
-colorTo: purple
 sdk: gradio
-sdk_version: 6.5.1
 app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Unicode Adversarial Attack Demo
+emoji: 🔤
+colorFrom: purple
+colorTo: blue
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 license: mit
 ---
+# Unicode Adversarial Attack Demo
+Interactive demonstration of how Unicode character substitutions can fool Large Language Models.
+## What This Does
+This demo transforms text using special Unicode characters (like Canadian Aboriginal Syllabics or Circled Letters) and tests whether the transformation changes an LLM's prediction.
+## Research Findings
+Tested on 59,376 samples across 3 models and 4 Unicode styles:
+- **Overall Attack Success Rate:** 50.2%
+- **Most Vulnerable Model:** Phi-3-mini (58.8% ASR)
+- **Most Robust Model:** Gemma-2-2b (39.0% ASR)
+- **Most Effective Style:** Canadian Aboriginal (56.5% ASR)
+## Project
+**Title:** Unicode-Based Adversarial Attacks on Large Language Models
+**Author:** Endrin Hoti
+**Institution:** King's College London
+**Supervisor:** Dr. Oana Cocarascu

app.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+Unicode Adversarial Attack Demo - HuggingFace Spaces Version
+Uses Inference API instead of local model loading.
+"""
+import gradio as gr
+import os
+from huggingface_hub import InferenceClient
+# Unicode transformation mappings
+SMALL_CAPS_MAP = {
+    'a': 'ᴀ', 'b': 'ʙ', 'c': 'ᴄ', 'd': 'ᴅ', 'e': 'ᴇ', 'f': 'ꜰ', 'g': 'ɢ',
+    'h': 'ʜ', 'i': 'ɪ', 'j': 'ᴊ', 'k': 'ᴋ', 'l': 'ʟ', 'm': 'ᴍ', 'n': 'ɴ',
+    'o': 'ᴏ', 'p': 'ᴘ', 'q': 'ǫ', 'r': 'ʀ', 's': 's', 't': 'ᴛ', 'u': 'ᴜ',
+    'v': 'ᴠ', 'w': 'ᴡ', 'x': 'x', 'y': 'ʏ', 'z': 'ᴢ',
+    'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G',
+    'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N',
+    'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U',
+    'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z',
+}
+CANADIAN_ABORIGINAL_MAP = {
+    'a': 'ᐞ', 'b': 'ᒃ', 'c': 'ᑦ', 'd': 'ᒄ', 'e': 'ᕪ', 'f': 'ᕝ', 'g': 'ᕐ',
+    'h': 'ᑋ', 'i': 'ᑊ', 'j': 'ᒢ', 'k': 'ᐟ', 'l': 'ᒻ', 'm': 'ᔿ', 'n': 'ᐢ',
+    'o': 'ᐤ', 'p': 'ᓐ', 'q': 'ᕐ', 'r': 'ᔇ', 's': 'ᔆ', 't': 'ᐩ', 'u': 'ᐡ',
+    'v': 'ᘁ', 'w': 'ᐜ', 'x': 'ᕽ', 'y': 'ᔉ', 'z': 'ᙆ',
+    'A': 'ᗩ', 'B': 'ᗷ', 'C': 'ᑕ', 'D': 'ᐅ', 'E': 'ᕮ', 'F': 'ᒋ', 'G': 'ᘜ',
+    'H': 'ᕼ', 'I': 'ᓵ', 'J': 'ᒎ', 'K': 'ᐠ', 'L': 'ᖶ', 'M': 'ᘻ', 'N': 'ᘯ',
+    'O': 'ᗜ', 'P': 'ᑭ', 'Q': 'ᕴ', 'R': 'ᖇ', 'S': 'ᔕ', 'T': 'ᘕ', 'U': 'ᑌ',
+    'V': 'ᐯ', 'W': 'ᗐ', 'X': '᙭', 'Y': 'ᖻ', 'Z': 'ᗱ',
+}
+CIRCLED_SQUARED_MAP = {
+    'a': 'ⓐ', 'b': 'ⓑ', 'c': 'ⓒ', 'd': 'ⓓ', 'e': 'ⓔ', 'f': 'ⓕ', 'g': 'ⓖ',
+    'h': 'ⓗ', 'i': 'ⓘ', 'j': 'ⓙ', 'k': 'ⓚ', 'l': 'ⓛ', 'm': 'ⓜ', 'n': 'ⓝ',
+    'o': 'ⓞ', 'p': 'ⓟ', 'q': 'ⓠ', 'r': 'ⓡ', 's': 'ⓢ', 't': 'ⓣ', 'u': 'ⓤ',
+    'v': 'ⓥ', 'w': 'ⓦ', 'x': 'ⓧ', 'y': 'ⓨ', 'z': 'ⓩ',
+    'A': '🄰', 'B': '🄱', 'C': '🄲', 'D': '🄳', 'E': '🄴', 'F': '🄵', 'G': '🄶',
+    'H': '🄷', 'I': '🄸', 'J': '🄹', 'K': '🄺', 'L': '🄻', 'M': '🄼', 'N': '🄽',
+    'O': '🄾', 'P': '🄿', 'Q': '🅀', 'R': '🅁', 'S': '🅂', 'T': '🅃', 'U': '🅄',
+    'V': '🅅', 'W': '🅆', 'X': '🅇', 'Y': '🅈', 'Z': '🅉',
+}
+SQUARED_LETTERS_MAP = {
+    'a': '🅰', 'b': '🅱', 'c': '🅲', 'd': '🅳', 'e': '🅴', 'f': '🅵', 'g': '🅶',
+    'h': '🅷', 'i': '🅸', 'j': '🅹', 'k': '🅺', 'l': '🅻', 'm': '🅼', 'n': '🅽',
+    'o': '🅾', 'p': '🅿', 'q': '🆀', 'r': '🆁', 's': '🆂', 't': '🆃', 'u': '🆄',
+    'v': '🆅', 'w': '🆆', 'x': '🆇', 'y': '🆈', 'z': '🆉',
+    'A': '🅰', 'B': '🅱', 'C': '🅲', 'D': '🅳', 'E': '🅴', 'F': '🅵', 'G': '🅶',
+    'H': '🅷', 'I': '🅸', 'J': '🅹', 'K': '🅺', 'L': '🅻', 'M': '🅼', 'N': '🅽',
+    'O': '🅾', 'P': '🅿', 'Q': '🆀', 'R': '🆁', 'S': '🆂', 'T': '🆃', 'U': '🆄',
+    'V': '🆅', 'W': '🆆', 'X': '🆇', 'Y': '🆈', 'Z': '🆉',
+}
+STYLES = {
+    'Small Caps': SMALL_CAPS_MAP,
+    'Canadian Aboriginal': CANADIAN_ABORIGINAL_MAP,
+    'Circled/Squared': CIRCLED_SQUARED_MAP,
+    'Squared Letters': SQUARED_LETTERS_MAP,
+}
+MODELS = {
+    'Gemma-2-2B': 'google/gemma-2-2b-it',
+    'Phi-3-mini': 'microsoft/Phi-3-mini-4k-instruct',
+    'Qwen2.5-3B': 'Qwen/Qwen2.5-3B-Instruct',
+}
+# Initialize client
+client = None
+def get_client():
+    global client
+    if client is None:
+        token = os.environ.get("HF_TOKEN")
+        client = InferenceClient(token=token)
+    return client
+def transform_text(text: str, style: str) -> str:
+    """Transform text using the specified Unicode style."""
+    if style not in STYLES:
+        return text
+    char_map = STYLES[style]
+    return ''.join(char_map.get(c, c) for c in text)
+def get_prediction(text: str, model_id: str, task: str) -> str:
+    """Get model prediction using Inference API."""
+    if task == "Fact Verification":
+        prompt = f"""You are a fact-checking assistant. Classify the following claim as exactly one of: SUPPORTS, REFUTES, or NOT_ENOUGH_INFO.
+Claim: {text}
+Respond with only one word (SUPPORTS, REFUTES, or NOT_ENOUGH_INFO):"""
+    else:
+        prompt = f"""You are a text classifier. Determine if the following sentence is an argument or not.
+Sentence: {text}
+Respond with only one word (ARGUMENT or NOT_ARGUMENT):"""
+    try:
+        c = get_client()
+        response = c.text_generation(
+            prompt,
+            model=model_id,
+            max_new_tokens=10,
+            temperature=0.01,
+        )
+        # Extract first word from response
+        result = response.strip().split()[0].upper() if response.strip() else "ERROR"
+        # Clean up common variations
+        if "SUPPORT" in result:
+            return "SUPPORTS"
+        if "REFUTE" in result:
+            return "REFUTES"
+        if "NOT_ENOUGH" in result or "NOT ENOUGH" in result:
+            return "NOT_ENOUGH_INFO"
+        if "ARGUMENT" in result and "NOT" not in result:
+            return "ARGUMENT"
+        if "NOT" in result:
+            return "NOT_ARGUMENT"
+        return result
+    except Exception as e:
+        return f"ERROR: {str(e)[:50]}"
+def run_attack(text: str, style: str, model_name: str, task: str):
+    """Run the Unicode attack and compare predictions."""
+    if not text.strip():
+        return "", "", "", "", "Please enter some text."
+    # Transform text
+    styled_text = transform_text(text, style)
+    # Get model ID
+    model_id = MODELS.get(model_name)
+    if not model_id:
+        return styled_text, "", "", "", f"Unknown model: {model_name}"
+    # Get predictions
+    original_pred = get_prediction(text, model_id, task)
+    styled_pred = get_prediction(styled_text, model_id, task)
+    # Determine result
+    if "ERROR" in original_pred or "ERROR" in styled_pred:
+        status = f"Error getting predictions. Try again or check API access."
+        color = "orange"
+    elif original_pred != styled_pred:
+        status = f"ATTACK SUCCEEDED - Prediction changed!"
+        color = "green"
+    else:
+        status = f"Attack failed - Prediction unchanged"
+        color = "red"
+    return styled_text, original_pred, styled_pred, status
+def preview_all_styles(text: str):
+    """Preview text in all Unicode styles."""
+    if not text.strip():
+        return "Enter text to see previews."
+    output = f"**Original:** {text}\n\n"
+    for style_name in STYLES:
+        transformed = transform_text(text, style_name)
+        output += f"**{style_name}:** {transformed}\n\n"
+    return output
+# Create Gradio interface
+with gr.Blocks(title="Unicode Attack Demo", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # Unicode Adversarial Attack Demo
+    Test how Unicode-styled text can fool LLMs. This demonstrates research on adversarial robustness.
+    **How it works:**
+    1. Enter a claim or sentence
+    2. Choose a Unicode style (transforms all characters)
+    3. Choose a model and task
+    4. See if the model's prediction changes
+    """)
+    with gr.Tab("Attack Demo"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                text_input = gr.Textbox(
+                    label="Input Text",
+                    placeholder="Enter a claim or sentence...",
+                    value="Climate change is caused by human activities.",
+                    lines=3
+                )
+                style_dropdown = gr.Dropdown(
+                    choices=list(STYLES.keys()),
+                    label="Unicode Style",
+                    value="Canadian Aboriginal",
+                    info="Canadian Aboriginal is most effective (56.5% ASR)"
+                )
+                model_dropdown = gr.Dropdown(
+                    choices=list(MODELS.keys()),
+                    label="Model",
+                    value="Phi-3-mini",
+                    info="Phi-3 is most vulnerable (58.8% ASR)"
+                )
+                task_dropdown = gr.Dropdown(
+                    choices=["Fact Verification", "Argument Mining"],
+                    label="Task",
+                    value="Fact Verification"
+                )
+                run_btn = gr.Button("Run Attack", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                styled_output = gr.Textbox(label="Styled Text", lines=3)
+                with gr.Row():
+                    original_pred_output = gr.Textbox(label="Original Prediction")
+                    styled_pred_output = gr.Textbox(label="Styled Prediction")
+                status_output = gr.Textbox(label="Result", lines=2)
+        run_btn.click(
+            fn=run_attack,
+            inputs=[text_input, style_dropdown, model_dropdown, task_dropdown],
+            outputs=[styled_output, original_pred_output, styled_pred_output, status_output]
+        )
+    with gr.Tab("Style Preview"):
+        gr.Markdown("### Preview All Unicode Styles")
+        preview_input = gr.Textbox(
+            label="Enter text",
+            value="Climate change is real",
+            lines=2
+        )
+        preview_btn = gr.Button("Preview Styles")
+        preview_output = gr.Markdown()
+        preview_btn.click(
+            fn=preview_all_styles,
+            inputs=[preview_input],
+            outputs=[preview_output]
+        )
+    with gr.Tab("Research Results"):
+        gr.Markdown("""
+        ### Experiment Results (59,376 samples)
+        | Metric | Value |
+        |--------|-------|
+        | Overall ASR | 50.2% |
+        | Most Vulnerable Model | Phi-3-mini (58.8% ASR) |
+        | Most Robust Model | Gemma-2-2b (39.0% ASR) |
+        | Most Effective Style | Canadian Aboriginal (56.5% ASR) |
+        #### By Model
+        | Model | Mean ASR |
+        |-------|----------|
+        | Gemma-2-2b | 39.0% |
+        | Qwen2.5-3B | 52.8% |
+        | Phi-3-mini | 58.8% |
+        #### By Style
+        | Style | Mean ASR |
+        |-------|----------|
+        | Canadian Aboriginal | 56.5% |
+        | Circled/Squared | 53.1% |
+        | Squared Letters | 53.1% |
+        | Small Caps | 38.1% |
+        *ASR = Attack Success Rate (% of predictions that changed)*
+        """)
+    gr.Markdown("""
+    ---
+    **Project:** Unicode-Based Adversarial Attacks on LLMs
+    **Author:** Endrin Hoti | King's College London
+    **Supervisor:** Dr. Oana Cocarascu
+    """)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio>=4.0.0
2	+ huggingface_hub>=0.20.0