Spaces:

end-rin
/

unicode-attack-demo

Running

App Files Files Community

end-rin commited on Feb 16

Commit

dbfa5a2

verified ·

1 Parent(s): dbdcc05

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +299 -222

app.py CHANGED Viewed

@@ -1,281 +1,358 @@
 """
-Unicode Adversarial Attack Demo - HuggingFace Spaces Version
-Uses Inference API instead of local model loading.
 """
 import gradio as gr
-import os
-from huggingface_hub import InferenceClient
-# Unicode transformation mappings
 SMALL_CAPS_MAP = {
     'a': 'ᴀ', 'b': 'ʙ', 'c': 'ᴄ', 'd': 'ᴅ', 'e': 'ᴇ', 'f': 'ꜰ', 'g': 'ɢ',
     'h': 'ʜ', 'i': 'ɪ', 'j': 'ᴊ', 'k': 'ᴋ', 'l': 'ʟ', 'm': 'ᴍ', 'n': 'ɴ',
-    'o': 'ᴏ', 'p': 'ᴘ', 'q': 'ǫ', 'r': 'ʀ', 's': 's', 't': 'ᴛ', 'u': 'ᴜ',
     'v': 'ᴠ', 'w': 'ᴡ', 'x': 'x', 'y': 'ʏ', 'z': 'ᴢ',
-    'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G',
-    'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N',
-    'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U',
-    'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z',
 }
 CANADIAN_ABORIGINAL_MAP = {
-    'a': 'ᐞ', 'b': 'ᒃ', 'c': 'ᑦ', 'd': 'ᒄ', 'e': 'ᕪ', 'f': 'ᕝ', 'g': 'ᕐ',
-    'h': 'ᑋ', 'i': 'ᑊ', 'j': 'ᒢ', 'k': 'ᐟ', 'l': 'ᒻ', 'm': 'ᔿ', 'n': 'ᐢ',
-    'o': 'ᐤ', 'p': 'ᓐ', 'q': 'ᕐ', 'r': 'ᔇ', 's': 'ᔆ', 't': 'ᐩ', 'u': 'ᐡ',
-    'v': 'ᘁ', 'w': 'ᐜ', 'x': 'ᕽ', 'y': 'ᔉ', 'z': 'ᙆ',
-    'A': 'ᗩ', 'B': 'ᗷ', 'C': 'ᑕ', 'D': 'ᐅ', 'E': 'ᕮ', 'F': 'ᒋ', 'G': 'ᘜ',
-    'H': 'ᕼ', 'I': 'ᓵ', 'J': 'ᒎ', 'K': 'ᐠ', 'L': 'ᖶ', 'M': 'ᘻ', 'N': 'ᘯ',
-    'O': 'ᗜ', 'P': 'ᑭ', 'Q': 'ᕴ', 'R': 'ᖇ', 'S': 'ᔕ', 'T': 'ᘕ', 'U': 'ᑌ',
-    'V': 'ᐯ', 'W': 'ᗐ', 'X': '᙭', 'Y': 'ᖻ', 'Z': 'ᗱ',
 }
-CIRCLED_SQUARED_MAP = {
     'a': 'ⓐ', 'b': 'ⓑ', 'c': 'ⓒ', 'd': 'ⓓ', 'e': 'ⓔ', 'f': 'ⓕ', 'g': 'ⓖ',
     'h': 'ⓗ', 'i': 'ⓘ', 'j': 'ⓙ', 'k': 'ⓚ', 'l': 'ⓛ', 'm': 'ⓜ', 'n': 'ⓝ',
     'o': 'ⓞ', 'p': 'ⓟ', 'q': 'ⓠ', 'r': 'ⓡ', 's': 'ⓢ', 't': 'ⓣ', 'u': 'ⓤ',
     'v': 'ⓥ', 'w': 'ⓦ', 'x': 'ⓧ', 'y': 'ⓨ', 'z': 'ⓩ',
     'A': '🄰', 'B': '🄱', 'C': '🄲', 'D': '🄳', 'E': '🄴', 'F': '🄵', 'G': '🄶',
     'H': '🄷', 'I': '🄸', 'J': '🄹', 'K': '🄺', 'L': '🄻', 'M': '🄼', 'N': '🄽',
     'O': '🄾', 'P': '🄿', 'Q': '🅀', 'R': '🅁', 'S': '🅂', 'T': '🅃', 'U': '🅄',
     'V': '🅅', 'W': '🅆', 'X': '🅇', 'Y': '🅈', 'Z': '🅉',
-}
-SQUARED_LETTERS_MAP = {
-    'a': '🅰', 'b': '🅱', 'c': '🅲', 'd': '🅳', 'e': '🅴', 'f': '🅵', 'g': '🅶',
-    'h': '🅷', 'i': '🅸', 'j': '🅹', 'k': '🅺', 'l': '🅻', 'm': '🅼', 'n': '🅽',
-    'o': '🅾', 'p': '🅿', 'q': '🆀', 'r': '🆁', 's': '🆂', 't': '🆃', 'u': '🆄',
-    'v': '🆅', 'w': '🆆', 'x': '🆇', 'y': '🆈', 'z': '🆉',
-    'A': '🅰', 'B': '🅱', 'C': '🅲', 'D': '🅳', 'E': '🅴', 'F': '🅵', 'G': '🅶',
-    'H': '🅷', 'I': '🅸', 'J': '🅹', 'K': '🅺', 'L': '🅻', 'M': '🅼', 'N': '🅽',
-    'O': '🅾', 'P': '🅿', 'Q': '🆀', 'R': '🆁', 'S': '🆂', 'T': '🆃', 'U': '🆄',
-    'V': '🆅', 'W': '🆆', 'X': '🆇', 'Y': '🆈', 'Z': '🆉',
 }
 STYLES = {
-    'Small Caps': SMALL_CAPS_MAP,
-    'Canadian Aboriginal': CANADIAN_ABORIGINAL_MAP,
-    'Circled/Squared': CIRCLED_SQUARED_MAP,
-    'Squared Letters': SQUARED_LETTERS_MAP,
 }
-# Models available on HF Inference API (free tier)
-# Note: Phi-3, Gemma, Qwen from our experiments are NOT on free API
-# Using similar instruction-tuned models that ARE available
 MODELS = {
-    'Zephyr-7B': 'HuggingFaceH4/zephyr-7b-beta',
-    'Mistral-7B': 'mistralai/Mistral-7B-Instruct-v0.2',
-    'Falcon-7B': 'tiiuae/falcon-7b-instruct',
 }
-# Initialize client
-client = None
-def get_client():
-    global client
-    if client is None:
-        token = os.environ.get("HF_TOKEN")
-        client = InferenceClient(token=token)
-    return client
 def transform_text(text: str, style: str) -> str:
     """Transform text using the specified Unicode style."""
     if style not in STYLES:
         return text
-    char_map = STYLES[style]
     return ''.join(char_map.get(c, c) for c in text)
-def get_prediction(text: str, model_id: str, task: str) -> str:
-    """Get model prediction using Inference API chat completion."""
-    if task == "Fact Verification":
-        system_msg = "You are a fact-checking assistant. Classify claims as SUPPORTS, REFUTES, or NOT_ENOUGH_INFO. Respond with only one word."
-        user_msg = f"Classify this claim: {text}"
     else:
-        system_msg = "You are a text classifier. Classify sentences as ARGUMENT or NOT_ARGUMENT. Respond with only one word."
-        user_msg = f"Is this an argument? {text}"
-    try:
-        c = get_client()
-        response = c.chat_completion(
-            messages=[
-                {"role": "system", "content": system_msg},
-                {"role": "user", "content": user_msg}
-            ],
-            model=model_id,
-            max_tokens=15,
-            temperature=0.01,
-        )
-        # Extract the response
-        result = response.choices[0].message.content.strip().upper()
-        # Get first word
-        result = result.split()[0] if result.split() else "ERROR"
-        # Clean up common variations
-        if "SUPPORT" in result:
-            return "SUPPORTS"
-        if "REFUTE" in result:
-            return "REFUTES"
-        if "NOT_ENOUGH" in result or "ENOUGH" in result:
-            return "NOT_ENOUGH_INFO"
-        if "ARGUMENT" in result and "NOT" not in result:
-            return "ARGUMENT"
-        if "NOT" in result:
-            return "NOT_ARGUMENT"
-        return result
-    except Exception as e:
-        return f"ERROR: {str(e)[:100]}"
-def run_attack(text: str, style: str, model_name: str, task: str):
     """Run the Unicode attack and compare predictions."""
     if not text.strip():
-        return "", "", "", "Please enter some text."
-    # Transform text
-    styled_text = transform_text(text, style)
-    # Get model ID
-    model_id = MODELS.get(model_name)
-    if not model_id:
-        return styled_text, "", "", f"Unknown model: {model_name}"
-    # Get predictions
-    original_pred = get_prediction(text, model_id, task)
-    styled_pred = get_prediction(styled_text, model_id, task)
-    # Determine result
-    if "ERROR" in original_pred or "ERROR" in styled_pred:
-        status = f"API Error: {original_pred if 'ERROR' in original_pred else styled_pred}"
-    elif original_pred != styled_pred:
-        status = f"ATTACK SUCCEEDED! Prediction changed from {original_pred} to {styled_pred}"
-    else:
-        status = f"Attack failed - Prediction unchanged: {original_pred}"
-    return styled_text, original_pred, styled_pred, status
-def preview_all_styles(text: str):
-    """Preview text in all Unicode styles."""
     if not text.strip():
-        return "Enter text to see previews."
-    output = f"**Original:** {text}\n\n"
-    for style_name in STYLES:
-        transformed = transform_text(text, style_name)
-        output += f"**{style_name}:** {transformed}\n\n"
-    return output
-# Create Gradio interface
-with gr.Blocks(title="Unicode Attack Demo", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # Unicode Adversarial Attack Demo
-    Test how Unicode-styled text can fool LLMs. This demonstrates research on adversarial robustness.
-    **How it works:**
-    1. Enter a claim or sentence
-    2. Choose a Unicode style (transforms all characters)
-    3. Choose a model and task
-    4. See if the model's prediction changes
-    """)
-    with gr.Tab("Attack Demo"):
-        with gr.Row():
-            with gr.Column(scale=1):
-                text_input = gr.Textbox(
-                    label="Input Text",
-                    placeholder="Enter a claim or sentence...",
-                    value="Climate change is caused by human activities.",
-                    lines=3
-                )
-                style_dropdown = gr.Dropdown(
-                    choices=list(STYLES.keys()),
-                    label="Unicode Style",
-                    value="Canadian Aboriginal",
-                    info="Canadian Aboriginal is most effective (56.5% ASR)"
-                )
-                model_dropdown = gr.Dropdown(
-                    choices=list(MODELS.keys()),
-                    label="Model",
-                    value="Zephyr-7B",
-                    info="Note: Original models (Phi-3, Gemma, Qwen) not on free API"
-                )
-                task_dropdown = gr.Dropdown(
-                    choices=["Fact Verification", "Argument Mining"],
-                    label="Task",
-                    value="Fact Verification"
-                )
-                run_btn = gr.Button("Run Attack", variant="primary", size="lg")
-            with gr.Column(scale=1):
-                styled_output = gr.Textbox(label="Styled Text", lines=3)
-                with gr.Row():
-                    original_pred_output = gr.Textbox(label="Original Prediction")
-                    styled_pred_output = gr.Textbox(label="Styled Prediction")
-                status_output = gr.Textbox(label="Result", lines=2)
-        run_btn.click(
-            fn=run_attack,
-            inputs=[text_input, style_dropdown, model_dropdown, task_dropdown],
-            outputs=[styled_output, original_pred_output, styled_pred_output, status_output]
-        )
-    with gr.Tab("Style Preview"):
-        gr.Markdown("### Preview All Unicode Styles")
-        preview_input = gr.Textbox(
-            label="Enter text",
-            value="Climate change is real",
-            lines=2
-        )
-        preview_btn = gr.Button("Preview Styles")
-        preview_output = gr.Markdown()
-        preview_btn.click(
-            fn=preview_all_styles,
-            inputs=[preview_input],
-            outputs=[preview_output]
-        )
-    with gr.Tab("Research Results"):
         gr.Markdown("""
-        ### Experiment Results (59,376 samples)
-        *Note: The actual experiments used Gemma-2-2B, Phi-3-mini, and Qwen2.5-3B locally.
-        This demo uses different models available on the free HuggingFace API.*
-        | Metric | Value |
-        |--------|-------|
-        | Overall ASR | 50.2% |
-        | Most Vulnerable Model | Phi-3-mini (58.8% ASR) |
-        | Most Robust Model | Gemma-2-2b (39.0% ASR) |
-        | Most Effective Style | Canadian Aboriginal (56.5% ASR) |
-        #### By Model
-        | Model | Mean ASR |
-        |-------|----------|
-        | Gemma-2-2b | 39.0% |
-        | Qwen2.5-3B | 52.8% |
-        | Phi-3-mini | 58.8% |
-        #### By Style
-        | Style | Mean ASR |
-        |-------|----------|
-        | Canadian Aboriginal | 56.5% |
-        | Circled/Squared | 53.1% |
-        | Squared Letters | 53.1% |
-        | Small Caps | 38.1% |
-        *ASR = Attack Success Rate (% of predictions that changed)*
         """)
-    gr.Markdown("""
-    ---
-    **Project:** Unicode-Based Adversarial Attacks on LLMs
-    **Author:** Endrin Hoti | King's College London
-    **Supervisor:** Dr. Oana Cocarascu
-    """)
 if __name__ == "__main__":
     demo.launch()

 """
+Gradio web interface for Unicode adversarial attack demonstration.
+Uses GGUF quantized models via llama-cpp-python for CPU inference.
+Designed for deployment on HuggingFace Spaces (free CPU tier).
+Supervisor approved: Feb 9, 2026
 """
 import gradio as gr
+from llama_cpp import Llama
+# =============================================================================
+# Unicode Style Mappings
+# =============================================================================
 SMALL_CAPS_MAP = {
     'a': 'ᴀ', 'b': 'ʙ', 'c': 'ᴄ', 'd': 'ᴅ', 'e': 'ᴇ', 'f': 'ꜰ', 'g': 'ɢ',
     'h': 'ʜ', 'i': 'ɪ', 'j': 'ᴊ', 'k': 'ᴋ', 'l': 'ʟ', 'm': 'ᴍ', 'n': 'ɴ',
+    'o': 'ᴏ', 'p': 'ᴘ', 'q': 'ǫ', 'r': 'ʀ', 's': 'ꜱ', 't': 'ᴛ', 'u': 'ᴜ',
     'v': 'ᴠ', 'w': 'ᴡ', 'x': 'x', 'y': 'ʏ', 'z': 'ᴢ',
+    'A': 'ᴀ', 'B': 'ʙ', 'C': 'ᴄ', 'D': 'ᴅ', 'E': 'ᴇ', 'F': 'ꜰ', 'G': 'ɢ',
+    'H': 'ʜ', 'I': 'ɪ', 'J': 'ᴊ', 'K': 'ᴋ', 'L': 'ʟ', 'M': 'ᴍ', 'N': 'ɴ',
+    'O': 'ᴏ', 'P': 'ᴘ', 'Q': 'ǫ', 'R': 'ʀ', 'S': 'ꜱ', 'T': 'ᴛ', 'U': 'ᴜ',
+    'V': 'ᴠ', 'W': 'ᴡ', 'X': 'x', 'Y': 'ʏ', 'Z': 'ᴢ',
 }
 CANADIAN_ABORIGINAL_MAP = {
+    'a': 'ᐞ', 'b': 'ᑲ', 'c': 'ᐸ', 'd': 'ᑯ', 'e': 'ᕪ', 'f': 'ᕝ', 'g': 'ᕐ',
+    'h': 'ᑋ', 'i': 'ᐃ', 'j': 'ᒉ', 'k': 'ᐠ', 'l': 'ᒻ', 'm': 'ᒻ', 'n': 'ᐢ',
+    'o': 'ᓱ', 'p': 'ᑭ', 'q': 'ᕴ', 'r': 'ᕐ', 's': 'ᔆ', 't': 'ᑦ', 'u': 'ᐡ',
+    'v': 'ᐯ', 'w': 'ᐤ', 'x': 'ᕽ', 'y': 'ᔾ', 'z': 'ᙆ',
+    'A': 'ᐞ', 'B': 'ᑲ', 'C': 'ᐸ', 'D': 'ᑯ', 'E': 'ᕪ', 'F': 'ᕝ', 'G': 'ᕐ',
+    'H': 'ᑋ', 'I': 'ᐃ', 'J': 'ᒉ', 'K': 'ᐠ', 'L': 'ᒻ', 'M': 'ᒻ', 'N': 'ᐢ',
+    'O': 'ᓱ', 'P': 'ᑭ', 'Q': 'ᕴ', 'R': 'ᕐ', 'S': 'ᔆ', 'T': 'ᑦ', 'U': 'ᐡ',
+    'V': 'ᐯ', 'W': 'ᐤ', 'X': 'ᕽ', 'Y': 'ᔾ', 'Z': 'ᙆ',
 }
+CIRCLED_MAP = {
     'a': 'ⓐ', 'b': 'ⓑ', 'c': 'ⓒ', 'd': 'ⓓ', 'e': 'ⓔ', 'f': 'ⓕ', 'g': 'ⓖ',
     'h': 'ⓗ', 'i': 'ⓘ', 'j': 'ⓙ', 'k': 'ⓚ', 'l': 'ⓛ', 'm': 'ⓜ', 'n': 'ⓝ',
     'o': 'ⓞ', 'p': 'ⓟ', 'q': 'ⓠ', 'r': 'ⓡ', 's': 'ⓢ', 't': 'ⓣ', 'u': 'ⓤ',
     'v': 'ⓥ', 'w': 'ⓦ', 'x': 'ⓧ', 'y': 'ⓨ', 'z': 'ⓩ',
+    'A': 'Ⓐ', 'B': 'Ⓑ', 'C': 'Ⓒ', 'D': 'Ⓓ', 'E': 'Ⓔ', 'F': 'Ⓕ', 'G': 'Ⓖ',
+    'H': 'Ⓗ', 'I': 'Ⓘ', 'J': 'Ⓙ', 'K': 'Ⓚ', 'L': 'Ⓛ', 'M': 'Ⓜ', 'N': 'Ⓝ',
+    'O': 'Ⓞ', 'P': 'Ⓟ', 'Q': 'Ⓠ', 'R': 'Ⓡ', 'S': 'Ⓢ', 'T': 'Ⓣ', 'U': 'Ⓤ',
+    'V': 'Ⓥ', 'W': 'Ⓦ', 'X': 'Ⓧ', 'Y': 'Ⓨ', 'Z': 'Ⓩ',
+}
+SQUARED_MAP = {
     'A': '🄰', 'B': '🄱', 'C': '🄲', 'D': '🄳', 'E': '🄴', 'F': '🄵', 'G': '🄶',
     'H': '🄷', 'I': '🄸', 'J': '🄹', 'K': '🄺', 'L': '🄻', 'M': '🄼', 'N': '🄽',
     'O': '🄾', 'P': '🄿', 'Q': '🅀', 'R': '🅁', 'S': '🅂', 'T': '🅃', 'U': '🅄',
     'V': '🅅', 'W': '🅆', 'X': '🅇', 'Y': '🅈', 'Z': '🅉',
+    'a': '🄰', 'b': '🄱', 'c': '🄲', 'd': '🄳', 'e': '🄴', 'f': '🄵', 'g': '🄶',
+    'h': '🄷', 'i': '🄸', 'j': '🄹', 'k': '🄺', 'l': '🄻', 'm': '🄼', 'n': '🄽',
+    'o': '🄾', 'p': '🄿', 'q': '🅀', 'r': '🅁', 's': '🅂', 't': '🅃', 'u': '🅄',
+    'v': '🅅', 'w': '🅆', 'x': '🅇', 'y': '🅈', 'z': '🅉',
 }
 STYLES = {
+    'small_caps': ('Small Caps', SMALL_CAPS_MAP),
+    'canadian_aboriginal': ('Canadian Aboriginal', CANADIAN_ABORIGINAL_MAP),
+    'circled': ('Circled Letters', CIRCLED_MAP),
+    'squared': ('Squared Letters', SQUARED_MAP),
 }
+# =============================================================================
+# Model Configuration
+# =============================================================================
 MODELS = {
+    'gemma': {
+        'name': 'Gemma-2-2b-it',
+        'repo_id': 'bartowski/gemma-2-2b-it-GGUF',
+        'filename': 'gemma-2-2b-it-Q4_K_M.gguf',
+        'chat_format': 'gemma',
+    },
+    'phi': {
+        'name': 'Phi-3-mini-4k',
+        'repo_id': 'microsoft/Phi-3-mini-4k-instruct-gguf',
+        'filename': 'Phi-3-mini-4k-instruct-q4.gguf',
+        'chat_format': 'chatml',
+    },
+    'qwen': {
+        'name': 'Qwen2.5-3B',
+        'repo_id': 'Qwen/Qwen2.5-3B-Instruct-GGUF',
+        'filename': 'qwen2.5-3b-instruct-q4_k_m.gguf',
+        'chat_format': 'chatml',
+    },
 }
+# Global model cache (only keep one model loaded at a time to save memory)
+_current_model = None
+_current_model_name = None
+# =============================================================================
+# Core Functions
+# =============================================================================
 def transform_text(text: str, style: str) -> str:
     """Transform text using the specified Unicode style."""
     if style not in STYLES:
         return text
+    char_map = STYLES[style][1]
     return ''.join(char_map.get(c, c) for c in text)
+def load_model(model_key: str) -> Llama:
+    """Load a GGUF model. Unloads previous model to save memory."""
+    global _current_model, _current_model_name
+    if _current_model_name == model_key and _current_model is not None:
+        return _current_model
+    # Unload previous model
+    if _current_model is not None:
+        del _current_model
+        _current_model = None
+        _current_model_name = None
+    config = MODELS[model_key]
+    _current_model = Llama.from_pretrained(
+        repo_id=config['repo_id'],
+        filename=config['filename'],
+        n_ctx=2048,
+        n_threads=4,
+        verbose=False,
+    )
+    _current_model_name = model_key
+    return _current_model
+def get_prediction(model: Llama, text: str, task: str, model_key: str) -> str:
+    """Get model prediction for the given text and task."""
+    if task == 'fact_verification':
+        system_prompt = "You are a fact-checking assistant. Classify claims as SUPPORTS, REFUTES, or NOT_ENOUGH_INFO. Reply with only one word."
+        user_prompt = f"Classify this claim: {text}"
+        valid_labels = ['SUPPORTS', 'REFUTES', 'NOT_ENOUGH_INFO']
     else:
+        system_prompt = "You are a text classifier. Determine if text is an argument or not. Reply with only ARGUMENT or NOT_ARGUMENT."
+        user_prompt = f"Is this an argument? {text}"
+        valid_labels = ['ARGUMENT', 'NOT_ARGUMENT']
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt},
+    ]
+    response = model.create_chat_completion(
+        messages=messages,
+        max_tokens=20,
+        temperature=0,
+    )
+    output = response['choices'][0]['message']['content'].strip().upper()
+    # Parse to valid label
+    for label in valid_labels:
+        if label in output:
+            return label
+    # Default fallback
+    return valid_labels[-1]
+def run_attack(text: str, style: str, model_key: str, task: str):
     """Run the Unicode attack and compare predictions."""
     if not text.strip():
+        return "", "", "", "Please enter some text.", ""
+    try:
+        # Transform text
+        styled_text = transform_text(text, style)
+        # Load model (shows progress)
+        yield styled_text, "Loading model...", "", "Loading model (this may take a moment)...", ""
+        model = load_model(model_key)
+        # Get original prediction
+        yield styled_text, "Running...", "", "Getting prediction for original text...", ""
+        original_pred = get_prediction(model, text, task, model_key)
+        # Get styled prediction
+        yield styled_text, original_pred, "Running...", "Getting prediction for styled text...", ""
+        styled_pred = get_prediction(model, styled_text, task, model_key)
+        # Determine result
+        if original_pred != styled_pred:
+            status = f"ATTACK SUCCEEDED: Prediction changed from {original_pred} to {styled_pred}"
+            result_color = "green"
+        else:
+            status = f"Attack failed: Prediction unchanged ({original_pred})"
+            result_color = "red"
+        yield styled_text, original_pred, styled_pred, status, result_color
+    except Exception as e:
+        yield "", "", "", f"Error: {str(e)}", "red"
+def preview_all_styles(text: str) -> str:
+    """Preview text in all available Unicode styles."""
     if not text.strip():
+        return "Enter text to preview."
+    lines = [f"Original: {text}", "=" * 50]
+    for key, (name, _) in STYLES.items():
+        styled = transform_text(text, key)
+        lines.append(f"\n{name}:\n{styled}")
+    return '\n'.join(lines)
+# =============================================================================
+# Gradio Interface
+# =============================================================================
+def create_demo():
+    """Create the Gradio demo interface."""
+    with gr.Blocks(
+        title="Unicode Adversarial Attack Demo",
+        theme=gr.themes.Soft(),
+    ) as demo:
+        gr.Markdown("""
+        # Unicode Adversarial Attack Demo
+        Test how LLMs respond to Unicode-styled text. This demo transforms your input
+        using special Unicode characters and compares model predictions.
+        **Note:** This demo uses quantized models (Q4) for CPU inference.
+        Results may differ slightly from full-precision models used in experiments.
+        """)
+        with gr.Tab("Attack Demo"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    text_input = gr.Textbox(
+                        label="Input Text",
+                        lines=3,
+                        placeholder="Enter a claim or statement to test...",
+                        value="Climate change is primarily caused by human activities.",
+                    )
+                    with gr.Row():
+                        style_dropdown = gr.Dropdown(
+                            choices=[(STYLES[k][0], k) for k in STYLES],
+                            label="Unicode Style",
+                            value="canadian_aboriginal",
+                        )
+                        model_dropdown = gr.Dropdown(
+                            choices=[(MODELS[k]['name'], k) for k in MODELS],
+                            label="Model",
+                            value="phi",
+                        )
+                    task_dropdown = gr.Dropdown(
+                        choices=[
+                            ("Fact Verification", "fact_verification"),
+                            ("Argument Mining", "argument_mining"),
+                        ],
+                        label="Task",
+                        value="fact_verification",
+                    )
+                    run_btn = gr.Button("Run Attack", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    styled_output = gr.Textbox(label="Styled Text", lines=3)
+                    with gr.Row():
+                        original_pred = gr.Textbox(label="Original Prediction")
+                        styled_pred = gr.Textbox(label="Styled Prediction")
+                    status_output = gr.Textbox(label="Result", lines=2)
+                    result_state = gr.State("")
+            run_btn.click(
+                fn=run_attack,
+                inputs=[text_input, style_dropdown, model_dropdown, task_dropdown],
+                outputs=[styled_output, original_pred, styled_pred, status_output, result_state],
+            )
+        with gr.Tab("Style Preview"):
+            gr.Markdown("### Preview Unicode Styles")
+            gr.Markdown("See how your text looks in each Unicode style before running an attack.")
+            preview_input = gr.Textbox(
+                label="Enter text",
+                placeholder="Type something...",
+                value="Climate change is real",
+            )
+            preview_btn = gr.Button("Preview All Styles")
+            preview_output = gr.Textbox(label="Styled Versions", lines=15)
+            preview_btn.click(
+                fn=preview_all_styles,
+                inputs=[preview_input],
+                outputs=[preview_output],
+            )
+        with gr.Tab("About"):
+            gr.Markdown("""
+            ## About This Demo
+            This demo accompanies the research project:
+            **"Unicode-Based Adversarial Attacks on Large Language Models"**
+            ### Key Findings (Phase 1 Experiments)
+            | Metric | Value |
+            |--------|-------|
+            | Total Samples Tested | 59,376 |
+            | Overall Attack Success Rate | 50.2% |
+            | Most Vulnerable Model | Phi-3-mini (58.8% ASR) |
+            | Most Robust Model | Gemma-2-2b (39.0% ASR) |
+            | Most Effective Style | Canadian Aboriginal (56.5% ASR) |
+            ### Models Used
+            | Model | Parameters | Quantization |
+            |-------|------------|--------------|
+            | Gemma-2-2b-it | 2B | Q4_K_M |
+            | Phi-3-mini-4k | 3.8B | Q4 |
+            | Qwen2.5-3B | 3B | Q4_K_M |
+            ### Unicode Styles
+            - **Small Caps**: ᴛᴇxᴛ ʟɪᴋᴇ ᴛʜɪꜱ
+            - **Canadian Aboriginal**: ᑦᕪᔆᐩ ᒻᐃᐠᕪ ᑦᑋᐃᔆ
+            - **Circled Letters**: ⓣⓔⓧⓣ ⓛⓘⓚⓔ ⓣⓗⓘⓢ
+            - **Squared Letters**: 🅃🄴🅇🅃 🄻🄸🄺🄴 🅃🄷🄸🅂
+            ---
+            **Student:** Endrin Hoti (King's College London)
+            **Supervisor:** Dr. Oana Cocarascu
+            """)
         gr.Markdown("""
+        ---
+        *First query may be slow while the model downloads and loads (~2GB per model).*
         """)
+    return demo
+# =============================================================================
+# Entry Point
+# =============================================================================
 if __name__ == "__main__":
+    demo = create_demo()
     demo.launch()