Spaces:
Running
Running
Add Unicode attack demo app
Browse files- README.md +28 -6
- app.py +279 -0
- requirements.txt +2 -0
README.md
CHANGED
|
@@ -1,13 +1,35 @@
|
|
| 1 |
---
|
| 2 |
-
title: Unicode Attack Demo
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Unicode Adversarial Attack Demo
|
| 3 |
+
emoji: 🔤
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# Unicode Adversarial Attack Demo
|
| 14 |
+
|
| 15 |
+
Interactive demonstration of how Unicode character substitutions can fool Large Language Models.
|
| 16 |
+
|
| 17 |
+
## What This Does
|
| 18 |
+
|
| 19 |
+
This demo transforms text using special Unicode characters (like Canadian Aboriginal Syllabics or Circled Letters) and tests whether the transformation changes an LLM's prediction.
|
| 20 |
+
|
| 21 |
+
## Research Findings
|
| 22 |
+
|
| 23 |
+
Tested on 59,376 samples across 3 models and 4 Unicode styles:
|
| 24 |
+
|
| 25 |
+
- **Overall Attack Success Rate:** 50.2%
|
| 26 |
+
- **Most Vulnerable Model:** Phi-3-mini (58.8% ASR)
|
| 27 |
+
- **Most Robust Model:** Gemma-2-2b (39.0% ASR)
|
| 28 |
+
- **Most Effective Style:** Canadian Aboriginal (56.5% ASR)
|
| 29 |
+
|
| 30 |
+
## Project
|
| 31 |
+
|
| 32 |
+
**Title:** Unicode-Based Adversarial Attacks on Large Language Models
|
| 33 |
+
**Author:** Endrin Hoti
|
| 34 |
+
**Institution:** King's College London
|
| 35 |
+
**Supervisor:** Dr. Oana Cocarascu
|
app.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unicode Adversarial Attack Demo - HuggingFace Spaces Version
|
| 3 |
+
Uses Inference API instead of local model loading.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import os
|
| 8 |
+
from huggingface_hub import InferenceClient
|
| 9 |
+
|
| 10 |
+
# Unicode transformation mappings
|
| 11 |
+
SMALL_CAPS_MAP = {
|
| 12 |
+
'a': 'ᴀ', 'b': 'ʙ', 'c': 'ᴄ', 'd': 'ᴅ', 'e': 'ᴇ', 'f': 'ꜰ', 'g': 'ɢ',
|
| 13 |
+
'h': 'ʜ', 'i': 'ɪ', 'j': 'ᴊ', 'k': 'ᴋ', 'l': 'ʟ', 'm': 'ᴍ', 'n': 'ɴ',
|
| 14 |
+
'o': 'ᴏ', 'p': 'ᴘ', 'q': 'ǫ', 'r': 'ʀ', 's': 's', 't': 'ᴛ', 'u': 'ᴜ',
|
| 15 |
+
'v': 'ᴠ', 'w': 'ᴡ', 'x': 'x', 'y': 'ʏ', 'z': 'ᴢ',
|
| 16 |
+
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G',
|
| 17 |
+
'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N',
|
| 18 |
+
'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U',
|
| 19 |
+
'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z',
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
CANADIAN_ABORIGINAL_MAP = {
|
| 23 |
+
'a': 'ᐞ', 'b': 'ᒃ', 'c': 'ᑦ', 'd': 'ᒄ', 'e': 'ᕪ', 'f': 'ᕝ', 'g': 'ᕐ',
|
| 24 |
+
'h': 'ᑋ', 'i': 'ᑊ', 'j': 'ᒢ', 'k': 'ᐟ', 'l': 'ᒻ', 'm': 'ᔿ', 'n': 'ᐢ',
|
| 25 |
+
'o': 'ᐤ', 'p': 'ᓐ', 'q': 'ᕐ', 'r': 'ᔇ', 's': 'ᔆ', 't': 'ᐩ', 'u': 'ᐡ',
|
| 26 |
+
'v': 'ᘁ', 'w': 'ᐜ', 'x': 'ᕽ', 'y': 'ᔉ', 'z': 'ᙆ',
|
| 27 |
+
'A': 'ᗩ', 'B': 'ᗷ', 'C': 'ᑕ', 'D': 'ᐅ', 'E': 'ᕮ', 'F': 'ᒋ', 'G': 'ᘜ',
|
| 28 |
+
'H': 'ᕼ', 'I': 'ᓵ', 'J': 'ᒎ', 'K': 'ᐠ', 'L': 'ᖶ', 'M': 'ᘻ', 'N': 'ᘯ',
|
| 29 |
+
'O': 'ᗜ', 'P': 'ᑭ', 'Q': 'ᕴ', 'R': 'ᖇ', 'S': 'ᔕ', 'T': 'ᘕ', 'U': 'ᑌ',
|
| 30 |
+
'V': 'ᐯ', 'W': 'ᗐ', 'X': '᙭', 'Y': 'ᖻ', 'Z': 'ᗱ',
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
CIRCLED_SQUARED_MAP = {
|
| 34 |
+
'a': 'ⓐ', 'b': 'ⓑ', 'c': 'ⓒ', 'd': 'ⓓ', 'e': 'ⓔ', 'f': 'ⓕ', 'g': 'ⓖ',
|
| 35 |
+
'h': 'ⓗ', 'i': 'ⓘ', 'j': 'ⓙ', 'k': 'ⓚ', 'l': 'ⓛ', 'm': 'ⓜ', 'n': 'ⓝ',
|
| 36 |
+
'o': 'ⓞ', 'p': 'ⓟ', 'q': 'ⓠ', 'r': 'ⓡ', 's': 'ⓢ', 't': 'ⓣ', 'u': 'ⓤ',
|
| 37 |
+
'v': 'ⓥ', 'w': 'ⓦ', 'x': 'ⓧ', 'y': 'ⓨ', 'z': 'ⓩ',
|
| 38 |
+
'A': '🄰', 'B': '🄱', 'C': '🄲', 'D': '🄳', 'E': '🄴', 'F': '🄵', 'G': '🄶',
|
| 39 |
+
'H': '🄷', 'I': '🄸', 'J': '🄹', 'K': '🄺', 'L': '🄻', 'M': '🄼', 'N': '🄽',
|
| 40 |
+
'O': '🄾', 'P': '🄿', 'Q': '🅀', 'R': '🅁', 'S': '🅂', 'T': '🅃', 'U': '🅄',
|
| 41 |
+
'V': '🅅', 'W': '🅆', 'X': '🅇', 'Y': '🅈', 'Z': '🅉',
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
SQUARED_LETTERS_MAP = {
|
| 45 |
+
'a': '🅰', 'b': '🅱', 'c': '🅲', 'd': '🅳', 'e': '🅴', 'f': '🅵', 'g': '🅶',
|
| 46 |
+
'h': '🅷', 'i': '🅸', 'j': '🅹', 'k': '🅺', 'l': '🅻', 'm': '🅼', 'n': '🅽',
|
| 47 |
+
'o': '🅾', 'p': '🅿', 'q': '🆀', 'r': '🆁', 's': '🆂', 't': '🆃', 'u': '🆄',
|
| 48 |
+
'v': '🆅', 'w': '🆆', 'x': '🆇', 'y': '🆈', 'z': '🆉',
|
| 49 |
+
'A': '🅰', 'B': '🅱', 'C': '🅲', 'D': '🅳', 'E': '🅴', 'F': '🅵', 'G': '🅶',
|
| 50 |
+
'H': '🅷', 'I': '🅸', 'J': '🅹', 'K': '🅺', 'L': '🅻', 'M': '🅼', 'N': '🅽',
|
| 51 |
+
'O': '🅾', 'P': '🅿', 'Q': '🆀', 'R': '🆁', 'S': '🆂', 'T': '🆃', 'U': '🆄',
|
| 52 |
+
'V': '🆅', 'W': '🆆', 'X': '🆇', 'Y': '🆈', 'Z': '🆉',
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
STYLES = {
|
| 56 |
+
'Small Caps': SMALL_CAPS_MAP,
|
| 57 |
+
'Canadian Aboriginal': CANADIAN_ABORIGINAL_MAP,
|
| 58 |
+
'Circled/Squared': CIRCLED_SQUARED_MAP,
|
| 59 |
+
'Squared Letters': SQUARED_LETTERS_MAP,
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
MODELS = {
|
| 63 |
+
'Gemma-2-2B': 'google/gemma-2-2b-it',
|
| 64 |
+
'Phi-3-mini': 'microsoft/Phi-3-mini-4k-instruct',
|
| 65 |
+
'Qwen2.5-3B': 'Qwen/Qwen2.5-3B-Instruct',
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
# Initialize client
|
| 69 |
+
client = None
|
| 70 |
+
|
| 71 |
+
def get_client():
|
| 72 |
+
global client
|
| 73 |
+
if client is None:
|
| 74 |
+
token = os.environ.get("HF_TOKEN")
|
| 75 |
+
client = InferenceClient(token=token)
|
| 76 |
+
return client
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def transform_text(text: str, style: str) -> str:
|
| 80 |
+
"""Transform text using the specified Unicode style."""
|
| 81 |
+
if style not in STYLES:
|
| 82 |
+
return text
|
| 83 |
+
char_map = STYLES[style]
|
| 84 |
+
return ''.join(char_map.get(c, c) for c in text)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def get_prediction(text: str, model_id: str, task: str) -> str:
|
| 88 |
+
"""Get model prediction using Inference API."""
|
| 89 |
+
if task == "Fact Verification":
|
| 90 |
+
prompt = f"""You are a fact-checking assistant. Classify the following claim as exactly one of: SUPPORTS, REFUTES, or NOT_ENOUGH_INFO.
|
| 91 |
+
|
| 92 |
+
Claim: {text}
|
| 93 |
+
|
| 94 |
+
Respond with only one word (SUPPORTS, REFUTES, or NOT_ENOUGH_INFO):"""
|
| 95 |
+
else:
|
| 96 |
+
prompt = f"""You are a text classifier. Determine if the following sentence is an argument or not.
|
| 97 |
+
|
| 98 |
+
Sentence: {text}
|
| 99 |
+
|
| 100 |
+
Respond with only one word (ARGUMENT or NOT_ARGUMENT):"""
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
c = get_client()
|
| 104 |
+
response = c.text_generation(
|
| 105 |
+
prompt,
|
| 106 |
+
model=model_id,
|
| 107 |
+
max_new_tokens=10,
|
| 108 |
+
temperature=0.01,
|
| 109 |
+
)
|
| 110 |
+
# Extract first word from response
|
| 111 |
+
result = response.strip().split()[0].upper() if response.strip() else "ERROR"
|
| 112 |
+
# Clean up common variations
|
| 113 |
+
if "SUPPORT" in result:
|
| 114 |
+
return "SUPPORTS"
|
| 115 |
+
if "REFUTE" in result:
|
| 116 |
+
return "REFUTES"
|
| 117 |
+
if "NOT_ENOUGH" in result or "NOT ENOUGH" in result:
|
| 118 |
+
return "NOT_ENOUGH_INFO"
|
| 119 |
+
if "ARGUMENT" in result and "NOT" not in result:
|
| 120 |
+
return "ARGUMENT"
|
| 121 |
+
if "NOT" in result:
|
| 122 |
+
return "NOT_ARGUMENT"
|
| 123 |
+
return result
|
| 124 |
+
except Exception as e:
|
| 125 |
+
return f"ERROR: {str(e)[:50]}"
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def run_attack(text: str, style: str, model_name: str, task: str):
|
| 129 |
+
"""Run the Unicode attack and compare predictions."""
|
| 130 |
+
if not text.strip():
|
| 131 |
+
return "", "", "", "", "Please enter some text."
|
| 132 |
+
|
| 133 |
+
# Transform text
|
| 134 |
+
styled_text = transform_text(text, style)
|
| 135 |
+
|
| 136 |
+
# Get model ID
|
| 137 |
+
model_id = MODELS.get(model_name)
|
| 138 |
+
if not model_id:
|
| 139 |
+
return styled_text, "", "", "", f"Unknown model: {model_name}"
|
| 140 |
+
|
| 141 |
+
# Get predictions
|
| 142 |
+
original_pred = get_prediction(text, model_id, task)
|
| 143 |
+
styled_pred = get_prediction(styled_text, model_id, task)
|
| 144 |
+
|
| 145 |
+
# Determine result
|
| 146 |
+
if "ERROR" in original_pred or "ERROR" in styled_pred:
|
| 147 |
+
status = f"Error getting predictions. Try again or check API access."
|
| 148 |
+
color = "orange"
|
| 149 |
+
elif original_pred != styled_pred:
|
| 150 |
+
status = f"ATTACK SUCCEEDED - Prediction changed!"
|
| 151 |
+
color = "green"
|
| 152 |
+
else:
|
| 153 |
+
status = f"Attack failed - Prediction unchanged"
|
| 154 |
+
color = "red"
|
| 155 |
+
|
| 156 |
+
return styled_text, original_pred, styled_pred, status
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def preview_all_styles(text: str):
|
| 160 |
+
"""Preview text in all Unicode styles."""
|
| 161 |
+
if not text.strip():
|
| 162 |
+
return "Enter text to see previews."
|
| 163 |
+
|
| 164 |
+
output = f"**Original:** {text}\n\n"
|
| 165 |
+
for style_name in STYLES:
|
| 166 |
+
transformed = transform_text(text, style_name)
|
| 167 |
+
output += f"**{style_name}:** {transformed}\n\n"
|
| 168 |
+
return output
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# Create Gradio interface
|
| 172 |
+
with gr.Blocks(title="Unicode Attack Demo", theme=gr.themes.Soft()) as demo:
|
| 173 |
+
gr.Markdown("""
|
| 174 |
+
# Unicode Adversarial Attack Demo
|
| 175 |
+
|
| 176 |
+
Test how Unicode-styled text can fool LLMs. This demonstrates research on adversarial robustness.
|
| 177 |
+
|
| 178 |
+
**How it works:**
|
| 179 |
+
1. Enter a claim or sentence
|
| 180 |
+
2. Choose a Unicode style (transforms all characters)
|
| 181 |
+
3. Choose a model and task
|
| 182 |
+
4. See if the model's prediction changes
|
| 183 |
+
""")
|
| 184 |
+
|
| 185 |
+
with gr.Tab("Attack Demo"):
|
| 186 |
+
with gr.Row():
|
| 187 |
+
with gr.Column(scale=1):
|
| 188 |
+
text_input = gr.Textbox(
|
| 189 |
+
label="Input Text",
|
| 190 |
+
placeholder="Enter a claim or sentence...",
|
| 191 |
+
value="Climate change is caused by human activities.",
|
| 192 |
+
lines=3
|
| 193 |
+
)
|
| 194 |
+
style_dropdown = gr.Dropdown(
|
| 195 |
+
choices=list(STYLES.keys()),
|
| 196 |
+
label="Unicode Style",
|
| 197 |
+
value="Canadian Aboriginal",
|
| 198 |
+
info="Canadian Aboriginal is most effective (56.5% ASR)"
|
| 199 |
+
)
|
| 200 |
+
model_dropdown = gr.Dropdown(
|
| 201 |
+
choices=list(MODELS.keys()),
|
| 202 |
+
label="Model",
|
| 203 |
+
value="Phi-3-mini",
|
| 204 |
+
info="Phi-3 is most vulnerable (58.8% ASR)"
|
| 205 |
+
)
|
| 206 |
+
task_dropdown = gr.Dropdown(
|
| 207 |
+
choices=["Fact Verification", "Argument Mining"],
|
| 208 |
+
label="Task",
|
| 209 |
+
value="Fact Verification"
|
| 210 |
+
)
|
| 211 |
+
run_btn = gr.Button("Run Attack", variant="primary", size="lg")
|
| 212 |
+
|
| 213 |
+
with gr.Column(scale=1):
|
| 214 |
+
styled_output = gr.Textbox(label="Styled Text", lines=3)
|
| 215 |
+
with gr.Row():
|
| 216 |
+
original_pred_output = gr.Textbox(label="Original Prediction")
|
| 217 |
+
styled_pred_output = gr.Textbox(label="Styled Prediction")
|
| 218 |
+
status_output = gr.Textbox(label="Result", lines=2)
|
| 219 |
+
|
| 220 |
+
run_btn.click(
|
| 221 |
+
fn=run_attack,
|
| 222 |
+
inputs=[text_input, style_dropdown, model_dropdown, task_dropdown],
|
| 223 |
+
outputs=[styled_output, original_pred_output, styled_pred_output, status_output]
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
with gr.Tab("Style Preview"):
|
| 227 |
+
gr.Markdown("### Preview All Unicode Styles")
|
| 228 |
+
preview_input = gr.Textbox(
|
| 229 |
+
label="Enter text",
|
| 230 |
+
value="Climate change is real",
|
| 231 |
+
lines=2
|
| 232 |
+
)
|
| 233 |
+
preview_btn = gr.Button("Preview Styles")
|
| 234 |
+
preview_output = gr.Markdown()
|
| 235 |
+
|
| 236 |
+
preview_btn.click(
|
| 237 |
+
fn=preview_all_styles,
|
| 238 |
+
inputs=[preview_input],
|
| 239 |
+
outputs=[preview_output]
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
with gr.Tab("Research Results"):
|
| 243 |
+
gr.Markdown("""
|
| 244 |
+
### Experiment Results (59,376 samples)
|
| 245 |
+
|
| 246 |
+
| Metric | Value |
|
| 247 |
+
|--------|-------|
|
| 248 |
+
| Overall ASR | 50.2% |
|
| 249 |
+
| Most Vulnerable Model | Phi-3-mini (58.8% ASR) |
|
| 250 |
+
| Most Robust Model | Gemma-2-2b (39.0% ASR) |
|
| 251 |
+
| Most Effective Style | Canadian Aboriginal (56.5% ASR) |
|
| 252 |
+
|
| 253 |
+
#### By Model
|
| 254 |
+
| Model | Mean ASR |
|
| 255 |
+
|-------|----------|
|
| 256 |
+
| Gemma-2-2b | 39.0% |
|
| 257 |
+
| Qwen2.5-3B | 52.8% |
|
| 258 |
+
| Phi-3-mini | 58.8% |
|
| 259 |
+
|
| 260 |
+
#### By Style
|
| 261 |
+
| Style | Mean ASR |
|
| 262 |
+
|-------|----------|
|
| 263 |
+
| Canadian Aboriginal | 56.5% |
|
| 264 |
+
| Circled/Squared | 53.1% |
|
| 265 |
+
| Squared Letters | 53.1% |
|
| 266 |
+
| Small Caps | 38.1% |
|
| 267 |
+
|
| 268 |
+
*ASR = Attack Success Rate (% of predictions that changed)*
|
| 269 |
+
""")
|
| 270 |
+
|
| 271 |
+
gr.Markdown("""
|
| 272 |
+
---
|
| 273 |
+
**Project:** Unicode-Based Adversarial Attacks on LLMs
|
| 274 |
+
**Author:** Endrin Hoti | King's College London
|
| 275 |
+
**Supervisor:** Dr. Oana Cocarascu
|
| 276 |
+
""")
|
| 277 |
+
|
| 278 |
+
if __name__ == "__main__":
|
| 279 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
huggingface_hub>=0.20.0
|