end-rin commited on
Commit
ecebbb9
·
1 Parent(s): 14d697e

Add Unicode attack demo app

Browse files
Files changed (3) hide show
  1. README.md +28 -6
  2. app.py +279 -0
  3. requirements.txt +2 -0
README.md CHANGED
@@ -1,13 +1,35 @@
1
  ---
2
- title: Unicode Attack Demo
3
- emoji:
4
- colorFrom: gray
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Unicode Adversarial Attack Demo
3
+ emoji: 🔤
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
+ # Unicode Adversarial Attack Demo
14
+
15
+ Interactive demonstration of how Unicode character substitutions can fool Large Language Models.
16
+
17
+ ## What This Does
18
+
19
+ This demo transforms text using special Unicode characters (like Canadian Aboriginal Syllabics or Circled Letters) and tests whether the transformation changes an LLM's prediction.
20
+
21
+ ## Research Findings
22
+
23
+ Tested on 59,376 samples across 3 models and 4 Unicode styles:
24
+
25
+ - **Overall Attack Success Rate:** 50.2%
26
+ - **Most Vulnerable Model:** Phi-3-mini (58.8% ASR)
27
+ - **Most Robust Model:** Gemma-2-2b (39.0% ASR)
28
+ - **Most Effective Style:** Canadian Aboriginal (56.5% ASR)
29
+
30
+ ## Project
31
+
32
+ **Title:** Unicode-Based Adversarial Attacks on Large Language Models
33
+ **Author:** Endrin Hoti
34
+ **Institution:** King's College London
35
+ **Supervisor:** Dr. Oana Cocarascu
app.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unicode Adversarial Attack Demo - HuggingFace Spaces Version
3
+ Uses Inference API instead of local model loading.
4
+ """
5
+
6
+ import gradio as gr
7
+ import os
8
+ from huggingface_hub import InferenceClient
9
+
10
+ # Unicode transformation mappings
11
+ SMALL_CAPS_MAP = {
12
+ 'a': 'ᴀ', 'b': 'ʙ', 'c': 'ᴄ', 'd': 'ᴅ', 'e': 'ᴇ', 'f': 'ꜰ', 'g': 'ɢ',
13
+ 'h': 'ʜ', 'i': 'ɪ', 'j': 'ᴊ', 'k': 'ᴋ', 'l': 'ʟ', 'm': 'ᴍ', 'n': 'ɴ',
14
+ 'o': 'ᴏ', 'p': 'ᴘ', 'q': 'ǫ', 'r': 'ʀ', 's': 's', 't': 'ᴛ', 'u': 'ᴜ',
15
+ 'v': 'ᴠ', 'w': 'ᴡ', 'x': 'x', 'y': 'ʏ', 'z': 'ᴢ',
16
+ 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G',
17
+ 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N',
18
+ 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U',
19
+ 'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z',
20
+ }
21
+
22
+ CANADIAN_ABORIGINAL_MAP = {
23
+ 'a': 'ᐞ', 'b': 'ᒃ', 'c': 'ᑦ', 'd': 'ᒄ', 'e': 'ᕪ', 'f': 'ᕝ', 'g': 'ᕐ',
24
+ 'h': 'ᑋ', 'i': 'ᑊ', 'j': 'ᒢ', 'k': 'ᐟ', 'l': 'ᒻ', 'm': 'ᔿ', 'n': 'ᐢ',
25
+ 'o': 'ᐤ', 'p': 'ᓐ', 'q': 'ᕐ', 'r': 'ᔇ', 's': 'ᔆ', 't': 'ᐩ', 'u': 'ᐡ',
26
+ 'v': 'ᘁ', 'w': 'ᐜ', 'x': 'ᕽ', 'y': 'ᔉ', 'z': 'ᙆ',
27
+ 'A': 'ᗩ', 'B': 'ᗷ', 'C': 'ᑕ', 'D': 'ᐅ', 'E': 'ᕮ', 'F': 'ᒋ', 'G': 'ᘜ',
28
+ 'H': 'ᕼ', 'I': 'ᓵ', 'J': 'ᒎ', 'K': 'ᐠ', 'L': 'ᖶ', 'M': 'ᘻ', 'N': 'ᘯ',
29
+ 'O': 'ᗜ', 'P': 'ᑭ', 'Q': 'ᕴ', 'R': 'ᖇ', 'S': 'ᔕ', 'T': 'ᘕ', 'U': 'ᑌ',
30
+ 'V': 'ᐯ', 'W': 'ᗐ', 'X': '᙭', 'Y': 'ᖻ', 'Z': 'ᗱ',
31
+ }
32
+
33
+ CIRCLED_SQUARED_MAP = {
34
+ 'a': 'ⓐ', 'b': 'ⓑ', 'c': 'ⓒ', 'd': 'ⓓ', 'e': 'ⓔ', 'f': 'ⓕ', 'g': 'ⓖ',
35
+ 'h': 'ⓗ', 'i': 'ⓘ', 'j': 'ⓙ', 'k': 'ⓚ', 'l': 'ⓛ', 'm': 'ⓜ', 'n': 'ⓝ',
36
+ 'o': 'ⓞ', 'p': 'ⓟ', 'q': 'ⓠ', 'r': 'ⓡ', 's': 'ⓢ', 't': 'ⓣ', 'u': 'ⓤ',
37
+ 'v': 'ⓥ', 'w': 'ⓦ', 'x': 'ⓧ', 'y': 'ⓨ', 'z': 'ⓩ',
38
+ 'A': '🄰', 'B': '🄱', 'C': '🄲', 'D': '🄳', 'E': '🄴', 'F': '🄵', 'G': '🄶',
39
+ 'H': '🄷', 'I': '🄸', 'J': '🄹', 'K': '🄺', 'L': '🄻', 'M': '🄼', 'N': '🄽',
40
+ 'O': '🄾', 'P': '🄿', 'Q': '🅀', 'R': '🅁', 'S': '🅂', 'T': '🅃', 'U': '🅄',
41
+ 'V': '🅅', 'W': '🅆', 'X': '🅇', 'Y': '🅈', 'Z': '🅉',
42
+ }
43
+
44
+ SQUARED_LETTERS_MAP = {
45
+ 'a': '🅰', 'b': '🅱', 'c': '🅲', 'd': '🅳', 'e': '🅴', 'f': '🅵', 'g': '🅶',
46
+ 'h': '🅷', 'i': '🅸', 'j': '🅹', 'k': '🅺', 'l': '🅻', 'm': '🅼', 'n': '🅽',
47
+ 'o': '🅾', 'p': '🅿', 'q': '🆀', 'r': '🆁', 's': '🆂', 't': '🆃', 'u': '🆄',
48
+ 'v': '🆅', 'w': '🆆', 'x': '🆇', 'y': '🆈', 'z': '🆉',
49
+ 'A': '🅰', 'B': '🅱', 'C': '🅲', 'D': '🅳', 'E': '🅴', 'F': '🅵', 'G': '🅶',
50
+ 'H': '🅷', 'I': '🅸', 'J': '🅹', 'K': '🅺', 'L': '🅻', 'M': '🅼', 'N': '🅽',
51
+ 'O': '🅾', 'P': '🅿', 'Q': '🆀', 'R': '🆁', 'S': '🆂', 'T': '🆃', 'U': '🆄',
52
+ 'V': '🆅', 'W': '🆆', 'X': '🆇', 'Y': '🆈', 'Z': '🆉',
53
+ }
54
+
55
+ STYLES = {
56
+ 'Small Caps': SMALL_CAPS_MAP,
57
+ 'Canadian Aboriginal': CANADIAN_ABORIGINAL_MAP,
58
+ 'Circled/Squared': CIRCLED_SQUARED_MAP,
59
+ 'Squared Letters': SQUARED_LETTERS_MAP,
60
+ }
61
+
62
+ MODELS = {
63
+ 'Gemma-2-2B': 'google/gemma-2-2b-it',
64
+ 'Phi-3-mini': 'microsoft/Phi-3-mini-4k-instruct',
65
+ 'Qwen2.5-3B': 'Qwen/Qwen2.5-3B-Instruct',
66
+ }
67
+
68
+ # Initialize client
69
+ client = None
70
+
71
+ def get_client():
72
+ global client
73
+ if client is None:
74
+ token = os.environ.get("HF_TOKEN")
75
+ client = InferenceClient(token=token)
76
+ return client
77
+
78
+
79
+ def transform_text(text: str, style: str) -> str:
80
+ """Transform text using the specified Unicode style."""
81
+ if style not in STYLES:
82
+ return text
83
+ char_map = STYLES[style]
84
+ return ''.join(char_map.get(c, c) for c in text)
85
+
86
+
87
+ def get_prediction(text: str, model_id: str, task: str) -> str:
88
+ """Get model prediction using Inference API."""
89
+ if task == "Fact Verification":
90
+ prompt = f"""You are a fact-checking assistant. Classify the following claim as exactly one of: SUPPORTS, REFUTES, or NOT_ENOUGH_INFO.
91
+
92
+ Claim: {text}
93
+
94
+ Respond with only one word (SUPPORTS, REFUTES, or NOT_ENOUGH_INFO):"""
95
+ else:
96
+ prompt = f"""You are a text classifier. Determine if the following sentence is an argument or not.
97
+
98
+ Sentence: {text}
99
+
100
+ Respond with only one word (ARGUMENT or NOT_ARGUMENT):"""
101
+
102
+ try:
103
+ c = get_client()
104
+ response = c.text_generation(
105
+ prompt,
106
+ model=model_id,
107
+ max_new_tokens=10,
108
+ temperature=0.01,
109
+ )
110
+ # Extract first word from response
111
+ result = response.strip().split()[0].upper() if response.strip() else "ERROR"
112
+ # Clean up common variations
113
+ if "SUPPORT" in result:
114
+ return "SUPPORTS"
115
+ if "REFUTE" in result:
116
+ return "REFUTES"
117
+ if "NOT_ENOUGH" in result or "NOT ENOUGH" in result:
118
+ return "NOT_ENOUGH_INFO"
119
+ if "ARGUMENT" in result and "NOT" not in result:
120
+ return "ARGUMENT"
121
+ if "NOT" in result:
122
+ return "NOT_ARGUMENT"
123
+ return result
124
+ except Exception as e:
125
+ return f"ERROR: {str(e)[:50]}"
126
+
127
+
128
+ def run_attack(text: str, style: str, model_name: str, task: str):
129
+ """Run the Unicode attack and compare predictions."""
130
+ if not text.strip():
131
+ return "", "", "", "", "Please enter some text."
132
+
133
+ # Transform text
134
+ styled_text = transform_text(text, style)
135
+
136
+ # Get model ID
137
+ model_id = MODELS.get(model_name)
138
+ if not model_id:
139
+ return styled_text, "", "", "", f"Unknown model: {model_name}"
140
+
141
+ # Get predictions
142
+ original_pred = get_prediction(text, model_id, task)
143
+ styled_pred = get_prediction(styled_text, model_id, task)
144
+
145
+ # Determine result
146
+ if "ERROR" in original_pred or "ERROR" in styled_pred:
147
+ status = f"Error getting predictions. Try again or check API access."
148
+ color = "orange"
149
+ elif original_pred != styled_pred:
150
+ status = f"ATTACK SUCCEEDED - Prediction changed!"
151
+ color = "green"
152
+ else:
153
+ status = f"Attack failed - Prediction unchanged"
154
+ color = "red"
155
+
156
+ return styled_text, original_pred, styled_pred, status
157
+
158
+
159
+ def preview_all_styles(text: str):
160
+ """Preview text in all Unicode styles."""
161
+ if not text.strip():
162
+ return "Enter text to see previews."
163
+
164
+ output = f"**Original:** {text}\n\n"
165
+ for style_name in STYLES:
166
+ transformed = transform_text(text, style_name)
167
+ output += f"**{style_name}:** {transformed}\n\n"
168
+ return output
169
+
170
+
171
+ # Create Gradio interface
172
+ with gr.Blocks(title="Unicode Attack Demo", theme=gr.themes.Soft()) as demo:
173
+ gr.Markdown("""
174
+ # Unicode Adversarial Attack Demo
175
+
176
+ Test how Unicode-styled text can fool LLMs. This demonstrates research on adversarial robustness.
177
+
178
+ **How it works:**
179
+ 1. Enter a claim or sentence
180
+ 2. Choose a Unicode style (transforms all characters)
181
+ 3. Choose a model and task
182
+ 4. See if the model's prediction changes
183
+ """)
184
+
185
+ with gr.Tab("Attack Demo"):
186
+ with gr.Row():
187
+ with gr.Column(scale=1):
188
+ text_input = gr.Textbox(
189
+ label="Input Text",
190
+ placeholder="Enter a claim or sentence...",
191
+ value="Climate change is caused by human activities.",
192
+ lines=3
193
+ )
194
+ style_dropdown = gr.Dropdown(
195
+ choices=list(STYLES.keys()),
196
+ label="Unicode Style",
197
+ value="Canadian Aboriginal",
198
+ info="Canadian Aboriginal is most effective (56.5% ASR)"
199
+ )
200
+ model_dropdown = gr.Dropdown(
201
+ choices=list(MODELS.keys()),
202
+ label="Model",
203
+ value="Phi-3-mini",
204
+ info="Phi-3 is most vulnerable (58.8% ASR)"
205
+ )
206
+ task_dropdown = gr.Dropdown(
207
+ choices=["Fact Verification", "Argument Mining"],
208
+ label="Task",
209
+ value="Fact Verification"
210
+ )
211
+ run_btn = gr.Button("Run Attack", variant="primary", size="lg")
212
+
213
+ with gr.Column(scale=1):
214
+ styled_output = gr.Textbox(label="Styled Text", lines=3)
215
+ with gr.Row():
216
+ original_pred_output = gr.Textbox(label="Original Prediction")
217
+ styled_pred_output = gr.Textbox(label="Styled Prediction")
218
+ status_output = gr.Textbox(label="Result", lines=2)
219
+
220
+ run_btn.click(
221
+ fn=run_attack,
222
+ inputs=[text_input, style_dropdown, model_dropdown, task_dropdown],
223
+ outputs=[styled_output, original_pred_output, styled_pred_output, status_output]
224
+ )
225
+
226
+ with gr.Tab("Style Preview"):
227
+ gr.Markdown("### Preview All Unicode Styles")
228
+ preview_input = gr.Textbox(
229
+ label="Enter text",
230
+ value="Climate change is real",
231
+ lines=2
232
+ )
233
+ preview_btn = gr.Button("Preview Styles")
234
+ preview_output = gr.Markdown()
235
+
236
+ preview_btn.click(
237
+ fn=preview_all_styles,
238
+ inputs=[preview_input],
239
+ outputs=[preview_output]
240
+ )
241
+
242
+ with gr.Tab("Research Results"):
243
+ gr.Markdown("""
244
+ ### Experiment Results (59,376 samples)
245
+
246
+ | Metric | Value |
247
+ |--------|-------|
248
+ | Overall ASR | 50.2% |
249
+ | Most Vulnerable Model | Phi-3-mini (58.8% ASR) |
250
+ | Most Robust Model | Gemma-2-2b (39.0% ASR) |
251
+ | Most Effective Style | Canadian Aboriginal (56.5% ASR) |
252
+
253
+ #### By Model
254
+ | Model | Mean ASR |
255
+ |-------|----------|
256
+ | Gemma-2-2b | 39.0% |
257
+ | Qwen2.5-3B | 52.8% |
258
+ | Phi-3-mini | 58.8% |
259
+
260
+ #### By Style
261
+ | Style | Mean ASR |
262
+ |-------|----------|
263
+ | Canadian Aboriginal | 56.5% |
264
+ | Circled/Squared | 53.1% |
265
+ | Squared Letters | 53.1% |
266
+ | Small Caps | 38.1% |
267
+
268
+ *ASR = Attack Success Rate (% of predictions that changed)*
269
+ """)
270
+
271
+ gr.Markdown("""
272
+ ---
273
+ **Project:** Unicode-Based Adversarial Attacks on LLMs
274
+ **Author:** Endrin Hoti | King's College London
275
+ **Supervisor:** Dr. Oana Cocarascu
276
+ """)
277
+
278
+ if __name__ == "__main__":
279
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=4.0.0
2
+ huggingface_hub>=0.20.0