end-rin commited on
Commit
dbfa5a2
·
verified ·
1 Parent(s): dbdcc05

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +299 -222
app.py CHANGED
@@ -1,281 +1,358 @@
1
  """
2
- Unicode Adversarial Attack Demo - HuggingFace Spaces Version
3
- Uses Inference API instead of local model loading.
 
 
 
 
4
  """
5
 
6
  import gradio as gr
7
- import os
8
- from huggingface_hub import InferenceClient
 
 
 
9
 
10
- # Unicode transformation mappings
11
  SMALL_CAPS_MAP = {
12
  'a': 'ᴀ', 'b': 'ʙ', 'c': 'ᴄ', 'd': 'ᴅ', 'e': 'ᴇ', 'f': 'ꜰ', 'g': 'ɢ',
13
  'h': 'ʜ', 'i': 'ɪ', 'j': 'ᴊ', 'k': 'ᴋ', 'l': 'ʟ', 'm': 'ᴍ', 'n': 'ɴ',
14
- 'o': 'ᴏ', 'p': 'ᴘ', 'q': 'ǫ', 'r': 'ʀ', 's': 's', 't': 'ᴛ', 'u': 'ᴜ',
15
  'v': 'ᴠ', 'w': 'ᴡ', 'x': 'x', 'y': 'ʏ', 'z': 'ᴢ',
16
- 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G',
17
- 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N',
18
- 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U',
19
- 'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z',
20
  }
21
 
22
  CANADIAN_ABORIGINAL_MAP = {
23
- 'a': 'ᐞ', 'b': '', 'c': '', 'd': '', 'e': 'ᕪ', 'f': 'ᕝ', 'g': 'ᕐ',
24
- 'h': 'ᑋ', 'i': '', 'j': '', 'k': '', 'l': 'ᒻ', 'm': '', 'n': 'ᐢ',
25
- 'o': '', 'p': '', 'q': '', 'r': '', 's': 'ᔆ', 't': '', 'u': 'ᐡ',
26
- 'v': '', 'w': '', 'x': 'ᕽ', 'y': '', 'z': 'ᙆ',
27
- 'A': '', 'B': '', 'C': '', 'D': '', 'E': '', 'F': '', 'G': '',
28
- 'H': '', 'I': '', 'J': '', 'K': 'ᐠ', 'L': '', 'M': '', 'N': '',
29
- 'O': '', 'P': 'ᑭ', 'Q': 'ᕴ', 'R': '', 'S': '', 'T': '', 'U': '',
30
- 'V': 'ᐯ', 'W': '', 'X': '', 'Y': '', 'Z': '',
31
  }
32
 
33
- CIRCLED_SQUARED_MAP = {
34
  'a': 'ⓐ', 'b': 'ⓑ', 'c': 'ⓒ', 'd': 'ⓓ', 'e': 'ⓔ', 'f': 'ⓕ', 'g': 'ⓖ',
35
  'h': 'ⓗ', 'i': 'ⓘ', 'j': 'ⓙ', 'k': 'ⓚ', 'l': 'ⓛ', 'm': 'ⓜ', 'n': 'ⓝ',
36
  'o': 'ⓞ', 'p': 'ⓟ', 'q': 'ⓠ', 'r': 'ⓡ', 's': 'ⓢ', 't': 'ⓣ', 'u': 'ⓤ',
37
  'v': 'ⓥ', 'w': 'ⓦ', 'x': 'ⓧ', 'y': 'ⓨ', 'z': 'ⓩ',
 
 
 
 
 
 
 
38
  'A': '🄰', 'B': '🄱', 'C': '🄲', 'D': '🄳', 'E': '🄴', 'F': '🄵', 'G': '🄶',
39
  'H': '🄷', 'I': '🄸', 'J': '🄹', 'K': '🄺', 'L': '🄻', 'M': '🄼', 'N': '🄽',
40
  'O': '🄾', 'P': '🄿', 'Q': '🅀', 'R': '🅁', 'S': '🅂', 'T': '🅃', 'U': '🅄',
41
  'V': '🅅', 'W': '🅆', 'X': '🅇', 'Y': '🅈', 'Z': '🅉',
42
- }
43
-
44
- SQUARED_LETTERS_MAP = {
45
- 'a': '🅰', 'b': '🅱', 'c': '🅲', 'd': '🅳', 'e': '🅴', 'f': '🅵', 'g': '🅶',
46
- 'h': '🅷', 'i': '🅸', 'j': '🅹', 'k': '🅺', 'l': '🅻', 'm': '🅼', 'n': '🅽',
47
- 'o': '🅾', 'p': '🅿', 'q': '🆀', 'r': '🆁', 's': '🆂', 't': '🆃', 'u': '🆄',
48
- 'v': '🆅', 'w': '🆆', 'x': '🆇', 'y': '🆈', 'z': '🆉',
49
- 'A': '🅰', 'B': '🅱', 'C': '🅲', 'D': '🅳', 'E': '🅴', 'F': '🅵', 'G': '🅶',
50
- 'H': '🅷', 'I': '🅸', 'J': '🅹', 'K': '🅺', 'L': '🅻', 'M': '🅼', 'N': '🅽',
51
- 'O': '🅾', 'P': '🅿', 'Q': '🆀', 'R': '🆁', 'S': '🆂', 'T': '🆃', 'U': '🆄',
52
- 'V': '🆅', 'W': '🆆', 'X': '🆇', 'Y': '🆈', 'Z': '🆉',
53
  }
54
 
55
  STYLES = {
56
- 'Small Caps': SMALL_CAPS_MAP,
57
- 'Canadian Aboriginal': CANADIAN_ABORIGINAL_MAP,
58
- 'Circled/Squared': CIRCLED_SQUARED_MAP,
59
- 'Squared Letters': SQUARED_LETTERS_MAP,
60
  }
61
 
62
- # Models available on HF Inference API (free tier)
63
- # Note: Phi-3, Gemma, Qwen from our experiments are NOT on free API
64
- # Using similar instruction-tuned models that ARE available
 
65
  MODELS = {
66
- 'Zephyr-7B': 'HuggingFaceH4/zephyr-7b-beta',
67
- 'Mistral-7B': 'mistralai/Mistral-7B-Instruct-v0.2',
68
- 'Falcon-7B': 'tiiuae/falcon-7b-instruct',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  }
70
 
71
- # Initialize client
72
- client = None
73
-
74
- def get_client():
75
- global client
76
- if client is None:
77
- token = os.environ.get("HF_TOKEN")
78
- client = InferenceClient(token=token)
79
- return client
80
 
 
 
 
81
 
82
  def transform_text(text: str, style: str) -> str:
83
  """Transform text using the specified Unicode style."""
84
  if style not in STYLES:
85
  return text
86
- char_map = STYLES[style]
87
  return ''.join(char_map.get(c, c) for c in text)
88
 
89
 
90
- def get_prediction(text: str, model_id: str, task: str) -> str:
91
- """Get model prediction using Inference API chat completion."""
92
- if task == "Fact Verification":
93
- system_msg = "You are a fact-checking assistant. Classify claims as SUPPORTS, REFUTES, or NOT_ENOUGH_INFO. Respond with only one word."
94
- user_msg = f"Classify this claim: {text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  else:
96
- system_msg = "You are a text classifier. Classify sentences as ARGUMENT or NOT_ARGUMENT. Respond with only one word."
97
- user_msg = f"Is this an argument? {text}"
 
98
 
99
- try:
100
- c = get_client()
101
- response = c.chat_completion(
102
- messages=[
103
- {"role": "system", "content": system_msg},
104
- {"role": "user", "content": user_msg}
105
- ],
106
- model=model_id,
107
- max_tokens=15,
108
- temperature=0.01,
109
- )
110
- # Extract the response
111
- result = response.choices[0].message.content.strip().upper()
112
- # Get first word
113
- result = result.split()[0] if result.split() else "ERROR"
114
- # Clean up common variations
115
- if "SUPPORT" in result:
116
- return "SUPPORTS"
117
- if "REFUTE" in result:
118
- return "REFUTES"
119
- if "NOT_ENOUGH" in result or "ENOUGH" in result:
120
- return "NOT_ENOUGH_INFO"
121
- if "ARGUMENT" in result and "NOT" not in result:
122
- return "ARGUMENT"
123
- if "NOT" in result:
124
- return "NOT_ARGUMENT"
125
- return result
126
- except Exception as e:
127
- return f"ERROR: {str(e)[:100]}"
128
 
129
 
130
- def run_attack(text: str, style: str, model_name: str, task: str):
131
  """Run the Unicode attack and compare predictions."""
132
  if not text.strip():
133
- return "", "", "", "Please enter some text."
134
 
135
- # Transform text
136
- styled_text = transform_text(text, style)
 
137
 
138
- # Get model ID
139
- model_id = MODELS.get(model_name)
140
- if not model_id:
141
- return styled_text, "", "", f"Unknown model: {model_name}"
142
 
143
- # Get predictions
144
- original_pred = get_prediction(text, model_id, task)
145
- styled_pred = get_prediction(styled_text, model_id, task)
146
 
147
- # Determine result
148
- if "ERROR" in original_pred or "ERROR" in styled_pred:
149
- status = f"API Error: {original_pred if 'ERROR' in original_pred else styled_pred}"
150
- elif original_pred != styled_pred:
151
- status = f"ATTACK SUCCEEDED! Prediction changed from {original_pred} to {styled_pred}"
152
- else:
153
- status = f"Attack failed - Prediction unchanged: {original_pred}"
 
 
 
 
 
 
 
 
 
 
154
 
155
- return styled_text, original_pred, styled_pred, status
 
156
 
157
 
158
- def preview_all_styles(text: str):
159
- """Preview text in all Unicode styles."""
160
  if not text.strip():
161
- return "Enter text to see previews."
162
-
163
- output = f"**Original:** {text}\n\n"
164
- for style_name in STYLES:
165
- transformed = transform_text(text, style_name)
166
- output += f"**{style_name}:** {transformed}\n\n"
167
- return output
168
-
169
-
170
- # Create Gradio interface
171
- with gr.Blocks(title="Unicode Attack Demo", theme=gr.themes.Soft()) as demo:
172
- gr.Markdown("""
173
- # Unicode Adversarial Attack Demo
174
-
175
- Test how Unicode-styled text can fool LLMs. This demonstrates research on adversarial robustness.
176
-
177
- **How it works:**
178
- 1. Enter a claim or sentence
179
- 2. Choose a Unicode style (transforms all characters)
180
- 3. Choose a model and task
181
- 4. See if the model's prediction changes
182
- """)
183
-
184
- with gr.Tab("Attack Demo"):
185
- with gr.Row():
186
- with gr.Column(scale=1):
187
- text_input = gr.Textbox(
188
- label="Input Text",
189
- placeholder="Enter a claim or sentence...",
190
- value="Climate change is caused by human activities.",
191
- lines=3
192
- )
193
- style_dropdown = gr.Dropdown(
194
- choices=list(STYLES.keys()),
195
- label="Unicode Style",
196
- value="Canadian Aboriginal",
197
- info="Canadian Aboriginal is most effective (56.5% ASR)"
198
- )
199
- model_dropdown = gr.Dropdown(
200
- choices=list(MODELS.keys()),
201
- label="Model",
202
- value="Zephyr-7B",
203
- info="Note: Original models (Phi-3, Gemma, Qwen) not on free API"
204
- )
205
- task_dropdown = gr.Dropdown(
206
- choices=["Fact Verification", "Argument Mining"],
207
- label="Task",
208
- value="Fact Verification"
209
- )
210
- run_btn = gr.Button("Run Attack", variant="primary", size="lg")
211
-
212
- with gr.Column(scale=1):
213
- styled_output = gr.Textbox(label="Styled Text", lines=3)
214
- with gr.Row():
215
- original_pred_output = gr.Textbox(label="Original Prediction")
216
- styled_pred_output = gr.Textbox(label="Styled Prediction")
217
- status_output = gr.Textbox(label="Result", lines=2)
218
-
219
- run_btn.click(
220
- fn=run_attack,
221
- inputs=[text_input, style_dropdown, model_dropdown, task_dropdown],
222
- outputs=[styled_output, original_pred_output, styled_pred_output, status_output]
223
- )
224
-
225
- with gr.Tab("Style Preview"):
226
- gr.Markdown("### Preview All Unicode Styles")
227
- preview_input = gr.Textbox(
228
- label="Enter text",
229
- value="Climate change is real",
230
- lines=2
231
- )
232
- preview_btn = gr.Button("Preview Styles")
233
- preview_output = gr.Markdown()
234
-
235
- preview_btn.click(
236
- fn=preview_all_styles,
237
- inputs=[preview_input],
238
- outputs=[preview_output]
239
- )
240
-
241
- with gr.Tab("Research Results"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  gr.Markdown("""
243
- ### Experiment Results (59,376 samples)
244
-
245
- *Note: The actual experiments used Gemma-2-2B, Phi-3-mini, and Qwen2.5-3B locally.
246
- This demo uses different models available on the free HuggingFace API.*
247
-
248
- | Metric | Value |
249
- |--------|-------|
250
- | Overall ASR | 50.2% |
251
- | Most Vulnerable Model | Phi-3-mini (58.8% ASR) |
252
- | Most Robust Model | Gemma-2-2b (39.0% ASR) |
253
- | Most Effective Style | Canadian Aboriginal (56.5% ASR) |
254
-
255
- #### By Model
256
- | Model | Mean ASR |
257
- |-------|----------|
258
- | Gemma-2-2b | 39.0% |
259
- | Qwen2.5-3B | 52.8% |
260
- | Phi-3-mini | 58.8% |
261
-
262
- #### By Style
263
- | Style | Mean ASR |
264
- |-------|----------|
265
- | Canadian Aboriginal | 56.5% |
266
- | Circled/Squared | 53.1% |
267
- | Squared Letters | 53.1% |
268
- | Small Caps | 38.1% |
269
-
270
- *ASR = Attack Success Rate (% of predictions that changed)*
271
  """)
272
 
273
- gr.Markdown("""
274
- ---
275
- **Project:** Unicode-Based Adversarial Attacks on LLMs
276
- **Author:** Endrin Hoti | King's College London
277
- **Supervisor:** Dr. Oana Cocarascu
278
- """)
279
 
280
  if __name__ == "__main__":
 
281
  demo.launch()
 
1
  """
2
+ Gradio web interface for Unicode adversarial attack demonstration.
3
+
4
+ Uses GGUF quantized models via llama-cpp-python for CPU inference.
5
+ Designed for deployment on HuggingFace Spaces (free CPU tier).
6
+
7
+ Supervisor approved: Feb 9, 2026
8
  """
9
 
10
  import gradio as gr
11
+ from llama_cpp import Llama
12
+
13
+ # =============================================================================
14
+ # Unicode Style Mappings
15
+ # =============================================================================
16
 
 
17
  SMALL_CAPS_MAP = {
18
  'a': 'ᴀ', 'b': 'ʙ', 'c': 'ᴄ', 'd': 'ᴅ', 'e': 'ᴇ', 'f': 'ꜰ', 'g': 'ɢ',
19
  'h': 'ʜ', 'i': 'ɪ', 'j': 'ᴊ', 'k': 'ᴋ', 'l': 'ʟ', 'm': 'ᴍ', 'n': 'ɴ',
20
+ 'o': 'ᴏ', 'p': 'ᴘ', 'q': 'ǫ', 'r': 'ʀ', 's': '', 't': 'ᴛ', 'u': 'ᴜ',
21
  'v': 'ᴠ', 'w': 'ᴡ', 'x': 'x', 'y': 'ʏ', 'z': 'ᴢ',
22
+ 'A': '', 'B': 'ʙ', 'C': '', 'D': '', 'E': '', 'F': '', 'G': 'ɢ',
23
+ 'H': 'ʜ', 'I': 'ɪ', 'J': '', 'K': '', 'L': 'ʟ', 'M': '', 'N': 'ɴ',
24
+ 'O': '', 'P': '', 'Q': 'ǫ', 'R': 'ʀ', 'S': '', 'T': '', 'U': '',
25
+ 'V': '', 'W': '', 'X': 'x', 'Y': 'ʏ', 'Z': '',
26
  }
27
 
28
  CANADIAN_ABORIGINAL_MAP = {
29
+ 'a': 'ᐞ', 'b': '', 'c': '', 'd': '', 'e': 'ᕪ', 'f': 'ᕝ', 'g': 'ᕐ',
30
+ 'h': 'ᑋ', 'i': '', 'j': '', 'k': '', 'l': 'ᒻ', 'm': '', 'n': 'ᐢ',
31
+ 'o': '', 'p': '', 'q': '', 'r': '', 's': 'ᔆ', 't': '', 'u': 'ᐡ',
32
+ 'v': '', 'w': '', 'x': 'ᕽ', 'y': '', 'z': 'ᙆ',
33
+ 'A': '', 'B': '', 'C': '', 'D': '', 'E': '', 'F': '', 'G': '',
34
+ 'H': '', 'I': '', 'J': '', 'K': 'ᐠ', 'L': '', 'M': '', 'N': '',
35
+ 'O': '', 'P': 'ᑭ', 'Q': 'ᕴ', 'R': '', 'S': '', 'T': '', 'U': '',
36
+ 'V': 'ᐯ', 'W': '', 'X': '', 'Y': '', 'Z': '',
37
  }
38
 
39
+ CIRCLED_MAP = {
40
  'a': 'ⓐ', 'b': 'ⓑ', 'c': 'ⓒ', 'd': 'ⓓ', 'e': 'ⓔ', 'f': 'ⓕ', 'g': 'ⓖ',
41
  'h': 'ⓗ', 'i': 'ⓘ', 'j': 'ⓙ', 'k': 'ⓚ', 'l': 'ⓛ', 'm': 'ⓜ', 'n': 'ⓝ',
42
  'o': 'ⓞ', 'p': 'ⓟ', 'q': 'ⓠ', 'r': 'ⓡ', 's': 'ⓢ', 't': 'ⓣ', 'u': 'ⓤ',
43
  'v': 'ⓥ', 'w': 'ⓦ', 'x': 'ⓧ', 'y': 'ⓨ', 'z': 'ⓩ',
44
+ 'A': 'Ⓐ', 'B': 'Ⓑ', 'C': 'Ⓒ', 'D': 'Ⓓ', 'E': 'Ⓔ', 'F': 'Ⓕ', 'G': 'Ⓖ',
45
+ 'H': 'Ⓗ', 'I': 'Ⓘ', 'J': 'Ⓙ', 'K': 'Ⓚ', 'L': 'Ⓛ', 'M': 'Ⓜ', 'N': 'Ⓝ',
46
+ 'O': 'Ⓞ', 'P': 'Ⓟ', 'Q': 'Ⓠ', 'R': 'Ⓡ', 'S': 'Ⓢ', 'T': 'Ⓣ', 'U': 'Ⓤ',
47
+ 'V': 'Ⓥ', 'W': 'Ⓦ', 'X': 'Ⓧ', 'Y': 'Ⓨ', 'Z': 'Ⓩ',
48
+ }
49
+
50
+ SQUARED_MAP = {
51
  'A': '🄰', 'B': '🄱', 'C': '🄲', 'D': '🄳', 'E': '🄴', 'F': '🄵', 'G': '🄶',
52
  'H': '🄷', 'I': '🄸', 'J': '🄹', 'K': '🄺', 'L': '🄻', 'M': '🄼', 'N': '🄽',
53
  'O': '🄾', 'P': '🄿', 'Q': '🅀', 'R': '🅁', 'S': '🅂', 'T': '🅃', 'U': '🅄',
54
  'V': '🅅', 'W': '🅆', 'X': '🅇', 'Y': '🅈', 'Z': '🅉',
55
+ 'a': '🄰', 'b': '🄱', 'c': '🄲', 'd': '🄳', 'e': '🄴', 'f': '🄵', 'g': '🄶',
56
+ 'h': '🄷', 'i': '🄸', 'j': '🄹', 'k': '🄺', 'l': '🄻', 'm': '🄼', 'n': '🄽',
57
+ 'o': '🄾', 'p': '🄿', 'q': '🅀', 'r': '🅁', 's': '🅂', 't': '🅃', 'u': '🅄',
58
+ 'v': '🅅', 'w': '🅆', 'x': '🅇', 'y': '🅈', 'z': '🅉',
 
 
 
 
 
 
 
59
  }
60
 
61
  STYLES = {
62
+ 'small_caps': ('Small Caps', SMALL_CAPS_MAP),
63
+ 'canadian_aboriginal': ('Canadian Aboriginal', CANADIAN_ABORIGINAL_MAP),
64
+ 'circled': ('Circled Letters', CIRCLED_MAP),
65
+ 'squared': ('Squared Letters', SQUARED_MAP),
66
  }
67
 
68
+ # =============================================================================
69
+ # Model Configuration
70
+ # =============================================================================
71
+
72
  MODELS = {
73
+ 'gemma': {
74
+ 'name': 'Gemma-2-2b-it',
75
+ 'repo_id': 'bartowski/gemma-2-2b-it-GGUF',
76
+ 'filename': 'gemma-2-2b-it-Q4_K_M.gguf',
77
+ 'chat_format': 'gemma',
78
+ },
79
+ 'phi': {
80
+ 'name': 'Phi-3-mini-4k',
81
+ 'repo_id': 'microsoft/Phi-3-mini-4k-instruct-gguf',
82
+ 'filename': 'Phi-3-mini-4k-instruct-q4.gguf',
83
+ 'chat_format': 'chatml',
84
+ },
85
+ 'qwen': {
86
+ 'name': 'Qwen2.5-3B',
87
+ 'repo_id': 'Qwen/Qwen2.5-3B-Instruct-GGUF',
88
+ 'filename': 'qwen2.5-3b-instruct-q4_k_m.gguf',
89
+ 'chat_format': 'chatml',
90
+ },
91
  }
92
 
93
+ # Global model cache (only keep one model loaded at a time to save memory)
94
+ _current_model = None
95
+ _current_model_name = None
 
 
 
 
 
 
96
 
97
+ # =============================================================================
98
+ # Core Functions
99
+ # =============================================================================
100
 
101
  def transform_text(text: str, style: str) -> str:
102
  """Transform text using the specified Unicode style."""
103
  if style not in STYLES:
104
  return text
105
+ char_map = STYLES[style][1]
106
  return ''.join(char_map.get(c, c) for c in text)
107
 
108
 
109
+ def load_model(model_key: str) -> Llama:
110
+ """Load a GGUF model. Unloads previous model to save memory."""
111
+ global _current_model, _current_model_name
112
+
113
+ if _current_model_name == model_key and _current_model is not None:
114
+ return _current_model
115
+
116
+ # Unload previous model
117
+ if _current_model is not None:
118
+ del _current_model
119
+ _current_model = None
120
+ _current_model_name = None
121
+
122
+ config = MODELS[model_key]
123
+ _current_model = Llama.from_pretrained(
124
+ repo_id=config['repo_id'],
125
+ filename=config['filename'],
126
+ n_ctx=2048,
127
+ n_threads=4,
128
+ verbose=False,
129
+ )
130
+ _current_model_name = model_key
131
+ return _current_model
132
+
133
+
134
+ def get_prediction(model: Llama, text: str, task: str, model_key: str) -> str:
135
+ """Get model prediction for the given text and task."""
136
+
137
+ if task == 'fact_verification':
138
+ system_prompt = "You are a fact-checking assistant. Classify claims as SUPPORTS, REFUTES, or NOT_ENOUGH_INFO. Reply with only one word."
139
+ user_prompt = f"Classify this claim: {text}"
140
+ valid_labels = ['SUPPORTS', 'REFUTES', 'NOT_ENOUGH_INFO']
141
  else:
142
+ system_prompt = "You are a text classifier. Determine if text is an argument or not. Reply with only ARGUMENT or NOT_ARGUMENT."
143
+ user_prompt = f"Is this an argument? {text}"
144
+ valid_labels = ['ARGUMENT', 'NOT_ARGUMENT']
145
 
146
+ messages = [
147
+ {"role": "system", "content": system_prompt},
148
+ {"role": "user", "content": user_prompt},
149
+ ]
150
+
151
+ response = model.create_chat_completion(
152
+ messages=messages,
153
+ max_tokens=20,
154
+ temperature=0,
155
+ )
156
+
157
+ output = response['choices'][0]['message']['content'].strip().upper()
158
+
159
+ # Parse to valid label
160
+ for label in valid_labels:
161
+ if label in output:
162
+ return label
163
+
164
+ # Default fallback
165
+ return valid_labels[-1]
 
 
 
 
 
 
 
 
 
166
 
167
 
168
+ def run_attack(text: str, style: str, model_key: str, task: str):
169
  """Run the Unicode attack and compare predictions."""
170
  if not text.strip():
171
+ return "", "", "", "Please enter some text.", ""
172
 
173
+ try:
174
+ # Transform text
175
+ styled_text = transform_text(text, style)
176
 
177
+ # Load model (shows progress)
178
+ yield styled_text, "Loading model...", "", "Loading model (this may take a moment)...", ""
 
 
179
 
180
+ model = load_model(model_key)
 
 
181
 
182
+ # Get original prediction
183
+ yield styled_text, "Running...", "", "Getting prediction for original text...", ""
184
+ original_pred = get_prediction(model, text, task, model_key)
185
+
186
+ # Get styled prediction
187
+ yield styled_text, original_pred, "Running...", "Getting prediction for styled text...", ""
188
+ styled_pred = get_prediction(model, styled_text, task, model_key)
189
+
190
+ # Determine result
191
+ if original_pred != styled_pred:
192
+ status = f"ATTACK SUCCEEDED: Prediction changed from {original_pred} to {styled_pred}"
193
+ result_color = "green"
194
+ else:
195
+ status = f"Attack failed: Prediction unchanged ({original_pred})"
196
+ result_color = "red"
197
+
198
+ yield styled_text, original_pred, styled_pred, status, result_color
199
 
200
+ except Exception as e:
201
+ yield "", "", "", f"Error: {str(e)}", "red"
202
 
203
 
204
+ def preview_all_styles(text: str) -> str:
205
+ """Preview text in all available Unicode styles."""
206
  if not text.strip():
207
+ return "Enter text to preview."
208
+
209
+ lines = [f"Original: {text}", "=" * 50]
210
+ for key, (name, _) in STYLES.items():
211
+ styled = transform_text(text, key)
212
+ lines.append(f"\n{name}:\n{styled}")
213
+ return '\n'.join(lines)
214
+
215
+
216
+ # =============================================================================
217
+ # Gradio Interface
218
+ # =============================================================================
219
+
220
+ def create_demo():
221
+ """Create the Gradio demo interface."""
222
+
223
+ with gr.Blocks(
224
+ title="Unicode Adversarial Attack Demo",
225
+ theme=gr.themes.Soft(),
226
+ ) as demo:
227
+
228
+ gr.Markdown("""
229
+ # Unicode Adversarial Attack Demo
230
+
231
+ Test how LLMs respond to Unicode-styled text. This demo transforms your input
232
+ using special Unicode characters and compares model predictions.
233
+
234
+ **Note:** This demo uses quantized models (Q4) for CPU inference.
235
+ Results may differ slightly from full-precision models used in experiments.
236
+ """)
237
+
238
+ with gr.Tab("Attack Demo"):
239
+ with gr.Row():
240
+ with gr.Column(scale=1):
241
+ text_input = gr.Textbox(
242
+ label="Input Text",
243
+ lines=3,
244
+ placeholder="Enter a claim or statement to test...",
245
+ value="Climate change is primarily caused by human activities.",
246
+ )
247
+
248
+ with gr.Row():
249
+ style_dropdown = gr.Dropdown(
250
+ choices=[(STYLES[k][0], k) for k in STYLES],
251
+ label="Unicode Style",
252
+ value="canadian_aboriginal",
253
+ )
254
+ model_dropdown = gr.Dropdown(
255
+ choices=[(MODELS[k]['name'], k) for k in MODELS],
256
+ label="Model",
257
+ value="phi",
258
+ )
259
+
260
+ task_dropdown = gr.Dropdown(
261
+ choices=[
262
+ ("Fact Verification", "fact_verification"),
263
+ ("Argument Mining", "argument_mining"),
264
+ ],
265
+ label="Task",
266
+ value="fact_verification",
267
+ )
268
+
269
+ run_btn = gr.Button("Run Attack", variant="primary", size="lg")
270
+
271
+ with gr.Column(scale=1):
272
+ styled_output = gr.Textbox(label="Styled Text", lines=3)
273
+
274
+ with gr.Row():
275
+ original_pred = gr.Textbox(label="Original Prediction")
276
+ styled_pred = gr.Textbox(label="Styled Prediction")
277
+
278
+ status_output = gr.Textbox(label="Result", lines=2)
279
+ result_state = gr.State("")
280
+
281
+ run_btn.click(
282
+ fn=run_attack,
283
+ inputs=[text_input, style_dropdown, model_dropdown, task_dropdown],
284
+ outputs=[styled_output, original_pred, styled_pred, status_output, result_state],
285
+ )
286
+
287
+ with gr.Tab("Style Preview"):
288
+ gr.Markdown("### Preview Unicode Styles")
289
+ gr.Markdown("See how your text looks in each Unicode style before running an attack.")
290
+
291
+ preview_input = gr.Textbox(
292
+ label="Enter text",
293
+ placeholder="Type something...",
294
+ value="Climate change is real",
295
+ )
296
+ preview_btn = gr.Button("Preview All Styles")
297
+ preview_output = gr.Textbox(label="Styled Versions", lines=15)
298
+
299
+ preview_btn.click(
300
+ fn=preview_all_styles,
301
+ inputs=[preview_input],
302
+ outputs=[preview_output],
303
+ )
304
+
305
+ with gr.Tab("About"):
306
+ gr.Markdown("""
307
+ ## About This Demo
308
+
309
+ This demo accompanies the research project:
310
+
311
+ **"Unicode-Based Adversarial Attacks on Large Language Models"**
312
+
313
+ ### Key Findings (Phase 1 Experiments)
314
+
315
+ | Metric | Value |
316
+ |--------|-------|
317
+ | Total Samples Tested | 59,376 |
318
+ | Overall Attack Success Rate | 50.2% |
319
+ | Most Vulnerable Model | Phi-3-mini (58.8% ASR) |
320
+ | Most Robust Model | Gemma-2-2b (39.0% ASR) |
321
+ | Most Effective Style | Canadian Aboriginal (56.5% ASR) |
322
+
323
+ ### Models Used
324
+
325
+ | Model | Parameters | Quantization |
326
+ |-------|------------|--------------|
327
+ | Gemma-2-2b-it | 2B | Q4_K_M |
328
+ | Phi-3-mini-4k | 3.8B | Q4 |
329
+ | Qwen2.5-3B | 3B | Q4_K_M |
330
+
331
+ ### Unicode Styles
332
+
333
+ - **Small Caps**: ᴛᴇxᴛ ʟɪᴋᴇ ᴛʜɪꜱ
334
+ - **Canadian Aboriginal**: ᑦᕪᔆᐩ ᒻᐃᐠᕪ ᑦᑋᐃᔆ
335
+ - **Circled Letters**: ⓣⓔⓧⓣ ⓛⓘⓚⓔ ⓣⓗⓘⓢ
336
+ - **Squared Letters**: 🅃🄴🅇🅃 🄻🄸🄺🄴 🅃🄷🄸🅂
337
+
338
+ ---
339
+
340
+ **Student:** Endrin Hoti (King's College London)
341
+ **Supervisor:** Dr. Oana Cocarascu
342
+ """)
343
+
344
  gr.Markdown("""
345
+ ---
346
+ *First query may be slow while the model downloads and loads (~2GB per model).*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  """)
348
 
349
+ return demo
350
+
351
+
352
+ # =============================================================================
353
+ # Entry Point
354
+ # =============================================================================
355
 
356
  if __name__ == "__main__":
357
+ demo = create_demo()
358
  demo.launch()