feifeinoban commited on
Commit
7736878
·
1 Parent(s): 94eb760

use static HTML

Browse files
Files changed (1) hide show
  1. index.html +462 -220
index.html CHANGED
@@ -3,10 +3,10 @@
3
  <head>
4
  <meta charset="utf-8">
5
  <meta name="description"
6
- content="Shell: A Metacognition-Driven Safety Framework for Domain-Specific LLMs">
7
- <meta name="keywords" content="LLM Safety, Metacognition, AI Alignment, Activation Steering">
8
  <meta name="viewport" content="width=device-width, initial-scale=1">
9
- <title>Shell: Metacognition-Driven Safety for Domain-Specific LLMs</title>
10
 
11
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
12
  rel="stylesheet">
@@ -18,28 +18,32 @@
18
 
19
  <style>
20
  :root {
21
- --shell-primary: #3498db;
22
- --shell-secondary: #9b59b6;
23
- --shell-accent: #2ecc71;
 
24
  }
25
 
26
  body {
27
  font-family: 'Google Sans', 'Noto Sans', sans-serif;
28
  line-height: 1.6;
 
29
  }
30
 
31
  .hero {
32
- background: linear-gradient(135deg, var(--shell-primary), var(--shell-secondary));
33
  color: white;
 
34
  }
35
 
36
  .publication-title {
37
  color: white;
38
- margin-bottom: 1rem;
 
39
  }
40
 
41
  .publication-authors {
42
- margin-bottom: 1rem;
43
  }
44
 
45
  .author-block {
@@ -48,134 +52,227 @@
48
  }
49
 
50
  .publication-links {
51
- margin-top: 1.5rem;
52
  }
53
 
54
  .link-block {
55
  display: inline-block;
56
- margin: 0 0.5rem;
57
  }
58
 
59
  .external-link {
60
- transition: transform 0.2s;
61
- background: linear-gradient(135deg, #667eea, #764ba2) !important;
62
- border: none !important;
 
 
63
  }
64
 
65
  .external-link:hover {
66
- transform: translateY(-2px);
67
- background: linear-gradient(135deg, #764ba2, #667eea) !important;
68
- }
69
-
70
- .external-link span {
71
- color: white !important;
72
  }
73
 
74
  .teaser {
75
  padding: 4rem 0;
76
- }
77
-
78
- .teaser video {
79
- border-radius: 10px;
80
- box-shadow: 0 10px 30px rgba(0,0,0,0.3);
81
  }
82
 
83
  .dnerf {
84
  font-weight: bold;
85
- color: var(--shell-primary);
86
- }
87
-
88
- .results-carousel {
89
- margin: 2rem 0;
90
  }
91
 
92
  .section {
93
  padding: 4rem 1.5rem;
94
  }
95
 
 
 
 
 
96
  .content h2, .content h3 {
97
- color: var(--shell-primary);
98
- border-bottom: 2px solid #f5f5f5;
99
  padding-bottom: 0.5rem;
100
  margin-top: 2rem;
101
  }
102
 
103
  .table-container {
104
  margin: 2rem 0;
105
- box-shadow: 0 5px 15px rgba(0,0,0,0.1);
106
  border-radius: 10px;
107
  overflow: hidden;
 
108
  }
109
 
110
  table {
111
  width: 100%;
 
112
  }
113
 
114
  table th {
115
- background: linear-gradient(135deg, var(--shell-primary), var(--shell-secondary));
116
  color: white;
117
  font-weight: 600;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  }
119
 
120
  .badge {
121
  display: inline-block;
122
- padding: 0.5rem 1rem;
123
- background: linear-gradient(135deg, #667eea, #764ba2);
124
  color: white !important;
125
- border-radius: 20px;
126
- margin: 0.25rem;
127
- font-size: 0.9rem;
128
  text-decoration: none;
129
  border: none;
 
 
 
130
  }
131
 
132
  .badge:hover {
133
- background: linear-gradient(135deg, #764ba2, #667eea);
 
134
  color: white !important;
135
- transform: translateY(-2px);
136
  }
137
 
138
  .abstract-box {
139
- background: linear-gradient(135deg, #f8f9fa, #e9ecef);
140
- padding: 2rem;
141
- border-radius: 10px;
142
- border-left: 5px solid var(--shell-primary);
143
  margin: 2rem 0;
 
144
  }
145
 
146
  .methodology-step {
147
- margin: 2rem 0;
148
- padding: 1.5rem;
149
- border-radius: 10px;
150
  background: white;
151
- box-shadow: 0 5px 15px rgba(0,0,0,0.1);
152
- border-left: 4px solid var(--shell-accent);
 
 
 
 
 
153
  }
154
 
155
  .results-highlight {
156
  text-align: center;
157
- padding: 2rem;
158
- background: linear-gradient(135deg, var(--shell-primary), var(--shell-secondary));
159
  color: white;
160
- border-radius: 10px;
161
- margin: 2rem 0;
 
162
  }
163
 
164
  .results-highlight .number {
165
- font-size: 3rem;
166
  font-weight: bold;
167
  display: block;
 
 
 
 
 
 
168
  }
169
 
170
  .architecture-image {
171
  width: 100%;
172
- max-width: 800px;
173
  display: block;
174
- margin: 2rem auto;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  border-radius: 10px;
176
- box-shadow: 0 10px 30px rgba(0,0,0,0.2);
 
 
 
 
 
 
 
177
  }
178
- </style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  </head>
180
  <body>
181
 
@@ -184,17 +281,19 @@
184
  <div class="container is-max-desktop">
185
  <div class="columns is-centered">
186
  <div class="column has-text-centered">
187
- <h1 class="title is-1 publication-title">🐚 Shell: Metacognition-Driven Safety Framework for Domain-Specific LLMs</h1>
 
 
188
  <div class="is-size-5 publication-authors">
189
  <span class="author-block">
190
- <a href="#" target="_blank">Wen Wu</a><sup>1</sup>,</span>
191
  <span class="author-block">
192
- <a href="#" target="_blank">Zhenyu Ying</a><sup>1</sup>,</span>
193
  <span class="author-block">
194
- <a href="#" target="_blank">Liang He</a><sup>1</sup>,
195
  </span>
196
  <span class="author-block">
197
- <a href="#" target="_blank">Shell Team</a><sup>1</sup>
198
  </span>
199
  </div>
200
 
@@ -202,48 +301,43 @@
202
  <span class="author-block"><sup>1</sup>Anonymous Submission</span>
203
  </div>
204
 
205
- <div class="column has-text-centered">
206
- <div class="publication-links">
207
- <!-- PDF Link. -->
208
- <span class="link-block">
209
- <a href="#" target="_blank"
210
- class="external-link button is-normal is-rounded is-dark">
211
- <span class="icon">
212
- <i class="fas fa-file-pdf"></i>
213
- </span>
214
- <span>Paper</span>
215
- </a>
216
- </span>
217
- <span class="link-block">
218
- <a href="#" target="_blank"
219
- class="external-link button is-normal is-rounded is-dark">
220
- <span class="icon">
221
- <i class="ai ai-arxiv"></i>
222
- </span>
223
- <span>arXiv</span>
224
- </a>
225
- </span>
226
- <!-- Code Link. -->
227
- <span class="link-block">
228
- <a href="#" target="_blank"
229
- class="external-link button is-normal is-rounded is-dark">
230
- <span class="icon">
231
- <i class="fab fa-github"></i>
232
- </span>
233
- <span>Code</span>
234
- </a>
235
- </span>
236
- <!-- Dataset Link. -->
237
- <span class="link-block">
238
- <a href="#" target="_blank"
239
- class="external-link button is-normal is-rounded is-dark">
240
- <span class="icon">
241
- <i class="far fa-images"></i>
242
- </span>
243
- <span>Dataset</span>
244
- </a>
245
- </span>
246
- </div>
247
  </div>
248
  </div>
249
  </div>
@@ -251,17 +345,38 @@
251
  </div>
252
  </section>
253
 
254
- <section class="hero teaser">
255
  <div class="container is-max-desktop">
256
  <div class="hero-body has-text-centered">
257
  <h2 class="subtitle is-3">
258
- Uncover and mitigate <span class="dnerf">implicit value risks</span> in education, finance, management—and beyond
259
  </h2>
260
- <div class="content">
261
- <a href="#" class="badge">🔒 Model-agnostic</a>
262
- <a href="#" class="badge">🧠 Self-evolving rules</a>
263
- <a href="#" class="badge">⚡ Activation steering</a>
264
- <a href="#" class="badge">📉 90%+ jailbreak reduction</a>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  </div>
266
  </div>
267
  </div>
@@ -269,81 +384,93 @@
269
 
270
  <section class="section">
271
  <div class="container is-max-desktop">
272
- <!-- Abstract. -->
273
  <div class="columns is-centered has-text-centered">
274
  <div class="column is-four-fifths">
275
  <h2 class="title is-3">Abstract</h2>
276
  <div class="abstract-box">
277
  <div class="content has-text-justified">
278
  <p>
279
- While current LLM safety methods focus on explicit harms (e.g., hate speech, violence), they often miss <strong>domain-specific implicit risks</strong>—such as encouraging academic dishonesty in education, promoting reckless trading in finance, or normalizing toxic workplace culture in management.
 
 
 
280
  </p>
281
  <p>
282
- We introduce <strong>Shell</strong>, a metacognition-driven self-evolution framework that enables LLMs to self-diagnose value misalignments via perspective-taking and consequence simulation, builds a hybrid rule system with expert-defined static trees and self-evolved dynamic graphs, and enforces rules at inference time via activation steering.
 
 
 
283
  </p>
284
  <p>
285
- Evaluated on 9,000 risk queries across <strong>education, finance, and management</strong>, Shell reduces average jailbreak rates by <strong>>90%</strong> on models including GPT-5, Qwen3, and Llama 3.1.
 
 
 
286
  </p>
287
  </div>
288
  </div>
289
  </div>
290
  </div>
291
- <!--/ Abstract. -->
292
  </div>
293
  </section>
294
 
295
- <section class="hero is-light is-small">
296
- <div class="hero-body">
297
- <div class="container">
298
- <div class="columns is-centered">
299
- <div class="column is-8 has-text-centered">
300
- <h2 class="title is-3">Core Challenges: Implicit Risks Are Everywhere</h2>
301
- </div>
302
- </div>
303
- <div class="table-container">
304
- <table class="table is-striped is-fullwidth">
305
- <thead>
306
- <tr>
307
- <th>Domain</th>
308
- <th>Example Implicit Risk</th>
309
- <th>Harmful Consequence</th>
310
- </tr>
311
- </thead>
312
- <tbody>
313
- <tr>
314
- <td><strong>Education</strong></td>
315
- <td>Suggesting clever comebacks that escalate bullying</td>
316
- <td>Deteriorates peer relationships</td>
317
- </tr>
318
- <tr>
319
- <td></td>
320
- <td>Framing "sacrificing sleep for grades" as admirable</td>
321
- <td>Promotes unhealthy competition</td>
322
- </tr>
323
- <tr>
324
- <td></td>
325
- <td>Teaching how to "rephrase copied essays"</td>
326
- <td>Undermines academic integrity</td>
327
- </tr>
328
- <tr>
329
- <td><strong>Finance</strong></td>
330
- <td>Encouraging high-leverage speculation as "smart risk"</td>
331
- <td>Normalizes financial recklessness</td>
332
- </tr>
333
- <tr>
334
- <td><strong>Management</strong></td>
335
- <td>Praising "always-on" culture as "dedication"</td>
336
- <td>Reinforces burnout and poor work-life balance</td>
337
- </tr>
338
- </tbody>
339
- </table>
340
- </div>
341
- <div class="has-text-centered">
342
- <p class="is-italic">
343
- 💡 These risks are <strong>not jailbreaks</strong> in the traditional sense—they appear benign but subtly erode domain-specific values.
344
  </p>
345
  </div>
346
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  </div>
348
  </section>
349
 
@@ -351,113 +478,174 @@
351
  <div class="container is-max-desktop">
352
  <div class="columns is-centered">
353
  <div class="column is-four-fifths">
354
- <h2 class="title is-3">Methodology: The MENTOR Architecture</h2>
 
 
 
 
 
355
 
356
  <div class="methodology-step">
 
357
  <h3 class="title is-4">1. Metacognitive Self-Assessment</h3>
358
  <div class="content">
359
- <p>LLMs evaluate their own outputs using:</p>
360
  <ul>
361
  <li><strong>Perspective-taking</strong>: "How would a teacher/parent/regulator view this?"</li>
362
  <li><strong>Consequential thinking</strong>: "What real-world harm could this cause?"</li>
363
  <li><strong>Normative introspection</strong>: "Does this align with core domain ethics?"</li>
 
364
  </ul>
365
- <p>This replaces labor-intensive human labeling with <strong>autonomous, human-aligned reflection</strong>.</p>
366
  </div>
367
  </div>
368
 
369
  <div class="methodology-step">
 
370
  <h3 class="title is-4">2. Rule Evolution Cycle (REC)</h3>
371
  <div class="content">
 
372
  <ul>
373
- <li><strong>Static Rule Tree</strong>: Expert-curated, hierarchical rules (e.g., <code>Education → Academic Integrity → No Plagiarism</code>).</li>
374
- <li><strong>Dynamic Rule Graph</strong>: Automatically generated from successful self-corrections (e.g., <code>&lt;risk: essay outsourcing&gt; → &lt;rule: teach outlining instead&gt;</code>).</li>
375
- <li>Rules evolve via <strong>dual clustering</strong> (by risk type & mitigation strategy), enabling precise retrieval.</li>
376
  </ul>
 
377
  </div>
378
  </div>
379
 
380
  <div class="methodology-step">
 
381
  <h3 class="title is-4">3. Robust Rule Vectors (RV) via Activation Steering</h3>
382
  <div class="content">
 
383
  <ul>
384
- <li>Generate <strong>steering vectors</strong> from contrasting compliant vs. non-compliant responses.</li>
385
- <li>At inference, <strong>add vectors to internal activations</strong> (e.g., Layer 18 of Llama 3.1) to guide behavior.</li>
386
- <li><strong>No fine-tuning needed</strong>—works on closed-source models like GPT-5.</li>
 
387
  </ul>
 
388
  </div>
389
  </div>
390
-
391
- <!-- Architecture Image -->
392
- <img src="https://huggingface.co/spaces/feifeinoban/shell/resolve/main/assets/mentor_arch.png"
393
- alt="MENTOR Architecture"
394
- class="architecture-image">
395
-
396
- <div class="has-text-centered">
397
- <p class="is-italic">
398
- Figure: The MENTOR framework. Shell implements this full pipeline.
399
- </p>
400
- </div>
401
  </div>
402
  </div>
403
  </div>
404
  </section>
405
 
406
- <section class="section">
407
  <div class="container is-max-desktop">
408
  <div class="columns is-centered">
409
  <div class="column is-four-fifths">
410
- <h2 class="title is-3">Results: Strong, Efficient, Generalizable</h2>
411
 
412
  <div class="results-highlight">
413
  <span class="number">>90%</span>
414
- <span class="subtitle">Average Jailbreak Rate Reduction</span>
415
  </div>
416
 
417
- <h3 class="title is-4">Jailbreak Rate Reduction (3,000 queries per domain)</h3>
418
 
419
  <div class="table-container">
420
  <table class="table is-striped is-fullwidth">
421
  <thead>
422
  <tr>
423
  <th>Model</th>
 
424
  <th>Original</th>
425
- <th>+ Shell (Rules + MetaLoop + RV)</th>
426
- <th>Reduction</th>
 
427
  </tr>
428
  </thead>
429
  <tbody>
430
  <tr>
431
- <td><strong>GPT-5</strong></td>
432
- <td>38.39%</td>
433
- <td><strong>0.77%</strong></td>
434
- <td><strong>98.0%</strong></td>
435
- </tr>
436
- <tr>
437
- <td><strong>Qwen3-235B</strong></td>
438
  <td>56.33%</td>
 
 
439
  <td><strong>3.13%</strong></td>
440
- <td><strong>94.4%</strong></td>
441
  </tr>
442
  <tr>
443
- <td><strong>GPT-4o</strong></td>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  <td>58.81%</td>
 
 
445
  <td><strong>6.43%</strong></td>
446
- <td><strong>89.1%</strong></td>
447
  </tr>
448
  <tr>
449
- <td><strong>Llama 3.1-8B</strong></td>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  <td>67.45%</td>
 
451
  <td><strong>31.39%</strong></td>
452
- <td><strong>53.5%</strong></td>
 
 
 
 
 
 
 
 
 
 
 
453
  </tr>
454
  </tbody>
455
  </table>
456
  </div>
457
 
458
- <div class="has-text-centered">
459
- <p class="is-italic">
460
- ✅ Human evaluators prefer Shell-augmented responses <strong>68% of the time</strong> for safety, appropriateness, and usefulness.
 
461
  </p>
462
  </div>
463
  </div>
@@ -469,14 +657,41 @@
469
  <div class="container is-max-desktop">
470
  <div class="columns is-centered">
471
  <div class="column is-four-fifths">
472
- <h2 class="title is-3">Try It / Use It</h2>
473
 
474
  <div class="content">
475
- <h3 class="title is-4">For Researchers</h3>
476
- <ul>
477
- <li><strong>Dataset</strong>: 9,000 implicit-risk queries across 3 domains → [HF Dataset Link]</li>
478
- <li><strong>Code</strong>: Full implementation of REC + RV [GitHub Link] (coming soon)</li>
479
- </ul>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  </div>
481
  </div>
482
  </div>
@@ -486,20 +701,25 @@
486
  <section class="section" id="BibTeX">
487
  <div class="container is-max-desktop content">
488
  <h2 class="title">BibTeX</h2>
489
- <pre><code>@article{shell2025,
490
- title={Shell: A Metacognition-Driven Safety Framework for Domain-Specific LLMs},
 
491
  author={Wu, Wen and Ying, Zhenyu and He, Liang and Team, Shell},
492
  journal={Anonymous Submission},
493
  year={2025}
494
- }</code></pre>
 
495
  </div>
496
  </section>
497
 
498
- <footer class="footer">
499
  <div class="container">
500
  <div class="content has-text-centered">
501
  <p>
502
- This website is licensed under a <a rel="license" target="_blank"
 
 
 
503
  href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
504
  Commons Attribution-ShareAlike 4.0 International License</a>.
505
  </p>
@@ -508,7 +728,6 @@
508
  </footer>
509
 
510
  <script>
511
- // Simple JavaScript for interactive elements if needed
512
  document.addEventListener('DOMContentLoaded', function() {
513
  // Add smooth scrolling for anchor links
514
  document.querySelectorAll('a[href^="#"]').forEach(anchor => {
@@ -519,6 +738,29 @@
519
  });
520
  });
521
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
  });
523
  </script>
524
 
 
3
  <head>
4
  <meta charset="utf-8">
5
  <meta name="description"
6
+ content="MENTOR: A Metacognition-Driven Self-Evolution Framework for Uncovering and Mitigating Implicit Risks in Domain-Specific LLMs">
7
+ <meta name="keywords" content="LLM Safety, Metacognition, AI Alignment, Activation Steering, Domain-Specific Risks">
8
  <meta name="viewport" content="width=device-width, initial-scale=1">
9
+ <title>MENTOR: Metacognition-Driven Safety Framework for Domain-Specific LLMs</title>
10
 
11
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
12
  rel="stylesheet">
 
18
 
19
  <style>
20
  :root {
21
+ --mentor-primary: #3498db;
22
+ --mentor-secondary: #9b59b6;
23
+ --mentor-accent: #2ecc71;
24
+ --mentor-dark: #2c3e50;
25
  }
26
 
27
  body {
28
  font-family: 'Google Sans', 'Noto Sans', sans-serif;
29
  line-height: 1.6;
30
+ color: #333;
31
  }
32
 
33
  .hero {
34
+ background: linear-gradient(135deg, var(--mentor-primary), var(--mentor-secondary));
35
  color: white;
36
+ padding: 4rem 1.5rem;
37
  }
38
 
39
  .publication-title {
40
  color: white;
41
+ margin-bottom: 1.5rem;
42
+ text-shadow: 0 2px 4px rgba(0,0,0,0.3);
43
  }
44
 
45
  .publication-authors {
46
+ margin-bottom: 1.5rem;
47
  }
48
 
49
  .author-block {
 
52
  }
53
 
54
  .publication-links {
55
+ margin-top: 2rem;
56
  }
57
 
58
  .link-block {
59
  display: inline-block;
60
+ margin: 0 0.5rem 1rem;
61
  }
62
 
63
  .external-link {
64
+ transition: all 0.3s ease;
65
+ background: rgba(255,255,255,0.2) !important;
66
+ border: 2px solid rgba(255,255,255,0.3) !important;
67
+ color: white !important;
68
+ font-weight: 600;
69
  }
70
 
71
  .external-link:hover {
72
+ transform: translateY(-3px);
73
+ background: rgba(255,255,255,0.3) !important;
74
+ border-color: white !important;
75
+ box-shadow: 0 5px 15px rgba(0,0,0,0.2);
 
 
76
  }
77
 
78
  .teaser {
79
  padding: 4rem 0;
80
+ background: #f8f9fa;
 
 
 
 
81
  }
82
 
83
  .dnerf {
84
  font-weight: bold;
85
+ color: var(--mentor-primary);
 
 
 
 
86
  }
87
 
88
  .section {
89
  padding: 4rem 1.5rem;
90
  }
91
 
92
+ .section-alt {
93
+ background: #f8f9fa;
94
+ }
95
+
96
  .content h2, .content h3 {
97
+ color: var(--mentor-dark);
98
+ border-bottom: 3px solid var(--mentor-primary);
99
  padding-bottom: 0.5rem;
100
  margin-top: 2rem;
101
  }
102
 
103
  .table-container {
104
  margin: 2rem 0;
105
+ box-shadow: 0 5px 25px rgba(0,0,0,0.1);
106
  border-radius: 10px;
107
  overflow: hidden;
108
+ border: 1px solid #e0e0e0;
109
  }
110
 
111
  table {
112
  width: 100%;
113
+ margin: 0;
114
  }
115
 
116
  table th {
117
+ background: linear-gradient(135deg, var(--mentor-primary), var(--mentor-secondary));
118
  color: white;
119
  font-weight: 600;
120
+ padding: 1rem;
121
+ text-align: left;
122
+ }
123
+
124
+ table td {
125
+ padding: 1rem;
126
+ border-bottom: 1px solid #f0f0f0;
127
+ }
128
+
129
+ table tr:hover {
130
+ background-color: #f8f9fa;
131
+ }
132
+
133
+ .badge-container {
134
+ margin: 2rem 0;
135
  }
136
 
137
  .badge {
138
  display: inline-block;
139
+ padding: 0.75rem 1.5rem;
140
+ background: linear-gradient(135deg, var(--mentor-primary), var(--mentor-secondary));
141
  color: white !important;
142
+ border-radius: 25px;
143
+ margin: 0.5rem;
144
+ font-size: 1rem;
145
  text-decoration: none;
146
  border: none;
147
+ font-weight: 600;
148
+ box-shadow: 0 4px 15px rgba(52, 152, 219, 0.3);
149
+ transition: all 0.3s ease;
150
  }
151
 
152
  .badge:hover {
153
+ transform: translateY(-3px);
154
+ box-shadow: 0 8px 25px rgba(52, 152, 219, 0.4);
155
  color: white !important;
 
156
  }
157
 
158
  .abstract-box {
159
+ background: linear-gradient(135deg, #ffffff, #f8f9fa);
160
+ padding: 2.5rem;
161
+ border-radius: 15px;
162
+ border-left: 6px solid var(--mentor-primary);
163
  margin: 2rem 0;
164
+ box-shadow: 0 5px 25px rgba(0,0,0,0.08);
165
  }
166
 
167
  .methodology-step {
168
+ margin: 2.5rem 0;
169
+ padding: 2rem;
170
+ border-radius: 12px;
171
  background: white;
172
+ box-shadow: 0 5px 20px rgba(0,0,0,0.1);
173
+ border-left: 5px solid var(--mentor-accent);
174
+ transition: transform 0.3s ease;
175
+ }
176
+
177
+ .methodology-step:hover {
178
+ transform: translateY(-5px);
179
  }
180
 
181
  .results-highlight {
182
  text-align: center;
183
+ padding: 3rem 2rem;
184
+ background: linear-gradient(135deg, var(--mentor-primary), var(--mentor-secondary));
185
  color: white;
186
+ border-radius: 15px;
187
+ margin: 3rem 0;
188
+ box-shadow: 0 10px 30px rgba(0,0,0,0.2);
189
  }
190
 
191
  .results-highlight .number {
192
+ font-size: 4rem;
193
  font-weight: bold;
194
  display: block;
195
+ text-shadow: 0 2px 4px rgba(0,0,0,0.3);
196
+ }
197
+
198
+ .results-highlight .subtitle {
199
+ font-size: 1.5rem;
200
+ opacity: 0.9;
201
  }
202
 
203
  .architecture-image {
204
  width: 100%;
205
+ max-width: 900px;
206
  display: block;
207
+ margin: 3rem auto;
208
+ border-radius: 12px;
209
+ box-shadow: 0 15px 40px rgba(0,0,0,0.25);
210
+ border: 1px solid #e0e0e0;
211
+ }
212
+
213
+ .feature-icon {
214
+ font-size: 3rem;
215
+ color: var(--mentor-primary);
216
+ margin-bottom: 1rem;
217
+ }
218
+
219
+ .quote-box {
220
+ background: linear-gradient(135deg, #667eea, #764ba2);
221
+ color: white;
222
+ padding: 2rem;
223
  border-radius: 10px;
224
+ margin: 2rem 0;
225
+ font-style: italic;
226
+ box-shadow: 0 5px 20px rgba(0,0,0,0.15);
227
+ }
228
+
229
+ .performance-metric {
230
+ text-align: center;
231
+ padding: 1.5rem;
232
  }
233
+
234
+ .metric-value {
235
+ font-size: 2.5rem;
236
+ font-weight: bold;
237
+ color: var(--mentor-primary);
238
+ display: block;
239
+ }
240
+
241
+ .metric-label {
242
+ font-size: 1rem;
243
+ color: var(--mentor-dark);
244
+ margin-top: 0.5rem;
245
+ }
246
+
247
+ .code-block {
248
+ background: #2c3e50;
249
+ color: #ecf0f1;
250
+ padding: 1.5rem;
251
+ border-radius: 8px;
252
+ overflow-x: auto;
253
+ font-family: 'Courier New', monospace;
254
+ margin: 1.5rem 0;
255
+ }
256
+
257
+ @media (max-width: 768px) {
258
+ .hero {
259
+ padding: 2rem 1rem;
260
+ }
261
+
262
+ .publication-title {
263
+ font-size: 2rem;
264
+ }
265
+
266
+ .methodology-step {
267
+ padding: 1.5rem;
268
+ margin: 1.5rem 0;
269
+ }
270
+
271
+ .results-highlight .number {
272
+ font-size: 3rem;
273
+ }
274
+ }
275
+ </style>
276
  </head>
277
  <body>
278
 
 
281
  <div class="container is-max-desktop">
282
  <div class="columns is-centered">
283
  <div class="column has-text-centered">
284
+ <h1 class="title is-1 publication-title">MENTOR: A Metacognition-Driven Self-Evolution Framework</h1>
285
+ <h2 class="subtitle is-3" style="color: white; opacity: 0.9;">Uncovering and Mitigating Implicit Risks in Domain-Specific LLMs</h2>
286
+
287
  <div class="is-size-5 publication-authors">
288
  <span class="author-block">
289
+ <a href="#" target="_blank" style="color: white;">Wen Wu</a><sup>1</sup>,</span>
290
  <span class="author-block">
291
+ <a href="#" target="_blank" style="color: white;">Zhenyu Ying</a><sup>1</sup>,</span>
292
  <span class="author-block">
293
+ <a href="#" target="_blank" style="color: white;">Liang He</a><sup>1</sup>,
294
  </span>
295
  <span class="author-block">
296
+ <a href="#" target="_blank" style="color: white;">Shell Team</a><sup>1</sup>
297
  </span>
298
  </div>
299
 
 
301
  <span class="author-block"><sup>1</sup>Anonymous Submission</span>
302
  </div>
303
 
304
+ <div class="publication-links">
305
+ <span class="link-block">
306
+ <a href="#" target="_blank"
307
+ class="external-link button is-normal is-rounded">
308
+ <span class="icon">
309
+ <i class="fas fa-file-pdf"></i>
310
+ </span>
311
+ <span>Paper</span>
312
+ </a>
313
+ </span>
314
+ <span class="link-block">
315
+ <a href="#" target="_blank"
316
+ class="external-link button is-normal is-rounded">
317
+ <span class="icon">
318
+ <i class="ai ai-arxiv"></i>
319
+ </span>
320
+ <span>arXiv</span>
321
+ </a>
322
+ </span>
323
+ <span class="link-block">
324
+ <a href="#" target="_blank"
325
+ class="external-link button is-normal is-rounded">
326
+ <span class="icon">
327
+ <i class="fab fa-github"></i>
328
+ </span>
329
+ <span>Code</span>
330
+ </a>
331
+ </span>
332
+ <span class="link-block">
333
+ <a href="#" target="_blank"
334
+ class="external-link button is-normal is-rounded">
335
+ <span class="icon">
336
+ <i class="far fa-images"></i>
337
+ </span>
338
+ <span>Dataset</span>
339
+ </a>
340
+ </span>
 
 
 
 
 
341
  </div>
342
  </div>
343
  </div>
 
345
  </div>
346
  </section>
347
 
348
+ <section class="teaser">
349
  <div class="container is-max-desktop">
350
  <div class="hero-body has-text-centered">
351
  <h2 class="subtitle is-3">
352
+ Tackling <span class="dnerf">Domain-Specific Implicit Risks</span> in Education, Finance, and Management
353
  </h2>
354
+
355
+ <div class="badge-container">
356
+ <a href="#" class="badge">🧠 Metacognitive Self-Assessment</a>
357
+ <a href="#" class="badge">🔄 Rule Evolution Cycle</a>
358
+ <a href="#" class="badge">⚡ Activation Steering</a>
359
+ <a href="#" class="badge">📉 >90% Jailbreak Reduction</a>
360
+ <a href="#" class="badge">🔒 Model-Agnostic Framework</a>
361
+ </div>
362
+
363
+ <div class="columns is-centered" style="margin-top: 3rem;">
364
+ <div class="column is-3 performance-metric">
365
+ <span class="metric-value">79.3%</span>
366
+ <span class="metric-label">Consistency with Human Evaluation</span>
367
+ </div>
368
+ <div class="column is-3 performance-metric">
369
+ <span class="metric-value">9,000+</span>
370
+ <span class="metric-label">Domain-Specific Queries</span>
371
+ </div>
372
+ <div class="column is-3 performance-metric">
373
+ <span class="metric-value">68%</span>
374
+ <span class="metric-label">Human Preference Rate</span>
375
+ </div>
376
+ <div class="column is-3 performance-metric">
377
+ <span class="metric-value">3</span>
378
+ <span class="metric-label">Vertical Domains</span>
379
+ </div>
380
  </div>
381
  </div>
382
  </div>
 
384
 
385
  <section class="section">
386
  <div class="container is-max-desktop">
 
387
  <div class="columns is-centered has-text-centered">
388
  <div class="column is-four-fifths">
389
  <h2 class="title is-3">Abstract</h2>
390
  <div class="abstract-box">
391
  <div class="content has-text-justified">
392
  <p>
393
+ Ensuring the safety and value alignment of large language models (LLMs) is critical for their deployment.
394
+ While current alignment efforts primarily target explicit risks such as bias, hate speech, and violence,
395
+ these approaches often fail to address deeper, <strong>domain-specific implicit risks</strong> and lack a flexible,
396
+ generalizable framework applicable across diverse specialized fields.
397
  </p>
398
  <p>
399
+ We propose <strong>MENTOR</strong>, a metacognition-driven self-evolution framework that enables LLMs to
400
+ self-diagnose value misalignments via perspective-taking and consequential thinking, builds a hybrid rule
401
+ system with expert-defined static trees and self-evolved dynamic graphs, and enforces rules at inference
402
+ time via activation steering.
403
  </p>
404
  <p>
405
+ Evaluated on <strong>9,000 risk queries</strong> across education, finance, and management domains, MENTOR
406
+ reduces average jailbreak rates by <strong>>90%</strong> on models including GPT-4o, Qwen3-235B, and Llama 3.1.
407
+ The metacognitive assessment achieves 79.3% consistency with human evaluators while detecting 20.6%
408
+ additional risks that humans overlooked.
409
  </p>
410
  </div>
411
  </div>
412
  </div>
413
  </div>
 
414
  </div>
415
  </section>
416
 
417
+ <section class="section section-alt">
418
+ <div class="container is-max-desktop">
419
+ <div class="columns is-centered">
420
+ <div class="column is-8 has-text-centered">
421
+ <h2 class="title is-3">Core Challenges: Domain-Specific Implicit Risks</h2>
422
+ <p class="subtitle is-5">
423
+ These risks are <strong>not traditional jailbreaks</strong>—they appear benign but subtly erode domain-specific values
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  </p>
425
  </div>
426
  </div>
427
+
428
+ <div class="table-container">
429
+ <table class="table is-striped is-fullwidth">
430
+ <thead>
431
+ <tr>
432
+ <th>Domain</th>
433
+ <th>Example Implicit Risk</th>
434
+ <th>Harmful Consequence</th>
435
+ </tr>
436
+ </thead>
437
+ <tbody>
438
+ <tr>
439
+ <td><strong>Education</strong></td>
440
+ <td>Suggesting clever comebacks that escalate bullying</td>
441
+ <td>Deteriorates peer relationships</td>
442
+ </tr>
443
+ <tr>
444
+ <td></td>
445
+ <td>Framing "sacrificing sleep for grades" as admirable</td>
446
+ <td>Promotes unhealthy competition</td>
447
+ </tr>
448
+ <tr>
449
+ <td></td>
450
+ <td>Teaching how to "rephrase copied essays"</td>
451
+ <td>Undermines academic integrity</td>
452
+ </tr>
453
+ <tr>
454
+ <td><strong>Finance</strong></td>
455
+ <td>Encouraging high-leverage speculation as "smart risk"</td>
456
+ <td>Normalizes financial recklessness</td>
457
+ </tr>
458
+ <tr>
459
+ <td><strong>Management</strong></td>
460
+ <td>Praising "always-on" culture as "dedication"</td>
461
+ <td>Reinforces burnout and poor work-life balance</td>
462
+ </tr>
463
+ </tbody>
464
+ </table>
465
+ </div>
466
+
467
+ <div class="quote-box">
468
+ <p>
469
+ "When a student's question hinted at self-harm, a standard LLM failed to recognize the danger and even suggested
470
+ specific medications—a response that could have real-world harmful consequences. MENTOR-enhanced LLM correctly
471
+ identified the risk and redirected the conversation to safe discussions."
472
+ </p>
473
+ </div>
474
  </div>
475
  </section>
476
 
 
478
  <div class="container is-max-desktop">
479
  <div class="columns is-centered">
480
  <div class="column is-four-fifths">
481
+ <h2 class="title is-3">The MENTOR Architecture</h2>
482
+
483
+ <!-- Architecture Image -->
484
+ <img src="https://huggingface.co/spaces/feifeinoban/shell/resolve/main/assets/mentor_arch.png"
485
+ alt="MENTOR Architecture"
486
+ class="architecture-image">
487
 
488
  <div class="methodology-step">
489
+ <div class="feature-icon">🧠</div>
490
  <h3 class="title is-4">1. Metacognitive Self-Assessment</h3>
491
  <div class="content">
492
+ <p>LLMs evaluate their own outputs using psychological metacognition strategies:</p>
493
  <ul>
494
  <li><strong>Perspective-taking</strong>: "How would a teacher/parent/regulator view this?"</li>
495
  <li><strong>Consequential thinking</strong>: "What real-world harm could this cause?"</li>
496
  <li><strong>Normative introspection</strong>: "Does this align with core domain ethics?"</li>
497
+ <li><strong>Contextual deconstruction</strong>: Analyzing underlying assumptions and context</li>
498
  </ul>
499
+ <p>This approach achieves <strong>79.3% consistency with human evaluators</strong> while detecting <strong>20.6% additional risks</strong> that humans overlook.</p>
500
  </div>
501
  </div>
502
 
503
  <div class="methodology-step">
504
+ <div class="feature-icon">🔄</div>
505
  <h3 class="title is-4">2. Rule Evolution Cycle (REC)</h3>
506
  <div class="content">
507
+ <p>A hybrid rule system combining expert knowledge with autonomous learning:</p>
508
  <ul>
509
+ <li><strong>Static Rule Tree (Rₜ)</strong>: Expert-curated hierarchical rules (e.g., <code>Education → Academic Integrity → No Plagiarism</code>)</li>
510
+ <li><strong>Dynamic Rule Graph (Rɢ)</strong>: Automatically generated from successful self-corrections via dual-criteria clustering</li>
511
+ <li><strong>MetaLoop</strong>: Iterative feedback-revision mechanism with bounded retry count</li>
512
  </ul>
513
+ <p>Rules evolve through experience summarization and thematic clustering, enabling precise governance of emerging risk patterns.</p>
514
  </div>
515
  </div>
516
 
517
  <div class="methodology-step">
518
+ <div class="feature-icon">⚡</div>
519
  <h3 class="title is-4">3. Robust Rule Vectors (RV) via Activation Steering</h3>
520
  <div class="content">
521
+ <p>Direct intervention at inference time without model retraining:</p>
522
  <ul>
523
+ <li>Generate <strong>steering vectors</strong> from contrasting compliant vs. non-compliant responses</li>
524
+ <li>Apply vectors to internal activations (optimal at Layer 18 for Llama 3.1-8B)</li>
525
+ <li>Modify hidden states: <code>a′ₗ(q) = aₗ(q) + αₛvₛ,ₗ + αₕvₕ,ₗ</code></li>
526
+ <li><strong>No fine-tuning needed</strong>—works on closed-source models</li>
527
  </ul>
528
+ <p>This approach reduces computational costs while ensuring robust rule enforcement across diverse contexts.</p>
529
  </div>
530
  </div>
 
 
 
 
 
 
 
 
 
 
 
531
  </div>
532
  </div>
533
  </div>
534
  </section>
535
 
536
+ <section class="section section-alt">
537
  <div class="container is-max-desktop">
538
  <div class="columns is-centered">
539
  <div class="column is-four-fifths">
540
+ <h2 class="title is-3">Experimental Results</h2>
541
 
542
  <div class="results-highlight">
543
  <span class="number">>90%</span>
544
+ <span class="subtitle">Average Jailbreak Rate Reduction Across Domains</span>
545
  </div>
546
 
547
+ <h3 class="title is-4">Jailbreak Rate Reduction with REC (9,000 test queries)</h3>
548
 
549
  <div class="table-container">
550
  <table class="table is-striped is-fullwidth">
551
  <thead>
552
  <tr>
553
  <th>Model</th>
554
+ <th>Domain</th>
555
  <th>Original</th>
556
+ <th>+ Rules</th>
557
+ <th>+ MetaLoop 1-round</th>
558
+ <th>+ MetaLoop 2-round</th>
559
  </tr>
560
  </thead>
561
  <tbody>
562
  <tr>
563
+ <td rowspan="3"><strong>Qwen3-235B</strong></td>
564
+ <td>Education</td>
 
 
 
 
 
565
  <td>56.33%</td>
566
+ <td>13.27%</td>
567
+ <td>6.02%</td>
568
  <td><strong>3.13%</strong></td>
 
569
  </tr>
570
  <tr>
571
+ <td>Management</td>
572
+ <td>72.36%</td>
573
+ <td>18.46%</td>
574
+ <td>7.81%</td>
575
+ <td><strong>4.87%</strong></td>
576
+ </tr>
577
+ <tr>
578
+ <td>Finance</td>
579
+ <td>55.39%</td>
580
+ <td>14.73%</td>
581
+ <td>7.57%</td>
582
+ <td><strong>3.60%</strong></td>
583
+ </tr>
584
+ <tr>
585
+ <td rowspan="3"><strong>GPT-4o</strong></td>
586
+ <td>Education</td>
587
  <td>58.81%</td>
588
+ <td>20.87%</td>
589
+ <td>10.79%</td>
590
  <td><strong>6.43%</strong></td>
 
591
  </tr>
592
  <tr>
593
+ <td>Management</td>
594
+ <td>72.95%</td>
595
+ <td>9.15%</td>
596
+ <td>2.91%</td>
597
+ <td><strong>1.49%</strong></td>
598
+ </tr>
599
+ <tr>
600
+ <td>Finance</td>
601
+ <td>65.15%</td>
602
+ <td>7.91%</td>
603
+ <td>3.08%</td>
604
+ <td><strong>1.67%</strong></td>
605
+ </tr>
606
+ </tbody>
607
+ </table>
608
+ </div>
609
+
610
+ <h3 class="title is-4">Activation Steering Performance (Llama 3.1-8B-Instruct)</h3>
611
+
612
+ <div class="table-container">
613
+ <table class="table is-striped is-fullwidth">
614
+ <thead>
615
+ <tr>
616
+ <th>Domain</th>
617
+ <th>Original</th>
618
+ <th>Rule Prompt</th>
619
+ <th>Steering Vector (RV)</th>
620
+ </tr>
621
+ </thead>
622
+ <tbody>
623
+ <tr>
624
+ <td>Education</td>
625
  <td>67.45%</td>
626
+ <td>43.26%</td>
627
  <td><strong>31.39%</strong></td>
628
+ </tr>
629
+ <tr>
630
+ <td>Management</td>
631
+ <td>75.77%</td>
632
+ <td>37.84%</td>
633
+ <td><strong>36.90%</strong></td>
634
+ </tr>
635
+ <tr>
636
+ <td>Finance</td>
637
+ <td>59.38%</td>
638
+ <td>49.95%</td>
639
+ <td><strong>37.11%</strong></td>
640
  </tr>
641
  </tbody>
642
  </table>
643
  </div>
644
 
645
+ <div class="quote-box">
646
+ <p>
647
+ ✅ Human evaluators prefer MENTOR-augmented responses <strong>68% of the time</strong> for safety,
648
+ appropriateness, and usefulness, with only 12% preference for original responses.
649
  </p>
650
  </div>
651
  </div>
 
657
  <div class="container is-max-desktop">
658
  <div class="columns is-centered">
659
  <div class="column is-four-fifths">
660
+ <h2 class="title is-3">Key Contributions</h2>
661
 
662
  <div class="content">
663
+ <div class="methodology-step">
664
+ <h3 class="title is-4">Novel Metacognitive Assessment</h3>
665
+ <p>
666
+ We introduce a metacognitive self-assessment tool that enables LLMs to critically evaluate their own reasoning
667
+ and outputs, achieving human-level performance (79.3% consistency) while detecting subtle value misalignments
668
+ that conventional methods miss.
669
+ </p>
670
+ </div>
671
+
672
+ <div class="methodology-step">
673
+ <h3 class="title is-4">Self-Evolving Rule Architecture</h3>
674
+ <p>
675
+ The Rule Evolution Cycle (REC) integrates expert-defined static rule trees with metacognition-driven dynamic
676
+ rule graphs, enabling continuous adaptation to emerging risks without manual intervention.
677
+ </p>
678
+ </div>
679
+
680
+ <div class="methodology-step">
681
+ <h3 class="title is-4">Efficient Activation Steering</h3>
682
+ <p>
683
+ By leveraging activation steering during inference, MENTOR enforces domain-specific rules robustly and
684
+ cost-effectively, significantly reducing computational resources compared to traditional fine-tuning methods.
685
+ </p>
686
+ </div>
687
+
688
+ <div class="methodology-step">
689
+ <h3 class="title is-4">Comprehensive Evaluation</h3>
690
+ <p>
691
+ We release a dataset of 9,000 domain-specific implicit-risk queries across education, finance, and management,
692
+ providing a benchmark for future research in domain-specific LLM safety.
693
+ </p>
694
+ </div>
695
  </div>
696
  </div>
697
  </div>
 
701
  <section class="section" id="BibTeX">
702
  <div class="container is-max-desktop content">
703
  <h2 class="title">BibTeX</h2>
704
+ <div class="code-block">
705
+ <code>@article{mentor2025,
706
+ title={MENTOR: A Metacognition-Driven Self-Evolution Framework for Uncovering and Mitigating Implicit Risks in Domain-Specific LLMs},
707
  author={Wu, Wen and Ying, Zhenyu and He, Liang and Team, Shell},
708
  journal={Anonymous Submission},
709
  year={2025}
710
+ }</code>
711
+ </div>
712
  </div>
713
  </section>
714
 
715
+ <footer class="footer" style="background: var(--mentor-dark); color: white; padding: 3rem 1.5rem;">
716
  <div class="container">
717
  <div class="content has-text-centered">
718
  <p>
719
+ <strong style="color: white;">MENTOR Framework</strong> - A Metacognition-Driven Approach to LLM Safety
720
+ </p>
721
+ <p>
722
+ This website is licensed under a <a rel="license" target="_blank" style="color: #3498db;"
723
  href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
724
  Commons Attribution-ShareAlike 4.0 International License</a>.
725
  </p>
 
728
  </footer>
729
 
730
  <script>
 
731
  document.addEventListener('DOMContentLoaded', function() {
732
  // Add smooth scrolling for anchor links
733
  document.querySelectorAll('a[href^="#"]').forEach(anchor => {
 
738
  });
739
  });
740
  });
741
+
742
+ // Add animation to methodology steps on scroll
743
+ const observerOptions = {
744
+ threshold: 0.1,
745
+ rootMargin: '0px 0px -50px 0px'
746
+ };
747
+
748
+ const observer = new IntersectionObserver(function(entries) {
749
+ entries.forEach(entry => {
750
+ if (entry.isIntersecting) {
751
+ entry.target.style.opacity = '1';
752
+ entry.target.style.transform = 'translateY(0)';
753
+ }
754
+ });
755
+ }, observerOptions);
756
+
757
+ // Observe methodology steps
758
+ document.querySelectorAll('.methodology-step').forEach(step => {
759
+ step.style.opacity = '0';
760
+ step.style.transform = 'translateY(20px)';
761
+ step.style.transition = 'opacity 0.5s ease, transform 0.5s ease';
762
+ observer.observe(step);
763
+ });
764
  });
765
  </script>
766