Spaces:

feifeinoban
/

shell

Running

App Files Files Community

feifeinoban commited on Oct 7, 2025

Commit

7736878

1 Parent(s): 94eb760

use static HTML

Browse files

Files changed (1) hide show

index.html +462 -220

index.html CHANGED Viewed

@@ -3,10 +3,10 @@
 <head>
   <meta charset="utf-8">
   <meta name="description"
-        content="Shell: A Metacognition-Driven Safety Framework for Domain-Specific LLMs">
-  <meta name="keywords" content="LLM Safety, Metacognition, AI Alignment, Activation Steering">
   <meta name="viewport" content="width=device-width, initial-scale=1">
-  <title>Shell: Metacognition-Driven Safety for Domain-Specific LLMs</title>
   <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
         rel="stylesheet">
@@ -18,28 +18,32 @@
   <style>
     :root {
-      --shell-primary: #3498db;
-      --shell-secondary: #9b59b6;
-      --shell-accent: #2ecc71;
     }
     body {
       font-family: 'Google Sans', 'Noto Sans', sans-serif;
       line-height: 1.6;
     }
     .hero {
-      background: linear-gradient(135deg, var(--shell-primary), var(--shell-secondary));
       color: white;
     }
     .publication-title {
       color: white;
-      margin-bottom: 1rem;
     }
     .publication-authors {
-      margin-bottom: 1rem;
     }
     .author-block {
@@ -48,134 +52,227 @@
     }
     .publication-links {
-      margin-top: 1.5rem;
     }
     .link-block {
       display: inline-block;
-      margin: 0 0.5rem;
     }
     .external-link {
-      transition: transform 0.2s;
-      background: linear-gradient(135deg, #667eea, #764ba2) !important;
-      border: none !important;
     }
     .external-link:hover {
-      transform: translateY(-2px);
-      background: linear-gradient(135deg, #764ba2, #667eea) !important;
-    }
-    .external-link span {
-      color: white !important;
     }
     .teaser {
       padding: 4rem 0;
-    }
-    .teaser video {
-      border-radius: 10px;
-      box-shadow: 0 10px 30px rgba(0,0,0,0.3);
     }
     .dnerf {
       font-weight: bold;
-      color: var(--shell-primary);
-    }
-    .results-carousel {
-      margin: 2rem 0;
     }
     .section {
       padding: 4rem 1.5rem;
     }
     .content h2, .content h3 {
-      color: var(--shell-primary);
-      border-bottom: 2px solid #f5f5f5;
       padding-bottom: 0.5rem;
       margin-top: 2rem;
     }
     .table-container {
       margin: 2rem 0;
-      box-shadow: 0 5px 15px rgba(0,0,0,0.1);
       border-radius: 10px;
       overflow: hidden;
     }
     table {
       width: 100%;
     }
     table th {
-      background: linear-gradient(135deg, var(--shell-primary), var(--shell-secondary));
       color: white;
       font-weight: 600;
     }
     .badge {
       display: inline-block;
-      padding: 0.5rem 1rem;
-      background: linear-gradient(135deg, #667eea, #764ba2);
       color: white !important;
-      border-radius: 20px;
-      margin: 0.25rem;
-      font-size: 0.9rem;
       text-decoration: none;
       border: none;
     }
     .badge:hover {
-      background: linear-gradient(135deg, #764ba2, #667eea);
       color: white !important;
-      transform: translateY(-2px);
     }
     .abstract-box {
-      background: linear-gradient(135deg, #f8f9fa, #e9ecef);
-      padding: 2rem;
-      border-radius: 10px;
-      border-left: 5px solid var(--shell-primary);
       margin: 2rem 0;
     }
     .methodology-step {
-      margin: 2rem 0;
-      padding: 1.5rem;
-      border-radius: 10px;
       background: white;
-      box-shadow: 0 5px 15px rgba(0,0,0,0.1);
-      border-left: 4px solid var(--shell-accent);
     }
     .results-highlight {
       text-align: center;
-      padding: 2rem;
-      background: linear-gradient(135deg, var(--shell-primary), var(--shell-secondary));
       color: white;
-      border-radius: 10px;
-      margin: 2rem 0;
     }
     .results-highlight .number {
-      font-size: 3rem;
       font-weight: bold;
       display: block;
     }
     .architecture-image {
       width: 100%;
-      max-width: 800px;
       display: block;
-      margin: 2rem auto;
       border-radius: 10px;
-      box-shadow: 0 10px 30px rgba(0,0,0,0.2);
     }
-</style>
 </head>
 <body>
@@ -184,17 +281,19 @@
     <div class="container is-max-desktop">
       <div class="columns is-centered">
         <div class="column has-text-centered">
-          <h1 class="title is-1 publication-title">🐚 Shell: Metacognition-Driven Safety Framework for Domain-Specific LLMs</h1>
           <div class="is-size-5 publication-authors">
             <span class="author-block">
-              <a href="#" target="_blank">Wen Wu</a><sup>1</sup>,</span>
             <span class="author-block">
-              <a href="#" target="_blank">Zhenyu Ying</a><sup>1</sup>,</span>
             <span class="author-block">
-              <a href="#" target="_blank">Liang He</a><sup>1</sup>,
             </span>
             <span class="author-block">
-              <a href="#" target="_blank">Shell Team</a><sup>1</sup>
             </span>
           </div>
@@ -202,48 +301,43 @@
             <span class="author-block"><sup>1</sup>Anonymous Submission</span>
           </div>
-          <div class="column has-text-centered">
-            <div class="publication-links">
-              <!-- PDF Link. -->
-              <span class="link-block">
-                <a href="#" target="_blank"
-                   class="external-link button is-normal is-rounded is-dark">
-                  <span class="icon">
-                      <i class="fas fa-file-pdf"></i>
-                  </span>
-                  <span>Paper</span>
-                </a>
-              </span>
-              <span class="link-block">
-                <a href="#" target="_blank"
-                   class="external-link button is-normal is-rounded is-dark">
-                  <span class="icon">
-                      <i class="ai ai-arxiv"></i>
-                  </span>
-                  <span>arXiv</span>
-                </a>
-              </span>
-              <!-- Code Link. -->
-              <span class="link-block">
-                <a href="#" target="_blank"
-                   class="external-link button is-normal is-rounded is-dark">
-                  <span class="icon">
-                      <i class="fab fa-github"></i>
-                  </span>
-                  <span>Code</span>
-                  </a>
-              </span>
-              <!-- Dataset Link. -->
-              <span class="link-block">
-                <a href="#" target="_blank"
-                   class="external-link button is-normal is-rounded is-dark">
-                  <span class="icon">
-                      <i class="far fa-images"></i>
-                  </span>
-                  <span>Dataset</span>
-                  </a>
-              </span>
-            </div>
           </div>
         </div>
       </div>
@@ -251,17 +345,38 @@
   </div>
 </section>
-<section class="hero teaser">
   <div class="container is-max-desktop">
     <div class="hero-body has-text-centered">
       <h2 class="subtitle is-3">
-        Uncover and mitigate <span class="dnerf">implicit value risks</span> in education, finance, management—and beyond
       </h2>
-      <div class="content">
-        <a href="#" class="badge">🔒 Model-agnostic</a>
-        <a href="#" class="badge">🧠 Self-evolving rules</a>
-        <a href="#" class="badge">⚡ Activation steering</a>
-        <a href="#" class="badge">📉 90%+ jailbreak reduction</a>
       </div>
     </div>
   </div>
@@ -269,81 +384,93 @@
 <section class="section">
   <div class="container is-max-desktop">
-    <!-- Abstract. -->
     <div class="columns is-centered has-text-centered">
       <div class="column is-four-fifths">
         <h2 class="title is-3">Abstract</h2>
         <div class="abstract-box">
           <div class="content has-text-justified">
             <p>
-              While current LLM safety methods focus on explicit harms (e.g., hate speech, violence), they often miss <strong>domain-specific implicit risks</strong>—such as encouraging academic dishonesty in education, promoting reckless trading in finance, or normalizing toxic workplace culture in management.
             </p>
             <p>
-              We introduce <strong>Shell</strong>, a metacognition-driven self-evolution framework that enables LLMs to self-diagnose value misalignments via perspective-taking and consequence simulation, builds a hybrid rule system with expert-defined static trees and self-evolved dynamic graphs, and enforces rules at inference time via activation steering.
             </p>
             <p>
-              Evaluated on 9,000 risk queries across <strong>education, finance, and management</strong>, Shell reduces average jailbreak rates by <strong>>90%</strong> on models including GPT-5, Qwen3, and Llama 3.1.
             </p>
           </div>
         </div>
       </div>
     </div>
-    <!--/ Abstract. -->
   </div>
 </section>
-<section class="hero is-light is-small">
-  <div class="hero-body">
-    <div class="container">
-      <div class="columns is-centered">
-        <div class="column is-8 has-text-centered">
-          <h2 class="title is-3">Core Challenges: Implicit Risks Are Everywhere</h2>
-        </div>
-      </div>
-      <div class="table-container">
-        <table class="table is-striped is-fullwidth">
-          <thead>
-            <tr>
-              <th>Domain</th>
-              <th>Example Implicit Risk</th>
-              <th>Harmful Consequence</th>
-            </tr>
-          </thead>
-          <tbody>
-            <tr>
-              <td><strong>Education</strong></td>
-              <td>Suggesting clever comebacks that escalate bullying</td>
-              <td>Deteriorates peer relationships</td>
-            </tr>
-            <tr>
-              <td></td>
-              <td>Framing "sacrificing sleep for grades" as admirable</td>
-              <td>Promotes unhealthy competition</td>
-            </tr>
-            <tr>
-              <td></td>
-              <td>Teaching how to "rephrase copied essays"</td>
-              <td>Undermines academic integrity</td>
-            </tr>
-            <tr>
-              <td><strong>Finance</strong></td>
-              <td>Encouraging high-leverage speculation as "smart risk"</td>
-              <td>Normalizes financial recklessness</td>
-            </tr>
-            <tr>
-              <td><strong>Management</strong></td>
-              <td>Praising "always-on" culture as "dedication"</td>
-              <td>Reinforces burnout and poor work-life balance</td>
-            </tr>
-          </tbody>
-        </table>
-      </div>
-      <div class="has-text-centered">
-        <p class="is-italic">
-          💡 These risks are <strong>not jailbreaks</strong> in the traditional sense—they appear benign but subtly erode domain-specific values.
         </p>
       </div>
     </div>
   </div>
 </section>
@@ -351,113 +478,174 @@
   <div class="container is-max-desktop">
     <div class="columns is-centered">
       <div class="column is-four-fifths">
-        <h2 class="title is-3">Methodology: The MENTOR Architecture</h2>
         <div class="methodology-step">
           <h3 class="title is-4">1. Metacognitive Self-Assessment</h3>
           <div class="content">
-            <p>LLMs evaluate their own outputs using:</p>
             <ul>
               <li><strong>Perspective-taking</strong>: "How would a teacher/parent/regulator view this?"</li>
               <li><strong>Consequential thinking</strong>: "What real-world harm could this cause?"</li>
               <li><strong>Normative introspection</strong>: "Does this align with core domain ethics?"</li>
             </ul>
-            <p>This replaces labor-intensive human labeling with <strong>autonomous, human-aligned reflection</strong>.</p>
           </div>
         </div>
         <div class="methodology-step">
           <h3 class="title is-4">2. Rule Evolution Cycle (REC)</h3>
           <div class="content">
             <ul>
-              <li><strong>Static Rule Tree</strong>: Expert-curated, hierarchical rules (e.g., <code>Education → Academic Integrity → No Plagiarism</code>).</li>
-              <li><strong>Dynamic Rule Graph</strong>: Automatically generated from successful self-corrections (e.g., <code>&lt;risk: essay outsourcing&gt; → &lt;rule: teach outlining instead&gt;</code>).</li>
-              <li>Rules evolve via <strong>dual clustering</strong> (by risk type & mitigation strategy), enabling precise retrieval.</li>
             </ul>
           </div>
         </div>
         <div class="methodology-step">
           <h3 class="title is-4">3. Robust Rule Vectors (RV) via Activation Steering</h3>
           <div class="content">
             <ul>
-              <li>Generate <strong>steering vectors</strong> from contrasting compliant vs. non-compliant responses.</li>
-              <li>At inference, <strong>add vectors to internal activations</strong> (e.g., Layer 18 of Llama 3.1) to guide behavior.</li>
-              <li><strong>No fine-tuning needed</strong>—works on closed-source models like GPT-5.</li>
             </ul>
           </div>
         </div>
-        <!-- Architecture Image -->
-        <img src="https://huggingface.co/spaces/feifeinoban/shell/resolve/main/assets/mentor_arch.png"
-             alt="MENTOR Architecture"
-             class="architecture-image">
-        <div class="has-text-centered">
-          <p class="is-italic">
-            Figure: The MENTOR framework. Shell implements this full pipeline.
-          </p>
-        </div>
       </div>
     </div>
   </div>
 </section>
-<section class="section">
   <div class="container is-max-desktop">
     <div class="columns is-centered">
       <div class="column is-four-fifths">
-        <h2 class="title is-3">Results: Strong, Efficient, Generalizable</h2>
         <div class="results-highlight">
           <span class="number">>90%</span>
-          <span class="subtitle">Average Jailbreak Rate Reduction</span>
         </div>
-        <h3 class="title is-4">Jailbreak Rate Reduction (3,000 queries per domain)</h3>
         <div class="table-container">
           <table class="table is-striped is-fullwidth">
             <thead>
               <tr>
                 <th>Model</th>
                 <th>Original</th>
-                <th>+ Shell (Rules + MetaLoop + RV)</th>
-                <th>Reduction</th>
               </tr>
             </thead>
             <tbody>
               <tr>
-                <td><strong>GPT-5</strong></td>
-                <td>38.39%</td>
-                <td><strong>0.77%</strong></td>
-                <td><strong>98.0%</strong></td>
-              </tr>
-              <tr>
-                <td><strong>Qwen3-235B</strong></td>
                 <td>56.33%</td>
                 <td><strong>3.13%</strong></td>
-                <td><strong>94.4%</strong></td>
               </tr>
               <tr>
-                <td><strong>GPT-4o</strong></td>
                 <td>58.81%</td>
                 <td><strong>6.43%</strong></td>
-                <td><strong>89.1%</strong></td>
               </tr>
               <tr>
-                <td><strong>Llama 3.1-8B</strong></td>
                 <td>67.45%</td>
                 <td><strong>31.39%</strong></td>
-                <td><strong>53.5%</strong></td>
               </tr>
             </tbody>
           </table>
         </div>
-        <div class="has-text-centered">
-          <p class="is-italic">
-            ✅ Human evaluators prefer Shell-augmented responses <strong>68% of the time</strong> for safety, appropriateness, and usefulness.
           </p>
         </div>
       </div>
@@ -469,14 +657,41 @@
   <div class="container is-max-desktop">
     <div class="columns is-centered">
       <div class="column is-four-fifths">
-        <h2 class="title is-3">Try It / Use It</h2>
         <div class="content">
-          <h3 class="title is-4">For Researchers</h3>
-          <ul>
-            <li><strong>Dataset</strong>: 9,000 implicit-risk queries across 3 domains → [HF Dataset Link]</li>
-            <li><strong>Code</strong>: Full implementation of REC + RV → [GitHub Link] (coming soon)</li>
-          </ul>
         </div>
       </div>
     </div>
@@ -486,20 +701,25 @@
 <section class="section" id="BibTeX">
   <div class="container is-max-desktop content">
     <h2 class="title">BibTeX</h2>
-    <pre><code>@article{shell2025,
-  title={Shell: A Metacognition-Driven Safety Framework for Domain-Specific LLMs},
   author={Wu, Wen and Ying, Zhenyu and He, Liang and Team, Shell},
   journal={Anonymous Submission},
   year={2025}
-}</code></pre>
   </div>
 </section>
-<footer class="footer">
   <div class="container">
     <div class="content has-text-centered">
       <p>
-        This website is licensed under a <a rel="license" target="_blank"
                                             href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
         Commons Attribution-ShareAlike 4.0 International License</a>.
       </p>
@@ -508,7 +728,6 @@
 </footer>
 <script>
-  // Simple JavaScript for interactive elements if needed
   document.addEventListener('DOMContentLoaded', function() {
     // Add smooth scrolling for anchor links
     document.querySelectorAll('a[href^="#"]').forEach(anchor => {
@@ -519,6 +738,29 @@
         });
       });
     });
   });
 </script>

 <head>
   <meta charset="utf-8">
   <meta name="description"
+        content="MENTOR: A Metacognition-Driven Self-Evolution Framework for Uncovering and Mitigating Implicit Risks in Domain-Specific LLMs">
+  <meta name="keywords" content="LLM Safety, Metacognition, AI Alignment, Activation Steering, Domain-Specific Risks">
   <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>MENTOR: Metacognition-Driven Safety Framework for Domain-Specific LLMs</title>
   <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
         rel="stylesheet">
   <style>
     :root {
+      --mentor-primary: #3498db;
+      --mentor-secondary: #9b59b6;
+      --mentor-accent: #2ecc71;
+      --mentor-dark: #2c3e50;
     }
     body {
       font-family: 'Google Sans', 'Noto Sans', sans-serif;
       line-height: 1.6;
+      color: #333;
     }
     .hero {
+      background: linear-gradient(135deg, var(--mentor-primary), var(--mentor-secondary));
       color: white;
+      padding: 4rem 1.5rem;
     }
     .publication-title {
       color: white;
+      margin-bottom: 1.5rem;
+      text-shadow: 0 2px 4px rgba(0,0,0,0.3);
     }
     .publication-authors {
+      margin-bottom: 1.5rem;
     }
     .author-block {
     }
     .publication-links {
+      margin-top: 2rem;
     }
     .link-block {
       display: inline-block;
+      margin: 0 0.5rem 1rem;
     }
     .external-link {
+      transition: all 0.3s ease;
+      background: rgba(255,255,255,0.2) !important;
+      border: 2px solid rgba(255,255,255,0.3) !important;
+      color: white !important;
+      font-weight: 600;
     }
     .external-link:hover {
+      transform: translateY(-3px);
+      background: rgba(255,255,255,0.3) !important;
+      border-color: white !important;
+      box-shadow: 0 5px 15px rgba(0,0,0,0.2);
     }
     .teaser {
       padding: 4rem 0;
+      background: #f8f9fa;
     }
     .dnerf {
       font-weight: bold;
+      color: var(--mentor-primary);
     }
     .section {
       padding: 4rem 1.5rem;
     }
+    .section-alt {
+      background: #f8f9fa;
+    }
     .content h2, .content h3 {
+      color: var(--mentor-dark);
+      border-bottom: 3px solid var(--mentor-primary);
       padding-bottom: 0.5rem;
       margin-top: 2rem;
     }
     .table-container {
       margin: 2rem 0;
+      box-shadow: 0 5px 25px rgba(0,0,0,0.1);
       border-radius: 10px;
       overflow: hidden;
+      border: 1px solid #e0e0e0;
     }
     table {
       width: 100%;
+      margin: 0;
     }
     table th {
+      background: linear-gradient(135deg, var(--mentor-primary), var(--mentor-secondary));
       color: white;
       font-weight: 600;
+      padding: 1rem;
+      text-align: left;
+    }
+    table td {
+      padding: 1rem;
+      border-bottom: 1px solid #f0f0f0;
+    }
+    table tr:hover {
+      background-color: #f8f9fa;
+    }
+    .badge-container {
+      margin: 2rem 0;
     }
     .badge {
       display: inline-block;
+      padding: 0.75rem 1.5rem;
+      background: linear-gradient(135deg, var(--mentor-primary), var(--mentor-secondary));
       color: white !important;
+      border-radius: 25px;
+      margin: 0.5rem;
+      font-size: 1rem;
       text-decoration: none;
       border: none;
+      font-weight: 600;
+      box-shadow: 0 4px 15px rgba(52, 152, 219, 0.3);
+      transition: all 0.3s ease;
     }
     .badge:hover {
+      transform: translateY(-3px);
+      box-shadow: 0 8px 25px rgba(52, 152, 219, 0.4);
       color: white !important;
     }
     .abstract-box {
+      background: linear-gradient(135deg, #ffffff, #f8f9fa);
+      padding: 2.5rem;
+      border-radius: 15px;
+      border-left: 6px solid var(--mentor-primary);
       margin: 2rem 0;
+      box-shadow: 0 5px 25px rgba(0,0,0,0.08);
     }
     .methodology-step {
+      margin: 2.5rem 0;
+      padding: 2rem;
+      border-radius: 12px;
       background: white;
+      box-shadow: 0 5px 20px rgba(0,0,0,0.1);
+      border-left: 5px solid var(--mentor-accent);
+      transition: transform 0.3s ease;
+    }
+    .methodology-step:hover {
+      transform: translateY(-5px);
     }
     .results-highlight {
       text-align: center;
+      padding: 3rem 2rem;
+      background: linear-gradient(135deg, var(--mentor-primary), var(--mentor-secondary));
       color: white;
+      border-radius: 15px;
+      margin: 3rem 0;
+      box-shadow: 0 10px 30px rgba(0,0,0,0.2);
     }
     .results-highlight .number {
+      font-size: 4rem;
       font-weight: bold;
       display: block;
+      text-shadow: 0 2px 4px rgba(0,0,0,0.3);
+    }
+    .results-highlight .subtitle {
+      font-size: 1.5rem;
+      opacity: 0.9;
     }
     .architecture-image {
       width: 100%;
+      max-width: 900px;
       display: block;
+      margin: 3rem auto;
+      border-radius: 12px;
+      box-shadow: 0 15px 40px rgba(0,0,0,0.25);
+      border: 1px solid #e0e0e0;
+    }
+    .feature-icon {
+      font-size: 3rem;
+      color: var(--mentor-primary);
+      margin-bottom: 1rem;
+    }
+    .quote-box {
+      background: linear-gradient(135deg, #667eea, #764ba2);
+      color: white;
+      padding: 2rem;
       border-radius: 10px;
+      margin: 2rem 0;
+      font-style: italic;
+      box-shadow: 0 5px 20px rgba(0,0,0,0.15);
+    }
+    .performance-metric {
+      text-align: center;
+      padding: 1.5rem;
     }
+    .metric-value {
+      font-size: 2.5rem;
+      font-weight: bold;
+      color: var(--mentor-primary);
+      display: block;
+    }
+    .metric-label {
+      font-size: 1rem;
+      color: var(--mentor-dark);
+      margin-top: 0.5rem;
+    }
+    .code-block {
+      background: #2c3e50;
+      color: #ecf0f1;
+      padding: 1.5rem;
+      border-radius: 8px;
+      overflow-x: auto;
+      font-family: 'Courier New', monospace;
+      margin: 1.5rem 0;
+    }
+    @media (max-width: 768px) {
+      .hero {
+        padding: 2rem 1rem;
+      }
+      .publication-title {
+        font-size: 2rem;
+      }
+      .methodology-step {
+        padding: 1.5rem;
+        margin: 1.5rem 0;
+      }
+      .results-highlight .number {
+        font-size: 3rem;
+      }
+    }
+  </style>
 </head>
 <body>
     <div class="container is-max-desktop">
       <div class="columns is-centered">
         <div class="column has-text-centered">
+          <h1 class="title is-1 publication-title">MENTOR: A Metacognition-Driven Self-Evolution Framework</h1>
+          <h2 class="subtitle is-3" style="color: white; opacity: 0.9;">Uncovering and Mitigating Implicit Risks in Domain-Specific LLMs</h2>
           <div class="is-size-5 publication-authors">
             <span class="author-block">
+              <a href="#" target="_blank" style="color: white;">Wen Wu</a><sup>1</sup>,</span>
             <span class="author-block">
+              <a href="#" target="_blank" style="color: white;">Zhenyu Ying</a><sup>1</sup>,</span>
             <span class="author-block">
+              <a href="#" target="_blank" style="color: white;">Liang He</a><sup>1</sup>,
             </span>
             <span class="author-block">
+              <a href="#" target="_blank" style="color: white;">Shell Team</a><sup>1</sup>
             </span>
           </div>
             <span class="author-block"><sup>1</sup>Anonymous Submission</span>
           </div>
+          <div class="publication-links">
+            <span class="link-block">
+              <a href="#" target="_blank"
+                 class="external-link button is-normal is-rounded">
+                <span class="icon">
+                    <i class="fas fa-file-pdf"></i>
+                </span>
+                <span>Paper</span>
+              </a>
+            </span>
+            <span class="link-block">
+              <a href="#" target="_blank"
+                 class="external-link button is-normal is-rounded">
+                <span class="icon">
+                    <i class="ai ai-arxiv"></i>
+                </span>
+                <span>arXiv</span>
+              </a>
+            </span>
+            <span class="link-block">
+              <a href="#" target="_blank"
+                 class="external-link button is-normal is-rounded">
+                <span class="icon">
+                    <i class="fab fa-github"></i>
+                </span>
+                <span>Code</span>
+              </a>
+            </span>
+            <span class="link-block">
+              <a href="#" target="_blank"
+                 class="external-link button is-normal is-rounded">
+                <span class="icon">
+                    <i class="far fa-images"></i>
+                </span>
+                <span>Dataset</span>
+              </a>
+            </span>
           </div>
         </div>
       </div>
   </div>
 </section>
+<section class="teaser">
   <div class="container is-max-desktop">
     <div class="hero-body has-text-centered">
       <h2 class="subtitle is-3">
+        Tackling <span class="dnerf">Domain-Specific Implicit Risks</span> in Education, Finance, and Management
       </h2>
+      <div class="badge-container">
+        <a href="#" class="badge">🧠 Metacognitive Self-Assessment</a>
+        <a href="#" class="badge">🔄 Rule Evolution Cycle</a>
+        <a href="#" class="badge">⚡ Activation Steering</a>
+        <a href="#" class="badge">📉 >90% Jailbreak Reduction</a>
+        <a href="#" class="badge">🔒 Model-Agnostic Framework</a>
+      </div>
+      <div class="columns is-centered" style="margin-top: 3rem;">
+        <div class="column is-3 performance-metric">
+          <span class="metric-value">79.3%</span>
+          <span class="metric-label">Consistency with Human Evaluation</span>
+        </div>
+        <div class="column is-3 performance-metric">
+          <span class="metric-value">9,000+</span>
+          <span class="metric-label">Domain-Specific Queries</span>
+        </div>
+        <div class="column is-3 performance-metric">
+          <span class="metric-value">68%</span>
+          <span class="metric-label">Human Preference Rate</span>
+        </div>
+        <div class="column is-3 performance-metric">
+          <span class="metric-value">3</span>
+          <span class="metric-label">Vertical Domains</span>
+        </div>
       </div>
     </div>
   </div>
 <section class="section">
   <div class="container is-max-desktop">
     <div class="columns is-centered has-text-centered">
       <div class="column is-four-fifths">
         <h2 class="title is-3">Abstract</h2>
         <div class="abstract-box">
           <div class="content has-text-justified">
             <p>
+              Ensuring the safety and value alignment of large language models (LLMs) is critical for their deployment.
+              While current alignment efforts primarily target explicit risks such as bias, hate speech, and violence,
+              these approaches often fail to address deeper, <strong>domain-specific implicit risks</strong> and lack a flexible,
+              generalizable framework applicable across diverse specialized fields.
             </p>
             <p>
+              We propose <strong>MENTOR</strong>, a metacognition-driven self-evolution framework that enables LLMs to
+              self-diagnose value misalignments via perspective-taking and consequential thinking, builds a hybrid rule
+              system with expert-defined static trees and self-evolved dynamic graphs, and enforces rules at inference
+              time via activation steering.
             </p>
             <p>
+              Evaluated on <strong>9,000 risk queries</strong> across education, finance, and management domains, MENTOR
+              reduces average jailbreak rates by <strong>>90%</strong> on models including GPT-4o, Qwen3-235B, and Llama 3.1.
+              The metacognitive assessment achieves 79.3% consistency with human evaluators while detecting 20.6%
+              additional risks that humans overlooked.
             </p>
           </div>
         </div>
       </div>
     </div>
   </div>
 </section>
+<section class="section section-alt">
+  <div class="container is-max-desktop">
+    <div class="columns is-centered">
+      <div class="column is-8 has-text-centered">
+        <h2 class="title is-3">Core Challenges: Domain-Specific Implicit Risks</h2>
+        <p class="subtitle is-5">
+          These risks are <strong>not traditional jailbreaks</strong>—they appear benign but subtly erode domain-specific values
         </p>
       </div>
     </div>
+    <div class="table-container">
+      <table class="table is-striped is-fullwidth">
+        <thead>
+          <tr>
+            <th>Domain</th>
+            <th>Example Implicit Risk</th>
+            <th>Harmful Consequence</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><strong>Education</strong></td>
+            <td>Suggesting clever comebacks that escalate bullying</td>
+            <td>Deteriorates peer relationships</td>
+          </tr>
+          <tr>
+            <td></td>
+            <td>Framing "sacrificing sleep for grades" as admirable</td>
+            <td>Promotes unhealthy competition</td>
+          </tr>
+          <tr>
+            <td></td>
+            <td>Teaching how to "rephrase copied essays"</td>
+            <td>Undermines academic integrity</td>
+          </tr>
+          <tr>
+            <td><strong>Finance</strong></td>
+            <td>Encouraging high-leverage speculation as "smart risk"</td>
+            <td>Normalizes financial recklessness</td>
+          </tr>
+          <tr>
+            <td><strong>Management</strong></td>
+            <td>Praising "always-on" culture as "dedication"</td>
+            <td>Reinforces burnout and poor work-life balance</td>
+          </tr>
+        </tbody>
+      </table>
+    </div>
+    <div class="quote-box">
+      <p>
+        "When a student's question hinted at self-harm, a standard LLM failed to recognize the danger and even suggested
+        specific medications—a response that could have real-world harmful consequences. MENTOR-enhanced LLM correctly
+        identified the risk and redirected the conversation to safe discussions."
+      </p>
+    </div>
   </div>
 </section>
   <div class="container is-max-desktop">
     <div class="columns is-centered">
       <div class="column is-four-fifths">
+        <h2 class="title is-3">The MENTOR Architecture</h2>
+        <!-- Architecture Image -->
+        <img src="https://huggingface.co/spaces/feifeinoban/shell/resolve/main/assets/mentor_arch.png"
+             alt="MENTOR Architecture"
+             class="architecture-image">
         <div class="methodology-step">
+          <div class="feature-icon">🧠</div>
           <h3 class="title is-4">1. Metacognitive Self-Assessment</h3>
           <div class="content">
+            <p>LLMs evaluate their own outputs using psychological metacognition strategies:</p>
             <ul>
               <li><strong>Perspective-taking</strong>: "How would a teacher/parent/regulator view this?"</li>
               <li><strong>Consequential thinking</strong>: "What real-world harm could this cause?"</li>
               <li><strong>Normative introspection</strong>: "Does this align with core domain ethics?"</li>
+              <li><strong>Contextual deconstruction</strong>: Analyzing underlying assumptions and context</li>
             </ul>
+            <p>This approach achieves <strong>79.3% consistency with human evaluators</strong> while detecting <strong>20.6% additional risks</strong> that humans overlook.</p>
           </div>
         </div>
         <div class="methodology-step">
+          <div class="feature-icon">🔄</div>
           <h3 class="title is-4">2. Rule Evolution Cycle (REC)</h3>
           <div class="content">
+            <p>A hybrid rule system combining expert knowledge with autonomous learning:</p>
             <ul>
+              <li><strong>Static Rule Tree (Rₜ)</strong>: Expert-curated hierarchical rules (e.g., <code>Education → Academic Integrity → No Plagiarism</code>)</li>
+              <li><strong>Dynamic Rule Graph (Rɢ)</strong>: Automatically generated from successful self-corrections via dual-criteria clustering</li>
+              <li><strong>MetaLoop</strong>: Iterative feedback-revision mechanism with bounded retry count</li>
             </ul>
+            <p>Rules evolve through experience summarization and thematic clustering, enabling precise governance of emerging risk patterns.</p>
           </div>
         </div>
         <div class="methodology-step">
+          <div class="feature-icon">⚡</div>
           <h3 class="title is-4">3. Robust Rule Vectors (RV) via Activation Steering</h3>
           <div class="content">
+            <p>Direct intervention at inference time without model retraining:</p>
             <ul>
+              <li>Generate <strong>steering vectors</strong> from contrasting compliant vs. non-compliant responses</li>
+              <li>Apply vectors to internal activations (optimal at Layer 18 for Llama 3.1-8B)</li>
+              <li>Modify hidden states: <code>a′ₗ(q) = aₗ(q) + αₛvₛ,ₗ + αₕvₕ,ₗ</code></li>
+              <li><strong>No fine-tuning needed</strong>—works on closed-source models</li>
             </ul>
+            <p>This approach reduces computational costs while ensuring robust rule enforcement across diverse contexts.</p>
           </div>
         </div>
       </div>
     </div>
   </div>
 </section>
+<section class="section section-alt">
   <div class="container is-max-desktop">
     <div class="columns is-centered">
       <div class="column is-four-fifths">
+        <h2 class="title is-3">Experimental Results</h2>
         <div class="results-highlight">
           <span class="number">>90%</span>
+          <span class="subtitle">Average Jailbreak Rate Reduction Across Domains</span>
         </div>
+        <h3 class="title is-4">Jailbreak Rate Reduction with REC (9,000 test queries)</h3>
         <div class="table-container">
           <table class="table is-striped is-fullwidth">
             <thead>
               <tr>
                 <th>Model</th>
+                <th>Domain</th>
                 <th>Original</th>
+                <th>+ Rules</th>
+                <th>+ MetaLoop 1-round</th>
+                <th>+ MetaLoop 2-round</th>
               </tr>
             </thead>
             <tbody>
               <tr>
+                <td rowspan="3"><strong>Qwen3-235B</strong></td>
+                <td>Education</td>
                 <td>56.33%</td>
+                <td>13.27%</td>
+                <td>6.02%</td>
                 <td><strong>3.13%</strong></td>
               </tr>
               <tr>
+                <td>Management</td>
+                <td>72.36%</td>
+                <td>18.46%</td>
+                <td>7.81%</td>
+                <td><strong>4.87%</strong></td>
+              </tr>
+              <tr>
+                <td>Finance</td>
+                <td>55.39%</td>
+                <td>14.73%</td>
+                <td>7.57%</td>
+                <td><strong>3.60%</strong></td>
+              </tr>
+              <tr>
+                <td rowspan="3"><strong>GPT-4o</strong></td>
+                <td>Education</td>
                 <td>58.81%</td>
+                <td>20.87%</td>
+                <td>10.79%</td>
                 <td><strong>6.43%</strong></td>
               </tr>
               <tr>
+                <td>Management</td>
+                <td>72.95%</td>
+                <td>9.15%</td>
+                <td>2.91%</td>
+                <td><strong>1.49%</strong></td>
+              </tr>
+              <tr>
+                <td>Finance</td>
+                <td>65.15%</td>
+                <td>7.91%</td>
+                <td>3.08%</td>
+                <td><strong>1.67%</strong></td>
+              </tr>
+            </tbody>
+          </table>
+        </div>
+        <h3 class="title is-4">Activation Steering Performance (Llama 3.1-8B-Instruct)</h3>
+        <div class="table-container">
+          <table class="table is-striped is-fullwidth">
+            <thead>
+              <tr>
+                <th>Domain</th>
+                <th>Original</th>
+                <th>Rule Prompt</th>
+                <th>Steering Vector (RV)</th>
+              </tr>
+            </thead>
+            <tbody>
+              <tr>
+                <td>Education</td>
                 <td>67.45%</td>
+                <td>43.26%</td>
                 <td><strong>31.39%</strong></td>
+              </tr>
+              <tr>
+                <td>Management</td>
+                <td>75.77%</td>
+                <td>37.84%</td>
+                <td><strong>36.90%</strong></td>
+              </tr>
+              <tr>
+                <td>Finance</td>
+                <td>59.38%</td>
+                <td>49.95%</td>
+                <td><strong>37.11%</strong></td>
               </tr>
             </tbody>
           </table>
         </div>
+        <div class="quote-box">
+          <p>
+            ✅ Human evaluators prefer MENTOR-augmented responses <strong>68% of the time</strong> for safety,
+            appropriateness, and usefulness, with only 12% preference for original responses.
           </p>
         </div>
       </div>
   <div class="container is-max-desktop">
     <div class="columns is-centered">
       <div class="column is-four-fifths">
+        <h2 class="title is-3">Key Contributions</h2>
         <div class="content">
+          <div class="methodology-step">
+            <h3 class="title is-4">Novel Metacognitive Assessment</h3>
+            <p>
+              We introduce a metacognitive self-assessment tool that enables LLMs to critically evaluate their own reasoning
+              and outputs, achieving human-level performance (79.3% consistency) while detecting subtle value misalignments
+              that conventional methods miss.
+            </p>
+          </div>
+          <div class="methodology-step">
+            <h3 class="title is-4">Self-Evolving Rule Architecture</h3>
+            <p>
+              The Rule Evolution Cycle (REC) integrates expert-defined static rule trees with metacognition-driven dynamic
+              rule graphs, enabling continuous adaptation to emerging risks without manual intervention.
+            </p>
+          </div>
+          <div class="methodology-step">
+            <h3 class="title is-4">Efficient Activation Steering</h3>
+            <p>
+              By leveraging activation steering during inference, MENTOR enforces domain-specific rules robustly and
+              cost-effectively, significantly reducing computational resources compared to traditional fine-tuning methods.
+            </p>
+          </div>
+          <div class="methodology-step">
+            <h3 class="title is-4">Comprehensive Evaluation</h3>
+            <p>
+              We release a dataset of 9,000 domain-specific implicit-risk queries across education, finance, and management,
+              providing a benchmark for future research in domain-specific LLM safety.
+            </p>
+          </div>
         </div>
       </div>
     </div>
 <section class="section" id="BibTeX">
   <div class="container is-max-desktop content">
     <h2 class="title">BibTeX</h2>
+    <div class="code-block">
+      <code>@article{mentor2025,
+  title={MENTOR: A Metacognition-Driven Self-Evolution Framework for Uncovering and Mitigating Implicit Risks in Domain-Specific LLMs},
   author={Wu, Wen and Ying, Zhenyu and He, Liang and Team, Shell},
   journal={Anonymous Submission},
   year={2025}
+}</code>
+    </div>
   </div>
 </section>
+<footer class="footer" style="background: var(--mentor-dark); color: white; padding: 3rem 1.5rem;">
   <div class="container">
     <div class="content has-text-centered">
       <p>
+        <strong style="color: white;">MENTOR Framework</strong> - A Metacognition-Driven Approach to LLM Safety
+      </p>
+      <p>
+        This website is licensed under a <a rel="license" target="_blank" style="color: #3498db;"
                                             href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
         Commons Attribution-ShareAlike 4.0 International License</a>.
       </p>
 </footer>
 <script>
   document.addEventListener('DOMContentLoaded', function() {
     // Add smooth scrolling for anchor links
     document.querySelectorAll('a[href^="#"]').forEach(anchor => {
         });
       });
     });
+    // Add animation to methodology steps on scroll
+    const observerOptions = {
+      threshold: 0.1,
+      rootMargin: '0px 0px -50px 0px'
+    };
+    const observer = new IntersectionObserver(function(entries) {
+      entries.forEach(entry => {
+        if (entry.isIntersecting) {
+          entry.target.style.opacity = '1';
+          entry.target.style.transform = 'translateY(0)';
+        }
+      });
+    }, observerOptions);
+    // Observe methodology steps
+    document.querySelectorAll('.methodology-step').forEach(step => {
+      step.style.opacity = '0';
+      step.style.transform = 'translateY(20px)';
+      step.style.transition = 'opacity 0.5s ease, transform 0.5s ease';
+      observer.observe(step);
+    });
   });
 </script>