| <!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"/> |
| <title>Multilingual Refusal Alignment for Safer LLMs</title> |
| <link rel="preconnect" href="https://fonts.googleapis.com"> |
| <link href="https://fonts.googleapis.com/css2?family=Source+Serif+4:ital,opsz,wght@0,8..60,300;0,8..60,400;0,8..60,600;0,8..60,700;1,8..60,300;1,8..60,400&family=DM+Sans:ital,wght@0,300;0,400;0,500;0,600;1,300&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet"> |
| <style> |
| :root { |
| --bg: #faf9f7; |
| --surface: #ffffff; |
| --border: #e8e4de; |
| --text-primary: #1a1714; |
| --text-secondary: #5c5650; |
| --text-muted: #8c867e; |
| --accent: #c0392b; |
| --accent-light: #fdf0ef; |
| --accent2: #2c5f8a; |
| --accent2-light: #edf3f9; |
| --tag-bg: #f0ede8; |
| --shadow: 0 2px 16px rgba(26,23,20,0.07); |
| --shadow-lg: 0 8px 40px rgba(26,23,20,0.11); |
| --radius: 8px; |
| --serif: 'Source Serif 4', Georgia, serif; |
| --sans: 'DM Sans', system-ui, sans-serif; |
| --mono: 'JetBrains Mono', monospace; |
| } |
| |
| *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; } |
| |
| html { scroll-behavior: smooth; } |
| |
| body { |
| font-family: var(--sans); |
| background: var(--bg); |
| color: var(--text-primary); |
| line-height: 1.7; |
| -webkit-font-smoothing: antialiased; |
| } |
| |
| |
| .hero { |
| background: var(--surface); |
| border-bottom: 1px solid var(--border); |
| padding: 80px 24px 64px; |
| text-align: center; |
| } |
| |
| .venue-badge { |
| display: inline-flex; |
| align-items: center; |
| gap: 6px; |
| background: var(--accent-light); |
| color: var(--accent); |
| border: 1px solid rgba(192,57,43,0.2); |
| border-radius: 20px; |
| padding: 5px 14px; |
| font-size: 0.78rem; |
| font-weight: 600; |
| letter-spacing: 0.04em; |
| text-transform: uppercase; |
| margin-bottom: 28px; |
| } |
| |
| |
| .hero h1 { |
| font-family: var(--serif); |
| font-size: clamp(1.7rem, 4vw, 2.6rem); |
| font-weight: 600; |
| line-height: 1.25; |
| max-width: 780px; |
| margin: 0 auto 8px; |
| color: var(--text-primary); |
| letter-spacing: -0.01em; |
| } |
| |
| .hero h1 em { |
| font-style: italic; |
| color: var(--accent); |
| } |
| |
| .authors { |
| margin: 28px auto 0; |
| max-width: 660px; |
| font-size: 0.95rem; |
| color: var(--text-secondary); |
| } |
| |
| .authors .name-list { |
| display: flex; |
| flex-wrap: wrap; |
| justify-content: center; |
| gap: 4px 18px; |
| font-weight: 500; |
| color: var(--text-primary); |
| margin-bottom: 6px; |
| } |
| |
| .authors .name-list sup { |
| font-size: 0.65em; |
| color: var(--accent); |
| font-weight: 600; |
| } |
| |
| .affil { |
| font-size: 0.82rem; |
| color: var(--text-muted); |
| line-height: 1.6; |
| } |
| |
| |
| .links-row { |
| display: flex; |
| flex-wrap: wrap; |
| justify-content: center; |
| gap: 10px; |
| margin-top: 32px; |
| } |
| |
| .btn { |
| display: inline-flex; |
| align-items: center; |
| gap: 7px; |
| padding: 9px 18px; |
| border-radius: 6px; |
| font-family: var(--sans); |
| font-size: 0.84rem; |
| font-weight: 500; |
| text-decoration: none; |
| transition: all 0.18s ease; |
| border: 1px solid transparent; |
| cursor: pointer; |
| } |
| |
| .btn-primary { |
| background: var(--text-primary); |
| color: #fff; |
| } |
| .btn-primary:hover { background: #333; transform: translateY(-1px); box-shadow: var(--shadow); } |
| |
| .btn-outline { |
| background: var(--surface); |
| color: var(--text-primary); |
| border-color: var(--border); |
| } |
| .btn-outline:hover { border-color: var(--text-primary); transform: translateY(-1px); box-shadow: var(--shadow); } |
| |
| .btn-dataset { |
| background: var(--accent2-light); |
| color: var(--accent2); |
| border-color: rgba(44,95,138,0.2); |
| } |
| .btn-dataset:hover { background: #dbe9f5; transform: translateY(-1px); } |
| |
| .btn svg { width: 15px; height: 15px; flex-shrink: 0; } |
| |
| |
| .container { |
| max-width: 860px; |
| margin: 0 auto; |
| padding: 0 24px; |
| } |
| |
| section { |
| padding: 64px 24px; |
| max-width: 860px; |
| margin: 0 auto; |
| } |
| |
| .section-label { |
| font-size: 0.72rem; |
| font-weight: 600; |
| letter-spacing: 0.1em; |
| text-transform: uppercase; |
| color: var(--accent); |
| margin-bottom: 12px; |
| } |
| |
| h2 { |
| font-family: var(--serif); |
| font-size: clamp(1.25rem, 2.5vw, 1.6rem); |
| font-weight: 600; |
| margin-bottom: 20px; |
| color: var(--text-primary); |
| letter-spacing: -0.01em; |
| } |
| |
| p { |
| color: var(--text-secondary); |
| font-size: 0.97rem; |
| line-height: 1.8; |
| margin-bottom: 14px; |
| } |
| |
| p:last-child { margin-bottom: 0; } |
| |
| |
| .abstract-section { |
| border-bottom: 1px solid var(--border); |
| } |
| |
| .abstract-card { |
| background: var(--surface); |
| border: 1px solid var(--border); |
| border-radius: 12px; |
| padding: 32px 36px; |
| box-shadow: var(--shadow); |
| } |
| |
| |
| .highlights-grid { |
| display: grid; |
| grid-template-columns: repeat(auto-fit, minmax(240px, 1fr)); |
| gap: 16px; |
| margin-top: 8px; |
| } |
| |
| .highlight-card { |
| background: var(--surface); |
| border: 1px solid var(--border); |
| border-radius: 10px; |
| padding: 22px 24px; |
| box-shadow: var(--shadow); |
| transition: box-shadow 0.2s, transform 0.2s; |
| position: relative; |
| overflow: hidden; |
| } |
| |
| .highlight-card::before { |
| content: ''; |
| position: absolute; |
| top: 0; left: 0; right: 0; |
| height: 3px; |
| } |
| .highlight-card:nth-child(1)::before { background: var(--accent); } |
| .highlight-card:nth-child(2)::before { background: var(--accent2); } |
| .highlight-card:nth-child(3)::before { background: #2a7a4e; } |
| .highlight-card:nth-child(4)::before { background: #7a4a2a; } |
| |
| .highlight-card:hover { box-shadow: var(--shadow-lg); transform: translateY(-2px); } |
| |
| .highlight-icon { |
| font-size: 1.5rem; |
| margin-bottom: 10px; |
| display: block; |
| } |
| |
| .highlight-title { |
| font-weight: 600; |
| font-size: 0.9rem; |
| color: var(--text-primary); |
| margin-bottom: 6px; |
| } |
| |
| .highlight-card p { |
| font-size: 0.85rem; |
| line-height: 1.65; |
| margin: 0; |
| } |
| |
| |
| .dataset-section { |
| background: var(--surface); |
| border-top: 1px solid var(--border); |
| border-bottom: 1px solid var(--border); |
| padding: 64px 24px; |
| } |
| |
| .dataset-inner { |
| max-width: 860px; |
| margin: 0 auto; |
| } |
| |
| .dataset-stats { |
| display: grid; |
| grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); |
| gap: 16px; |
| margin: 28px 0; |
| } |
| |
| .stat-box { |
| background: var(--bg); |
| border: 1px solid var(--border); |
| border-radius: 10px; |
| padding: 20px; |
| text-align: center; |
| } |
| |
| .stat-number { |
| font-family: var(--serif); |
| font-size: 2rem; |
| font-weight: 600; |
| color: var(--accent); |
| display: block; |
| line-height: 1.1; |
| } |
| |
| .stat-label { |
| font-size: 0.78rem; |
| color: var(--text-muted); |
| margin-top: 4px; |
| display: block; |
| font-weight: 500; |
| } |
| |
| .lang-pills { |
| display: flex; |
| flex-wrap: wrap; |
| gap: 7px; |
| margin-top: 16px; |
| } |
| |
| .lang-pill { |
| background: var(--tag-bg); |
| color: var(--text-secondary); |
| border-radius: 5px; |
| padding: 3px 10px; |
| font-size: 0.78rem; |
| font-weight: 500; |
| font-family: var(--mono); |
| } |
| |
| |
| .rq-list { |
| list-style: none; |
| display: flex; |
| flex-direction: column; |
| gap: 14px; |
| margin-top: 8px; |
| } |
| |
| .rq-item { |
| display: flex; |
| gap: 16px; |
| background: var(--surface); |
| border: 1px solid var(--border); |
| border-radius: 10px; |
| padding: 18px 20px; |
| box-shadow: var(--shadow); |
| } |
| |
| .rq-tag { |
| flex-shrink: 0; |
| background: var(--text-primary); |
| color: #fff; |
| border-radius: 6px; |
| padding: 2px 9px; |
| font-size: 0.72rem; |
| font-weight: 700; |
| letter-spacing: 0.04em; |
| height: fit-content; |
| margin-top: 2px; |
| } |
| |
| .rq-text { |
| font-size: 0.92rem; |
| color: var(--text-secondary); |
| line-height: 1.7; |
| margin: 0; |
| } |
| |
| |
| .figure-block { |
| background: var(--surface); |
| border: 1.5px dashed var(--border); |
| border-radius: 10px; |
| padding: 0; |
| overflow: hidden; |
| margin: 24px 0; |
| } |
| |
| .figure-block img { |
| width: 100%; |
| height: auto; |
| display: block; |
| } |
| |
| .figure-caption { |
| padding: 10px 16px 12px; |
| font-size: 0.8rem; |
| color: var(--text-muted); |
| border-top: 1px solid var(--border); |
| background: var(--bg); |
| font-style: italic; |
| } |
| |
| |
| .finding-list { |
| list-style: none; |
| display: flex; |
| flex-direction: column; |
| gap: 12px; |
| margin-top: 8px; |
| } |
| |
| .finding-item { |
| display: flex; |
| gap: 14px; |
| align-items: flex-start; |
| padding: 16px 20px; |
| background: var(--surface); |
| border: 1px solid var(--border); |
| border-radius: 9px; |
| box-shadow: var(--shadow); |
| } |
| |
| .finding-dot { |
| width: 8px; |
| height: 8px; |
| border-radius: 50%; |
| background: var(--accent); |
| flex-shrink: 0; |
| margin-top: 8px; |
| } |
| |
| .finding-item:nth-child(2) .finding-dot { background: var(--accent2); } |
| .finding-item:nth-child(3) .finding-dot { background: #2a7a4e; } |
| .finding-item:nth-child(4) .finding-dot { background: #7a5f2a; } |
| .finding-item:nth-child(5) .finding-dot { background: #6a2a7a; } |
| |
| .finding-text { |
| font-size: 0.9rem; |
| color: var(--text-secondary); |
| line-height: 1.7; |
| margin: 0; |
| } |
| |
| .finding-text strong { color: var(--text-primary); } |
| |
| |
| .bibtex-section { |
| background: var(--surface); |
| border-top: 1px solid var(--border); |
| padding: 64px 24px; |
| } |
| |
| .bibtex-inner { |
| max-width: 860px; |
| margin: 0 auto; |
| } |
| |
| .code-block { |
| background: #1d1b19; |
| color: #d4cfc9; |
| border-radius: 10px; |
| padding: 24px 28px; |
| font-family: var(--mono); |
| font-size: 0.78rem; |
| line-height: 1.8; |
| overflow-x: auto; |
| margin-top: 16px; |
| position: relative; |
| } |
| |
| .code-key { color: #e09a6a; } |
| .code-val { color: #8bbf7a; } |
| .code-brace { color: #888; } |
| |
| .copy-btn { |
| position: absolute; |
| top: 12px; right: 14px; |
| background: rgba(255,255,255,0.08); |
| border: 1px solid rgba(255,255,255,0.12); |
| color: #aaa; |
| padding: 4px 11px; |
| border-radius: 5px; |
| font-size: 0.72rem; |
| cursor: pointer; |
| font-family: var(--sans); |
| transition: all 0.15s; |
| } |
| .copy-btn:hover { background: rgba(255,255,255,0.14); color: #fff; } |
| |
| |
| footer { |
| background: var(--bg); |
| border-top: 1px solid var(--border); |
| text-align: center; |
| padding: 28px 24px; |
| font-size: 0.78rem; |
| color: var(--text-muted); |
| } |
| |
| footer a { color: var(--accent2); text-decoration: none; } |
| footer a:hover { text-decoration: underline; } |
| |
| |
| .divider { |
| height: 1px; |
| background: var(--border); |
| max-width: 860px; |
| margin: 0 auto; |
| } |
| |
| |
| @media (max-width: 600px) { |
| .abstract-card { padding: 22px 20px; } |
| .links-row { gap: 8px; } |
| section { padding: 48px 20px; } |
| .dataset-section { padding: 48px 20px; } |
| .bibtex-section { padding: 48px 20px; } |
| } |
| |
| |
| @keyframes fadeUp { |
| from { opacity: 0; transform: translateY(18px); } |
| to { opacity: 1; transform: translateY(0); } |
| } |
| |
| .hero > * { |
| animation: fadeUp 0.55s ease both; |
| } |
| .hero .venue-badge { animation-delay: 0.05s; } |
| .hero h1 { animation-delay: 0.12s; } |
| .hero .authors { animation-delay: 0.2s; } |
| .hero .links-row { animation-delay: 0.28s; } |
| </style> |
| </head> |
| <body> |
|
|
| |
| <header class="hero"> |
| <div class="venue-badge">Findings of ACL 2026</div> |
|
|
| <h1>Multilingual Refusal Alignment<br>for Safer Large Language Models</h1> |
|
|
| <div class="authors"> |
| <div class="name-list"> |
| <span>Aleksandra KrasnodΔbska<sup>β </sup></span> |
| <span>Wojciech Kusa<sup>β </sup></span> |
| <span>Aldo Lipani<sup>β‘</sup></span> |
| </div> |
| <div class="affil"> |
| <sup>β </sup> NASK National Research Institute, Warsaw, Poland Β· |
| <sup>β‘</sup> University College London, London, UK<br> |
| </div> |
| </div> |
|
|
| <div class="links-row"> |
| <a href="https://wojciechkusa.github.io/papers/RefusEU-2026.pdf" class="btn btn-primary"> |
| <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"/><polyline points="14 2 14 8 20 8"/></svg> |
| Paper |
| </a> |
| <a href="https://huggingface.co/datasets/NASK-PIB/RefusEU" class="btn btn-dataset" target="_blank"> |
| <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><ellipse cx="12" cy="5" rx="9" ry="3"/><path d="M21 12c0 1.66-4 3-9 3s-9-1.34-9-3"/><path d="M3 5v14c0 1.66 4 3 9 3s9-1.34 9-3V5"/></svg> |
| RefusEU Dataset |
| </a> |
| <a href="#bibtex" class="btn btn-outline"> |
| <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M4 7V4h16v3"/><path d="M9 20h6"/><path d="M12 4v16"/></svg> |
| BibTeX |
| </a> |
| </div> |
| </header> |
|
|
|
|
| |
| <section class="abstract-section"> |
| <div class="section-label">Abstract</div> |
| <div class="abstract-card"> |
| <p>As Large Language Models are deployed globally, ensuring consistent safety across languages becomes paramount. Yet safety behaviors vary unpredictably between languages, and most alignment research remains English-centric β creating a critical gap for non-English speakers worldwide.</p> |
| <p>We systematically investigate the dynamics of multilingual alignment: whether single-language alignment transfers cross-lingually, how language consistency is preserved during training, and the resulting trade-offs with general capabilities. We introduce <strong>RefusEU</strong>, a novel refusal alignment dataset covering <strong>12 European languages</strong>, including a held-out test set for evaluating state-of-the-art models.</p> |
| <p>Our controlled Direct Preference Optimization (DPO) experiments reveal two key insights: aligning models exclusively in English is <em>insufficient</em> to ensure cross-lingual safety β even for identical harm categories. Conversely, training on multilingual datasets can improve safety <em>without degrading general performance</em>, as measured by the Global MMLU benchmark.</p> |
| </div> |
| </section> |
|
|
| <div class="divider"></div> |
|
|
| |
| <section> |
| <div class="section-label">Research Questions</div> |
| <h2>What We Set Out to Answer</h2> |
| <ul class="rq-list"> |
| <li class="rq-item"> |
| <span class="rq-tag">RQ1</span> |
| <p class="rq-text">Do we need to perform multilingual alignment for each language on the same groups of prompts, or is training in a single language (English) sufficient to achieve cross-lingual safety?</p> |
| </li> |
| <li class="rq-item"> |
| <span class="rq-tag">RQ2</span> |
| <p class="rq-text">How well is cross-lingual consistency β the ability to respond in the prompt's language β preserved during multilingual training, and how does it interact with safety?</p> |
| </li> |
| <li class="rq-item"> |
| <span class="rq-tag">RQ3</span> |
| <p class="rq-text">How does multilingual safety alignment influence general multilingual capabilities, including factual knowledge, fluency, and linguistic correctness?</p> |
| </li> |
| </ul> |
| </section> |
|
|
| <div class="divider"></div> |
|
|
| |
| <div class="dataset-section"> |
| <div class="dataset-inner"> |
| <div class="section-label">Dataset</div> |
| <h2>Introducing RefusEU</h2> |
| <p>RefusEU is the first European dataset designed for alignment training as DPO-ready triples β (question, chosen, rejected) β and includes a separate, contamination-free test split. Each chosen response is a high-quality refusal; each rejected response was generated by a safety-abliterated model.</p> |
|
|
| <div class="dataset-stats"> |
| <div class="stat-box"> |
| <span class="stat-number">12</span> |
| <span class="stat-label">European languages</span> |
| </div> |
| <div class="stat-box"> |
| <span class="stat-number">4k+</span> |
| <span class="stat-label">pairs per language</span> |
| </div> |
| <div class="stat-box"> |
| <span class="stat-number">14</span> |
| <span class="stat-label">harm categories</span> |
| </div> |
| <div class="stat-box"> |
| <span class="stat-number">1,400</span> |
| <span class="stat-label">test samples / lang.</span> |
| </div> |
| </div> |
|
|
| <p style="margin-bottom:10px;"><strong style="color: var(--text-primary);">Languages covered:</strong></p> |
| <div class="lang-pills"> |
| <span class="lang-pill">English</span> |
| <span class="lang-pill">German</span> |
| <span class="lang-pill">French</span> |
| <span class="lang-pill">Italian</span> |
| <span class="lang-pill">Spanish</span> |
| <span class="lang-pill">Portuguese</span> |
| <span class="lang-pill">Polish</span> |
| <span class="lang-pill">Czech</span> |
| <span class="lang-pill">Slovak</span> |
| <span class="lang-pill">Slovenian</span> |
| <span class="lang-pill">Lithuanian</span> |
| <span class="lang-pill">Latvian</span> |
| </div> |
|
|
| <p style="margin-top: 20px;">Questions are generated using an adversarial pipeline based on Rainbow Teaming across 10 attack styles and 14 crime categories (Llama-Guard taxonomy). A multi-model labelling protocol (Llama-Guard-3-8B, PolyGuard-Qwen, GPT-4o-mini) ensures label quality, with a manual audit confirming 100% accuracy across 1,200 sampled pairs.</p> |
|
|
| <div class="figure-block" id="fig-dataset"> |
| <img src="img/dataset.png" alt="Dataset construction pipeline" class="figure-image" style="width: 50%; margin: 0 auto;" /> |
| <div class="figure-caption">Figure 1 β Dataset construction process: adversarial prompt generation β multilingual translation β dual-model safety labelling β DPO triple curation.</div> |
| </div> |
| </div> |
| </div> |
|
|
| <div class="divider"></div> |
|
|
| |
| <section> |
| <div class="section-label">Methodology</div> |
| <h2>Experimental Design</h2> |
| <p>To isolate alignment dynamics, we start from <em>abliterated</em> Llama-3.1-8B and 70B models β versions where safety mechanisms have been deliberately removed via refusal direction ablation β then realign them using DPO under four dataset configurations:</p> |
|
|
| <div class="highlights-grid" style="margin: 20px 0 24px;"> |
| <div class="highlight-card"> |
| <span class="highlight-icon">βοΈ</span> |
| <div class="highlight-title">Balanced</div> |
| <p>All 12 languages with equal representation (34,668 samples total).</p> |
| </div> |
| <div class="highlight-card"> |
| <span class="highlight-icon">π</span> |
| <div class="highlight-title">High-Resource Only</div> |
| <p>English, German, Italian, French, Spanish, Portuguese (17,334 samples).</p> |
| </div> |
| <div class="highlight-card"> |
| <span class="highlight-icon">π¬π§</span> |
| <div class="highlight-title">English Only</div> |
| <p>Baseline to test whether English alignment is sufficient (2,889 samples).</p> |
| </div> |
| <div class="highlight-card"> |
| <span class="highlight-icon">π</span> |
| <div class="highlight-title">No English</div> |
| <p>All 11 non-English languages β tests transfer to English from others.</p> |
| </div> |
| </div> |
|
|
| <p>Additionally, 11 individual single-language DPO runs were performed to measure language-specific transfer. Evaluation uses Attack Success Rate (ASR) on RefusEU-test, language consistency, Global MMLU, and an LLM-as-a-Judge fluency/correctness protocol.</p> |
| </section> |
|
|
| <div class="divider"></div> |
|
|
| |
| <section> |
| <div class="section-label">Results</div> |
| <h2>Key Findings</h2> |
|
|
| <div class="figure-block" id="fig-asr"> |
| <img src="img/asr-table.png" alt="ASR comparison table results" style="width: 100%; display: block; margin: 0 auto 16px;" /> |
| <img src="img/asr.png" alt="ASR comparison figure results" style="width: 100%; display: block; margin: 0 auto;" /> |
| <div class="figure-caption">Table 2 and Figure 2 β Attack Success Rate (ASR %) on RefusEU-test. Lower is better. Balanced multilingual training achieves the lowest ASR across both model sizes.</div> |
| </div> |
|
|
| <ul class="finding-list"> |
| <li class="finding-item"> |
| <div class="finding-dot"></div> |
| <p class="finding-text"><strong>English-only alignment is insufficient.</strong> Training exclusively on English safety preferences leads to notably higher ASR for low-resource languages, particularly with Llama-70B β demonstrating that cross-lingual safety transfer from English alone cannot be relied upon.</p> |
| </li> |
| <li class="finding-item"> |
| <div class="finding-dot"></div> |
| <p class="finding-text"><strong>Balanced multilingual training works best.</strong> The lowest average ASR across all languages is consistently achieved by the balanced 12-language configuration for both the 8B and 70B models, with high-resource-only training as a strong second choice.</p> |
| </li> |
| <li class="finding-item"> |
| <div class="finding-dot"></div> |
| <p class="finding-text"><strong>Linguistic proximity enables transfer.</strong> Closely related language pairs β PolishβCzech and PortugueseβSpanish β exhibit strongly correlated ASR values across training configurations, suggesting that structural similarity facilitates cross-lingual safety generalization.</p> |
| </li> |
| <li class="finding-item"> |
| <div class="finding-dot"></div> |
| <p class="finding-text"><strong>Language consistency and safety interact non-trivially.</strong> While high language consistency is generally desirable, explicitly enforcing it can reduce safety in smaller models like Llama-8B. Llama-70B achieves near-100% consistency across all configurations; smaller models degrade under English-only setups.</p> |
| </li> |
| <li class="finding-item"> |
| <div class="finding-dot"></div> |
| <p class="finding-text"><strong>General capabilities are largely preserved.</strong> Performance degradation on Global MMLU stays below 0.006 for both model sizes. For low-resource languages on the 8B model, translation-based pipelines (translate β answer in English β translate back) outperform native-language generation even for the unmodified Instruct baseline.</p> |
| </li> |
| </ul> |
|
|
| <div class="figure-block" id="fig-scatter" style="margin-top: 24px;"> |
| <img src="img/language.png" alt="ASR vs. language consistency scatter plot" style="width: 90%; display: block; margin: 0 auto;" /> |
| <div class="figure-caption">Figure 3 β ASR vs. language consistency across training setups. Llama-70B with high-resource training achieves the best combined performance.</div> |
| </div> |
| </section> |
|
|
| <div class="divider"></div> |
|
|
| |
| <section> |
| <div class="section-label">Contributions</div> |
| <h2>Summary</h2> |
| <div class="highlights-grid"> |
| <div class="highlight-card"> |
| <span class="highlight-icon">ποΈ</span> |
| <div class="highlight-title">RefusEU Dataset</div> |
| <p>The first DPO-ready multilingual refusal dataset covering 12 European languages, with a fixed contamination-free evaluation split and fully audited safety labels.</p> |
| </div> |
| <div class="highlight-card"> |
| <span class="highlight-icon">π¬</span> |
| <div class="highlight-title">Controlled Experiments</div> |
| <p>Systematic ablation across 4 training configurations + 11 single-language runs on deliberately de-safety-aligned base models for clean measurement.</p> |
| </div> |
| <div class="highlight-card"> |
| <span class="highlight-icon">π</span> |
| <div class="highlight-title">Multidimensional Evaluation</div> |
| <p>ASR, language consistency, Global MMLU, and fluency/correctness measured across all 12 languages, revealing trade-offs invisible under single-metric reporting.</p> |
| </div> |
| </div> |
| </section> |
|
|
| |
| <div class="bibtex-section" id="bibtex"> |
| <div class="bibtex-inner"> |
| <div class="section-label">Citation</div> |
| <h2>BibTeX</h2> |
| <div class="code-block" id="bibtex-block"> |
| <button class="copy-btn" onclick="copyBibtex()">Copy</button> |
| <span class="code-brace">@inproceedings{</span><span class="code-val">krasnodebska2026refuseu</span><span class="code-brace">,</span><br> |
| <span class="code-key">title</span> = <span class="code-val">{Multilingual Refusal Alignment for Safer Large Language Models}</span>,<br> |
| <span class="code-key">author</span> = <span class="code-val">{KrasnodΔbska, Aleksandra and Kusa, Wojciech and Lipani, Aldo}</span>,<br> |
| <span class="code-key">booktitle</span> = <span class="code-val">{Findings of the Association for Computational Linguistics: ACL 2026}</span>,<br> |
| <span class="code-key">year</span> = <span class="code-val">{2026}</span>,<br> |
| <span class="code-key">address</span> = <span class="code-val">{San Diego, California, United States}</span>,<br> |
| <span class="code-key">publisher</span> = <span class="code-val">{Association for Computational Linguistics}</span><br> |
| <span class="code-brace">}</span> |
| </div> |
| </div> |
| </div> |
|
|
| <footer> |
| <p style="margin-top:6px;">RefusEU dataset available at <a href="https://huggingface.co/datasets/NASK-PIB/RefusEU" target="_blank">huggingface.co/datasets/NASK-PIB/RefusEU</a></p> |
| </footer> |
|
|
| <script> |
| function copyBibtex() { |
| const raw = `@inproceedings{krasnodebska2026refuseu, |
| title = {Multilingual Refusal Alignment for Safer Large Language Models}, |
| author = {KrasnodΔbska, Aleksandra and Kusa, Wojciech and Lipani, Aldo}, |
| booktitle = {Findings of the Association for Computational Linguistics: ACL 2026}, |
| year = {2026}, |
| address = {San Diego, California, United States}, |
| publisher = {Association for Computational Linguistics} |
| }`; |
| navigator.clipboard.writeText(raw).then(() => { |
| const btn = document.querySelector('.copy-btn'); |
| btn.textContent = 'Copied!'; |
| setTimeout(() => btn.textContent = 'Copy', 2000); |
| }); |
| } |
| </script> |
|
|
| </body> |
| </html> |
|
|