ParliaBench / app.py
argyrotsipi's picture
Update app.py
1c462d3 verified
"""
ParliaBench Demo β€” Hugging Face Space
Interactive inference demo for LLM-generated UK parliamentary speeches.
Based on:
"ParliaBench: An Evaluation and Benchmarking Framework for
LLM-Generated Parliamentary Speech"
Argyro Tsipi, NTUA Diploma Thesis, October 2025
Repos:
Models β†’ argyro/parliabench-{model}-lora
Dataset β†’ argyro/parliabench-gb-processed
Space β†’ argyro/parliabench-demo
"""
import json
import re
import time
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from utils import (
PARTIES, EUROVOC_TOPICS, HOUSES, MODELS, MODEL_FAMILY, MODEL_CONFIG,
DEFAULT_GEN_PARAMS, get_valid_houses, get_orientation,
build_context_string, count_tokens_approx, validate_speech,
)
from prompt_templates import build_full_prompt
# ─── Model cache ──────────────────────────────────────────────────────────────
_model_cache: dict = {}
def _load_model_and_tokenizer(model_display_name: str):
"""Load (and cache) model + tokenizer for the given display name."""
if model_display_name in _model_cache:
return _model_cache[model_display_name]
repo_id = MODELS[model_display_name]
family = MODEL_FAMILY[model_display_name]
is_ft = "fine-tuned" in model_display_name
base_repo = MODEL_CONFIG[family]["base_model"]
tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
device_map = "auto" if torch.cuda.is_available() else None
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
if is_ft:
# Load base model, then apply LoRA adapter
from peft import PeftModel
base = AutoModelForCausalLM.from_pretrained(
base_repo, torch_dtype=dtype, device_map=device_map,
trust_remote_code=True,
)
model = PeftModel.from_pretrained(base, repo_id)
else:
model = AutoModelForCausalLM.from_pretrained(
repo_id, torch_dtype=dtype, device_map=device_map,
trust_remote_code=True,
)
model.eval()
_model_cache[model_display_name] = (model, tokenizer)
return model, tokenizer
# ─── Speech extraction (mirrors extract_speech in speech_generator.py) ────────
def _extract_speech(raw_text: str, family: str) -> str:
"""Extract clean speech from raw decoded model output."""
cfg = MODEL_CONFIG[family]
# Find start marker
start = cfg["start_marker"]
if start in raw_text:
parts = raw_text.split(start)
speech = parts[-1].lstrip("\n")
else:
speech = raw_text
# Truncate at end marker
for em in cfg["end_markers"]:
if em in speech:
speech = speech.split(em)[0]
break
# Remove special tokens
for tok in cfg["special_tokens_to_remove"]:
speech = speech.replace(tok, "")
# Remove template artefacts
for art in ["Context:", "Instruction:", "EUROVOC TOPIC:", "SECTION:",
"PARTY:", "POLITICAL ORIENTATION:", "HOUSE:",
"\nuser", "\nassistant", "\nsystem"]:
if art in speech:
speech = speech.split(art)[0]
# Strip meta-commentary prefixes
_strip_prefixes = [
"Thank you for providing", "Thank you for your instruction",
"Here is my speech:", "Here is my response:", "Response:",
"Based on your specifications", "Based on the context provided",
]
sl = speech.lower()
for prefix in _strip_prefixes:
if sl.startswith(prefix.lower()):
if prefix.endswith(":"):
speech = speech[len(prefix):].lstrip()
else:
cut = speech.find("\n\n")
if 0 < cut < 200:
speech = speech[cut + 2:].strip()
else:
cut = speech.find("\n")
if 0 < cut < 150:
speech = speech[cut + 1:].strip()
break
# Llama reserved tokens
speech = re.sub(r"<\|reserved_special_token_\d+\|>", "", speech)
speech = re.sub(r"<\|[^|]*\|>", "", speech)
# Whitespace
speech = re.sub(r"\n{3,}", "\n\n", speech)
speech = re.sub(r" {2,}", " ", speech)
speech = speech.strip()
# Leading punctuation artefacts
speech = re.sub(r"^[^\w\s\"'(]+", "", speech).lstrip()
speech = re.sub(r"^\.{2,}\s*", "", speech)
# HTML tags / trailing dashes
speech = re.sub(r"</?[a-zA-Z][^>]*>", "", speech)
speech = re.sub(r"----+\s*\.?\s*$", "", speech)
# Qwen: literal escape sequences
if "\\n" in speech or "\\t" in speech:
speech = speech.replace("\\n", "\n").replace("\\t", " ")
# Markdown
speech = re.sub(r"^#+\s+", "", speech)
speech = re.sub(r"\n#+\s+", "\n", speech)
speech = re.sub(r"\n?```\.?", "", speech)
speech = speech.strip()
# Final punctuation
if speech and not speech.endswith((".", "!", "?", '"', "'")):
speech = speech.rstrip() + "."
return speech
# ─── Main generation function ─────────────────────────────────────────────────
def generate_speech(
model_display_name: str,
party: str,
topic: str,
section: str,
house: str,
instruction_input: str,
temperature: float,
top_p: float,
repetition_penalty: float,
max_new_tokens: int,
min_words: int,
max_words: int,
):
"""Generate a parliamentary speech and return (speech, prompt, stats, params)."""
family = MODEL_FAMILY[model_display_name]
cfg = MODEL_CONFIG[family]
instruction = (instruction_input.strip()
if instruction_input and instruction_input.strip()
else f"Address the debate on {section} on {topic}.")
full_prompt = build_full_prompt(
model_family=family,
party=party,
topic=topic,
section=section,
house=house,
instruction=instruction,
min_words=int(min_words),
max_words=int(max_words),
)
prompt_tokens = count_tokens_approx(full_prompt)
try:
model, tokenizer = _load_model_and_tokenizer(model_display_name)
except Exception as exc:
return (
f"⚠️ Model loading failed:\n{exc}\n\n"
"Make sure the model repository exists on Hugging Face "
"and you have sufficient GPU memory (β‰₯16 GB recommended).",
full_prompt,
"*Model loading error β€” see output above.*",
"",
)
inputs = tokenizer([full_prompt], return_tensors="pt").to(model.device)
in_len = inputs["input_ids"].shape[-1]
pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
t0 = time.time()
with torch.no_grad():
out_ids = model.generate(
**inputs,
max_new_tokens=int(max_new_tokens),
do_sample=True,
temperature=float(temperature),
top_p=float(top_p),
repetition_penalty=float(repetition_penalty),
pad_token_id=pad_id,
eos_token_id=tokenizer.eos_token_id,
stop_strings=cfg["stop_strings"],
tokenizer=tokenizer,
use_cache=True,
)
elapsed = time.time() - t0
raw = tokenizer.decode(out_ids[0], skip_special_tokens=False)
speech = _extract_speech(raw, family)
is_valid, reason = validate_speech(speech, int(min_words), int(max_words))
wc = len(speech.split())
stats = (
f"**Tokens in prompt:** ~{prompt_tokens} | "
f"**Words generated:** {wc} | "
f"**Time:** {elapsed:.1f}s | "
f"**Validation:** {'βœ… ' + reason if is_valid else '⚠️ ' + reason}"
)
params_used = (
f"temperature={temperature}, top_p={top_p}, "
f"repetition_penalty={repetition_penalty}, max_new_tokens={max_new_tokens}"
)
return speech, full_prompt, stats, params_used
# ─── Sample gallery ───────────────────────────────────────────────────────────
with open("sample_data.json") as _f:
SAMPLES = json.load(_f)
def _render_sample(s: dict) -> str:
if s.get("is_finetuned"):
border_color = "#2d9e6b"
badge_bg, badge_color = "#d4f0e4", "#1a6b45"
badge_text = "✦ Fine-tuned"
else:
border_color = "#8a8a8a"
badge_bg, badge_color = "#e8e8e8", "#444444"
badge_text = "β—‡ Baseline"
meta = (
f'<div style="background:#f8f9fa;border-left:4px solid {border_color};'
f'border-radius:0 8px 8px 0;padding:14px 18px;margin-bottom:14px;'
f'color:#222222;font-size:.92em;line-height:1.9;">'
f'<span style="display:inline-block;padding:3px 12px;border-radius:12px;'
f'background:{badge_bg};color:{badge_color};font-weight:700;'
f'font-size:.82em;margin-bottom:8px;">{badge_text}</span><br>'
f'<strong style="color:#222;">Model</strong>&nbsp;&nbsp;{s["model"]}<br>'
f'<strong style="color:#222;">Party</strong>&nbsp;&nbsp;{s["party"]} &nbsp;Β·&nbsp; '
f'<strong style="color:#222;">Orientation</strong>&nbsp;&nbsp;{s["orientation"]}<br>'
f'<strong style="color:#222;">Topic</strong>&nbsp;&nbsp;{s["topic"]} &nbsp;Β·&nbsp; '
f'<strong style="color:#222;">Section</strong>&nbsp;&nbsp;{s["section"]}<br>'
f'<strong style="color:#222;">House</strong>&nbsp;&nbsp;{s["house"]} &nbsp;Β·&nbsp; '
f'<strong style="color:#222;">Words</strong>&nbsp;&nbsp;{s["word_count"]}'
f'</div>'
)
speech = (
f'<div style="background:#ffffff;border:1px solid #e0e0e0;border-radius:8px;'
f'padding:20px 24px;font-size:.95em;line-height:1.8;color:#1a1a1a;">'
f'{s["speech"]}</div>'
)
return meta + speech
# ─── Dynamic UI helpers ───────────────────────────────────────────────────────
def _update_house(party):
valid = get_valid_houses(party)
return gr.update(choices=valid, value=valid[0])
def _update_orientation(party):
return gr.update(value=get_orientation(party))
# ─── Gradio app ───────────────────────────────────────────────────────────────
CSS = """
#title { text-align: center; margin-bottom: .4em; }
#sub { text-align: center; color: #666; margin-bottom: 1.4em; font-size: .9em; }
#speech textarea { font-size: .95em; line-height: 1.65; }
#prompt textarea { font-family: monospace; font-size: .78em; }
.tab-nav { justify-content: center !important; }
.tab-nav button { font-size: .95em !important; }
"""
with gr.Blocks(css=CSS, title="ParliaBench Demo") as demo:
gr.HTML("""
<style>
table, table th, table td {
color: black !important;
}
</style>
""")
gr.Markdown("# ParliaBench β€” UK Parliamentary Speech Generation",
elem_id="title")
gr.Markdown(
"Inference demo for five LLMs fine-tuned on **ParlaMint-GB** with QLoRA \n"
"Koniaris, Tsipi & Tsanakas Β· [arXiv:2511.08247](https://arxiv.org/abs/2511.08247) Β· NTUA 2025 \n"
"[🌐 Project Page](https://argyrotsipi.github.io/ParliaBench/)",
elem_id="sub",
)
with gr.Tabs():
# ── Tab 1: About ──────────────────────────────────────────────────────
with gr.Tab("About"):
gr.HTML("""
<style>
/* Main section titles */
.pb-section h2 {
color: #00C389 !important;
}
/* Standalone h2 titles (e.g., About ParliaBench, Models, Dataset, Speech Validation) */
h2 {
color: #00C389 !important;
}
/* Uppercase mini section labels (e.g., QLORA CONFIGURATION, POLITICAL PARTIES IN DATASET, GENERATION PARAMETERS) */
div[style*="text-transform:uppercase"] {
color: #00C389 !important;
}
/* Model-specific small headers (Mistral, Llama 3.1, Gemma, etc.) */
.pb-template + p,
div p[style*="font-weight:600"] {
color: #00C389 !important;
}
.pb-card {
background:#f8f9fc;
border:1px solid #dde3ee;
border-radius:12px;
overflow:hidden;
margin:12px 0 20px;
}
.pb-card table {
border-collapse:collapse;
width:100%;
font-size:.88em;
}
.pb-card thead tr {
background:#e8eef6;
}
.pb-card thead th {
padding:9px 14px;
color:#1a3a5c;
font-weight:600;
text-align:left;
letter-spacing:.01em;
}
.pb-card thead th.r { text-align:right; }
.pb-card thead th.c { text-align:center; }
.pb-card tbody tr:nth-child(odd) { background:#f8f9fc; }
.pb-card tbody tr:nth-child(even) { background:#ffffff; }
.pb-card tbody td {
padding:7px 14px;
color:#222;
vertical-align:middle;
}
.pb-card tbody td.r { text-align:right; font-weight:600; }
.pb-card tbody td.c { text-align:center; color:#555; }
.pb-card tbody td.mono { font-family:monospace; font-size:.9em; color:#555; }
.pb-section {
border-top:1px solid #dde3ee;
margin:24px 0 6px;
padding-top:18px;
}
.pb-pre {
background:#f4f6f9;
border:1px solid #dde3ee;
border-radius:8px;
padding:12px 16px;
font-size:.83em;
color:#222 !important;
line-height:1.6;
overflow-x:auto;
font-family:monospace;
}
.pb-template {
background:#f4f6f9;
border:1px solid #c5d2e8;
border-left:3px solid #2d5282;
border-radius:0 8px 8px 0;
padding:10px 14px;
font-size:.79em;
color:#222 !important;;
line-height:1.6;
font-family:monospace;
overflow-x:auto;
margin:0;
}
.pb-template.purple {
border-left-color:#7b5ea7;
border-color:#ccc5e0;
background:#f6f4fc;
}
.pb-val-step {
padding:7px 14px;
color:#222;
vertical-align:middle;
line-height:1.55;
}
.pb-val-num {
font-weight:700;
color:#2d5282;
text-align:center;
padding:7px 10px;
}
.pb-val-label {
font-weight:600;
padding:7px 14px;
color:#222;
min-width:140px;
}
</style>
<div style="color:#222;font-size:.92em;line-height:1.8;padding:4px 0 8px 0;">
<h2 style="color:#1e2a3a;margin-bottom:6px;">About ParliaBench</h2>
<p><strong>ParliaBench</strong> is a benchmark and evaluation framework for LLM-generated UK parliamentary speeches,
combining a curated dataset, multi-dimensional evaluation metrics, and five domain-specific fine-tuned models.<br>
Paper: <a href="https://arxiv.org/abs/2511.08247" style="color:#4a7fa5;">arXiv:2511.08247</a></p>
<!-- ── Dataset ─────────────────────────────────────────────────────── -->
<div class="pb-section"><h2 style="color:#1e2a3a;margin:0 0 6px;">Dataset</h2></div>
<p>Constructed from the UK subset of the <strong>ParlaMint corpus</strong>, 2015–2022.
Four-step pipeline: XML parsing β†’ metadata alignment β†’ content filtering β†’ EuroVoc thematic classification.</p>
<div style="display:flex;gap:20px;flex-wrap:wrap;margin:14px 0 4px;">
<div style="flex:1;min-width:240px;">
<p style="font-weight:600;color:#2c3a4a;margin-bottom:6px;font-size:.85em;text-transform:uppercase;letter-spacing:.05em;">Corpus Statistics</p>
<div class="pb-card">
<table>
<thead><tr><th>Statistic</th><th class="r">Value</th></tr></thead>
<tbody>
<tr><td>Total speeches</td><td class="r">447,778</td></tr>
<tr><td>Unique speakers</td><td class="r">1,901</td></tr>
<tr><td>Political affiliations</td><td class="r">11</td></tr>
<tr><td>Total words</td><td class="r">~99.94 million</td></tr>
<tr><td>Mean speech length</td><td class="r">223 words</td></tr>
<tr><td>Median speech length</td><td class="r">99 words</td></tr>
<tr><td>P10 β€” min threshold</td><td class="r">43 words</td></tr>
<tr><td>P90 β€” max threshold</td><td class="r">635 words</td></tr>
<tr><td>EuroVoc topic domains</td><td class="r">21</td></tr>
<tr><td>Temporal coverage</td><td class="r">2015–2022</td></tr>
</tbody>
</table>
</div>
</div>
<div style="flex:2;min-width:340px;">
<p style="font-weight:600;color:#2c3a4a;margin-bottom:6px;font-size:.85em;text-transform:uppercase;letter-spacing:.05em;">Political Parties in Dataset</p>
<div class="pb-card">
<table>
<thead><tr><th>Party</th><th class="c">Orientation</th><th class="r">Speeches</th><th class="r">Speakers</th><th class="r">Share</th></tr></thead>
<tbody>
<tr><td style="font-weight:600;">Conservative</td><td class="c">Centre-right</td><td class="r">263,513</td><td class="r">792</td><td class="r">58.9%</td></tr>
<tr><td style="font-weight:600;">Labour</td><td class="c">Centre-left</td><td class="r">108,831</td><td class="r">592</td><td class="r">24.3%</td></tr>
<tr><td style="font-weight:600;">Scottish National Party</td><td class="c">Centre-left</td><td class="r">23,562</td><td class="r">67</td><td class="r">5.3%</td></tr>
<tr><td style="font-weight:600;">Liberal Democrats</td><td class="c">Centre / centre-left</td><td class="r">23,517</td><td class="r">168</td><td class="r">5.3%</td></tr>
<tr><td style="font-weight:600;">Crossbench</td><td class="c">Non-partisan</td><td class="r">11,878</td><td class="r">215</td><td class="r">2.7%</td></tr>
<tr><td style="font-weight:600;">Democratic Unionist Party</td><td class="c">Right</td><td class="r">6,610</td><td class="r">15</td><td class="r">1.5%</td></tr>
<tr><td style="font-weight:600;">Independent</td><td class="c">Non-partisan</td><td class="r">2,783</td><td class="r">45</td><td class="r">0.6%</td></tr>
<tr><td style="font-weight:600;">Plaid Cymru</td><td class="c">Centre-left to left</td><td class="r">2,229</td><td class="r">7</td><td class="r">0.5%</td></tr>
<tr><td style="font-weight:600;">Green Party</td><td class="c">Left</td><td class="r">1,992</td><td class="r">3</td><td class="r">0.4%</td></tr>
<tr><td style="font-weight:600;">Non-Affiliated</td><td class="c">Non-partisan</td><td class="r">1,713</td><td class="r">60</td><td class="r">0.4%</td></tr>
<tr><td style="font-weight:600;">Bishops</td><td class="c">Non-partisan</td><td class="r">1,150</td><td class="r">41</td><td class="r">0.3%</td></tr>
</tbody>
</table>
</div>
<p style="color:#888;font-size:.78em;margin-top:4px;">Bishops, Crossbench, and Non-Affiliated are formal parliamentary affiliations. Minimum threshold: 1,000 speeches.</p>
</div>
</div>
<!-- ── Models ─────────────────────────────────────────────────────────── -->
<div class="pb-section"><h2 style="color:#1e2a3a;margin:0 0 6px;">Models</h2></div>
<p>Five LLMs fine-tuned with <strong>QLoRA</strong> via the Unsloth framework:</p>
<div class="pb-card">
<table>
<thead><tr><th>Model</th><th>Base (Unsloth 4-bit)</th><th>HF Repository</th></tr></thead>
<tbody>
<tr><td style="font-weight:600;border-left:3px solid #4a7fa5;">Llama-3.1-8B</td><td class="mono">unsloth/Meta-Llama-3.1-8B-bnb-4bit</td><td><a href="https://huggingface.co/argyrotsipi/parliabench-unsloth-llama-3.1-8b" style="color:#4a7fa5;">argyrotsipi/parliabench-unsloth-llama-3.1-8b</a></td></tr>
<tr><td style="font-weight:600;border-left:3px solid #7b5ea7;">Gemma-2-9B</td><td class="mono">unsloth/gemma-2-9b-bnb-4bit</td><td><a href="https://huggingface.co/argyrotsipi/parliabench-unsloth-gemma-2-9b" style="color:#4a7fa5;">argyrotsipi/parliabench-unsloth-gemma-2-9b</a></td></tr>
<tr><td style="font-weight:600;border-left:3px solid #2d5282;">Mistral-7B</td><td class="mono">unsloth/mistral-7b-v0.3-bnb-4bit</td><td><a href="https://huggingface.co/argyrotsipi/parliabench-unsloth-mistral-7b-v0.3" style="color:#4a7fa5;">argyrotsipi/parliabench-unsloth-mistral-7b-v0.3</a></td></tr>
<tr><td style="font-weight:600;border-left:3px solid #4a6080;">Qwen2-7B</td><td class="mono">unsloth/Qwen2-7B-bnb-4bit</td><td><a href="https://huggingface.co/argyrotsipi/parliabench-unsloth-qwen-2-7b" style="color:#4a7fa5;">argyrotsipi/parliabench-unsloth-qwen-2-7b</a></td></tr>
<tr><td style="font-weight:600;border-left:3px solid #4a4a6a;">Yi-1.5-6B</td><td class="mono">unsloth/Yi-1.5-6B-bnb-4bit</td><td><a href="https://huggingface.co/argyrotsipi/parliabench-unsloth-yi-1.5-6b" style="color:#4a7fa5;">argyrotsipi/parliabench-unsloth-yi-1.5-6b</a></td></tr>
</tbody>
</table>
</div>
<div style="display:flex;gap:20px;flex-wrap:wrap;">
<div style="flex:1;min-width:240px;">
<p style="font-weight:600;color:#2c3a4a;margin-bottom:6px;font-size:.85em;text-transform:uppercase;letter-spacing:.05em;">QLoRA Configuration</p>
<div class="pb-card">
<table>
<thead><tr><th>Parameter</th><th class="r">Value</th></tr></thead>
<tbody>
<tr><td>LoRA rank (r)</td><td class="r">16</td></tr>
<tr><td>LoRA alpha</td><td class="r">16</td></tr>
<tr><td>Target modules</td><td class="r">q, k, v, o, gate, up, down</td></tr>
<tr><td>Dropout</td><td class="r">0</td></tr>
<tr><td>Batch size</td><td class="r">64</td></tr>
<tr><td>Learning rate</td><td class="r">2e-4</td></tr>
<tr><td>Optimizer</td><td class="r">AdamW (fused)</td></tr>
<tr><td>Max steps</td><td class="r">11,194 (~2 epochs)</td></tr>
<tr><td>Warmup steps</td><td class="r">336</td></tr>
<tr><td>Max sequence length</td><td class="r">1,024</td></tr>
</tbody>
</table>
</div>
</div>
<div style="flex:1;min-width:240px;">
<p style="font-weight:600;color:#2c3a4a;margin-bottom:6px;font-size:.85em;text-transform:uppercase;letter-spacing:.05em;">Generation Parameters</p>
<div class="pb-card">
<table>
<thead><tr><th>Parameter</th><th class="r">Value</th></tr></thead>
<tbody>
<tr><td>Temperature</td><td class="r">0.7</td></tr>
<tr><td>Top-p</td><td class="r">0.85</td></tr>
<tr><td>Repetition penalty</td><td class="r">1.2</td></tr>
<tr><td>Max new tokens</td><td class="r">850</td></tr>
<tr><td>Min words (P10)</td><td class="r">43</td></tr>
<tr><td>Max words (P90)</td><td class="r">635</td></tr>
<tr><td>Sampling</td><td class="r">Nucleus (top-p)</td></tr>
<tr><td>Max regen attempts</td><td class="r">3</td></tr>
</tbody>
</table>
</div>
</div>
</div>
<!-- ── Prompt Architecture ─────────────────────────────────────────────── -->
<div class="pb-section"><h2 style="color:#1e2a3a;margin:0 0 6px;">Prompt Architecture</h2></div>
<p style="margin-bottom:6px;"><strong>System prompt β€” training</strong> (no word count):</p>
<pre class="pb-pre">You are a seasoned UK parliamentary member. Use proper British parliamentary language
appropriate for the specified House. The speech should reflect the political orientation
and typical positions of the specified party on the given topic.</pre>
<p style="margin-bottom:6px;margin-top:12px;"><strong>System prompt β€” generation</strong> (includes word count target):</p>
<pre class="pb-pre">You are a seasoned UK parliamentary member. Generate a coherent speech of
{min_words}-{max_words} words in standard English (no Unicode artifacts, no special
characters). Use proper British parliamentary language appropriate for the specified
House. The speech should reflect the political orientation and typical positions of the
specified party on the given topic.</pre>
<p style="margin-bottom:6px;margin-top:12px;"><strong>Context string</strong> (pipe-separated, injected as user turn):</p>
<pre class="pb-pre">EUROVOC TOPIC: {topic} | SECTION: {section} | PARTY: {party} | POLITICAL ORIENTATION: {orientation} | HOUSE: {house}</pre>
<p style="font-weight:600;margin-top:18px;margin-bottom:10px;color:#1a3a5c;">Model-specific chat templates</p>
<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(300px,1fr));gap:14px;margin-bottom:20px;">
<div>
<p style="font-weight:600;color:#2d5282;margin:0 0 5px;font-size:.85em;">Mistral</p>
<pre class="pb-template">&lt;s&gt;[INST] {SYSTEM_PROMPT}
Context: {context}
Instruction: {instruction} [/INST] {response}&lt;/s&gt;</pre>
</div>
<div>
<p style="font-weight:600;color:#2d5282;margin:0 0 5px;font-size:.85em;">Llama 3.1</p>
<pre class="pb-template">&lt;|begin_of_text|&gt;&lt;|start_header_id|&gt;system&lt;|end_header_id|&gt;
{SYSTEM_PROMPT}&lt;|eot_id|&gt;&lt;|start_header_id|&gt;user&lt;|end_header_id|&gt;
Context: {context}
Instruction: {instruction}&lt;|eot_id|&gt;&lt;|start_header_id|&gt;assistant&lt;|end_header_id|&gt;
{response}&lt;|eot_id|&gt;</pre>
</div>
<div>
<p style="font-weight:600;color:#7b5ea7;margin:0 0 5px;font-size:.85em;">Gemma 2</p>
<pre class="pb-template purple">&lt;bos&gt;&lt;start_of_turn&gt;user
{SYSTEM_PROMPT}
Context: {context}
Instruction: {instruction}&lt;end_of_turn&gt;
&lt;start_of_turn&gt;model
{response}&lt;end_of_turn&gt;</pre>
</div>
<div>
<p style="font-weight:600;color:#7b5ea7;margin:0 0 5px;font-size:.85em;">Qwen2 &amp; Yi-1.5 (ChatML)</p>
<pre class="pb-template purple">&lt;|im_start|&gt;system
{SYSTEM_PROMPT}&lt;|im_end|&gt;
&lt;|im_start|&gt;user
Context: {context}
Instruction: {instruction}&lt;|im_end|&gt;
&lt;|im_start|&gt;assistant
{response}&lt;|im_end|&gt;</pre>
</div>
</div>
<!-- ── Speech Validation ───────────────────────────────────────────────── -->
<div class="pb-section"><h2 style="color:#1e2a3a;margin:0 0 6px;">Speech Validation</h2></div>
<p>Every generated speech passes a <strong>9-step validation pipeline</strong>. Invalid speeches are automatically
regenerated up to 3 times. Baseline models exhibited higher failure rates, suggesting fine-tuning
improved output quality directly.</p>
<div class="pb-card" style="margin-top:12px;">
<table>
<thead><tr>
<th style="text-align:center;width:36px;">#</th>
<th style="min-width:140px;">Check</th>
<th>Detail</th>
</tr></thead>
<tbody>
<tr>
<td style="text-align:center;font-weight:700;color:#2d5282;">1</td>
<td style="font-weight:600;">Template leakage</td>
<td>27 markers: role tokens (<code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">\nuser</code>, <code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">\nassistant</code>), context labels (<code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">Context:</code>, <code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">Instruction:</code>), special tokens (<code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">[INST]</code>, im_start, etc.)</td>
</tr>
<tr>
<td style="text-align:center;font-weight:700;color:#2d5282;">2</td>
<td style="font-weight:600;">Unicode corruption</td>
<td>14 corruption patterns + 11 forbidden script ranges (CJK, Cyrillic, Arabic, Thai, technical symbols)</td>
</tr>
<tr>
<td style="text-align:center;font-weight:700;color:#2d5282;">3</td>
<td style="font-weight:600;">Language detection</td>
<td>spaCy <code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">en_core_web_sm</code> with 85% confidence threshold on texts &ge; 30 characters</td>
</tr>
<tr>
<td style="text-align:center;font-weight:700;color:#2d5282;">4</td>
<td style="font-weight:600;">Repetition</td>
<td>Same word &gt; 3&times; consecutive; sequences of 3–7 words repeated &gt; 3&times;; &gt; 5 ordinal counting words</td>
</tr>
<tr>
<td style="text-align:center;font-weight:700;color:#2d5282;">5</td>
<td style="font-weight:600;">Semantic relevance</td>
<td>Cosine similarity &lt; 0.08 via <code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">all-MiniLM-L6-v2</code> against &ldquo;UK parliamentary debate about {section} on {topic}&rdquo;</td>
</tr>
<tr>
<td style="text-align:center;font-weight:700;color:#2d5282;">6</td>
<td style="font-weight:600;">Length</td>
<td>Valid word count: 43–635 words (P10–P90 of training corpus)</td>
</tr>
<tr>
<td style="text-align:center;font-weight:700;color:#2d5282;">7</td>
<td style="font-weight:600;">Concatenation</td>
<td>Rejects if &ge; 4 parliamentary opening phrases (<code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">My Lords</code>, <code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">Mr Speaker</code> &hellip;) suggesting multiple speeches joined</td>
</tr>
<tr>
<td style="text-align:center;font-weight:700;color:#2d5282;">8</td>
<td style="font-weight:600;">Corrupted endings</td>
<td>Nonsensical suffixes (e.g. <code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">&#9613;&#9613;&#9613;</code>, <code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">});</code>)</td>
</tr>
<tr>
<td style="text-align:center;font-weight:700;color:#2d5282;">9</td>
<td style="font-weight:600;">Refusal patterns</td>
<td>AI refusal phrases (<code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">I cannot generate</code>, <code style="background:#eef2ff;padding:1px 5px;border-radius:3px;">I'm sorry but I cannot</code> &hellip;)</td>
</tr>
</tbody>
</table>
</div>
</div>
""")
# ── Results Table ─────────────────────────────────────────────────
gr.Markdown("## Results\n\n27,560 fully evaluated speeches β€” baseline (B) vs fine-tuned (F) across all 14 metrics.")
gr.HTML("""
<div style="font-size:.87em;overflow-x:auto;padding:4px 0 20px 0;">
<p style="color:#555;margin-bottom:10px;">
<span style="background:#d4edda;color:#155724;padding:2px 9px;border-radius:5px;font-weight:600;">green = significant improvement</span>
&nbsp;
<span style="background:#f8d7da;color:#721c24;padding:2px 9px;border-radius:5px;font-weight:600;">red = significant regression</span>
&nbsp;
<span style="color:#888;">PPL &darr; &middot; Self-BLEU &darr; &middot; all others &uarr;</span>
</p>
<div style="background:#f8f9fc;border:1px solid #dde3ee;border-radius:12px;overflow:hidden;min-width:960px;">
<table style="border-collapse:collapse;width:100%;font-size:.82em;">
<thead>
<tr style="background:#e8eef6;">
<th rowspan="2" style="padding:9px 11px;text-align:left;color:#1a3a5c;border-right:2px solid #c5d2e8;min-width:140px;">Model</th>
<th colspan="5" style="padding:7px 8px;text-align:center;border-right:2px solid #c5d2e8;color:#2d5282;background:#dce8f5;">Linguistic Quality</th>
<th colspan="4" style="padding:7px 8px;text-align:center;border-right:2px solid #c5d2e8;color:#4a3a7a;background:#e8e0f5;">Semantic Coherence</th>
<th colspan="5" style="padding:7px 8px;text-align:center;color:#1a3a5c;background:#dce8f5;">Political Authenticity</th>
</tr>
<tr style="background:#eef2f8;font-size:.9em;">
<th style="padding:5px 6px;text-align:center;color:#2d5282;">PPL&darr;</th>
<th style="padding:5px 6px;text-align:center;color:#2d5282;">Dist-N&uarr;</th>
<th style="padding:5px 6px;text-align:center;color:#2d5282;">Self-BLEU&darr;</th>
<th style="padding:5px 6px;text-align:center;color:#2d5282;">J_Coh&uarr;</th>
<th style="padding:5px 6px;text-align:center;border-right:2px solid #c5d2e8;color:#2d5282;">J_Conc&uarr;</th>
<th style="padding:5px 6px;text-align:center;color:#4a3a7a;">GRUEN&uarr;</th>
<th style="padding:5px 6px;text-align:center;color:#4a3a7a;">BERTScore&uarr;</th>
<th style="padding:5px 6px;text-align:center;color:#4a3a7a;">MoverScore&uarr;</th>
<th style="padding:5px 6px;text-align:center;border-right:2px solid #c5d2e8;color:#4a3a7a;">J_Rel&uarr;</th>
<th style="padding:5px 6px;text-align:center;color:#1a3a5c;">PSA&uarr;</th>
<th style="padding:5px 6px;text-align:center;color:#1a3a5c;">Party Align&uarr;</th>
<th style="padding:5px 6px;text-align:center;color:#1a3a5c;">J_Auth&uarr;</th>
<th style="padding:5px 6px;text-align:center;color:#1a3a5c;">J_PolApp&uarr;</th>
<th style="padding:5px 6px;text-align:center;color:#1a3a5c;">J_Qual&uarr;</th>
</tr>
</thead>
<tbody>
<!-- Llama -->
<tr style="background:#f4f8fc;">
<td style="padding:7px 11px;font-weight:600;color:#1e2a3a;border-left:4px solid #4a7fa5;border-right:2px solid #c5d2e8;">Llama 3.1 8B (B)</td>
<td style="padding:5px 6px;text-align:center;color:#222;">60.854</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.988</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.006</td>
<td style="padding:5px 6px;text-align:center;color:#222;">7.041</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">5.935</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.539</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.803</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.505</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">5.465</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.399</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.504</td>
<td style="padding:5px 6px;text-align:center;color:#222;">4.403</td>
<td style="padding:5px 6px;text-align:center;color:#222;">6.177</td>
<td style="padding:5px 6px;text-align:center;color:#222;">4.791</td>
</tr>
<tr style="background:#e6eef8;">
<td style="padding:7px 11px;font-weight:600;color:#1e2a3a;border-left:4px solid #4a7fa5;border-right:2px solid #c5d2e8;">Llama 3.1 8B (F)</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">31.724 ↓</td>
<td style="padding:5px 6px;text-align:center;background:#f8d7da;color:#721c24;font-weight:600;">0.974 ↓</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.018</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">7.915 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;border-right:2px solid #c5d2e8;">7.129 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#f8d7da;color:#721c24;font-weight:600;">0.508 ↓</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.820 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.511 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;border-right:2px solid #c5d2e8;">6.139 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.487 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.576 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">6.106 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">7.277 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">5.399 ↑</td>
</tr>
<!-- Gemma -->
<tr style="background:#f5f3fc;border-top:2px solid #dde3ee;">
<td style="padding:7px 11px;font-weight:600;color:#1e2a3a;border-left:4px solid #7b5ea7;border-right:2px solid #c5d2e8;">Gemma 2 9B (B)</td>
<td style="padding:5px 6px;text-align:center;color:#222;">89.784</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.992</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.008</td>
<td style="padding:5px 6px;text-align:center;color:#222;">7.788</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">4.784</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.526</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.804</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.508</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">5.782</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.444</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.543</td>
<td style="padding:5px 6px;text-align:center;color:#222;">3.837</td>
<td style="padding:5px 6px;text-align:center;color:#222;">6.498</td>
<td style="padding:5px 6px;text-align:center;color:#222;">4.442</td>
</tr>
<tr style="background:#ece8f8;">
<td style="padding:7px 11px;font-weight:600;color:#1e2a3a;border-left:4px solid #7b5ea7;border-right:2px solid #c5d2e8;">Gemma 2 9B (F)</td>
<td style="padding:5px 6px;text-align:center;background:#f8d7da;color:#721c24;font-weight:600;">101.578 ↑</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.990</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.010</td>
<td style="padding:5px 6px;text-align:center;background:#f8d7da;color:#721c24;font-weight:600;">7.507 ↓</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">5.006</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.501</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.804</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.510 ↑</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">5.529</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.498 ↑</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.590</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">4.209 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">7.293 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">4.950 ↑</td>
</tr>
<!-- Mistral -->
<tr style="background:#f2f6fb;border-top:2px solid #dde3ee;">
<td style="padding:7px 11px;font-weight:600;color:#1e2a3a;border-left:4px solid #2d5282;border-right:2px solid #c5d2e8;">Mistral 7B v0.3 (B)</td>
<td style="padding:5px 6px;text-align:center;color:#222;">31.280</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.966</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.008</td>
<td style="padding:5px 6px;text-align:center;color:#222;">6.598</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">6.899</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.555</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.810</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.505</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">5.418</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.418</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.521</td>
<td style="padding:5px 6px;text-align:center;color:#222;">4.237</td>
<td style="padding:5px 6px;text-align:center;color:#222;">5.617</td>
<td style="padding:5px 6px;text-align:center;color:#222;">4.179</td>
</tr>
<tr style="background:#dde8f5;">
<td style="padding:7px 11px;font-weight:600;color:#1e2a3a;border-left:4px solid #2d5282;border-right:2px solid #c5d2e8;">Mistral 7B v0.3 (F)</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">29.562 ↓</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.972 ↑</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.016</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">7.961 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;border-right:2px solid #c5d2e8;">8.962 ↑</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.552</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.825 ↑</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.508</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;border-right:2px solid #c5d2e8;">5.681 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.437 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#f8d7da;color:#721c24;font-weight:600;">0.507 ↓</td>
<td style="padding:5px 6px;text-align:center;background:#f8d7da;color:#721c24;font-weight:600;">3.983 ↓</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">6.382 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#f8d7da;color:#721c24;font-weight:600;">3.727 ↓</td>
</tr>
<!-- Qwen -->
<tr style="background:#f3f5f9;border-top:2px solid #dde3ee;">
<td style="padding:7px 11px;font-weight:600;color:#1e2a3a;border-left:4px solid #4a6080;border-right:2px solid #c5d2e8;">Qwen2 7B (B)</td>
<td style="padding:5px 6px;text-align:center;color:#222;">44.927</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.981</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.020</td>
<td style="padding:5px 6px;text-align:center;color:#222;">7.911</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">5.928</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.488</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.803</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.508</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">6.904</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.444</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.560</td>
<td style="padding:5px 6px;text-align:center;color:#222;">6.565</td>
<td style="padding:5px 6px;text-align:center;color:#222;">7.291</td>
<td style="padding:5px 6px;text-align:center;color:#222;">6.348</td>
</tr>
<tr style="background:#e4e8f0;">
<td style="padding:7px 11px;font-weight:600;color:#1e2a3a;border-left:4px solid #4a6080;border-right:2px solid #c5d2e8;">Qwen2 7B (F)</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">36.090 ↓</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.982</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.017 ↓</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">8.060 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;border-right:2px solid #c5d2e8;">7.625 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.539 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.821 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.512 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#f8d7da;color:#721c24;font-weight:600;border-right:2px solid #c5d2e8;">6.009 ↓</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.488 ↑</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.572</td>
<td style="padding:5px 6px;text-align:center;background:#f8d7da;color:#721c24;font-weight:600;">5.731 ↓</td>
<td style="padding:5px 6px;text-align:center;color:#222;">7.138</td>
<td style="padding:5px 6px;text-align:center;background:#f8d7da;color:#721c24;font-weight:600;">5.014 ↓</td>
</tr>
<!-- Yi -->
<tr style="background:#f4f4f8;border-top:2px solid #dde3ee;">
<td style="padding:7px 11px;font-weight:600;color:#1e2a3a;border-left:4px solid #4a4a6a;border-right:2px solid #c5d2e8;">Yi 6B (B)</td>
<td style="padding:5px 6px;text-align:center;color:#222;">82.100</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.990</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.006</td>
<td style="padding:5px 6px;text-align:center;color:#222;">6.741</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">4.303</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.563</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.799</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.505</td>
<td style="padding:5px 6px;text-align:center;color:#222;border-right:2px solid #c5d2e8;">4.490</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.343</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.423</td>
<td style="padding:5px 6px;text-align:center;color:#222;">2.981</td>
<td style="padding:5px 6px;text-align:center;color:#222;">5.385</td>
<td style="padding:5px 6px;text-align:center;color:#222;">3.083</td>
</tr>
<tr style="background:#e8e8f0;">
<td style="padding:7px 11px;font-weight:600;color:#1e2a3a;border-left:4px solid #4a4a6a;border-right:2px solid #c5d2e8;">Yi 6B (F)</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">42.893 ↓</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.987</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.016</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">8.043 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;border-right:2px solid #c5d2e8;">6.856 ↑</td>
<td style="padding:5px 6px;text-align:center;color:#222;">0.537</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.817 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.511 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;border-right:2px solid #c5d2e8;">5.984 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.493 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">0.582 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">6.102 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">7.326 ↑</td>
<td style="padding:5px 6px;text-align:center;background:#d4edda;color:#155724;font-weight:600;">5.392 ↑</td>
</tr>
</tbody>
</table>
</div>
<p style="color:#888;font-size:.79em;margin-top:8px;">&#8593;/&#8595; p &lt; 0.05 after Bonferroni correction. PSA and Party Align on 0&ndash;1 scale; J scores on 0&ndash;10 scale; GRUEN/BERT/Mover on 0&ndash;1; PPL raw.</p>
</div>
""")
gr.Markdown("---\n\n## Political Spectrum & Party Alignment")
# ── PSA Diagram β€” PRESERVED EXACTLY ──────────────────────────────
gr.HTML("""
<div style="color:#222;padding:4px 0 24px 0;">
<p style="color:#444;margin-bottom:20px;line-height:1.7;">
These are the two <strong>novel embedding-based metrics</strong> introduced by ParliaBench to measure
political authenticity β€” dimensions entirely unavailable to conventional NLP metrics.
</p>
<!-- Two-column layout -->
<div style="display:flex;gap:24px;flex-wrap:wrap;">
<!-- PSA Card -->
<div style="flex:1;min-width:300px;background:#f8f9fc;border:1px solid #dde3ee;border-radius:10px;padding:20px 22px;">
<h3 style="color:#1a3a5c;margin:0 0 12px 0;font-size:1em;">Political Spectrum Alignment (PSA)</h3>
<p style="color:#444;font-size:.88em;line-height:1.7;margin-bottom:16px;">
Measures how well a generated speech's <em>ideological positioning</em> matches the intended
party orientation on the left–right spectrum.
</p>
<!-- Spectrum diagram -->
<div style="background:#fff;border:1px solid #e0e0e0;border-radius:8px;padding:16px 12px 10px;margin-bottom:14px;">
<div style="position:relative;height:28px;margin:0 8px 6px;">
<div style="position:absolute;top:12px;left:0;right:0;height:4px;background:linear-gradient(to right,#c0392b,#e67e22,#f1c40f,#2ecc71,#3498db);border-radius:2px;"></div>
<!-- Party dots -->
<div style="position:absolute;top:2px;left:2%;transform:translateX(-50%);" title="Green Party">
<div style="width:12px;height:12px;background:#27ae60;border-radius:50%;border:2px solid #fff;box-shadow:0 1px 3px rgba(0,0,0,.3);"></div>
</div>
<div style="position:absolute;top:2px;left:18%;transform:translateX(-50%);" title="SNP / Labour / Plaid">
<div style="width:12px;height:12px;background:#e74c3c;border-radius:50%;border:2px solid #fff;box-shadow:0 1px 3px rgba(0,0,0,.3);"></div>
</div>
<div style="position:absolute;top:2px;left:36%;transform:translateX(-50%);" title="LibDems">
<div style="width:12px;height:12px;background:#f39c12;border-radius:50%;border:2px solid #fff;box-shadow:0 1px 3px rgba(0,0,0,.3);"></div>
</div>
<div style="position:absolute;top:2px;left:72%;transform:translateX(-50%);" title="Conservative">
<div style="width:12px;height:12px;background:#3498db;border-radius:50%;border:2px solid #fff;box-shadow:0 1px 3px rgba(0,0,0,.3);"></div>
</div>
<div style="position:absolute;top:2px;left:88%;transform:translateX(-50%);" title="DUP">
<div style="width:12px;height:12px;background:#8e44ad;border-radius:50%;border:2px solid #fff;box-shadow:0 1px 3px rgba(0,0,0,.3);"></div>
</div>
<!-- Generated speech marker -->
<div style="position:absolute;top:-2px;left:65%;transform:translateX(-50%);">
<div style="width:0;height:0;border-left:7px solid transparent;border-right:7px solid transparent;border-top:12px solid #e74c3c;"></div>
</div>
</div>
<div style="display:flex;justify-content:space-between;font-size:.72em;color:#888;margin:0 8px;">
<span>Far-left (βˆ’6)</span><span>Centre (0)</span><span>Far-right (+6)</span>
</div>
<div style="font-size:.74em;color:#555;margin-top:10px;line-height:1.6;">
<span style="display:inline-block;width:10px;height:10px;background:#27ae60;border-radius:50%;vertical-align:middle;margin-right:4px;"></span>Green &nbsp;
<span style="display:inline-block;width:10px;height:10px;background:#e74c3c;border-radius:50%;vertical-align:middle;margin-right:4px;"></span>Labour/SNP &nbsp;
<span style="display:inline-block;width:10px;height:10px;background:#f39c12;border-radius:50%;vertical-align:middle;margin-right:4px;"></span>LibDems &nbsp;
<span style="display:inline-block;width:10px;height:10px;background:#3498db;border-radius:50%;vertical-align:middle;margin-right:4px;"></span>Conservative &nbsp;
<span style="display:inline-block;width:10px;height:10px;background:#8e44ad;border-radius:50%;vertical-align:middle;margin-right:4px;"></span>DUP &nbsp;
<span style="color:#e74c3c;font-weight:700;">&#9660; generated speech</span>
</div>
</div>
<div style="background:#eef4ff;border-left:3px solid #4a90d9;padding:10px 14px;border-radius:0 6px 6px 0;font-size:.83em;color:#222;line-height:1.7;">
<strong>How it's computed:</strong><br>
1. Build <em>centroid embeddings</em> for each orientation (Far-left &rarr; Far-right) from real ParlaMint-GB speeches<br>
2. Embed the generated speech with <code>all-mpnet-base-v2</code><br>
3. Find the closest orientation centroid via cosine similarity<br>
4. Score = <code>sim(speech, closest_centroid) &times; max(0, 1 &minus; &Delta;&phi;/12)</code><br>
&nbsp;&nbsp;&nbsp;where &Delta;&phi; = |expected_orientation &minus; closest_orientation|<br>
5. Range 0&rarr;1; perfect alignment approaches 1
</div>
</div>
<!-- Party Alignment Card -->
<div style="flex:1;min-width:300px;background:#f8fcf8;border:1px solid #ddeedd;border-radius:10px;padding:20px 22px;">
<h3 style="color:#1a4a2a;margin:0 0 12px 0;font-size:1em;">Party Alignment</h3>
<p style="color:#444;font-size:.88em;line-height:1.7;margin-bottom:16px;">
Measures how closely a generated speech matches the <em>linguistic style and rhetoric</em>
of the specified party, independent of spectrum position.
</p>
<!-- Party centroid diagram -->
<div style="background:#fff;border:1px solid #e0e0e0;border-radius:8px;padding:16px 12px;margin-bottom:14px;">
<div style="position:relative;height:130px;">
<!-- Axes hint -->
<div style="position:absolute;top:50%;left:50%;width:90%;height:1px;background:#eee;transform:translate(-50%,-50%);"></div>
<div style="position:absolute;top:10%;left:50%;width:1px;height:80%;background:#eee;transform:translateX(-50%);"></div>
<!-- Party centroids -->
<div style="position:absolute;top:20%;left:68%;">
<div style="width:36px;height:36px;background:#3498db22;border:2px solid #3498db;border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:.68em;color:#1a5c8a;font-weight:700;">Con</div>
</div>
<div style="position:absolute;top:55%;left:20%;">
<div style="width:36px;height:36px;background:#e74c3c22;border:2px solid #e74c3c;border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:.68em;color:#8a1a1a;font-weight:700;">Lab</div>
</div>
<div style="position:absolute;top:15%;left:28%;">
<div style="width:34px;height:34px;background:#f1c40f22;border:2px solid #f1c40f;border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:.65em;color:#7a6000;font-weight:700;">SNP</div>
</div>
<div style="position:absolute;top:65%;left:62%;">
<div style="width:32px;height:32px;background:#27ae6022;border:2px solid #27ae60;border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:.62em;color:#1a5c30;font-weight:700;">LibD</div>
</div>
<!-- Generated speech -->
<div style="position:absolute;top:38%;left:50%;">
<div style="width:14px;height:14px;background:#e74c3c;border-radius:50%;border:3px solid #fff;box-shadow:0 2px 6px rgba(0,0,0,.3);transform:translate(-50%,-50%);"></div>
</div>
<!-- Dotted line to Lab -->
<svg style="position:absolute;top:0;left:0;width:100%;height:100%;pointer-events:none;">
<line x1="50%" y1="38%" x2="23%" y2="73%" stroke="#e74c3c" stroke-width="1.5" stroke-dasharray="4,3" opacity="0.7"/>
<text x="32%" y="52%" font-size="9" fill="#e74c3c" opacity="0.9">sim=0.61</text>
</svg>
</div>
<div style="font-size:.74em;color:#555;line-height:1.6;margin-top:4px;">
<span style="display:inline-block;width:10px;height:10px;background:#e74c3c;border-radius:50%;vertical-align:middle;margin-right:4px;"></span>generated speech (expected: Labour)
&nbsp;&middot;&nbsp; circles = party centroid embeddings
</div>
</div>
<div style="background:#eefaf2;border-left:3px solid #27ae60;padding:10px 14px;border-radius:0 6px 6px 0;font-size:.83em;color:#222;line-height:1.7;">
<strong>How it's computed:</strong><br>
1. Build a <em>centroid embedding</em> per party from all real speeches in that party's training data<br>
2. Embed the generated speech with <code>all-mpnet-base-v2</code><br>
3. Score = <code>cosine_similarity(speech, expected_party_centroid)</code><br>
4. Range 0&rarr;1; captures party-specific vocabulary, rhetorical style, and framing beyond ideological position alone
</div>
</div>
</div>
<div style="background:#f0f4ff;border:1px solid #c5d2e8;border-radius:8px;padding:14px 18px;margin-top:20px;font-size:.85em;color:#222;line-height:1.7;">
<strong>Key finding:</strong> Both metrics successfully discriminate their target dimensions (both p &lt; 0.001).
All five fine-tuned models showed statistically significant improvements in PSA (effect sizes d = 0.14&ndash;1.05),
validating that fine-tuning genuinely improves ideological alignment &mdash; not just surface fluency.
Mistral achieved the highest PSA after fine-tuning (8.94), while Llama led on Party Alignment (6.19).
</div>
</div>
""")
gr.Markdown("""
---
## Evaluation Framework
27,560 fully evaluated speeches across three assessment dimensions:
### Linguistic quality
- **Perplexity (PPL)** β€” text naturalness via GPT-2 (↓ better)
- **Distinct-N** β€” lexical diversity via unique n-gram ratios (↑ better)
- **Self-BLEU** β€” intra-model diversity; lower = more varied outputs (↓ better)
- **J_Coh / J_Conc** β€” LLM-as-a-Judge coherence and conciseness (1–10 scale)
### Semantic coherence
- **GRUEN** β€” grammaticality and semantic coherence (↑ better)
- **BERTScore** β€” semantic similarity via RoBERTa-large F1 (↑ better)
- **MoverScore** β€” Earth Mover's Distance over contextual embeddings (↑ better)
- **J_Rel** β€” LLM-as-a-Judge relevance to prompt (1–10 scale)
### Political authenticity *(novel metrics)*
- **Political Spectrum Alignment (PSA)** β€” cosine similarity to orientation centroids weighted by ideological distance on a 13-point left–right scale
- **Party Alignment** β€” cosine similarity to party-specific embedding centroids
- **J_Auth / J_PolApp / J_Qual** β€” LLM-as-a-Judge authenticity, political appropriateness, overall quality (1–10 scale)
LLM judge: **FlowJudge-v0.1** (3.8B, Phi-3.5-mini architecture) β€” architecturally independent from all evaluated models.
---
## Citation
```bibtex
@article{koniaris2025parliabench,
title = {ParliaBench: An Evaluation and Benchmarking Framework for
LLM-Generated Parliamentary Speech},
author = {Koniaris, Marios and Tsipi, Argyro and Tsanakas, Panayiotis},
journal = {arXiv preprint arXiv:2511.08247},
year = {2025},
url = {https://arxiv.org/abs/2511.08247}
}
```
*National Technical University of Athens Β· School of Electrical and Computer Engineering*
""")
# ── Tab 2: Sample Gallery ─────────────────────────────────────────────
with gr.Tab("Sample Gallery"):
gr.Markdown(
"### Generated Speech Examples\n"
"Representative outputs from the ParliaBench evaluation set β€” "
"one per model, comparing **baseline** and **fine-tuned** performance."
)
with gr.Row():
filter_radio = gr.Radio(
choices=["All", "Fine-tuned", "Baseline"],
value="All",
label="Filter by type",
interactive=True,
)
def _build_choices(filter_val):
filtered = SAMPLES
if filter_val == "Fine-tuned":
filtered = [s for s in SAMPLES if s.get("is_finetuned")]
elif filter_val == "Baseline":
filtered = [s for s in SAMPLES if not s.get("is_finetuned")]
return [
f"{'FT' if s.get('is_finetuned') else 'Base'} Β· {s['model']} Β· {s['party']} Β· {s['topic']}"
for s in filtered
], filtered
initial_choices, initial_filtered = _build_choices("All")
sample_sel = gr.Dropdown(
choices=initial_choices,
value=initial_choices[0],
label="Select a speech",
interactive=True,
)
sample_html = gr.HTML(_render_sample(SAMPLES[0]))
def _show_sample(choice: str, filter_val: str) -> str:
_, filtered = _build_choices(filter_val)
label_map = {
f"{'FT' if s.get('is_finetuned') else 'Base'} Β· {s['model']} Β· {s['party']} Β· {s['topic']}": s
for s in filtered
}
s = label_map.get(choice)
return _render_sample(s) if s else ""
def _update_filter(filter_val):
choices, filtered = _build_choices(filter_val)
return gr.update(choices=choices, value=choices[0]), _render_sample(filtered[0])
filter_radio.change(fn=_update_filter, inputs=filter_radio, outputs=[sample_sel, sample_html])
sample_sel.change(fn=_show_sample, inputs=[sample_sel, filter_radio], outputs=sample_html)
# ── Tab 3: Generate Speech ─────────────────────────────────────────────
with gr.Tab("Generate Speech"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Configuration")
model_select = gr.Dropdown(
choices=list(MODELS.keys()),
value="Llama-3.1-8B (fine-tuned)",
label="Model",
info="Fine-tuned = QLoRA adapter on Unsloth base; Baseline = raw 4-bit base model",
)
with gr.Group():
party_select = gr.Dropdown(
choices=PARTIES, value="Conservative", label="Party",
)
orientation_box = gr.Textbox(
value=get_orientation("Conservative"),
label="Political Orientation (auto-filled)",
interactive=False,
)
house_select = gr.Dropdown(
choices=HOUSES, value="House of Commons", label="House",
info="Some parties are restricted to the Lords",
)
topic_select = gr.Dropdown(
choices=EUROVOC_TOPICS, value="POLITICS", label="EuroVoc Topic",
info="21 domains from the EUROVOC thesaurus",
)
section_input = gr.Textbox(
value="National Health Service Funding",
label="Debate Section / Bill Title",
placeholder="e.g. Climate Change Act, Defence Procurement...",
)
instruction_input = gr.Textbox(
label="Custom Instruction (optional)",
placeholder="Leave blank for generic instruction, or enter a specific question/prompt from the debate...",
lines=2,
)
gr.Markdown("### Generation Parameters")
temperature = gr.Slider(0.1, 1.5, value=DEFAULT_GEN_PARAMS["temperature"], step=0.05, label="Temperature")
top_p = gr.Slider(0.5, 1.0, value=DEFAULT_GEN_PARAMS["top_p"], step=0.05, label="Top-p (nucleus sampling)")
rep_penalty = gr.Slider(1.0, 2.0, value=DEFAULT_GEN_PARAMS["repetition_penalty"], step=0.05, label="Repetition Penalty")
max_new_toks = gr.Slider(100, 850, value=500, step=50, label="Max New Tokens")
with gr.Row():
min_words = gr.Number(value=DEFAULT_GEN_PARAMS["min_words"], label="Min Words", precision=0)
max_words = gr.Number(value=300, label="Max Words (demo cap)", precision=0)
gen_btn = gr.Button("Generate Speech", variant="primary", size="lg")
with gr.Column(scale=2):
gr.Markdown("### Generated Speech")
speech_out = gr.Textbox(label="Output", lines=18, show_copy_button=True, elem_id="speech")
stats_out = gr.Markdown("*Stats will appear here after generation.*")
params_out = gr.Textbox(label="Parameters Used", interactive=False)
with gr.Accordion("Full Prompt Sent to Model", open=False):
prompt_out = gr.Textbox(
label="Prompt (read-only)", lines=14, interactive=False, elem_id="prompt",
)
gr.Markdown(
"---\n"
"The prompt panel shows the **exact input** fed to the model "
"(including chat template tokens) β€” useful for reproducibility."
)
party_select.change(_update_house, party_select, house_select)
party_select.change(_update_orientation, party_select, orientation_box)
gen_btn.click(
fn=generate_speech,
inputs=[model_select, party_select, topic_select, section_input,
house_select, instruction_input,
temperature, top_p, rep_penalty, max_new_toks, min_words, max_words],
outputs=[speech_out, prompt_out, stats_out, params_out],
)
gr.Markdown(
"---\n"
"<small>ParliaBench Demo Β· NTUA 2025 Β· "
"[argyrotsipi on HF](https://huggingface.co/argyrotsipi) Β· "
"[Train dataset](https://huggingface.co/datasets/argyrotsipi/train-dataset) Β· "
"[Generated dataset](https://huggingface.co/datasets/argyrotsipi/generated-dataset)</small>"
)
if __name__ == "__main__":
demo.launch(share=False)