macronizer / app.py
al1808th's picture
Switch UI palette from brown to black theme
aaf8c40
import html
import re
from typing import Dict, List, Tuple
import gradio as gr
import torch
import torch.nn.functional as F
from transformers import AutoModelForTokenClassification, AutoTokenizer
from grc_utils import lower_grc, normalize_word, heavy, vowel, only_bases
from syllabify import syllabify_joined
from preprocess import process_word
MODEL_OPTIONS: Dict[str, str] = {
"SyllaMoBert (current)": "Ericu950/SyllaMoBert-grc-macronizer-v1",
"Macronizer Mini": "Ericu950/macronizer_mini",
}
DEFAULT_MODEL_LABEL = "SyllaMoBert (current)"
DEFAULT_MODEL_ID = MODEL_OPTIONS[DEFAULT_MODEL_LABEL]
MAX_LENGTH = 512
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_MODEL_CACHE: Dict[str, Tuple[AutoTokenizer, AutoModelForTokenClassification, Dict[int, str]]] = {}
def _get_model_bundle(model_id: str) -> Tuple[AutoTokenizer, AutoModelForTokenClassification, Dict[int, str]]:
if model_id in _MODEL_CACHE:
return _MODEL_CACHE[model_id]
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)
model.to(device)
model.eval()
id2label = model.config.id2label
_MODEL_CACHE[model_id] = (tokenizer, model, id2label)
return _MODEL_CACHE[model_id]
def preprocess_greek_line(line: str) -> List[str]:
# Normalize accents and keep only Greek-letter word spans.
normalized = normalize_word(line)
lower = lower_grc(normalized)
words = lower.split()
token_lists = [process_word(word) for word in words]
return [token for tokens in token_lists for token in tokens]
def _normalize_label(raw_label: str) -> int:
text = raw_label.lower()
if "long" in text:
return 1
if "short" in text:
return 2
if text.endswith("_1") or text == "1":
return 1
if text.endswith("_2") or text == "2":
return 2
return 0
def preprocess_and_syllabify(line: str):
tokens = preprocess_greek_line(line)
return syllabify_joined(tokens)
def classify_line(line: str, model_id: str):
syllables = preprocess_and_syllabify(line)
if not syllables:
return []
tokenizer, model, id2label = _get_model_bundle(model_id)
encoded = tokenizer(
syllables,
is_split_into_words=True,
return_tensors="pt",
truncation=True,
max_length=MAX_LENGTH,
)
word_ids = encoded.word_ids(batch_index=0)
if "token_type_ids" in encoded:
del encoded["token_type_ids"]
model_inputs = {k: v.to(device) for k, v in encoded.items()}
with torch.no_grad():
outputs = model(**model_inputs)
probs = F.softmax(outputs.logits, dim=-1)
predictions = torch.argmax(probs, dim=-1).squeeze(0).cpu().tolist()
aligned = []
seen_word_ids = set()
for i, word_id in enumerate(word_ids):
if word_id is None:
continue
if word_id in seen_word_ids:
continue
if word_id >= len(syllables):
break
seen_word_ids.add(word_id)
pred_id = int(predictions[i])
label_name = id2label.get(pred_id, str(pred_id))
normalized = _normalize_label(str(label_name))
aligned.append((syllables[word_id], normalized))
return aligned
def _syllable_chip(syllable: str, label_id: int) -> str:
escaped = html.escape(syllable)
if label_id == 1:
return f'<span class="chip long">{escaped}<small>long</small></span>'
if label_id == 2:
return f'<span class="chip short">{escaped}<small>short</small></span>'
return f'<span class="chip clear">{escaped}</span>'
def _mark_syllable_plain(syllable: str, label_id: int) -> str:
if label_id not in (1, 2):
return syllable
marker = "_" if label_id == 1 else "^"
chars = list(syllable)
for i in range(len(chars) - 1, -1, -1):
if vowel(chars[i]):
return "".join(chars[: i + 1]) + marker + "".join(chars[i + 1 :])
return syllable + marker
def _to_final_sigma(text: str) -> str:
# Step 3: in rendered output, only word-final sigmas become final-sigma.
def _convert_word(token: str) -> str:
if not token.strip():
return token
chars = list(token)
last_greek_idx = -1
for i, ch in enumerate(chars):
if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
last_greek_idx = i
if last_greek_idx != -1 and chars[last_greek_idx] == "σ":
chars[last_greek_idx] = "ς"
return "".join(chars)
return "".join(_convert_word(tok) for tok in re.findall(r"\S+|\s+", text))
def _restore_expanded_word(marked_word: str, reference_word: str) -> str:
restored = marked_word.replace("δσ", "ζ").replace("κσ", "ξ").replace("πσ", "ψ")
ref_norm = lower_grc(normalize_word(reference_word))
if "ῥ" in ref_norm:
rho_idx = restored.find("ρ")
if rho_idx != -1:
restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :]
return _to_final_sigma(restored)
def _consume_word_alignment(
aligned: List[Tuple[str, int]],
start_idx: int,
expected_syllables: List[str],
) -> Tuple[List[Tuple[str, int]], int]:
if start_idx >= len(aligned):
return [], start_idx
expected_bases = only_bases("".join(expected_syllables))
if expected_bases:
taken: List[Tuple[str, int]] = []
i = start_idx
while i < len(aligned):
taken.append(aligned[i])
current_bases = only_bases("".join(s for s, _ in taken))
if current_bases == expected_bases:
return taken, i + 1
if len(current_bases) > len(expected_bases) and not current_bases.startswith(expected_bases):
break
i += 1
fallback_count = len(expected_syllables)
if fallback_count <= 0:
return [], start_idx
end_idx = min(len(aligned), start_idx + fallback_count)
return aligned[start_idx:end_idx], end_idx
def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -> str:
# Step 1: normalize input final sigma to medial sigma for matching only.
line_for_matching = line.replace("ς", "σ")
parts = re.findall(r"\S+|\s+", line)
parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
out_parts: List[str] = []
cursor = 0
for part, part_for_matching in zip(parts, parts_for_matching):
if part_for_matching.isspace():
# Step 2: preserve original spacing exactly.
out_parts.append(part_for_matching)
continue
normalized_word = lower_grc(normalize_word(part_for_matching)).replace("ς", "σ")
expected_tokens = process_word(normalized_word)
expected_syllables = syllabify_joined(expected_tokens)
taken, cursor = _consume_word_alignment(aligned, cursor, expected_syllables)
if not taken:
out_parts.append(part_for_matching)
continue
marked = "".join(_mark_syllable_plain(syl, label) for syl, label in taken)
restored = _restore_expanded_word(marked, part)
out_parts.append(restored)
if cursor < len(aligned):
tail = "".join(_mark_syllable_plain(syl, label) for syl, label in aligned[cursor:])
out_parts.append(_to_final_sigma(tail))
return "".join(out_parts)
def render_results(text: str, model_label: str):
lines = [line.strip() for line in text.splitlines() if line.strip()]
if not lines:
return "<div class='empty'>Enter one or more Greek lines to classify syllables.</div>", ""
model_id = MODEL_OPTIONS.get(model_label, DEFAULT_MODEL_ID)
cards = []
export_lines = []
for idx, line in enumerate(lines, start=1):
aligned = classify_line(line, model_id)
chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
plain_line = _render_plain_line_with_spacing(line, aligned)
cards.append(
f"""
<section class="card">
<div class="line-number">Line {idx}</div>
<div class="source">{html.escape(line)}</div>
<div class="chips">{chips or '<span class="chip clear">(no syllables found)</span>'}</div>
</section>
"""
)
export_lines.append(f"Line {idx}: {line}")
export_lines.append(f" {plain_line}" if plain_line else " (no syllables found)")
html_result = (
"<div class='legend'><span class='dot long'></span>Long"
"<span class='dot short'></span>Short"
"<span class='dot clear'></span>Unmarked</div>"
+ "".join(cards)
)
export_header = [f"Model: {model_label} ({model_id})", ""]
return html_result, "\n".join(export_header + export_lines)
examples = [
"νεανίας ἀάατός ἐστιν καὶ καλός. τὰ παῖδες τὰ καλά\nκαλὰ μὲν ἠέξευ, καλὰ δ᾽ ἔτραφες, οὐράνιε Ζεῦ,",
"Ἆρες, Ἄρες βροτολοιγὲ μιαιφόνε τειχεσιπλῆτα\nἈτρεΐδαι τε καὶ ἄλλοι ἐϋκνήμιδες Ἀχαιοί",
"ἢ τυφλὸς ἤ τις σκνιπὸς ἢ λέγα βλέπων\nψάμμου θαλασσῶν ἢ σκνιπῶν Αἰγυπτίων",
]
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@500;600;700&family=Space+Grotesk:wght@400;500;700&display=swap');
:root {
--bg-start: #0b0b0d;
--bg-end: #15151b;
--ink: #f0f0f5;
--long: #ff7868;
--short: #66dbd8;
--clear: #a0a0ab;
--paper: rgba(22, 22, 28, 0.9);
--chip-long-color: var(--long);
--chip-short-color: var(--short);
--chip-clear-color: #c9c9d3;
--source-text: var(--ink);
}
@media (prefers-color-scheme: dark) {
:root {
--bg-start: #050506;
--bg-end: #101015;
--ink: #f3f3f8;
--long: #ff7f70;
--short: #69e2de;
--clear: #b5b5c2;
--paper: rgba(16, 16, 22, 0.94);
--chip-long-color: #ff9b8d;
--chip-short-color: #7cebe7;
--chip-clear-color: #d4d4de;
--source-text: #fcfcff;
}
body.dark-mode {
--bg-start: #050506;
--bg-end: #101015;
--ink: #f3f3f8;
--long: #ff7f70;
--short: #69e2de;
--clear: #b5b5c2;
--paper: rgba(16, 16, 22, 0.94);
--chip-long-color: #ff9b8d;
--chip-short-color: #7cebe7;
--chip-clear-color: #d4d4de;
--source-text: #fcfcff;
}
}
body.dark-mode {
--bg-start: #050506;
--bg-end: #101015;
--ink: #f3f3f8;
--long: #ff7f70;
--short: #69e2de;
--clear: #b5b5c2;
--paper: rgba(16, 16, 22, 0.94);
--chip-long-color: #ff9b8d;
--chip-short-color: #7cebe7;
--chip-clear-color: #d4d4de;
--source-text: #fcfcff;
}
html.dark-mode {
--bg-start: #050506;
--bg-end: #101015;
--ink: #f3f3f8;
--long: #ff7f70;
--short: #69e2de;
--clear: #b5b5c2;
--paper: rgba(16, 16, 22, 0.94);
--chip-long-color: #ff9b8d;
--chip-short-color: #7cebe7;
--chip-clear-color: #d4d4de;
--source-text: #fcfcff;
}
.gradio-container {
font-family: 'Space Grotesk', sans-serif;
background: radial-gradient(circle at top left, var(--bg-start), var(--bg-end));
color: var(--ink);
transition: background-color 0.3s, color 0.3s;
}
.dark-mode-toggle {
position: fixed;
top: 20px;
right: 20px;
background: var(--paper);
border: 2px solid var(--ink);
color: var(--ink);
padding: 0.6rem 1.2rem;
border-radius: 999px;
cursor: pointer;
font-weight: 600;
font-family: 'Space Grotesk', sans-serif;
font-size: 0.95rem;
z-index: 1000;
transition: all 0.3s;
}
.dark-mode-toggle:hover {
transform: scale(1.05);
opacity: 0.9;
}
.title h1 {
font-family: 'Cormorant Garamond', serif;
font-size: 3rem;
letter-spacing: 0.02em;
margin-bottom: 0.2rem;
}
.title p {
opacity: 0.82;
}
.panel {
backdrop-filter: blur(8px);
background: var(--paper);
border: 1px solid rgba(255, 255, 255, 0.16);
border-radius: 18px;
padding: 0.9rem;
}
.dark-mode .panel {
border-color: rgba(232, 228, 220, 0.22);
}
.panel label,
.panel .gr-markdown,
.panel .gradio-markdown,
.panel .gr-form label,
.panel .gr-form span {
color: var(--ink) !important;
}
.panel textarea,
.panel input,
.panel .gr-textbox,
.panel .gr-textbox textarea,
.panel .gr-textbox input,
.panel .gr-radio,
.panel .gr-radio label,
.panel .gr-box,
.panel .gr-form {
color: var(--ink) !important;
}
.dark-mode .panel textarea,
.dark-mode .panel input,
.dark-mode .panel .gr-textbox,
.dark-mode .panel .gr-textbox textarea,
.dark-mode .panel .gr-textbox input,
.dark-mode .panel .gr-radio,
.dark-mode .panel .gr-box,
.dark-mode .panel .gr-form {
background: rgba(10, 10, 14, 0.9) !important;
border-color: rgba(232, 228, 220, 0.22) !important;
}
.dark-mode .panel .gr-button,
.dark-mode .panel button {
color: #f6f2e8 !important;
border-color: rgba(232, 228, 220, 0.28) !important;
}
.dark-mode .panel .gr-button.gr-button-primary,
.dark-mode .panel button.primary {
background: #3e74f2 !important;
color: #f7f9ff !important;
}
.legend {
display: flex;
align-items: center;
gap: 0.9rem;
font-weight: 600;
margin-bottom: 0.8rem;
}
.dot {
display: inline-block;
width: 10px;
height: 10px;
border-radius: 999px;
margin-left: 0.7rem;
margin-right: 0.25rem;
}
.dot.long { background: var(--long); }
.dot.short { background: var(--short); }
.dot.clear { background: var(--clear); }
.card {
background: rgba(24, 24, 32, 0.84);
border-radius: 14px;
padding: 0.9rem;
margin: 0.8rem 0;
border: 1px solid rgba(255, 255, 255, 0.14);
animation: rise 420ms ease both;
color: var(--ink);
}
.dark-mode .card {
background: rgba(14, 14, 20, 0.9);
border: 1px solid rgba(232, 228, 220, 0.15);
}
.line-number {
font-size: 0.8rem;
font-weight: 700;
text-transform: uppercase;
letter-spacing: 0.06em;
color: #afb0bc;
}
.dark-mode .line-number {
color: #d3d3df;
}
.source {
font-family: 'Cormorant Garamond', serif;
font-size: 1.45rem;
margin: 0.25rem 0 0.7rem;
color: var(--source-text);
}
.chips {
display: flex;
flex-wrap: wrap;
gap: 0.45rem;
}
.chip {
display: inline-flex;
align-items: baseline;
gap: 0.35rem;
border-radius: 999px;
padding: 0.28rem 0.65rem;
font-family: 'Cormorant Garamond', serif;
font-size: 1.1rem;
border: 1px solid transparent;
}
.chip small {
font-size: 0.75rem;
font-family: 'Space Grotesk', sans-serif;
text-transform: uppercase;
letter-spacing: 0.04em;
}
.chip.long {
color: var(--chip-long-color);
background: rgba(186, 58, 41, 0.15);
border-color: rgba(186, 58, 41, 0.3);
}
.chip.long:before {
content: '';
}
.dark-mode .chip.long {
background: rgba(255, 107, 90, 0.2);
border-color: rgba(255, 107, 90, 0.4);
}
.chip.short {
color: var(--chip-short-color);
background: rgba(31, 111, 109, 0.15);
border-color: rgba(31, 111, 109, 0.3);
}
.dark-mode .chip.short {
background: rgba(77, 217, 213, 0.2);
border-color: rgba(77, 217, 213, 0.4);
}
.chip.clear {
color: var(--chip-clear-color);
background: rgba(116, 108, 95, 0.12);
border-color: rgba(116, 108, 95, 0.25);
}
.dark-mode .chip.clear {
color: #c8c0b0;
background: rgba(170, 160, 144, 0.15);
border-color: rgba(170, 160, 144, 0.3);
}
.empty {
padding: 1rem;
border-radius: 12px;
background: rgba(255, 255, 255, 0.6);
border: 1px dashed rgba(47, 43, 38, 0.2);
color: var(--ink);
}
.dark-mode .empty {
background: rgba(40, 35, 28, 0.7);
border: 1px dashed rgba(232, 228, 220, 0.15);
}
@keyframes rise {
from { transform: translateY(8px); opacity: 0; }
to { transform: translateY(0); opacity: 1; }
}
@media (max-width: 820px) {
.title h1 { font-size: 2.2rem; }
.source { font-size: 1.25rem; }
.dark-mode-toggle {
position: relative;
top: auto;
right: auto;
margin-bottom: 1rem;
}
}
"""
with gr.Blocks() as demo:
gr.HTML("""
<script>
// Detect system dark mode preference and apply on load
function applyDarkModePreference() {
const darkModeToggle = document.getElementById('dark-mode-toggle');
const isDarkMode = localStorage.getItem('darkMode') === 'true' ||
(!localStorage.getItem('darkMode') && window.matchMedia('(prefers-color-scheme: dark)').matches);
if (isDarkMode) {
document.body.classList.add('dark-mode');
document.documentElement.classList.add('dark-mode');
if (darkModeToggle) darkModeToggle.textContent = '☀️ Light Mode';
} else {
document.body.classList.remove('dark-mode');
document.documentElement.classList.remove('dark-mode');
if (darkModeToggle) darkModeToggle.textContent = '🌙 Dark Mode';
}
}
// Apply preference on page load
window.addEventListener('load', applyDarkModePreference);
setTimeout(applyDarkModePreference, 100);
// Listen for system dark mode changes
window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', (e) => {
if (!localStorage.getItem('darkMode')) {
if (e.matches) {
document.body.classList.add('dark-mode');
document.documentElement.classList.add('dark-mode');
document.getElementById('dark-mode-toggle').textContent = '☀️ Light Mode';
} else {
document.body.classList.remove('dark-mode');
document.documentElement.classList.remove('dark-mode');
document.getElementById('dark-mode-toggle').textContent = '🌙 Dark Mode';
}
}
});
</script>
<button id="dark-mode-toggle" class="dark-mode-toggle" onclick="
document.body.classList.toggle('dark-mode');
document.documentElement.classList.toggle('dark-mode');
const isDark = document.body.classList.contains('dark-mode');
localStorage.setItem('darkMode', isDark);
document.getElementById('dark-mode-toggle').textContent = isDark ? '☀️ Light Mode' : '🌙 Dark Mode';
">🌙 Dark Mode</button>
""")
gr.Markdown(
"""
<div class="title">
<h1>Ancient Greek Macronizer</h1>
<p>Syllable-level long/short classification with a modern, readable presentation.</p>
</div>
"""
)
with gr.Column():
with gr.Column(elem_classes=["panel"]):
model_choice = gr.Radio(
label="Model",
choices=list(MODEL_OPTIONS.keys()),
value=DEFAULT_MODEL_LABEL,
)
text_input = gr.Textbox(
label="Greek Lines",
lines=8,
placeholder="Paste one or multiple lines; each line is processed separately.",
)
with gr.Row():
classify_btn = gr.Button("Classify", variant="primary")
clear_btn = gr.Button("Clear")
gr.Examples(examples=examples, inputs=text_input, label="Try examples")
with gr.Column(elem_classes=["panel"]):
html_output = gr.HTML(label="Styled Results")
text_output = gr.Textbox(label="Plain Output", lines=12)
classify_btn.click(render_results, inputs=[text_input, model_choice], outputs=[html_output, text_output])
clear_btn.click(lambda: ("", "", ""), outputs=[text_input, html_output, text_output])
if __name__ == "__main__":
demo.launch(css=CSS)