import html import re from typing import Dict, List, Tuple import gradio as gr import torch import torch.nn.functional as F from transformers import AutoModelForTokenClassification, AutoTokenizer from grc_utils import lower_grc, normalize_word, heavy, vowel, only_bases from syllabify import syllabify_joined from preprocess import process_word MODEL_OPTIONS: Dict[str, str] = { "SyllaMoBert (current)": "Ericu950/SyllaMoBert-grc-macronizer-v1", "Macronizer Mini": "Ericu950/macronizer_mini", } DEFAULT_MODEL_LABEL = "SyllaMoBert (current)" DEFAULT_MODEL_ID = MODEL_OPTIONS[DEFAULT_MODEL_LABEL] MAX_LENGTH = 512 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") _MODEL_CACHE: Dict[str, Tuple[AutoTokenizer, AutoModelForTokenClassification, Dict[int, str]]] = {} def _get_model_bundle(model_id: str) -> Tuple[AutoTokenizer, AutoModelForTokenClassification, Dict[int, str]]: if model_id in _MODEL_CACHE: return _MODEL_CACHE[model_id] tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForTokenClassification.from_pretrained(model_id) model.to(device) model.eval() id2label = model.config.id2label _MODEL_CACHE[model_id] = (tokenizer, model, id2label) return _MODEL_CACHE[model_id] def preprocess_greek_line(line: str) -> List[str]: # Normalize accents and keep only Greek-letter word spans. normalized = normalize_word(line) lower = lower_grc(normalized) words = lower.split() token_lists = [process_word(word) for word in words] return [token for tokens in token_lists for token in tokens] def _normalize_label(raw_label: str) -> int: text = raw_label.lower() if "long" in text: return 1 if "short" in text: return 2 if text.endswith("_1") or text == "1": return 1 if text.endswith("_2") or text == "2": return 2 return 0 def preprocess_and_syllabify(line: str): tokens = preprocess_greek_line(line) return syllabify_joined(tokens) def classify_line(line: str, model_id: str): syllables = preprocess_and_syllabify(line) if not syllables: return [] tokenizer, model, id2label = _get_model_bundle(model_id) encoded = tokenizer( syllables, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=MAX_LENGTH, ) word_ids = encoded.word_ids(batch_index=0) if "token_type_ids" in encoded: del encoded["token_type_ids"] model_inputs = {k: v.to(device) for k, v in encoded.items()} with torch.no_grad(): outputs = model(**model_inputs) probs = F.softmax(outputs.logits, dim=-1) predictions = torch.argmax(probs, dim=-1).squeeze(0).cpu().tolist() aligned = [] seen_word_ids = set() for i, word_id in enumerate(word_ids): if word_id is None: continue if word_id in seen_word_ids: continue if word_id >= len(syllables): break seen_word_ids.add(word_id) pred_id = int(predictions[i]) label_name = id2label.get(pred_id, str(pred_id)) normalized = _normalize_label(str(label_name)) aligned.append((syllables[word_id], normalized)) return aligned def _syllable_chip(syllable: str, label_id: int) -> str: escaped = html.escape(syllable) if label_id == 1: return f'{escaped}long' if label_id == 2: return f'{escaped}short' return f'{escaped}' def _mark_syllable_plain(syllable: str, label_id: int) -> str: if label_id not in (1, 2): return syllable marker = "_" if label_id == 1 else "^" chars = list(syllable) for i in range(len(chars) - 1, -1, -1): if vowel(chars[i]): return "".join(chars[: i + 1]) + marker + "".join(chars[i + 1 :]) return syllable + marker def _to_final_sigma(text: str) -> str: # Step 3: in rendered output, only word-final sigmas become final-sigma. def _convert_word(token: str) -> str: if not token.strip(): return token chars = list(token) last_greek_idx = -1 for i, ch in enumerate(chars): if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff": last_greek_idx = i if last_greek_idx != -1 and chars[last_greek_idx] == "σ": chars[last_greek_idx] = "ς" return "".join(chars) return "".join(_convert_word(tok) for tok in re.findall(r"\S+|\s+", text)) def _restore_expanded_word(marked_word: str, reference_word: str) -> str: restored = marked_word.replace("δσ", "ζ").replace("κσ", "ξ").replace("πσ", "ψ") ref_norm = lower_grc(normalize_word(reference_word)) if "ῥ" in ref_norm: rho_idx = restored.find("ρ") if rho_idx != -1: restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :] return _to_final_sigma(restored) def _consume_word_alignment( aligned: List[Tuple[str, int]], start_idx: int, expected_syllables: List[str], ) -> Tuple[List[Tuple[str, int]], int]: if start_idx >= len(aligned): return [], start_idx expected_bases = only_bases("".join(expected_syllables)) if expected_bases: taken: List[Tuple[str, int]] = [] i = start_idx while i < len(aligned): taken.append(aligned[i]) current_bases = only_bases("".join(s for s, _ in taken)) if current_bases == expected_bases: return taken, i + 1 if len(current_bases) > len(expected_bases) and not current_bases.startswith(expected_bases): break i += 1 fallback_count = len(expected_syllables) if fallback_count <= 0: return [], start_idx end_idx = min(len(aligned), start_idx + fallback_count) return aligned[start_idx:end_idx], end_idx def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -> str: # Step 1: normalize input final sigma to medial sigma for matching only. line_for_matching = line.replace("ς", "σ") parts = re.findall(r"\S+|\s+", line) parts_for_matching = re.findall(r"\S+|\s+", line_for_matching) out_parts: List[str] = [] cursor = 0 for part, part_for_matching in zip(parts, parts_for_matching): if part_for_matching.isspace(): # Step 2: preserve original spacing exactly. out_parts.append(part_for_matching) continue normalized_word = lower_grc(normalize_word(part_for_matching)).replace("ς", "σ") expected_tokens = process_word(normalized_word) expected_syllables = syllabify_joined(expected_tokens) taken, cursor = _consume_word_alignment(aligned, cursor, expected_syllables) if not taken: out_parts.append(part_for_matching) continue marked = "".join(_mark_syllable_plain(syl, label) for syl, label in taken) restored = _restore_expanded_word(marked, part) out_parts.append(restored) if cursor < len(aligned): tail = "".join(_mark_syllable_plain(syl, label) for syl, label in aligned[cursor:]) out_parts.append(_to_final_sigma(tail)) return "".join(out_parts) def render_results(text: str, model_label: str): lines = [line.strip() for line in text.splitlines() if line.strip()] if not lines: return "
Enter one or more Greek lines to classify syllables.
", "" model_id = MODEL_OPTIONS.get(model_label, DEFAULT_MODEL_ID) cards = [] export_lines = [] for idx, line in enumerate(lines, start=1): aligned = classify_line(line, model_id) chips = "".join(_syllable_chip(syl, label) for syl, label in aligned) plain_line = _render_plain_line_with_spacing(line, aligned) cards.append( f"""
Line {idx}
{html.escape(line)}
{chips or '(no syllables found)'}
""" ) export_lines.append(f"Line {idx}: {line}") export_lines.append(f" {plain_line}" if plain_line else " (no syllables found)") html_result = ( "
Long" "Short" "Unmarked
" + "".join(cards) ) export_header = [f"Model: {model_label} ({model_id})", ""] return html_result, "\n".join(export_header + export_lines) examples = [ "νεανίας ἀάατός ἐστιν καὶ καλός. τὰ παῖδες τὰ καλά\nκαλὰ μὲν ἠέξευ, καλὰ δ᾽ ἔτραφες, οὐράνιε Ζεῦ,", "Ἆρες, Ἄρες βροτολοιγὲ μιαιφόνε τειχεσιπλῆτα\nἈτρεΐδαι τε καὶ ἄλλοι ἐϋκνήμιδες Ἀχαιοί", "ἢ τυφλὸς ἤ τις σκνιπὸς ἢ λέγα βλέπων\nψάμμου θαλασσῶν ἢ σκνιπῶν Αἰγυπτίων", ] CSS = """ @import url('https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@500;600;700&family=Space+Grotesk:wght@400;500;700&display=swap'); :root { --bg-start: #0b0b0d; --bg-end: #15151b; --ink: #f0f0f5; --long: #ff7868; --short: #66dbd8; --clear: #a0a0ab; --paper: rgba(22, 22, 28, 0.9); --chip-long-color: var(--long); --chip-short-color: var(--short); --chip-clear-color: #c9c9d3; --source-text: var(--ink); } @media (prefers-color-scheme: dark) { :root { --bg-start: #050506; --bg-end: #101015; --ink: #f3f3f8; --long: #ff7f70; --short: #69e2de; --clear: #b5b5c2; --paper: rgba(16, 16, 22, 0.94); --chip-long-color: #ff9b8d; --chip-short-color: #7cebe7; --chip-clear-color: #d4d4de; --source-text: #fcfcff; } body.dark-mode { --bg-start: #050506; --bg-end: #101015; --ink: #f3f3f8; --long: #ff7f70; --short: #69e2de; --clear: #b5b5c2; --paper: rgba(16, 16, 22, 0.94); --chip-long-color: #ff9b8d; --chip-short-color: #7cebe7; --chip-clear-color: #d4d4de; --source-text: #fcfcff; } } body.dark-mode { --bg-start: #050506; --bg-end: #101015; --ink: #f3f3f8; --long: #ff7f70; --short: #69e2de; --clear: #b5b5c2; --paper: rgba(16, 16, 22, 0.94); --chip-long-color: #ff9b8d; --chip-short-color: #7cebe7; --chip-clear-color: #d4d4de; --source-text: #fcfcff; } html.dark-mode { --bg-start: #050506; --bg-end: #101015; --ink: #f3f3f8; --long: #ff7f70; --short: #69e2de; --clear: #b5b5c2; --paper: rgba(16, 16, 22, 0.94); --chip-long-color: #ff9b8d; --chip-short-color: #7cebe7; --chip-clear-color: #d4d4de; --source-text: #fcfcff; } .gradio-container { font-family: 'Space Grotesk', sans-serif; background: radial-gradient(circle at top left, var(--bg-start), var(--bg-end)); color: var(--ink); transition: background-color 0.3s, color 0.3s; } .dark-mode-toggle { position: fixed; top: 20px; right: 20px; background: var(--paper); border: 2px solid var(--ink); color: var(--ink); padding: 0.6rem 1.2rem; border-radius: 999px; cursor: pointer; font-weight: 600; font-family: 'Space Grotesk', sans-serif; font-size: 0.95rem; z-index: 1000; transition: all 0.3s; } .dark-mode-toggle:hover { transform: scale(1.05); opacity: 0.9; } .title h1 { font-family: 'Cormorant Garamond', serif; font-size: 3rem; letter-spacing: 0.02em; margin-bottom: 0.2rem; } .title p { opacity: 0.82; } .panel { backdrop-filter: blur(8px); background: var(--paper); border: 1px solid rgba(255, 255, 255, 0.16); border-radius: 18px; padding: 0.9rem; } .dark-mode .panel { border-color: rgba(232, 228, 220, 0.22); } .panel label, .panel .gr-markdown, .panel .gradio-markdown, .panel .gr-form label, .panel .gr-form span { color: var(--ink) !important; } .panel textarea, .panel input, .panel .gr-textbox, .panel .gr-textbox textarea, .panel .gr-textbox input, .panel .gr-radio, .panel .gr-radio label, .panel .gr-box, .panel .gr-form { color: var(--ink) !important; } .dark-mode .panel textarea, .dark-mode .panel input, .dark-mode .panel .gr-textbox, .dark-mode .panel .gr-textbox textarea, .dark-mode .panel .gr-textbox input, .dark-mode .panel .gr-radio, .dark-mode .panel .gr-box, .dark-mode .panel .gr-form { background: rgba(10, 10, 14, 0.9) !important; border-color: rgba(232, 228, 220, 0.22) !important; } .dark-mode .panel .gr-button, .dark-mode .panel button { color: #f6f2e8 !important; border-color: rgba(232, 228, 220, 0.28) !important; } .dark-mode .panel .gr-button.gr-button-primary, .dark-mode .panel button.primary { background: #3e74f2 !important; color: #f7f9ff !important; } .legend { display: flex; align-items: center; gap: 0.9rem; font-weight: 600; margin-bottom: 0.8rem; } .dot { display: inline-block; width: 10px; height: 10px; border-radius: 999px; margin-left: 0.7rem; margin-right: 0.25rem; } .dot.long { background: var(--long); } .dot.short { background: var(--short); } .dot.clear { background: var(--clear); } .card { background: rgba(24, 24, 32, 0.84); border-radius: 14px; padding: 0.9rem; margin: 0.8rem 0; border: 1px solid rgba(255, 255, 255, 0.14); animation: rise 420ms ease both; color: var(--ink); } .dark-mode .card { background: rgba(14, 14, 20, 0.9); border: 1px solid rgba(232, 228, 220, 0.15); } .line-number { font-size: 0.8rem; font-weight: 700; text-transform: uppercase; letter-spacing: 0.06em; color: #afb0bc; } .dark-mode .line-number { color: #d3d3df; } .source { font-family: 'Cormorant Garamond', serif; font-size: 1.45rem; margin: 0.25rem 0 0.7rem; color: var(--source-text); } .chips { display: flex; flex-wrap: wrap; gap: 0.45rem; } .chip { display: inline-flex; align-items: baseline; gap: 0.35rem; border-radius: 999px; padding: 0.28rem 0.65rem; font-family: 'Cormorant Garamond', serif; font-size: 1.1rem; border: 1px solid transparent; } .chip small { font-size: 0.75rem; font-family: 'Space Grotesk', sans-serif; text-transform: uppercase; letter-spacing: 0.04em; } .chip.long { color: var(--chip-long-color); background: rgba(186, 58, 41, 0.15); border-color: rgba(186, 58, 41, 0.3); } .chip.long:before { content: ''; } .dark-mode .chip.long { background: rgba(255, 107, 90, 0.2); border-color: rgba(255, 107, 90, 0.4); } .chip.short { color: var(--chip-short-color); background: rgba(31, 111, 109, 0.15); border-color: rgba(31, 111, 109, 0.3); } .dark-mode .chip.short { background: rgba(77, 217, 213, 0.2); border-color: rgba(77, 217, 213, 0.4); } .chip.clear { color: var(--chip-clear-color); background: rgba(116, 108, 95, 0.12); border-color: rgba(116, 108, 95, 0.25); } .dark-mode .chip.clear { color: #c8c0b0; background: rgba(170, 160, 144, 0.15); border-color: rgba(170, 160, 144, 0.3); } .empty { padding: 1rem; border-radius: 12px; background: rgba(255, 255, 255, 0.6); border: 1px dashed rgba(47, 43, 38, 0.2); color: var(--ink); } .dark-mode .empty { background: rgba(40, 35, 28, 0.7); border: 1px dashed rgba(232, 228, 220, 0.15); } @keyframes rise { from { transform: translateY(8px); opacity: 0; } to { transform: translateY(0); opacity: 1; } } @media (max-width: 820px) { .title h1 { font-size: 2.2rem; } .source { font-size: 1.25rem; } .dark-mode-toggle { position: relative; top: auto; right: auto; margin-bottom: 1rem; } } """ with gr.Blocks() as demo: gr.HTML(""" """) gr.Markdown( """

Ancient Greek Macronizer

Syllable-level long/short classification with a modern, readable presentation.

""" ) with gr.Column(): with gr.Column(elem_classes=["panel"]): model_choice = gr.Radio( label="Model", choices=list(MODEL_OPTIONS.keys()), value=DEFAULT_MODEL_LABEL, ) text_input = gr.Textbox( label="Greek Lines", lines=8, placeholder="Paste one or multiple lines; each line is processed separately.", ) with gr.Row(): classify_btn = gr.Button("Classify", variant="primary") clear_btn = gr.Button("Clear") gr.Examples(examples=examples, inputs=text_input, label="Try examples") with gr.Column(elem_classes=["panel"]): html_output = gr.HTML(label="Styled Results") text_output = gr.Textbox(label="Plain Output", lines=12) classify_btn.click(render_results, inputs=[text_input, model_choice], outputs=[html_output, text_output]) clear_btn.click(lambda: ("", "", ""), outputs=[text_input, html_output, text_output]) if __name__ == "__main__": demo.launch(css=CSS)