Spaces:
Running
Running
| import html | |
| import re | |
| from typing import Dict, List, Tuple | |
| import gradio as gr | |
| import torch | |
| import torch.nn.functional as F | |
| from transformers import AutoModelForTokenClassification, AutoTokenizer | |
| from grc_utils import lower_grc, normalize_word, heavy, vowel, only_bases | |
| from syllabify import syllabify_joined | |
| from preprocess import process_word | |
| MODEL_OPTIONS: Dict[str, str] = { | |
| "SyllaMoBert (current)": "Ericu950/SyllaMoBert-grc-macronizer-v1", | |
| "Macronizer Mini": "Ericu950/macronizer_mini", | |
| } | |
| DEFAULT_MODEL_LABEL = "SyllaMoBert (current)" | |
| DEFAULT_MODEL_ID = MODEL_OPTIONS[DEFAULT_MODEL_LABEL] | |
| MAX_LENGTH = 512 | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| _MODEL_CACHE: Dict[str, Tuple[AutoTokenizer, AutoModelForTokenClassification, Dict[int, str]]] = {} | |
| def _get_model_bundle(model_id: str) -> Tuple[AutoTokenizer, AutoModelForTokenClassification, Dict[int, str]]: | |
| if model_id in _MODEL_CACHE: | |
| return _MODEL_CACHE[model_id] | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForTokenClassification.from_pretrained(model_id) | |
| model.to(device) | |
| model.eval() | |
| id2label = model.config.id2label | |
| _MODEL_CACHE[model_id] = (tokenizer, model, id2label) | |
| return _MODEL_CACHE[model_id] | |
| def preprocess_greek_line(line: str) -> List[str]: | |
| # Normalize accents and keep only Greek-letter word spans. | |
| normalized = normalize_word(line) | |
| lower = lower_grc(normalized) | |
| words = lower.split() | |
| token_lists = [process_word(word) for word in words] | |
| return [token for tokens in token_lists for token in tokens] | |
| def _normalize_label(raw_label: str) -> int: | |
| text = raw_label.lower() | |
| if "long" in text: | |
| return 1 | |
| if "short" in text: | |
| return 2 | |
| if text.endswith("_1") or text == "1": | |
| return 1 | |
| if text.endswith("_2") or text == "2": | |
| return 2 | |
| return 0 | |
| def preprocess_and_syllabify(line: str): | |
| tokens = preprocess_greek_line(line) | |
| return syllabify_joined(tokens) | |
| def classify_line(line: str, model_id: str): | |
| syllables = preprocess_and_syllabify(line) | |
| if not syllables: | |
| return [] | |
| tokenizer, model, id2label = _get_model_bundle(model_id) | |
| encoded = tokenizer( | |
| syllables, | |
| is_split_into_words=True, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=MAX_LENGTH, | |
| ) | |
| word_ids = encoded.word_ids(batch_index=0) | |
| if "token_type_ids" in encoded: | |
| del encoded["token_type_ids"] | |
| model_inputs = {k: v.to(device) for k, v in encoded.items()} | |
| with torch.no_grad(): | |
| outputs = model(**model_inputs) | |
| probs = F.softmax(outputs.logits, dim=-1) | |
| predictions = torch.argmax(probs, dim=-1).squeeze(0).cpu().tolist() | |
| aligned = [] | |
| seen_word_ids = set() | |
| for i, word_id in enumerate(word_ids): | |
| if word_id is None: | |
| continue | |
| if word_id in seen_word_ids: | |
| continue | |
| if word_id >= len(syllables): | |
| break | |
| seen_word_ids.add(word_id) | |
| pred_id = int(predictions[i]) | |
| label_name = id2label.get(pred_id, str(pred_id)) | |
| normalized = _normalize_label(str(label_name)) | |
| aligned.append((syllables[word_id], normalized)) | |
| return aligned | |
| def _syllable_chip(syllable: str, label_id: int) -> str: | |
| escaped = html.escape(syllable) | |
| if label_id == 1: | |
| return f'<span class="chip long">{escaped}<small>long</small></span>' | |
| if label_id == 2: | |
| return f'<span class="chip short">{escaped}<small>short</small></span>' | |
| return f'<span class="chip clear">{escaped}</span>' | |
| def _mark_syllable_plain(syllable: str, label_id: int) -> str: | |
| if label_id not in (1, 2): | |
| return syllable | |
| marker = "_" if label_id == 1 else "^" | |
| chars = list(syllable) | |
| for i in range(len(chars) - 1, -1, -1): | |
| if vowel(chars[i]): | |
| return "".join(chars[: i + 1]) + marker + "".join(chars[i + 1 :]) | |
| return syllable + marker | |
| def _to_final_sigma(text: str) -> str: | |
| # Step 3: in rendered output, only word-final sigmas become final-sigma. | |
| def _convert_word(token: str) -> str: | |
| if not token.strip(): | |
| return token | |
| chars = list(token) | |
| last_greek_idx = -1 | |
| for i, ch in enumerate(chars): | |
| if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff": | |
| last_greek_idx = i | |
| if last_greek_idx != -1 and chars[last_greek_idx] == "σ": | |
| chars[last_greek_idx] = "ς" | |
| return "".join(chars) | |
| return "".join(_convert_word(tok) for tok in re.findall(r"\S+|\s+", text)) | |
| def _restore_expanded_word(marked_word: str, reference_word: str) -> str: | |
| restored = marked_word.replace("δσ", "ζ").replace("κσ", "ξ").replace("πσ", "ψ") | |
| ref_norm = lower_grc(normalize_word(reference_word)) | |
| if "ῥ" in ref_norm: | |
| rho_idx = restored.find("ρ") | |
| if rho_idx != -1: | |
| restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :] | |
| return _to_final_sigma(restored) | |
| def _consume_word_alignment( | |
| aligned: List[Tuple[str, int]], | |
| start_idx: int, | |
| expected_syllables: List[str], | |
| ) -> Tuple[List[Tuple[str, int]], int]: | |
| if start_idx >= len(aligned): | |
| return [], start_idx | |
| expected_bases = only_bases("".join(expected_syllables)) | |
| if expected_bases: | |
| taken: List[Tuple[str, int]] = [] | |
| i = start_idx | |
| while i < len(aligned): | |
| taken.append(aligned[i]) | |
| current_bases = only_bases("".join(s for s, _ in taken)) | |
| if current_bases == expected_bases: | |
| return taken, i + 1 | |
| if len(current_bases) > len(expected_bases) and not current_bases.startswith(expected_bases): | |
| break | |
| i += 1 | |
| fallback_count = len(expected_syllables) | |
| if fallback_count <= 0: | |
| return [], start_idx | |
| end_idx = min(len(aligned), start_idx + fallback_count) | |
| return aligned[start_idx:end_idx], end_idx | |
| def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -> str: | |
| # Step 1: normalize input final sigma to medial sigma for matching only. | |
| line_for_matching = line.replace("ς", "σ") | |
| parts = re.findall(r"\S+|\s+", line) | |
| parts_for_matching = re.findall(r"\S+|\s+", line_for_matching) | |
| out_parts: List[str] = [] | |
| cursor = 0 | |
| for part, part_for_matching in zip(parts, parts_for_matching): | |
| if part_for_matching.isspace(): | |
| # Step 2: preserve original spacing exactly. | |
| out_parts.append(part_for_matching) | |
| continue | |
| normalized_word = lower_grc(normalize_word(part_for_matching)).replace("ς", "σ") | |
| expected_tokens = process_word(normalized_word) | |
| expected_syllables = syllabify_joined(expected_tokens) | |
| taken, cursor = _consume_word_alignment(aligned, cursor, expected_syllables) | |
| if not taken: | |
| out_parts.append(part_for_matching) | |
| continue | |
| marked = "".join(_mark_syllable_plain(syl, label) for syl, label in taken) | |
| restored = _restore_expanded_word(marked, part) | |
| out_parts.append(restored) | |
| if cursor < len(aligned): | |
| tail = "".join(_mark_syllable_plain(syl, label) for syl, label in aligned[cursor:]) | |
| out_parts.append(_to_final_sigma(tail)) | |
| return "".join(out_parts) | |
| def render_results(text: str, model_label: str): | |
| lines = [line.strip() for line in text.splitlines() if line.strip()] | |
| if not lines: | |
| return "<div class='empty'>Enter one or more Greek lines to classify syllables.</div>", "" | |
| model_id = MODEL_OPTIONS.get(model_label, DEFAULT_MODEL_ID) | |
| cards = [] | |
| export_lines = [] | |
| for idx, line in enumerate(lines, start=1): | |
| aligned = classify_line(line, model_id) | |
| chips = "".join(_syllable_chip(syl, label) for syl, label in aligned) | |
| plain_line = _render_plain_line_with_spacing(line, aligned) | |
| cards.append( | |
| f""" | |
| <section class="card"> | |
| <div class="line-number">Line {idx}</div> | |
| <div class="source">{html.escape(line)}</div> | |
| <div class="chips">{chips or '<span class="chip clear">(no syllables found)</span>'}</div> | |
| </section> | |
| """ | |
| ) | |
| export_lines.append(f"Line {idx}: {line}") | |
| export_lines.append(f" {plain_line}" if plain_line else " (no syllables found)") | |
| html_result = ( | |
| "<div class='legend'><span class='dot long'></span>Long" | |
| "<span class='dot short'></span>Short" | |
| "<span class='dot clear'></span>Unmarked</div>" | |
| + "".join(cards) | |
| ) | |
| export_header = [f"Model: {model_label} ({model_id})", ""] | |
| return html_result, "\n".join(export_header + export_lines) | |
| examples = [ | |
| "νεανίας ἀάατός ἐστιν καὶ καλός. τὰ παῖδες τὰ καλά\nκαλὰ μὲν ἠέξευ, καλὰ δ᾽ ἔτραφες, οὐράνιε Ζεῦ,", | |
| "Ἆρες, Ἄρες βροτολοιγὲ μιαιφόνε τειχεσιπλῆτα\nἈτρεΐδαι τε καὶ ἄλλοι ἐϋκνήμιδες Ἀχαιοί", | |
| "ἢ τυφλὸς ἤ τις σκνιπὸς ἢ λέγα βλέπων\nψάμμου θαλασσῶν ἢ σκνιπῶν Αἰγυπτίων", | |
| ] | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@500;600;700&family=Space+Grotesk:wght@400;500;700&display=swap'); | |
| :root { | |
| --bg-start: #0b0b0d; | |
| --bg-end: #15151b; | |
| --ink: #f0f0f5; | |
| --long: #ff7868; | |
| --short: #66dbd8; | |
| --clear: #a0a0ab; | |
| --paper: rgba(22, 22, 28, 0.9); | |
| --chip-long-color: var(--long); | |
| --chip-short-color: var(--short); | |
| --chip-clear-color: #c9c9d3; | |
| --source-text: var(--ink); | |
| } | |
| @media (prefers-color-scheme: dark) { | |
| :root { | |
| --bg-start: #050506; | |
| --bg-end: #101015; | |
| --ink: #f3f3f8; | |
| --long: #ff7f70; | |
| --short: #69e2de; | |
| --clear: #b5b5c2; | |
| --paper: rgba(16, 16, 22, 0.94); | |
| --chip-long-color: #ff9b8d; | |
| --chip-short-color: #7cebe7; | |
| --chip-clear-color: #d4d4de; | |
| --source-text: #fcfcff; | |
| } | |
| body.dark-mode { | |
| --bg-start: #050506; | |
| --bg-end: #101015; | |
| --ink: #f3f3f8; | |
| --long: #ff7f70; | |
| --short: #69e2de; | |
| --clear: #b5b5c2; | |
| --paper: rgba(16, 16, 22, 0.94); | |
| --chip-long-color: #ff9b8d; | |
| --chip-short-color: #7cebe7; | |
| --chip-clear-color: #d4d4de; | |
| --source-text: #fcfcff; | |
| } | |
| } | |
| body.dark-mode { | |
| --bg-start: #050506; | |
| --bg-end: #101015; | |
| --ink: #f3f3f8; | |
| --long: #ff7f70; | |
| --short: #69e2de; | |
| --clear: #b5b5c2; | |
| --paper: rgba(16, 16, 22, 0.94); | |
| --chip-long-color: #ff9b8d; | |
| --chip-short-color: #7cebe7; | |
| --chip-clear-color: #d4d4de; | |
| --source-text: #fcfcff; | |
| } | |
| html.dark-mode { | |
| --bg-start: #050506; | |
| --bg-end: #101015; | |
| --ink: #f3f3f8; | |
| --long: #ff7f70; | |
| --short: #69e2de; | |
| --clear: #b5b5c2; | |
| --paper: rgba(16, 16, 22, 0.94); | |
| --chip-long-color: #ff9b8d; | |
| --chip-short-color: #7cebe7; | |
| --chip-clear-color: #d4d4de; | |
| --source-text: #fcfcff; | |
| } | |
| .gradio-container { | |
| font-family: 'Space Grotesk', sans-serif; | |
| background: radial-gradient(circle at top left, var(--bg-start), var(--bg-end)); | |
| color: var(--ink); | |
| transition: background-color 0.3s, color 0.3s; | |
| } | |
| .dark-mode-toggle { | |
| position: fixed; | |
| top: 20px; | |
| right: 20px; | |
| background: var(--paper); | |
| border: 2px solid var(--ink); | |
| color: var(--ink); | |
| padding: 0.6rem 1.2rem; | |
| border-radius: 999px; | |
| cursor: pointer; | |
| font-weight: 600; | |
| font-family: 'Space Grotesk', sans-serif; | |
| font-size: 0.95rem; | |
| z-index: 1000; | |
| transition: all 0.3s; | |
| } | |
| .dark-mode-toggle:hover { | |
| transform: scale(1.05); | |
| opacity: 0.9; | |
| } | |
| .title h1 { | |
| font-family: 'Cormorant Garamond', serif; | |
| font-size: 3rem; | |
| letter-spacing: 0.02em; | |
| margin-bottom: 0.2rem; | |
| } | |
| .title p { | |
| opacity: 0.82; | |
| } | |
| .panel { | |
| backdrop-filter: blur(8px); | |
| background: var(--paper); | |
| border: 1px solid rgba(255, 255, 255, 0.16); | |
| border-radius: 18px; | |
| padding: 0.9rem; | |
| } | |
| .dark-mode .panel { | |
| border-color: rgba(232, 228, 220, 0.22); | |
| } | |
| .panel label, | |
| .panel .gr-markdown, | |
| .panel .gradio-markdown, | |
| .panel .gr-form label, | |
| .panel .gr-form span { | |
| color: var(--ink) !important; | |
| } | |
| .panel textarea, | |
| .panel input, | |
| .panel .gr-textbox, | |
| .panel .gr-textbox textarea, | |
| .panel .gr-textbox input, | |
| .panel .gr-radio, | |
| .panel .gr-radio label, | |
| .panel .gr-box, | |
| .panel .gr-form { | |
| color: var(--ink) !important; | |
| } | |
| .dark-mode .panel textarea, | |
| .dark-mode .panel input, | |
| .dark-mode .panel .gr-textbox, | |
| .dark-mode .panel .gr-textbox textarea, | |
| .dark-mode .panel .gr-textbox input, | |
| .dark-mode .panel .gr-radio, | |
| .dark-mode .panel .gr-box, | |
| .dark-mode .panel .gr-form { | |
| background: rgba(10, 10, 14, 0.9) !important; | |
| border-color: rgba(232, 228, 220, 0.22) !important; | |
| } | |
| .dark-mode .panel .gr-button, | |
| .dark-mode .panel button { | |
| color: #f6f2e8 !important; | |
| border-color: rgba(232, 228, 220, 0.28) !important; | |
| } | |
| .dark-mode .panel .gr-button.gr-button-primary, | |
| .dark-mode .panel button.primary { | |
| background: #3e74f2 !important; | |
| color: #f7f9ff !important; | |
| } | |
| .legend { | |
| display: flex; | |
| align-items: center; | |
| gap: 0.9rem; | |
| font-weight: 600; | |
| margin-bottom: 0.8rem; | |
| } | |
| .dot { | |
| display: inline-block; | |
| width: 10px; | |
| height: 10px; | |
| border-radius: 999px; | |
| margin-left: 0.7rem; | |
| margin-right: 0.25rem; | |
| } | |
| .dot.long { background: var(--long); } | |
| .dot.short { background: var(--short); } | |
| .dot.clear { background: var(--clear); } | |
| .card { | |
| background: rgba(24, 24, 32, 0.84); | |
| border-radius: 14px; | |
| padding: 0.9rem; | |
| margin: 0.8rem 0; | |
| border: 1px solid rgba(255, 255, 255, 0.14); | |
| animation: rise 420ms ease both; | |
| color: var(--ink); | |
| } | |
| .dark-mode .card { | |
| background: rgba(14, 14, 20, 0.9); | |
| border: 1px solid rgba(232, 228, 220, 0.15); | |
| } | |
| .line-number { | |
| font-size: 0.8rem; | |
| font-weight: 700; | |
| text-transform: uppercase; | |
| letter-spacing: 0.06em; | |
| color: #afb0bc; | |
| } | |
| .dark-mode .line-number { | |
| color: #d3d3df; | |
| } | |
| .source { | |
| font-family: 'Cormorant Garamond', serif; | |
| font-size: 1.45rem; | |
| margin: 0.25rem 0 0.7rem; | |
| color: var(--source-text); | |
| } | |
| .chips { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 0.45rem; | |
| } | |
| .chip { | |
| display: inline-flex; | |
| align-items: baseline; | |
| gap: 0.35rem; | |
| border-radius: 999px; | |
| padding: 0.28rem 0.65rem; | |
| font-family: 'Cormorant Garamond', serif; | |
| font-size: 1.1rem; | |
| border: 1px solid transparent; | |
| } | |
| .chip small { | |
| font-size: 0.75rem; | |
| font-family: 'Space Grotesk', sans-serif; | |
| text-transform: uppercase; | |
| letter-spacing: 0.04em; | |
| } | |
| .chip.long { | |
| color: var(--chip-long-color); | |
| background: rgba(186, 58, 41, 0.15); | |
| border-color: rgba(186, 58, 41, 0.3); | |
| } | |
| .chip.long:before { | |
| content: ''; | |
| } | |
| .dark-mode .chip.long { | |
| background: rgba(255, 107, 90, 0.2); | |
| border-color: rgba(255, 107, 90, 0.4); | |
| } | |
| .chip.short { | |
| color: var(--chip-short-color); | |
| background: rgba(31, 111, 109, 0.15); | |
| border-color: rgba(31, 111, 109, 0.3); | |
| } | |
| .dark-mode .chip.short { | |
| background: rgba(77, 217, 213, 0.2); | |
| border-color: rgba(77, 217, 213, 0.4); | |
| } | |
| .chip.clear { | |
| color: var(--chip-clear-color); | |
| background: rgba(116, 108, 95, 0.12); | |
| border-color: rgba(116, 108, 95, 0.25); | |
| } | |
| .dark-mode .chip.clear { | |
| color: #c8c0b0; | |
| background: rgba(170, 160, 144, 0.15); | |
| border-color: rgba(170, 160, 144, 0.3); | |
| } | |
| .empty { | |
| padding: 1rem; | |
| border-radius: 12px; | |
| background: rgba(255, 255, 255, 0.6); | |
| border: 1px dashed rgba(47, 43, 38, 0.2); | |
| color: var(--ink); | |
| } | |
| .dark-mode .empty { | |
| background: rgba(40, 35, 28, 0.7); | |
| border: 1px dashed rgba(232, 228, 220, 0.15); | |
| } | |
| @keyframes rise { | |
| from { transform: translateY(8px); opacity: 0; } | |
| to { transform: translateY(0); opacity: 1; } | |
| } | |
| @media (max-width: 820px) { | |
| .title h1 { font-size: 2.2rem; } | |
| .source { font-size: 1.25rem; } | |
| .dark-mode-toggle { | |
| position: relative; | |
| top: auto; | |
| right: auto; | |
| margin-bottom: 1rem; | |
| } | |
| } | |
| """ | |
| with gr.Blocks() as demo: | |
| gr.HTML(""" | |
| <script> | |
| // Detect system dark mode preference and apply on load | |
| function applyDarkModePreference() { | |
| const darkModeToggle = document.getElementById('dark-mode-toggle'); | |
| const isDarkMode = localStorage.getItem('darkMode') === 'true' || | |
| (!localStorage.getItem('darkMode') && window.matchMedia('(prefers-color-scheme: dark)').matches); | |
| if (isDarkMode) { | |
| document.body.classList.add('dark-mode'); | |
| document.documentElement.classList.add('dark-mode'); | |
| if (darkModeToggle) darkModeToggle.textContent = '☀️ Light Mode'; | |
| } else { | |
| document.body.classList.remove('dark-mode'); | |
| document.documentElement.classList.remove('dark-mode'); | |
| if (darkModeToggle) darkModeToggle.textContent = '🌙 Dark Mode'; | |
| } | |
| } | |
| // Apply preference on page load | |
| window.addEventListener('load', applyDarkModePreference); | |
| setTimeout(applyDarkModePreference, 100); | |
| // Listen for system dark mode changes | |
| window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', (e) => { | |
| if (!localStorage.getItem('darkMode')) { | |
| if (e.matches) { | |
| document.body.classList.add('dark-mode'); | |
| document.documentElement.classList.add('dark-mode'); | |
| document.getElementById('dark-mode-toggle').textContent = '☀️ Light Mode'; | |
| } else { | |
| document.body.classList.remove('dark-mode'); | |
| document.documentElement.classList.remove('dark-mode'); | |
| document.getElementById('dark-mode-toggle').textContent = '🌙 Dark Mode'; | |
| } | |
| } | |
| }); | |
| </script> | |
| <button id="dark-mode-toggle" class="dark-mode-toggle" onclick=" | |
| document.body.classList.toggle('dark-mode'); | |
| document.documentElement.classList.toggle('dark-mode'); | |
| const isDark = document.body.classList.contains('dark-mode'); | |
| localStorage.setItem('darkMode', isDark); | |
| document.getElementById('dark-mode-toggle').textContent = isDark ? '☀️ Light Mode' : '🌙 Dark Mode'; | |
| ">🌙 Dark Mode</button> | |
| """) | |
| gr.Markdown( | |
| """ | |
| <div class="title"> | |
| <h1>Ancient Greek Macronizer</h1> | |
| <p>Syllable-level long/short classification with a modern, readable presentation.</p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Column(): | |
| with gr.Column(elem_classes=["panel"]): | |
| model_choice = gr.Radio( | |
| label="Model", | |
| choices=list(MODEL_OPTIONS.keys()), | |
| value=DEFAULT_MODEL_LABEL, | |
| ) | |
| text_input = gr.Textbox( | |
| label="Greek Lines", | |
| lines=8, | |
| placeholder="Paste one or multiple lines; each line is processed separately.", | |
| ) | |
| with gr.Row(): | |
| classify_btn = gr.Button("Classify", variant="primary") | |
| clear_btn = gr.Button("Clear") | |
| gr.Examples(examples=examples, inputs=text_input, label="Try examples") | |
| with gr.Column(elem_classes=["panel"]): | |
| html_output = gr.HTML(label="Styled Results") | |
| text_output = gr.Textbox(label="Plain Output", lines=12) | |
| classify_btn.click(render_results, inputs=[text_input, model_choice], outputs=[html_output, text_output]) | |
| clear_btn.click(lambda: ("", "", ""), outputs=[text_input, html_output, text_output]) | |
| if __name__ == "__main__": | |
| demo.launch(css=CSS) |