Spaces:

al1808th
/

macronizer

Sleeping

App Files Files Community

al1808th commited on 27 days ago

Commit

89195b1

1 Parent(s): 8887a7d

Improve macronizer UI and syllable classification output

Browse files

Files changed (1) hide show

app.py +345 -24

app.py CHANGED Viewed

@@ -1,40 +1,361 @@
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForTokenClassification
-MODEL_ID = "Ericu950/macronizer_mini"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
 id2label = model.config.id2label
-def macronize(text):
-    inputs = tokenizer(text, return_tensors="pt", truncation=True)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    logits = outputs.logits
-    predictions = torch.argmax(logits, dim=-1)[0]
-    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
-    # Pair tokens with labels
-    result = []
-    for token, pred in zip(tokens, predictions):
-        label = id2label[int(pred)]
-        result.append(f"{token}:{label}")
-    return " ".join(result)
-iface = gr.Interface(
-    fn=macronize,
-    inputs="text",
-    outputs="text",
-    title="Macronizer (Token Classification)"
-)
-iface.launch()

+import html
+import re
 import gradio as gr
 import torch
+from torch.nn.functional import softmax
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+from syllabify import syllabify_joined
+from preprocess import process_word, replace_oxia_with_tonos
+MODEL_ID = "Ericu950/SyllaMoBert-grc-macronizer-v1"
+MAX_LENGTH = 512
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+model.eval()
 id2label = model.config.id2label
+def preprocess_greek_line(line):
+    """
+    Normalize, extract, and tokenize a line of Greek text.
+    Steps:
+    1. Normalize oxia to tonos.
+    2. Extract valid Greek words and discard punctuation.
+    3. Expand compound characters and merge diphthongs.
+    4. Flatten the tokens across all words.
+    Args:
+        line (str): A full Greek sentence or phrase.
+    Returns:
+        list of str: A flat list of tokens (letters or diphthongs).
+    """
+    # Step 1: Replace oxia with tonos
+    line = replace_oxia_with_tonos(line)
+    # Step 2: Extract only Greek characters (ignore punctuation, numbers, etc.)
+    words = re.findall(
+        r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"
+        r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ"
+        r"ἐἑἒἓἔἕἘἙἜἝ"
+        r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ"
+        r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ"
+        r"ὀὁὂὃὄὅὈὉὊὋὌὍ"
+        r"ὐὑὒὓὔὕὖὗὙὛὝ"
+        r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ"
+        r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ"
+        r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ"
+        r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ"
+        r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+",
+        line.lower()
+    )
+    # Step 3: Tokenize each word using expansion rules
+    token_lists = [process_word(word) for word in words]
+    # Step 4: Flatten token lists across all words
+    tokens = [token for tokens in token_lists for token in tokens]
+    return tokens
+def _normalize_label(raw_label: str) -> int:
+        text = raw_label.lower()
+        if "long" in text:
+                return 1
+        if "short" in text:
+                return 2
+        return 0
+def _fallback_preprocess(line: str):
+        return re.findall(r"[\wἀ-῾]+|[^\w\s]", line, flags=re.UNICODE)
+def _fallback_syllabify(tokens):
+        return [t for t in tokens if re.search(r"[\wἀ-῾]", t, flags=re.UNICODE)]
+def preprocess_and_syllabify(line: str):
+        if preprocess_greek_line and syllabify_joined:
+                tokens = preprocess_greek_line(line)
+                return syllabify_joined(tokens)
+        tokens = _fallback_preprocess(line)
+        return _fallback_syllabify(tokens)
+def classify_line(line: str):
+        syllables = preprocess_and_syllabify(line)
+        if not syllables:
+                return []
+        inputs = tokenizer(
+                syllables,
+                is_split_into_words=True,
+                return_tensors="pt",
+                truncation=True,
+                max_length=MAX_LENGTH,
+        )
+        if "token_type_ids" in inputs:
+                del inputs["token_type_ids"]
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+                outputs = model(**inputs)
+                probs = softmax(outputs.logits, dim=-1)
+                predictions = torch.argmax(probs, dim=-1).squeeze(0).cpu().tolist()
+        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze(0))
+        aligned = []
+        syllable_idx = 0
+        for i, token in enumerate(tokens):
+                if token in tokenizer.all_special_tokens:
+                        continue
+                if syllable_idx >= len(syllables):
+                        break
+                pred_id = int(predictions[i])
+                label_name = id2label.get(pred_id, str(pred_id))
+                normalized = _normalize_label(str(label_name))
+                aligned.append((syllables[syllable_idx], normalized))
+                syllable_idx += 1
+        return aligned
+def _syllable_chip(syllable: str, label_id: int) -> str:
+        escaped = html.escape(syllable)
+        if label_id == 1:
+                return f'<span class="chip long">{escaped}<small>long</small></span>'
+        if label_id == 2:
+                return f'<span class="chip short">{escaped}<small>short</small></span>'
+        return f'<span class="chip clear">{escaped}</span>'
+def render_results(text: str):
+        lines = [line.strip() for line in text.splitlines() if line.strip()]
+        if not lines:
+                return "<div class='empty'>Enter one or more Greek lines to classify syllables.</div>", ""
+        cards = []
+        export_lines = []
+        for idx, line in enumerate(lines, start=1):
+                aligned = classify_line(line)
+                chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
+                cards.append(
+                        f"""
+                        <section class="card">
+                                <div class="line-number">Line {idx}</div>
+                                <div class="source">{html.escape(line)}</div>
+                                <div class="chips">{chips or '<span class="chip clear">(no syllables found)</span>'}</div>
+                        </section>
+                        """
+                )
+                export_lines.append(f"Line {idx}: {line}")
+                for syl, label in aligned:
+                        tag = "long" if label == 1 else "short" if label == 2 else "clear"
+                        export_lines.append(f"  - {syl}: {tag}")
+        html_result = (
+                "<div class='legend'><span class='dot long'></span>Long"
+                "<span class='dot short'></span>Short"
+                "<span class='dot clear'></span>Unmarked</div>"
+                + "".join(cards)
+        )
+        return html_result, "\n".join(export_lines)
+examples = [
+        "νεανίας ἀάατός ἐστιν καὶ καλός. τὰ παῖδες τὰ καλά\nκαλὰ μὲν ἠέξευ, καλὰ δ᾽ ἔτραφες, οὐράνιε Ζεῦ,",
+        "Ἆρες, Ἄρες βροτολοιγὲ μιαιφόνε τειχεσιπλῆτα\nἈτρεΐδαι τε καὶ ἄλλοι ἐϋκνήμιδες Ἀχαιοί",
+        "ἢ τυφλὸς ἤ τις σκνιπὸς ἢ λέγα βλέπων\nψάμμου θαλασσῶν ἢ σκνιπῶν Αἰγυπτίων",
+]
+CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@500;600;700&family=Space+Grotesk:wght@400;500;700&display=swap');
+:root {
+    --bg-start: #f2eee6;
+    --bg-end: #ddd5c6;
+    --ink: #2f2b26;
+    --long: #ba3a29;
+    --short: #1f6f6d;
+    --clear: #7c7369;
+    --paper: rgba(255, 251, 244, 0.88);
+}
+.gradio-container {
+    font-family: 'Space Grotesk', sans-serif;
+    background: radial-gradient(circle at top left, var(--bg-start), var(--bg-end));
+    color: var(--ink);
+}
+.title h1 {
+    font-family: 'Cormorant Garamond', serif;
+    font-size: 3rem;
+    letter-spacing: 0.02em;
+    margin-bottom: 0.2rem;
+}
+.title p {
+    opacity: 0.82;
+}
+.panel {
+    backdrop-filter: blur(8px);
+    background: var(--paper);
+    border: 1px solid rgba(47, 43, 38, 0.18);
+    border-radius: 18px;
+    padding: 0.9rem;
+}
+.legend {
+    display: flex;
+    align-items: center;
+    gap: 0.9rem;
+    font-weight: 600;
+    margin-bottom: 0.8rem;
+}
+.dot {
+    display: inline-block;
+    width: 10px;
+    height: 10px;
+    border-radius: 999px;
+    margin-left: 0.7rem;
+    margin-right: 0.25rem;
+}
+.dot.long { background: var(--long); }
+.dot.short { background: var(--short); }
+.dot.clear { background: var(--clear); }
+.card {
+    background: rgba(255, 255, 255, 0.72);
+    border-radius: 14px;
+    padding: 0.9rem;
+    margin: 0.8rem 0;
+    border: 1px solid rgba(47, 43, 38, 0.12);
+    animation: rise 420ms ease both;
+}
+.line-number {
+    font-size: 0.8rem;
+    font-weight: 700;
+    text-transform: uppercase;
+    letter-spacing: 0.06em;
+    color: #5c544b;
+}
+.source {
+    font-family: 'Cormorant Garamond', serif;
+    font-size: 1.45rem;
+    margin: 0.25rem 0 0.7rem;
+}
+.chips {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 0.45rem;
+}
+.chip {
+    display: inline-flex;
+    align-items: baseline;
+    gap: 0.35rem;
+    border-radius: 999px;
+    padding: 0.28rem 0.65rem;
+    font-family: 'Cormorant Garamond', serif;
+    font-size: 1.1rem;
+    border: 1px solid transparent;
+}
+.chip small {
+    font-size: 0.75rem;
+    font-family: 'Space Grotesk', sans-serif;
+    text-transform: uppercase;
+    letter-spacing: 0.04em;
+}
+.chip.long {
+    color: var(--long);
+    background: rgba(186, 58, 41, 0.09);
+    border-color: rgba(186, 58, 41, 0.2);
+}
+.chip.short {
+    color: var(--short);
+    background: rgba(31, 111, 109, 0.1);
+    border-color: rgba(31, 111, 109, 0.2);
+}
+.chip.clear {
+    color: #544e46;
+    background: rgba(116, 108, 95, 0.08);
+    border-color: rgba(116, 108, 95, 0.18);
+}
+.empty {
+    padding: 1rem;
+    border-radius: 12px;
+    background: rgba(255, 255, 255, 0.6);
+    border: 1px dashed rgba(47, 43, 38, 0.2);
+}
+@keyframes rise {
+    from { transform: translateY(8px); opacity: 0; }
+    to { transform: translateY(0); opacity: 1; }
+}
+@media (max-width: 820px) {
+    .title h1 { font-size: 2.2rem; }
+    .source { font-size: 1.25rem; }
+}
+"""
+with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+                """
+                <div class="title">
+                    <h1>Ancient Greek Macronizer</h1>
+                    <p>Syllable-level long/short classification with a modern, readable presentation.</p>
+                </div>
+                """
+        )
+        with gr.Row():
+                with gr.Column(scale=5, elem_classes=["panel"]):
+                        text_input = gr.Textbox(
+                                label="Greek Lines",
+                                lines=8,
+                                placeholder="Paste one or multiple lines; each line is processed separately.",
+                        )
+                        with gr.Row():
+                                classify_btn = gr.Button("Classify", variant="primary")
+                                clear_btn = gr.Button("Clear")
+                        gr.Examples(examples=examples, inputs=text_input, label="Try examples")
+                with gr.Column(scale=6, elem_classes=["panel"]):
+                        html_output = gr.HTML(label="Styled Results")
+                        text_output = gr.Textbox(label="Plain Output", lines=12)
+        classify_btn.click(render_results, inputs=text_input, outputs=[html_output, text_output])
+        clear_btn.click(lambda: ("", "", ""), outputs=[text_input, html_output, text_output])
+if __name__ == "__main__":
+        demo.launch()