Spaces:

al1808th
/

macronizer

Sleeping

App Files Files Community

al1808th commited on 26 days ago

Commit

1bd6db7

1 Parent(s): 07f1474

Add model toggle for current and mini checkpoints

Browse files

Files changed (1) hide show

app.py +112 -87

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import html
 import re
-from typing import List
 import gradio as gr
 import torch
@@ -11,17 +11,32 @@ from grc_utils import lower_grc, normalize_word, heavy
 from syllabify import syllabify_joined
 from preprocess import process_word
-+
-MODEL_ID = "Ericu950/SyllaMoBert-grc-macronizer-v1"
 MAX_LENGTH = 512
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-model.eval()
-id2label = model.config.id2label
 def preprocess_greek_line(line: str) -> List[str]:
@@ -51,49 +66,51 @@ def preprocess_and_syllabify(line: str):
     return syllabify_joined(tokens)
-def classify_line(line: str):
-        syllables = preprocess_and_syllabify(line)
-        if not syllables:
-                return []
-        encoded = tokenizer(
-                syllables,
-                is_split_into_words=True,
-                return_tensors="pt",
-                truncation=True,
-                max_length=MAX_LENGTH,
-        )
-        word_ids = encoded.word_ids(batch_index=0)
-        if "token_type_ids" in encoded:
-                del encoded["token_type_ids"]
-        model_inputs = {k: v.to(device) for k, v in encoded.items()}
-        with torch.no_grad():
-                outputs = model(**model_inputs)
-                probs = F.softmax(outputs.logits, dim=-1)
-                predictions = torch.argmax(probs, dim=-1).squeeze(0).cpu().tolist()
-        aligned = []
-        seen_word_ids = set()
-        for i, word_id in enumerate(word_ids):
-                if word_id is None:
-                        continue
-                if word_id in seen_word_ids:
-                        continue
-                if word_id >= len(syllables):
-                        break
-                seen_word_ids.add(word_id)
-                pred_id = int(predictions[i])
-                label_name = id2label.get(pred_id, str(pred_id))
-                normalized = _normalize_label(str(label_name))
-                aligned.append((syllables[word_id], normalized))
-        return aligned
 def _syllable_chip(syllable: str, label_id: int) -> str:
@@ -105,41 +122,44 @@ def _syllable_chip(syllable: str, label_id: int) -> str:
         return f'<span class="chip clear">{escaped}</span>'
-def render_results(text: str):
-        lines = [line.strip() for line in text.splitlines() if line.strip()]
-        if not lines:
-                return "<div class='empty'>Enter one or more Greek lines to classify syllables.</div>", ""
-        cards = []
-        export_lines = []
-        for idx, line in enumerate(lines, start=1):
-                aligned = classify_line(line)
-                chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
-                cards.append(
-                        f"""
-                        <section class="card">
-                                <div class="line-number">Line {idx}</div>
-                                <div class="source">{html.escape(line)}</div>
-                                <div class="chips">{chips or '<span class="chip clear">(no syllables found)</span>'}</div>
-                        </section>
-                        """
-                )
-                export_lines.append(f"Line {idx}: {line}")
-                for syl, label in aligned:
-                        tag = "long" if label == 1 else "short" if label == 2 else "clear"
-                        export_lines.append(f"  - {syl}: {tag}")
-        html_result = (
-                "<div class='legend'><span class='dot long'></span>Long"
-                "<span class='dot short'></span>Short"
-                "<span class='dot clear'></span>Unmarked</div>"
-                + "".join(cards)
         )
-        return html_result, "\n".join(export_lines)
 examples = [
@@ -454,22 +474,27 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
         )
         with gr.Column():
-                with gr.Column(elem_classes=["panel"]):
-                        text_input = gr.Textbox(
-                                label="Greek Lines",
-                                lines=8,
-                                placeholder="Paste one or multiple lines; each line is processed separately.",
-                        )
-                        with gr.Row():
-                                classify_btn = gr.Button("Classify", variant="primary")
-                                clear_btn = gr.Button("Clear")
-                        gr.Examples(examples=examples, inputs=text_input, label="Try examples")
-                with gr.Column(elem_classes=["panel"]):
-                        html_output = gr.HTML(label="Styled Results")
-                        text_output = gr.Textbox(label="Plain Output", lines=12)
-        classify_btn.click(render_results, inputs=text_input, outputs=[html_output, text_output])
         clear_btn.click(lambda: ("", "", ""), outputs=[text_input, html_output, text_output])

 import html
 import re
+from typing import Dict, List, Tuple
 import gradio as gr
 import torch
 from syllabify import syllabify_joined
 from preprocess import process_word
+MODEL_OPTIONS: Dict[str, str] = {
+    "SyllaMoBert (current)": "Ericu950/SyllaMoBert-grc-macronizer-v1",
+    "Macronizer Mini": "Ericu950/macronizer_mini",
+}
+DEFAULT_MODEL_LABEL = "SyllaMoBert (current)"
+DEFAULT_MODEL_ID = MODEL_OPTIONS[DEFAULT_MODEL_LABEL]
 MAX_LENGTH = 512
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+_MODEL_CACHE: Dict[str, Tuple[AutoTokenizer, AutoModelForTokenClassification, Dict[int, str]]] = {}
+def _get_model_bundle(model_id: str) -> Tuple[AutoTokenizer, AutoModelForTokenClassification, Dict[int, str]]:
+    if model_id in _MODEL_CACHE:
+        return _MODEL_CACHE[model_id]
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForTokenClassification.from_pretrained(model_id)
+    model.to(device)
+    model.eval()
+    id2label = model.config.id2label
+    _MODEL_CACHE[model_id] = (tokenizer, model, id2label)
+    return _MODEL_CACHE[model_id]
 def preprocess_greek_line(line: str) -> List[str]:
     return syllabify_joined(tokens)
+def classify_line(line: str, model_id: str):
+    syllables = preprocess_and_syllabify(line)
+    if not syllables:
+        return []
+    tokenizer, model, id2label = _get_model_bundle(model_id)
+    encoded = tokenizer(
+        syllables,
+        is_split_into_words=True,
+        return_tensors="pt",
+        truncation=True,
+        max_length=MAX_LENGTH,
+    )
+    word_ids = encoded.word_ids(batch_index=0)
+    if "token_type_ids" in encoded:
+        del encoded["token_type_ids"]
+    model_inputs = {k: v.to(device) for k, v in encoded.items()}
+    with torch.no_grad():
+        outputs = model(**model_inputs)
+        probs = F.softmax(outputs.logits, dim=-1)
+        predictions = torch.argmax(probs, dim=-1).squeeze(0).cpu().tolist()
+    aligned = []
+    seen_word_ids = set()
+    for i, word_id in enumerate(word_ids):
+        if word_id is None:
+            continue
+        if word_id in seen_word_ids:
+            continue
+        if word_id >= len(syllables):
+            break
+        seen_word_ids.add(word_id)
+        pred_id = int(predictions[i])
+        label_name = id2label.get(pred_id, str(pred_id))
+        normalized = _normalize_label(str(label_name))
+        aligned.append((syllables[word_id], normalized))
+    return aligned
 def _syllable_chip(syllable: str, label_id: int) -> str:
         return f'<span class="chip clear">{escaped}</span>'
+def render_results(text: str, model_label: str):
+    lines = [line.strip() for line in text.splitlines() if line.strip()]
+    if not lines:
+        return "<div class='empty'>Enter one or more Greek lines to classify syllables.</div>", ""
+    model_id = MODEL_OPTIONS.get(model_label, DEFAULT_MODEL_ID)
+    cards = []
+    export_lines = []
+    for idx, line in enumerate(lines, start=1):
+        aligned = classify_line(line, model_id)
+        chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
+        cards.append(
+            f"""
+            <section class="card">
+                    <div class="line-number">Line {idx}</div>
+                    <div class="source">{html.escape(line)}</div>
+                    <div class="chips">{chips or '<span class="chip clear">(no syllables found)</span>'}</div>
+            </section>
+            """
         )
+        export_lines.append(f"Line {idx}: {line}")
+        for syl, label in aligned:
+            tag = "long" if label == 1 else "short" if label == 2 else "clear"
+            export_lines.append(f"  - {syl}: {tag}")
+    html_result = (
+        "<div class='legend'><span class='dot long'></span>Long"
+        "<span class='dot short'></span>Short"
+        "<span class='dot clear'></span>Unmarked</div>"
+        + "".join(cards)
+    )
+    export_header = [f"Model: {model_label} ({model_id})", ""]
+    return html_result, "\n".join(export_header + export_lines)
 examples = [
         )
         with gr.Column():
+            with gr.Column(elem_classes=["panel"]):
+                model_choice = gr.Radio(
+                    label="Model",
+                    choices=list(MODEL_OPTIONS.keys()),
+                    value=DEFAULT_MODEL_LABEL,
+                )
+                text_input = gr.Textbox(
+                    label="Greek Lines",
+                    lines=8,
+                    placeholder="Paste one or multiple lines; each line is processed separately.",
+                )
+                with gr.Row():
+                    classify_btn = gr.Button("Classify", variant="primary")
+                    clear_btn = gr.Button("Clear")
+                gr.Examples(examples=examples, inputs=text_input, label="Try examples")
+            with gr.Column(elem_classes=["panel"]):
+                html_output = gr.HTML(label="Styled Results")
+                text_output = gr.Textbox(label="Plain Output", lines=12)
+        classify_btn.click(render_results, inputs=[text_input, model_choice], outputs=[html_output, text_output])
         clear_btn.click(lambda: ("", "", ""), outputs=[text_input, html_output, text_output])