Spaces:

al1808th
/

macronizer

Sleeping

App Files Files Community

al1808th commited on 12 days ago

Commit

dcbc7bd

1 Parent(s): c694732

Simplify sigma normalization and final-sigma rendering

Browse files

Files changed (1) hide show

app.py +98 -3

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import torch
 import torch.nn.functional as F
 from transformers import AutoModelForTokenClassification, AutoTokenizer
-from grc_utils import lower_grc, normalize_word, heavy, vowel
 from syllabify import syllabify_joined
 from preprocess import process_word
@@ -136,6 +136,101 @@ def _mark_syllable_plain(syllable: str, label_id: int) -> str:
     return syllable + marker
 def render_results(text: str, model_label: str):
     lines = [line.strip() for line in text.splitlines() if line.strip()]
     if not lines:
@@ -149,7 +244,7 @@ def render_results(text: str, model_label: str):
     for idx, line in enumerate(lines, start=1):
         aligned = classify_line(line, model_id)
         chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
-        plain_marked = [_mark_syllable_plain(syl, label) for syl, label in aligned]
         cards.append(
             f"""
@@ -162,7 +257,7 @@ def render_results(text: str, model_label: str):
         )
         export_lines.append(f"Line {idx}: {line}")
-        export_lines.append("  " + " ".join(plain_marked) if plain_marked else "  (no syllables found)")
     html_result = (
         "<div class='legend'><span class='dot long'></span>Long"

 import torch.nn.functional as F
 from transformers import AutoModelForTokenClassification, AutoTokenizer
+from grc_utils import lower_grc, normalize_word, heavy, vowel, only_bases
 from syllabify import syllabify_joined
 from preprocess import process_word
     return syllable + marker
+def _to_final_sigma(text: str) -> str:
+    # Step 3: in rendered output, only word-final sigmas become final-sigma.
+    def _convert_word(token: str) -> str:
+        if not token.strip():
+            return token
+        chars = list(token)
+        last_greek_idx = -1
+        for i, ch in enumerate(chars):
+            if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
+                last_greek_idx = i
+        if last_greek_idx != -1 and chars[last_greek_idx] == "σ":
+            chars[last_greek_idx] = "ς"
+        return "".join(chars)
+    return "".join(_convert_word(tok) for tok in re.findall(r"\S+|\s+", text))
+def _restore_expanded_word(marked_word: str, reference_word: str) -> str:
+    restored = marked_word.replace("δσ", "ζ").replace("κσ", "ξ").replace("πσ", "ψ")
+    ref_norm = lower_grc(normalize_word(reference_word))
+    if "ῥ" in ref_norm:
+        rho_idx = restored.find("ρ")
+        if rho_idx != -1:
+            restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :]
+    return _to_final_sigma(restored)
+def _consume_word_alignment(
+    aligned: List[Tuple[str, int]],
+    start_idx: int,
+    expected_syllables: List[str],
+) -> Tuple[List[Tuple[str, int]], int]:
+    if start_idx >= len(aligned):
+        return [], start_idx
+    expected_bases = only_bases("".join(expected_syllables))
+    if expected_bases:
+        taken: List[Tuple[str, int]] = []
+        i = start_idx
+        while i < len(aligned):
+            taken.append(aligned[i])
+            current_bases = only_bases("".join(s for s, _ in taken))
+            if current_bases == expected_bases:
+                return taken, i + 1
+            if len(current_bases) > len(expected_bases) and not current_bases.startswith(expected_bases):
+                break
+            i += 1
+    fallback_count = len(expected_syllables)
+    if fallback_count <= 0:
+        return [], start_idx
+    end_idx = min(len(aligned), start_idx + fallback_count)
+    return aligned[start_idx:end_idx], end_idx
+def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -> str:
+    # Step 1: normalize input final sigma to medial sigma for matching only.
+    line_for_matching = line.replace("ς", "σ")
+    parts = re.findall(r"\S+|\s+", line)
+    parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
+    out_parts: List[str] = []
+    cursor = 0
+    for part, part_for_matching in zip(parts, parts_for_matching):
+        if part_for_matching.isspace():
+            # Step 2: preserve original spacing exactly.
+            out_parts.append(part_for_matching)
+            continue
+        normalized_word = lower_grc(normalize_word(part_for_matching)).replace("ς", "σ")
+        expected_tokens = process_word(normalized_word)
+        expected_syllables = syllabify_joined(expected_tokens)
+        taken, cursor = _consume_word_alignment(aligned, cursor, expected_syllables)
+        if not taken:
+            out_parts.append(part_for_matching)
+            continue
+        marked = "".join(_mark_syllable_plain(syl, label) for syl, label in taken)
+        restored = _restore_expanded_word(marked, part)
+        out_parts.append(restored)
+    if cursor < len(aligned):
+        tail = "".join(_mark_syllable_plain(syl, label) for syl, label in aligned[cursor:])
+        out_parts.append(_to_final_sigma(tail))
+    return "".join(out_parts)
 def render_results(text: str, model_label: str):
     lines = [line.strip() for line in text.splitlines() if line.strip()]
     if not lines:
     for idx, line in enumerate(lines, start=1):
         aligned = classify_line(line, model_id)
         chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
+        plain_line = _render_plain_line_with_spacing(line, aligned)
         cards.append(
             f"""
         )
         export_lines.append(f"Line {idx}: {line}")
+        export_lines.append(f"  {plain_line}" if plain_line else "  (no syllables found)")
     html_result = (
         "<div class='legend'><span class='dot long'></span>Long"