Spaces:

Urdatorn
/

macronizer

Running on Zero

App Files Files Community

Urdatorn commited on 7 days ago

Commit

3d9fa99

1 Parent(s): aaf8c40

output formatting

Browse files

Files changed (6) hide show

.gitignore +3 -1
README.md +17 -1
__pycache__/app.cpython-313.pyc +0 -0
app.py +139 -10
pytest.ini +12 -0
test_markup.py +72 -0

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	- ~~.github/~~

+__pycache__
+.pytest_cache
+.vscode

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Macronizer
-emoji: 🦀
 colorFrom: gray
 colorTo: pink
 sdk: gradio
@@ -11,4 +11,20 @@ license: gpl-3.0
 short_description: Markup of Ancient Greek vowel length
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Macronizer
+emoji: 📐
 colorFrom: gray
 colorTo: pink
 sdk: gradio
 short_description: Markup of Ancient Greek vowel length
 ---
+# Ancient Greek Macronizer
+This application uses a fine-tuned transformer model to classify Ancient Greek syllables as long or short, marking them with markup (underscores for long vowels, carets for short vowels).
+## Testing
+Run the test suite with:
+```bash
+pytest
+```
+This will run all tests in `test_markup.py`, which verify that the plain text output preserves the input exactly, with only markup additions (^ and _ characters).
+See [pytest.ini](pytest.ini) for configuration details.
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/app.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ

app.py CHANGED Viewed

@@ -66,6 +66,28 @@ def preprocess_and_syllabify(line: str):
     return syllabify_joined(tokens)
 def classify_line(line: str, model_id: str):
     syllables = preprocess_and_syllabify(line)
     if not syllables:
@@ -165,9 +187,92 @@ def _restore_expanded_word(marked_word: str, reference_word: str) -> str:
         if rho_idx != -1:
             restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :]
     return _to_final_sigma(restored)
 def _consume_word_alignment(
     aligned: List[Tuple[str, int]],
     start_idx: int,
@@ -197,17 +302,22 @@ def _consume_word_alignment(
     return aligned[start_idx:end_idx], end_idx
-def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -> str:
-    # Step 1: normalize input final sigma to medial sigma for matching only.
     line_for_matching = line.replace("ς", "σ")
     parts = re.findall(r"\S+|\s+", line)
     parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
     out_parts: List[str] = []
-    cursor = 0
     for part, part_for_matching in zip(parts, parts_for_matching):
         if part_for_matching.isspace():
-            # Step 2: preserve original spacing exactly.
             out_parts.append(part_for_matching)
             continue
@@ -215,7 +325,7 @@ def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -
         expected_tokens = process_word(normalized_word)
         expected_syllables = syllabify_joined(expected_tokens)
-        taken, cursor = _consume_word_alignment(aligned, cursor, expected_syllables)
         if not taken:
             out_parts.append(part_for_matching)
             continue
@@ -224,11 +334,30 @@ def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -
         restored = _restore_expanded_word(marked, part)
         out_parts.append(restored)
-    if cursor < len(aligned):
-        tail = "".join(_mark_syllable_plain(syl, label) for syl, label in aligned[cursor:])
-        out_parts.append(_to_final_sigma(tail))
-    return "".join(out_parts)
 def render_results(text: str, model_label: str):
@@ -244,7 +373,7 @@ def render_results(text: str, model_label: str):
     for idx, line in enumerate(lines, start=1):
         aligned = classify_line(line, model_id)
         chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
-        plain_line = _render_plain_line_with_spacing(line, aligned)
         cards.append(
             f"""

     return syllabify_joined(tokens)
+def classify_line_per_word(line: str, model_id: str) -> List[Tuple[str, List[Tuple[str, int]]]]:
+    """
+    Classify each word separately to preserve word boundaries.
+    Returns list of (word, aligned_syllables) tuples.
+    """
+    line_for_matching = line.replace("ς", "σ")
+    parts = re.findall(r"\S+|\s+", line)
+    parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
+    result = []
+    for part, part_for_matching in zip(parts, parts_for_matching):
+        if part_for_matching.isspace():
+            result.append((part_for_matching, []))  # Spaces have no aligned syllables
+            continue
+        # Classify this word independently
+        aligned = classify_line(part_for_matching, model_id)
+        result.append((part, aligned))
+    return result
 def classify_line(line: str, model_id: str):
     syllables = preprocess_and_syllabify(line)
     if not syllables:
         if rho_idx != -1:
             restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :]
+    # Apply case from reference_word to restored
+    restored = _apply_case_from_reference(restored, reference_word)
+    # Preserve original final sigma from reference
+    restored = _preserve_final_sigma_from_reference(restored, reference_word)
     return _to_final_sigma(restored)
+def _apply_case_from_reference(text: str, reference: str) -> str:
+    """Apply case from reference word to text (only for Greek letters)."""
+    result = []
+    ref_idx = 0
+    for char in text:
+        # Skip markup characters
+        if char in "^_":
+            result.append(char)
+            continue
+        # For Greek letters, find corresponding reference character and apply case
+        if "\u0370" <= char <= "\u03ff" or "\u1f00" <= char <= "\u1fff":
+            # Find next Greek letter in reference
+            while ref_idx < len(reference) and not ("\u0370" <= reference[ref_idx] <= "\u03ff" or "\u1f00" <= reference[ref_idx] <= "\u1fff"):
+                ref_idx += 1
+            if ref_idx < len(reference):
+                ref_char = reference[ref_idx]
+                # Check if reference character is uppercase
+                if ref_char.isupper() or ref_char != lower_grc(ref_char):
+                    # Try to apply uppercase version
+                    upper_version = char.upper()
+                    if upper_version != char:  # Character has an uppercase form
+                        result.append(upper_version)
+                    else:
+                        result.append(char)
+                else:
+                    result.append(char)
+                ref_idx += 1
+            else:
+                result.append(char)
+        else:
+            result.append(char)
+    return "".join(result)
+def _preserve_final_sigma_from_reference(text: str, reference: str) -> str:
+    """If reference has final sigma ς at word end, preserve it in text."""
+    # Simply copy final sigmas from reference to text at word boundaries
+    # Split both into tokens
+    text_tokens = re.findall(r"\S+|\s+", text)
+    ref_tokens = re.findall(r"\S+|\s+", reference)
+    result = []
+    for text_token, ref_token in zip(text_tokens, ref_tokens):
+        if text_token.isspace() or ref_token.isspace():
+            result.append(text_token)
+            continue
+        # Find last Greek letter in both tokens
+        text_last_greek_idx = -1
+        ref_last_greek_idx = -1
+        for i in range(len(text_token) - 1, -1, -1):
+            ch = text_token[i]
+            if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
+                text_last_greek_idx = i
+                break
+        for i in range(len(ref_token) - 1, -1, -1):
+            ch = ref_token[i]
+            if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
+                ref_last_greek_idx = i
+                break
+        # If reference ends with final sigma ς, convert text's final sigma to match
+        if ref_last_greek_idx >= 0 and ref_token[ref_last_greek_idx] == "ς":
+            if text_last_greek_idx >= 0 and text_token[text_last_greek_idx] == "σ":
+                text_token = text_token[:text_last_greek_idx] + "ς" + text_token[text_last_greek_idx+1:]
+        result.append(text_token)
+    return "".join(result)
 def _consume_word_alignment(
     aligned: List[Tuple[str, int]],
     start_idx: int,
     return aligned[start_idx:end_idx], end_idx
+def _render_plain_line_per_word(line: str, model_id: str) -> str:
+    """Render plain line by processing each word separately."""
     line_for_matching = line.replace("ς", "σ")
     parts = re.findall(r"\S+|\s+", line)
     parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
     out_parts: List[str] = []
     for part, part_for_matching in zip(parts, parts_for_matching):
         if part_for_matching.isspace():
+            # Preserve original spacing exactly.
+            out_parts.append(part_for_matching)
+            continue
+        # Classify this word independently
+        aligned = classify_line(part_for_matching, model_id)
+        if not aligned:
             out_parts.append(part_for_matching)
             continue
         expected_tokens = process_word(normalized_word)
         expected_syllables = syllabify_joined(expected_tokens)
+        taken, _ = _consume_word_alignment(aligned, 0, expected_syllables)
         if not taken:
             out_parts.append(part_for_matching)
             continue
         restored = _restore_expanded_word(marked, part)
         out_parts.append(restored)
+    result = "".join(out_parts)
+    # Post-process: convert word-final σ to ς for readability
+    result = _convert_final_sigmas(result)
+    return result
+def _convert_final_sigmas(text: str) -> str:
+    """Convert word-final σ to ς (final sigma) for readability."""
+    # Find all words (sequences of non-space characters that include Greek letters)
+    def convert_word(match):
+        word = match.group(0)
+        # Find the last Greek letter in the word
+        for i in range(len(word) - 1, -1, -1):
+            ch = word[i]
+            if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
+                # Found the last Greek letter
+                if ch == "σ":
+                    # Convert medial sigma to final sigma
+                    return word[:i] + "ς" + word[i+1:]
+                break
+        return word
+    # Replace word-final σ with ς
+    return re.sub(r"\S+", convert_word, text)
 def render_results(text: str, model_label: str):
     for idx, line in enumerate(lines, start=1):
         aligned = classify_line(line, model_id)
         chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
+        plain_line = _render_plain_line_per_word(line, model_id)
         cards.append(
             f"""

pytest.ini ADDED Viewed

	@@ -0,0 +1,12 @@

+[pytest]
+testpaths = test_markup.py
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+# Output options
+addopts = -v --tb=short
+# Markers
+markers =
+    markup: Tests for markup output preservation

test_markup.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""
+Test that plain output preserves input exactly except for markup.
+Output should equal: input.replace("^", "").replace("_", "")
+"""
+import pytest
+from app import _render_plain_line_per_word, DEFAULT_MODEL_ID
+def strip_markup(text: str) -> str:
+    """Remove markup characters from text."""
+    return text.replace("^", "").replace("_", "")
+# Test cases with Greek lines
+TEST_CASES = [
+    # Basic simple word
+    "νεανίας",
+    # Multiple words with spaces
+    "νεανίας ἀάατός ἐστιν",
+    # With final sigma
+    "καὶ καλός",
+    # Multiple spaces
+    "καλὰ   μὲν",
+    # Single letter
+    "ἢ",
+    # Word with punctuation preserved
+    "τυφλὸς ἤ",
+    # Multi-word with accents
+    "Ἀτρεΐδαι τε καὶ ἄλλοι",
+    # Longer passage
+    "νεανίας ἀάατός ἐστιν καὶ καλός",
+]
+@pytest.mark.parametrize("input_line", TEST_CASES)
+def test_plain_output_preserves_input_without_markup(input_line):
+    """
+    Test that the plain output is identical to input after removing markup.
+    The output should be: input.replace("^", "").replace("_", "")
+    This ensures:
+    - All original characters are preserved
+    - Spaces are preserved exactly
+    - Final sigmas are preserved
+    - Only the markup (^ and _) are added
+    """
+    # Get the rendered plain line
+    plain_output = _render_plain_line_per_word(input_line, DEFAULT_MODEL_ID)
+    # Strip markup from the output to get back the base text
+    output_without_markup = strip_markup(plain_output)
+    # The expected result: input with no markup
+    # (We don't normalize final sigma - we preserve exactly what was in the input)
+    input_expected = input_line
+    print(f"\nInput: {repr(input_line)}")
+    print(f"Output: {repr(plain_output)}")
+    print(f"Output without markup: {repr(output_without_markup)}")
+    print(f"Expected: {repr(input_expected)}")
+    # The core assertion: output without markup should match input
+    assert output_without_markup == input_expected, (
+        f"Output without markup doesn't match input.\n"
+        f"Expected: {repr(input_expected)}\n"
+        f"Got: {repr(output_without_markup)}"
+    )
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])