Spaces:
Running on Zero
Running on Zero
output formatting
Browse files- .gitignore +3 -1
- README.md +17 -1
- __pycache__/app.cpython-313.pyc +0 -0
- app.py +139 -10
- pytest.ini +12 -0
- test_markup.py +72 -0
.gitignore
CHANGED
|
@@ -1 +1,3 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
.pytest_cache
|
| 3 |
+
.vscode
|
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: Macronizer
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: pink
|
| 6 |
sdk: gradio
|
|
@@ -11,4 +11,20 @@ license: gpl-3.0
|
|
| 11 |
short_description: Markup of Ancient Greek vowel length
|
| 12 |
---
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
title: Macronizer
|
| 3 |
+
emoji: 📐
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: pink
|
| 6 |
sdk: gradio
|
|
|
|
| 11 |
short_description: Markup of Ancient Greek vowel length
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# Ancient Greek Macronizer
|
| 15 |
+
|
| 16 |
+
This application uses a fine-tuned transformer model to classify Ancient Greek syllables as long or short, marking them with markup (underscores for long vowels, carets for short vowels).
|
| 17 |
+
|
| 18 |
+
## Testing
|
| 19 |
+
|
| 20 |
+
Run the test suite with:
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
pytest
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
This will run all tests in `test_markup.py`, which verify that the plain text output preserves the input exactly, with only markup additions (^ and _ characters).
|
| 27 |
+
|
| 28 |
+
See [pytest.ini](pytest.ini) for configuration details.
|
| 29 |
+
|
| 30 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
__pycache__/app.cpython-313.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
|
|
|
app.py
CHANGED
|
@@ -66,6 +66,28 @@ def preprocess_and_syllabify(line: str):
|
|
| 66 |
return syllabify_joined(tokens)
|
| 67 |
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
def classify_line(line: str, model_id: str):
|
| 70 |
syllables = preprocess_and_syllabify(line)
|
| 71 |
if not syllables:
|
|
@@ -165,9 +187,92 @@ def _restore_expanded_word(marked_word: str, reference_word: str) -> str:
|
|
| 165 |
if rho_idx != -1:
|
| 166 |
restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :]
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
return _to_final_sigma(restored)
|
| 169 |
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
def _consume_word_alignment(
|
| 172 |
aligned: List[Tuple[str, int]],
|
| 173 |
start_idx: int,
|
|
@@ -197,17 +302,22 @@ def _consume_word_alignment(
|
|
| 197 |
return aligned[start_idx:end_idx], end_idx
|
| 198 |
|
| 199 |
|
| 200 |
-
def
|
| 201 |
-
|
| 202 |
line_for_matching = line.replace("ς", "σ")
|
| 203 |
parts = re.findall(r"\S+|\s+", line)
|
| 204 |
parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
|
| 205 |
out_parts: List[str] = []
|
| 206 |
-
cursor = 0
|
| 207 |
|
| 208 |
for part, part_for_matching in zip(parts, parts_for_matching):
|
| 209 |
if part_for_matching.isspace():
|
| 210 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
out_parts.append(part_for_matching)
|
| 212 |
continue
|
| 213 |
|
|
@@ -215,7 +325,7 @@ def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -
|
|
| 215 |
expected_tokens = process_word(normalized_word)
|
| 216 |
expected_syllables = syllabify_joined(expected_tokens)
|
| 217 |
|
| 218 |
-
taken,
|
| 219 |
if not taken:
|
| 220 |
out_parts.append(part_for_matching)
|
| 221 |
continue
|
|
@@ -224,11 +334,30 @@ def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -
|
|
| 224 |
restored = _restore_expanded_word(marked, part)
|
| 225 |
out_parts.append(restored)
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
|
|
|
| 230 |
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
|
| 234 |
def render_results(text: str, model_label: str):
|
|
@@ -244,7 +373,7 @@ def render_results(text: str, model_label: str):
|
|
| 244 |
for idx, line in enumerate(lines, start=1):
|
| 245 |
aligned = classify_line(line, model_id)
|
| 246 |
chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
|
| 247 |
-
plain_line =
|
| 248 |
|
| 249 |
cards.append(
|
| 250 |
f"""
|
|
|
|
| 66 |
return syllabify_joined(tokens)
|
| 67 |
|
| 68 |
|
| 69 |
+
def classify_line_per_word(line: str, model_id: str) -> List[Tuple[str, List[Tuple[str, int]]]]:
|
| 70 |
+
"""
|
| 71 |
+
Classify each word separately to preserve word boundaries.
|
| 72 |
+
Returns list of (word, aligned_syllables) tuples.
|
| 73 |
+
"""
|
| 74 |
+
line_for_matching = line.replace("ς", "σ")
|
| 75 |
+
parts = re.findall(r"\S+|\s+", line)
|
| 76 |
+
parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
|
| 77 |
+
|
| 78 |
+
result = []
|
| 79 |
+
for part, part_for_matching in zip(parts, parts_for_matching):
|
| 80 |
+
if part_for_matching.isspace():
|
| 81 |
+
result.append((part_for_matching, [])) # Spaces have no aligned syllables
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
# Classify this word independently
|
| 85 |
+
aligned = classify_line(part_for_matching, model_id)
|
| 86 |
+
result.append((part, aligned))
|
| 87 |
+
|
| 88 |
+
return result
|
| 89 |
+
|
| 90 |
+
|
| 91 |
def classify_line(line: str, model_id: str):
|
| 92 |
syllables = preprocess_and_syllabify(line)
|
| 93 |
if not syllables:
|
|
|
|
| 187 |
if rho_idx != -1:
|
| 188 |
restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :]
|
| 189 |
|
| 190 |
+
# Apply case from reference_word to restored
|
| 191 |
+
restored = _apply_case_from_reference(restored, reference_word)
|
| 192 |
+
|
| 193 |
+
# Preserve original final sigma from reference
|
| 194 |
+
restored = _preserve_final_sigma_from_reference(restored, reference_word)
|
| 195 |
+
|
| 196 |
return _to_final_sigma(restored)
|
| 197 |
|
| 198 |
|
| 199 |
+
def _apply_case_from_reference(text: str, reference: str) -> str:
|
| 200 |
+
"""Apply case from reference word to text (only for Greek letters)."""
|
| 201 |
+
result = []
|
| 202 |
+
ref_idx = 0
|
| 203 |
+
|
| 204 |
+
for char in text:
|
| 205 |
+
# Skip markup characters
|
| 206 |
+
if char in "^_":
|
| 207 |
+
result.append(char)
|
| 208 |
+
continue
|
| 209 |
+
|
| 210 |
+
# For Greek letters, find corresponding reference character and apply case
|
| 211 |
+
if "\u0370" <= char <= "\u03ff" or "\u1f00" <= char <= "\u1fff":
|
| 212 |
+
# Find next Greek letter in reference
|
| 213 |
+
while ref_idx < len(reference) and not ("\u0370" <= reference[ref_idx] <= "\u03ff" or "\u1f00" <= reference[ref_idx] <= "\u1fff"):
|
| 214 |
+
ref_idx += 1
|
| 215 |
+
|
| 216 |
+
if ref_idx < len(reference):
|
| 217 |
+
ref_char = reference[ref_idx]
|
| 218 |
+
# Check if reference character is uppercase
|
| 219 |
+
if ref_char.isupper() or ref_char != lower_grc(ref_char):
|
| 220 |
+
# Try to apply uppercase version
|
| 221 |
+
upper_version = char.upper()
|
| 222 |
+
if upper_version != char: # Character has an uppercase form
|
| 223 |
+
result.append(upper_version)
|
| 224 |
+
else:
|
| 225 |
+
result.append(char)
|
| 226 |
+
else:
|
| 227 |
+
result.append(char)
|
| 228 |
+
ref_idx += 1
|
| 229 |
+
else:
|
| 230 |
+
result.append(char)
|
| 231 |
+
else:
|
| 232 |
+
result.append(char)
|
| 233 |
+
|
| 234 |
+
return "".join(result)
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def _preserve_final_sigma_from_reference(text: str, reference: str) -> str:
|
| 238 |
+
"""If reference has final sigma ς at word end, preserve it in text."""
|
| 239 |
+
# Simply copy final sigmas from reference to text at word boundaries
|
| 240 |
+
# Split both into tokens
|
| 241 |
+
text_tokens = re.findall(r"\S+|\s+", text)
|
| 242 |
+
ref_tokens = re.findall(r"\S+|\s+", reference)
|
| 243 |
+
|
| 244 |
+
result = []
|
| 245 |
+
for text_token, ref_token in zip(text_tokens, ref_tokens):
|
| 246 |
+
if text_token.isspace() or ref_token.isspace():
|
| 247 |
+
result.append(text_token)
|
| 248 |
+
continue
|
| 249 |
+
|
| 250 |
+
# Find last Greek letter in both tokens
|
| 251 |
+
text_last_greek_idx = -1
|
| 252 |
+
ref_last_greek_idx = -1
|
| 253 |
+
|
| 254 |
+
for i in range(len(text_token) - 1, -1, -1):
|
| 255 |
+
ch = text_token[i]
|
| 256 |
+
if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
|
| 257 |
+
text_last_greek_idx = i
|
| 258 |
+
break
|
| 259 |
+
|
| 260 |
+
for i in range(len(ref_token) - 1, -1, -1):
|
| 261 |
+
ch = ref_token[i]
|
| 262 |
+
if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
|
| 263 |
+
ref_last_greek_idx = i
|
| 264 |
+
break
|
| 265 |
+
|
| 266 |
+
# If reference ends with final sigma ς, convert text's final sigma to match
|
| 267 |
+
if ref_last_greek_idx >= 0 and ref_token[ref_last_greek_idx] == "ς":
|
| 268 |
+
if text_last_greek_idx >= 0 and text_token[text_last_greek_idx] == "σ":
|
| 269 |
+
text_token = text_token[:text_last_greek_idx] + "ς" + text_token[text_last_greek_idx+1:]
|
| 270 |
+
|
| 271 |
+
result.append(text_token)
|
| 272 |
+
|
| 273 |
+
return "".join(result)
|
| 274 |
+
|
| 275 |
+
|
| 276 |
def _consume_word_alignment(
|
| 277 |
aligned: List[Tuple[str, int]],
|
| 278 |
start_idx: int,
|
|
|
|
| 302 |
return aligned[start_idx:end_idx], end_idx
|
| 303 |
|
| 304 |
|
| 305 |
+
def _render_plain_line_per_word(line: str, model_id: str) -> str:
|
| 306 |
+
"""Render plain line by processing each word separately."""
|
| 307 |
line_for_matching = line.replace("ς", "σ")
|
| 308 |
parts = re.findall(r"\S+|\s+", line)
|
| 309 |
parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
|
| 310 |
out_parts: List[str] = []
|
|
|
|
| 311 |
|
| 312 |
for part, part_for_matching in zip(parts, parts_for_matching):
|
| 313 |
if part_for_matching.isspace():
|
| 314 |
+
# Preserve original spacing exactly.
|
| 315 |
+
out_parts.append(part_for_matching)
|
| 316 |
+
continue
|
| 317 |
+
|
| 318 |
+
# Classify this word independently
|
| 319 |
+
aligned = classify_line(part_for_matching, model_id)
|
| 320 |
+
if not aligned:
|
| 321 |
out_parts.append(part_for_matching)
|
| 322 |
continue
|
| 323 |
|
|
|
|
| 325 |
expected_tokens = process_word(normalized_word)
|
| 326 |
expected_syllables = syllabify_joined(expected_tokens)
|
| 327 |
|
| 328 |
+
taken, _ = _consume_word_alignment(aligned, 0, expected_syllables)
|
| 329 |
if not taken:
|
| 330 |
out_parts.append(part_for_matching)
|
| 331 |
continue
|
|
|
|
| 334 |
restored = _restore_expanded_word(marked, part)
|
| 335 |
out_parts.append(restored)
|
| 336 |
|
| 337 |
+
result = "".join(out_parts)
|
| 338 |
+
# Post-process: convert word-final σ to ς for readability
|
| 339 |
+
result = _convert_final_sigmas(result)
|
| 340 |
+
return result
|
| 341 |
|
| 342 |
+
|
| 343 |
+
def _convert_final_sigmas(text: str) -> str:
|
| 344 |
+
"""Convert word-final σ to ς (final sigma) for readability."""
|
| 345 |
+
# Find all words (sequences of non-space characters that include Greek letters)
|
| 346 |
+
def convert_word(match):
|
| 347 |
+
word = match.group(0)
|
| 348 |
+
# Find the last Greek letter in the word
|
| 349 |
+
for i in range(len(word) - 1, -1, -1):
|
| 350 |
+
ch = word[i]
|
| 351 |
+
if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
|
| 352 |
+
# Found the last Greek letter
|
| 353 |
+
if ch == "σ":
|
| 354 |
+
# Convert medial sigma to final sigma
|
| 355 |
+
return word[:i] + "ς" + word[i+1:]
|
| 356 |
+
break
|
| 357 |
+
return word
|
| 358 |
+
|
| 359 |
+
# Replace word-final σ with ς
|
| 360 |
+
return re.sub(r"\S+", convert_word, text)
|
| 361 |
|
| 362 |
|
| 363 |
def render_results(text: str, model_label: str):
|
|
|
|
| 373 |
for idx, line in enumerate(lines, start=1):
|
| 374 |
aligned = classify_line(line, model_id)
|
| 375 |
chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
|
| 376 |
+
plain_line = _render_plain_line_per_word(line, model_id)
|
| 377 |
|
| 378 |
cards.append(
|
| 379 |
f"""
|
pytest.ini
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[pytest]
|
| 2 |
+
testpaths = test_markup.py
|
| 3 |
+
python_files = test_*.py
|
| 4 |
+
python_classes = Test*
|
| 5 |
+
python_functions = test_*
|
| 6 |
+
|
| 7 |
+
# Output options
|
| 8 |
+
addopts = -v --tb=short
|
| 9 |
+
|
| 10 |
+
# Markers
|
| 11 |
+
markers =
|
| 12 |
+
markup: Tests for markup output preservation
|
test_markup.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test that plain output preserves input exactly except for markup.
|
| 3 |
+
Output should equal: input.replace("^", "").replace("_", "")
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pytest
|
| 7 |
+
from app import _render_plain_line_per_word, DEFAULT_MODEL_ID
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def strip_markup(text: str) -> str:
|
| 11 |
+
"""Remove markup characters from text."""
|
| 12 |
+
return text.replace("^", "").replace("_", "")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# Test cases with Greek lines
|
| 16 |
+
TEST_CASES = [
|
| 17 |
+
# Basic simple word
|
| 18 |
+
"νεανίας",
|
| 19 |
+
# Multiple words with spaces
|
| 20 |
+
"νεανίας ἀάατός ἐστιν",
|
| 21 |
+
# With final sigma
|
| 22 |
+
"καὶ καλός",
|
| 23 |
+
# Multiple spaces
|
| 24 |
+
"καλὰ μὲν",
|
| 25 |
+
# Single letter
|
| 26 |
+
"ἢ",
|
| 27 |
+
# Word with punctuation preserved
|
| 28 |
+
"τυφλὸς ἤ",
|
| 29 |
+
# Multi-word with accents
|
| 30 |
+
"Ἀτρεΐδαι τε καὶ ἄλλοι",
|
| 31 |
+
# Longer passage
|
| 32 |
+
"νεανίας ἀάατός ἐστιν καὶ καλός",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@pytest.mark.parametrize("input_line", TEST_CASES)
|
| 37 |
+
def test_plain_output_preserves_input_without_markup(input_line):
|
| 38 |
+
"""
|
| 39 |
+
Test that the plain output is identical to input after removing markup.
|
| 40 |
+
|
| 41 |
+
The output should be: input.replace("^", "").replace("_", "")
|
| 42 |
+
This ensures:
|
| 43 |
+
- All original characters are preserved
|
| 44 |
+
- Spaces are preserved exactly
|
| 45 |
+
- Final sigmas are preserved
|
| 46 |
+
- Only the markup (^ and _) are added
|
| 47 |
+
"""
|
| 48 |
+
# Get the rendered plain line
|
| 49 |
+
plain_output = _render_plain_line_per_word(input_line, DEFAULT_MODEL_ID)
|
| 50 |
+
|
| 51 |
+
# Strip markup from the output to get back the base text
|
| 52 |
+
output_without_markup = strip_markup(plain_output)
|
| 53 |
+
|
| 54 |
+
# The expected result: input with no markup
|
| 55 |
+
# (We don't normalize final sigma - we preserve exactly what was in the input)
|
| 56 |
+
input_expected = input_line
|
| 57 |
+
|
| 58 |
+
print(f"\nInput: {repr(input_line)}")
|
| 59 |
+
print(f"Output: {repr(plain_output)}")
|
| 60 |
+
print(f"Output without markup: {repr(output_without_markup)}")
|
| 61 |
+
print(f"Expected: {repr(input_expected)}")
|
| 62 |
+
|
| 63 |
+
# The core assertion: output without markup should match input
|
| 64 |
+
assert output_without_markup == input_expected, (
|
| 65 |
+
f"Output without markup doesn't match input.\n"
|
| 66 |
+
f"Expected: {repr(input_expected)}\n"
|
| 67 |
+
f"Got: {repr(output_without_markup)}"
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
pytest.main([__file__, "-v", "-s"])
|