Urdatorn commited on
Commit
3d9fa99
·
1 Parent(s): aaf8c40

output formatting

Browse files
Files changed (6) hide show
  1. .gitignore +3 -1
  2. README.md +17 -1
  3. __pycache__/app.cpython-313.pyc +0 -0
  4. app.py +139 -10
  5. pytest.ini +12 -0
  6. test_markup.py +72 -0
.gitignore CHANGED
@@ -1 +1,3 @@
1
- .github/
 
 
 
1
+ __pycache__
2
+ .pytest_cache
3
+ .vscode
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Macronizer
3
- emoji: 🦀
4
  colorFrom: gray
5
  colorTo: pink
6
  sdk: gradio
@@ -11,4 +11,20 @@ license: gpl-3.0
11
  short_description: Markup of Ancient Greek vowel length
12
  ---
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Macronizer
3
+ emoji: 📐
4
  colorFrom: gray
5
  colorTo: pink
6
  sdk: gradio
 
11
  short_description: Markup of Ancient Greek vowel length
12
  ---
13
 
14
+ # Ancient Greek Macronizer
15
+
16
+ This application uses a fine-tuned transformer model to classify Ancient Greek syllables as long or short, marking them with markup (underscores for long vowels, carets for short vowels).
17
+
18
+ ## Testing
19
+
20
+ Run the test suite with:
21
+
22
+ ```bash
23
+ pytest
24
+ ```
25
+
26
+ This will run all tests in `test_markup.py`, which verify that the plain text output preserves the input exactly, with only markup additions (^ and _ characters).
27
+
28
+ See [pytest.ini](pytest.ini) for configuration details.
29
+
30
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__pycache__/app.cpython-313.pyc CHANGED
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
 
app.py CHANGED
@@ -66,6 +66,28 @@ def preprocess_and_syllabify(line: str):
66
  return syllabify_joined(tokens)
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def classify_line(line: str, model_id: str):
70
  syllables = preprocess_and_syllabify(line)
71
  if not syllables:
@@ -165,9 +187,92 @@ def _restore_expanded_word(marked_word: str, reference_word: str) -> str:
165
  if rho_idx != -1:
166
  restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :]
167
 
 
 
 
 
 
 
168
  return _to_final_sigma(restored)
169
 
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  def _consume_word_alignment(
172
  aligned: List[Tuple[str, int]],
173
  start_idx: int,
@@ -197,17 +302,22 @@ def _consume_word_alignment(
197
  return aligned[start_idx:end_idx], end_idx
198
 
199
 
200
- def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -> str:
201
- # Step 1: normalize input final sigma to medial sigma for matching only.
202
  line_for_matching = line.replace("ς", "σ")
203
  parts = re.findall(r"\S+|\s+", line)
204
  parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
205
  out_parts: List[str] = []
206
- cursor = 0
207
 
208
  for part, part_for_matching in zip(parts, parts_for_matching):
209
  if part_for_matching.isspace():
210
- # Step 2: preserve original spacing exactly.
 
 
 
 
 
 
211
  out_parts.append(part_for_matching)
212
  continue
213
 
@@ -215,7 +325,7 @@ def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -
215
  expected_tokens = process_word(normalized_word)
216
  expected_syllables = syllabify_joined(expected_tokens)
217
 
218
- taken, cursor = _consume_word_alignment(aligned, cursor, expected_syllables)
219
  if not taken:
220
  out_parts.append(part_for_matching)
221
  continue
@@ -224,11 +334,30 @@ def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -
224
  restored = _restore_expanded_word(marked, part)
225
  out_parts.append(restored)
226
 
227
- if cursor < len(aligned):
228
- tail = "".join(_mark_syllable_plain(syl, label) for syl, label in aligned[cursor:])
229
- out_parts.append(_to_final_sigma(tail))
 
230
 
231
- return "".join(out_parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
 
234
  def render_results(text: str, model_label: str):
@@ -244,7 +373,7 @@ def render_results(text: str, model_label: str):
244
  for idx, line in enumerate(lines, start=1):
245
  aligned = classify_line(line, model_id)
246
  chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
247
- plain_line = _render_plain_line_with_spacing(line, aligned)
248
 
249
  cards.append(
250
  f"""
 
66
  return syllabify_joined(tokens)
67
 
68
 
69
+ def classify_line_per_word(line: str, model_id: str) -> List[Tuple[str, List[Tuple[str, int]]]]:
70
+ """
71
+ Classify each word separately to preserve word boundaries.
72
+ Returns list of (word, aligned_syllables) tuples.
73
+ """
74
+ line_for_matching = line.replace("ς", "σ")
75
+ parts = re.findall(r"\S+|\s+", line)
76
+ parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
77
+
78
+ result = []
79
+ for part, part_for_matching in zip(parts, parts_for_matching):
80
+ if part_for_matching.isspace():
81
+ result.append((part_for_matching, [])) # Spaces have no aligned syllables
82
+ continue
83
+
84
+ # Classify this word independently
85
+ aligned = classify_line(part_for_matching, model_id)
86
+ result.append((part, aligned))
87
+
88
+ return result
89
+
90
+
91
  def classify_line(line: str, model_id: str):
92
  syllables = preprocess_and_syllabify(line)
93
  if not syllables:
 
187
  if rho_idx != -1:
188
  restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :]
189
 
190
+ # Apply case from reference_word to restored
191
+ restored = _apply_case_from_reference(restored, reference_word)
192
+
193
+ # Preserve original final sigma from reference
194
+ restored = _preserve_final_sigma_from_reference(restored, reference_word)
195
+
196
  return _to_final_sigma(restored)
197
 
198
 
199
+ def _apply_case_from_reference(text: str, reference: str) -> str:
200
+ """Apply case from reference word to text (only for Greek letters)."""
201
+ result = []
202
+ ref_idx = 0
203
+
204
+ for char in text:
205
+ # Skip markup characters
206
+ if char in "^_":
207
+ result.append(char)
208
+ continue
209
+
210
+ # For Greek letters, find corresponding reference character and apply case
211
+ if "\u0370" <= char <= "\u03ff" or "\u1f00" <= char <= "\u1fff":
212
+ # Find next Greek letter in reference
213
+ while ref_idx < len(reference) and not ("\u0370" <= reference[ref_idx] <= "\u03ff" or "\u1f00" <= reference[ref_idx] <= "\u1fff"):
214
+ ref_idx += 1
215
+
216
+ if ref_idx < len(reference):
217
+ ref_char = reference[ref_idx]
218
+ # Check if reference character is uppercase
219
+ if ref_char.isupper() or ref_char != lower_grc(ref_char):
220
+ # Try to apply uppercase version
221
+ upper_version = char.upper()
222
+ if upper_version != char: # Character has an uppercase form
223
+ result.append(upper_version)
224
+ else:
225
+ result.append(char)
226
+ else:
227
+ result.append(char)
228
+ ref_idx += 1
229
+ else:
230
+ result.append(char)
231
+ else:
232
+ result.append(char)
233
+
234
+ return "".join(result)
235
+
236
+
237
+ def _preserve_final_sigma_from_reference(text: str, reference: str) -> str:
238
+ """If reference has final sigma ς at word end, preserve it in text."""
239
+ # Simply copy final sigmas from reference to text at word boundaries
240
+ # Split both into tokens
241
+ text_tokens = re.findall(r"\S+|\s+", text)
242
+ ref_tokens = re.findall(r"\S+|\s+", reference)
243
+
244
+ result = []
245
+ for text_token, ref_token in zip(text_tokens, ref_tokens):
246
+ if text_token.isspace() or ref_token.isspace():
247
+ result.append(text_token)
248
+ continue
249
+
250
+ # Find last Greek letter in both tokens
251
+ text_last_greek_idx = -1
252
+ ref_last_greek_idx = -1
253
+
254
+ for i in range(len(text_token) - 1, -1, -1):
255
+ ch = text_token[i]
256
+ if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
257
+ text_last_greek_idx = i
258
+ break
259
+
260
+ for i in range(len(ref_token) - 1, -1, -1):
261
+ ch = ref_token[i]
262
+ if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
263
+ ref_last_greek_idx = i
264
+ break
265
+
266
+ # If reference ends with final sigma ς, convert text's final sigma to match
267
+ if ref_last_greek_idx >= 0 and ref_token[ref_last_greek_idx] == "ς":
268
+ if text_last_greek_idx >= 0 and text_token[text_last_greek_idx] == "σ":
269
+ text_token = text_token[:text_last_greek_idx] + "ς" + text_token[text_last_greek_idx+1:]
270
+
271
+ result.append(text_token)
272
+
273
+ return "".join(result)
274
+
275
+
276
  def _consume_word_alignment(
277
  aligned: List[Tuple[str, int]],
278
  start_idx: int,
 
302
  return aligned[start_idx:end_idx], end_idx
303
 
304
 
305
+ def _render_plain_line_per_word(line: str, model_id: str) -> str:
306
+ """Render plain line by processing each word separately."""
307
  line_for_matching = line.replace("ς", "σ")
308
  parts = re.findall(r"\S+|\s+", line)
309
  parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
310
  out_parts: List[str] = []
 
311
 
312
  for part, part_for_matching in zip(parts, parts_for_matching):
313
  if part_for_matching.isspace():
314
+ # Preserve original spacing exactly.
315
+ out_parts.append(part_for_matching)
316
+ continue
317
+
318
+ # Classify this word independently
319
+ aligned = classify_line(part_for_matching, model_id)
320
+ if not aligned:
321
  out_parts.append(part_for_matching)
322
  continue
323
 
 
325
  expected_tokens = process_word(normalized_word)
326
  expected_syllables = syllabify_joined(expected_tokens)
327
 
328
+ taken, _ = _consume_word_alignment(aligned, 0, expected_syllables)
329
  if not taken:
330
  out_parts.append(part_for_matching)
331
  continue
 
334
  restored = _restore_expanded_word(marked, part)
335
  out_parts.append(restored)
336
 
337
+ result = "".join(out_parts)
338
+ # Post-process: convert word-final σ to ς for readability
339
+ result = _convert_final_sigmas(result)
340
+ return result
341
 
342
+
343
+ def _convert_final_sigmas(text: str) -> str:
344
+ """Convert word-final σ to ς (final sigma) for readability."""
345
+ # Find all words (sequences of non-space characters that include Greek letters)
346
+ def convert_word(match):
347
+ word = match.group(0)
348
+ # Find the last Greek letter in the word
349
+ for i in range(len(word) - 1, -1, -1):
350
+ ch = word[i]
351
+ if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
352
+ # Found the last Greek letter
353
+ if ch == "σ":
354
+ # Convert medial sigma to final sigma
355
+ return word[:i] + "ς" + word[i+1:]
356
+ break
357
+ return word
358
+
359
+ # Replace word-final σ with ς
360
+ return re.sub(r"\S+", convert_word, text)
361
 
362
 
363
  def render_results(text: str, model_label: str):
 
373
  for idx, line in enumerate(lines, start=1):
374
  aligned = classify_line(line, model_id)
375
  chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
376
+ plain_line = _render_plain_line_per_word(line, model_id)
377
 
378
  cards.append(
379
  f"""
pytest.ini ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [pytest]
2
+ testpaths = test_markup.py
3
+ python_files = test_*.py
4
+ python_classes = Test*
5
+ python_functions = test_*
6
+
7
+ # Output options
8
+ addopts = -v --tb=short
9
+
10
+ # Markers
11
+ markers =
12
+ markup: Tests for markup output preservation
test_markup.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test that plain output preserves input exactly except for markup.
3
+ Output should equal: input.replace("^", "").replace("_", "")
4
+ """
5
+
6
+ import pytest
7
+ from app import _render_plain_line_per_word, DEFAULT_MODEL_ID
8
+
9
+
10
+ def strip_markup(text: str) -> str:
11
+ """Remove markup characters from text."""
12
+ return text.replace("^", "").replace("_", "")
13
+
14
+
15
+ # Test cases with Greek lines
16
+ TEST_CASES = [
17
+ # Basic simple word
18
+ "νεανίας",
19
+ # Multiple words with spaces
20
+ "νεανίας ἀάατός ἐστιν",
21
+ # With final sigma
22
+ "καὶ καλός",
23
+ # Multiple spaces
24
+ "καλὰ μὲν",
25
+ # Single letter
26
+ "ἢ",
27
+ # Word with punctuation preserved
28
+ "τυφλὸς ἤ",
29
+ # Multi-word with accents
30
+ "Ἀτρεΐδαι τε καὶ ἄλλοι",
31
+ # Longer passage
32
+ "νεανίας ἀάατός ἐστιν καὶ καλός",
33
+ ]
34
+
35
+
36
+ @pytest.mark.parametrize("input_line", TEST_CASES)
37
+ def test_plain_output_preserves_input_without_markup(input_line):
38
+ """
39
+ Test that the plain output is identical to input after removing markup.
40
+
41
+ The output should be: input.replace("^", "").replace("_", "")
42
+ This ensures:
43
+ - All original characters are preserved
44
+ - Spaces are preserved exactly
45
+ - Final sigmas are preserved
46
+ - Only the markup (^ and _) are added
47
+ """
48
+ # Get the rendered plain line
49
+ plain_output = _render_plain_line_per_word(input_line, DEFAULT_MODEL_ID)
50
+
51
+ # Strip markup from the output to get back the base text
52
+ output_without_markup = strip_markup(plain_output)
53
+
54
+ # The expected result: input with no markup
55
+ # (We don't normalize final sigma - we preserve exactly what was in the input)
56
+ input_expected = input_line
57
+
58
+ print(f"\nInput: {repr(input_line)}")
59
+ print(f"Output: {repr(plain_output)}")
60
+ print(f"Output without markup: {repr(output_without_markup)}")
61
+ print(f"Expected: {repr(input_expected)}")
62
+
63
+ # The core assertion: output without markup should match input
64
+ assert output_without_markup == input_expected, (
65
+ f"Output without markup doesn't match input.\n"
66
+ f"Expected: {repr(input_expected)}\n"
67
+ f"Got: {repr(output_without_markup)}"
68
+ )
69
+
70
+
71
+ if __name__ == "__main__":
72
+ pytest.main([__file__, "-v", "-s"])