Urdatorn commited on
Commit
7caffd3
·
1 Parent(s): 3d9fa99

markup bug fix

Browse files
=0.25.1 ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Requirement already satisfied: transformers in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (4.45.2)
2
+ Requirement already satisfied: huggingface-hub in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.24.7)
3
+ Requirement already satisfied: filelock in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers) (3.16.1)
4
+ Requirement already satisfied: numpy>=1.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers) (1.26.4)
5
+ Requirement already satisfied: packaging>=20.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers) (24.0)
6
+ Requirement already satisfied: pyyaml>=5.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers) (6.0.1)
7
+ Requirement already satisfied: regex!=2019.12.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers) (2026.5.9)
8
+ Requirement already satisfied: requests in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers) (2.32.3)
9
+ Requirement already satisfied: safetensors>=0.4.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers) (0.4.5)
10
+ Requirement already satisfied: tokenizers<0.21,>=0.20 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers) (0.20.3)
11
+ Requirement already satisfied: tqdm>=4.27 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers) (4.66.5)
12
+ Requirement already satisfied: fsspec>=2023.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub) (2024.6.1)
13
+ Requirement already satisfied: typing-extensions>=3.7.4.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub) (4.11.0)
14
+ Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->transformers) (3.3.2)
15
+ Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->transformers) (3.6)
16
+ Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->transformers) (2.2.1)
17
+ Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->transformers) (2024.2.2)
18
+
19
+ [notice] A new release of pip is available: 24.3.1 -> 26.1.2
20
+ [notice] To update, run: pip3 install --upgrade pip
=5.0 ADDED
File without changes
__pycache__/app.cpython-313.pyc CHANGED
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
 
app.py CHANGED
@@ -151,10 +151,32 @@ def _mark_syllable_plain(syllable: str, label_id: int) -> str:
151
  marker = "_" if label_id == 1 else "^"
152
  chars = list(syllable)
153
 
 
 
154
  for i in range(len(chars) - 1, -1, -1):
155
  if vowel(chars[i]):
156
- return "".join(chars[: i + 1]) + marker + "".join(chars[i + 1 :])
157
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  return syllable + marker
159
 
160
 
 
151
  marker = "_" if label_id == 1 else "^"
152
  chars = list(syllable)
153
 
154
+ # Find the last vowel, skipping trailing non-letter characters (like punctuation)
155
+ vowel_idx = -1
156
  for i in range(len(chars) - 1, -1, -1):
157
  if vowel(chars[i]):
158
+ vowel_idx = i
159
+ break
160
+ # Skip markup characters and non-letter characters
161
+ ch = chars[i]
162
+ if ch not in "^_" and ("\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff"):
163
+ # It's a Greek letter but not a vowel
164
+ continue
165
+ if not ("\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff") and ch not in "^_":
166
+ # It's not a Greek letter or markup - it's punctuation or other, skip
167
+ continue
168
+
169
+ if vowel_idx >= 0:
170
+ # Found a vowel, insert marker after it
171
+ return "".join(chars[:vowel_idx + 1]) + marker + "".join(chars[vowel_idx + 1:])
172
+
173
+ # No vowel found, find the last Greek letter
174
+ for i in range(len(chars) - 1, -1, -1):
175
+ if "\u0370" <= chars[i] <= "\u03ff" or "\u1f00" <= chars[i] <= "\u1fff":
176
+ # Insert marker after the last Greek letter
177
+ return "".join(chars[:i + 1]) + marker + "".join(chars[i + 1:])
178
+
179
+ # No Greek letters found, append marker at the end
180
  return syllable + marker
181
 
182
 
pytest.ini CHANGED
@@ -1,5 +1,5 @@
1
  [pytest]
2
- testpaths = test_markup.py
3
  python_files = test_*.py
4
  python_classes = Test*
5
  python_functions = test_*
 
1
  [pytest]
2
+ testpaths = tests
3
  python_files = test_*.py
4
  python_classes = Test*
5
  python_functions = test_*
tests/test_bug_neaniasfix.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test for the νεανίας bug fix.
3
+
4
+ The bug is that the final alpha in νεανίας should be marked as long,
5
+ but it wasn't being marked correctly.
6
+
7
+ Additionally, punctuation handling was fixed so that marks are not placed
8
+ after punctuation characters.
9
+
10
+ Expected: νεα_νί^α_ς
11
+ Current (buggy): νεα_νί^ας
12
+ """
13
+
14
+ import pytest
15
+ import sys
16
+ from pathlib import Path
17
+
18
+ # Add parent directory to path to import app module
19
+ sys.path.insert(0, str(Path(__file__).parent.parent))
20
+
21
+ from app import _render_plain_line_per_word
22
+
23
+
24
+ def test_neaniaste_final_alpha_marked():
25
+ """Test that the final alpha in νεανίας is correctly marked as long."""
26
+ # Use the mini model since it works correctly
27
+ model_id = "Ericu950/macronizer_mini"
28
+
29
+ input_text = "νεανίας"
30
+ output = _render_plain_line_per_word(input_text, model_id)
31
+
32
+ # The expected output should have the long mark on the final alpha before sigma
33
+ expected = "νεα_νί^α_ς"
34
+
35
+ assert output == expected, (
36
+ f"Output mismatch for '{input_text}'\n"
37
+ f"Expected: {repr(expected)}\n"
38
+ f"Got: {repr(output)}"
39
+ )
40
+
41
+
42
+ def test_neaniaste_in_full_sentence():
43
+ """Test νεανίας in the full sentence provided by user."""
44
+ model_id = "Ericu950/macronizer_mini"
45
+
46
+ input_text = "νεανίας ἀάατός ἐστιν καὶ καλός. τὰ παῖδες τὰ καλά"
47
+ output = _render_plain_line_per_word(input_text, model_id)
48
+
49
+ # The first word should be νεα_νί^α_ς
50
+ first_word = output.split()[0]
51
+ expected_first = "νεα_νί^α_ς"
52
+
53
+ assert first_word == expected_first, (
54
+ f"First word mismatch\n"
55
+ f"Expected: {repr(expected_first)}\n"
56
+ f"Got: {repr(first_word)}"
57
+ )
58
+
59
+
60
+ def test_punctuation_not_marked():
61
+ """Test that punctuation is preserved and marks are placed before punctuation."""
62
+ model_id = "Ericu950/macronizer_mini"
63
+
64
+ # Word with punctuation
65
+ input_text = "καλός."
66
+ output = _render_plain_line_per_word(input_text, model_id)
67
+
68
+ # The period should be preserved
69
+ assert "." in output, f"Period not preserved in output: {repr(output)}"
70
+
71
+ # The mark should be placed before the period, not forming "σ._"
72
+ # The output should have the period at the very end
73
+ assert output.endswith("."), f"Period should be at the end: {repr(output)}"
74
+
75
+
76
+
77
+ if __name__ == "__main__":
78
+ pytest.main([__file__, "-v", "-s"])
79
+
test_markup.py → tests/test_markup.py RENAMED
@@ -4,6 +4,12 @@ Output should equal: input.replace("^", "").replace("_", "")
4
  """
5
 
6
  import pytest
 
 
 
 
 
 
7
  from app import _render_plain_line_per_word, DEFAULT_MODEL_ID
8
 
9
 
 
4
  """
5
 
6
  import pytest
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ # Add parent directory to path to import app module
11
+ sys.path.insert(0, str(Path(__file__).parent.parent))
12
+
13
  from app import _render_plain_line_per_word, DEFAULT_MODEL_ID
14
 
15