al1808th commited on
Commit
dcbc7bd
·
1 Parent(s): c694732

Simplify sigma normalization and final-sigma rendering

Browse files
Files changed (1) hide show
  1. app.py +98 -3
app.py CHANGED
@@ -7,7 +7,7 @@ import torch
7
  import torch.nn.functional as F
8
  from transformers import AutoModelForTokenClassification, AutoTokenizer
9
 
10
- from grc_utils import lower_grc, normalize_word, heavy, vowel
11
 
12
  from syllabify import syllabify_joined
13
  from preprocess import process_word
@@ -136,6 +136,101 @@ def _mark_syllable_plain(syllable: str, label_id: int) -> str:
136
  return syllable + marker
137
 
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  def render_results(text: str, model_label: str):
140
  lines = [line.strip() for line in text.splitlines() if line.strip()]
141
  if not lines:
@@ -149,7 +244,7 @@ def render_results(text: str, model_label: str):
149
  for idx, line in enumerate(lines, start=1):
150
  aligned = classify_line(line, model_id)
151
  chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
152
- plain_marked = [_mark_syllable_plain(syl, label) for syl, label in aligned]
153
 
154
  cards.append(
155
  f"""
@@ -162,7 +257,7 @@ def render_results(text: str, model_label: str):
162
  )
163
 
164
  export_lines.append(f"Line {idx}: {line}")
165
- export_lines.append(" " + " ".join(plain_marked) if plain_marked else " (no syllables found)")
166
 
167
  html_result = (
168
  "<div class='legend'><span class='dot long'></span>Long"
 
7
  import torch.nn.functional as F
8
  from transformers import AutoModelForTokenClassification, AutoTokenizer
9
 
10
+ from grc_utils import lower_grc, normalize_word, heavy, vowel, only_bases
11
 
12
  from syllabify import syllabify_joined
13
  from preprocess import process_word
 
136
  return syllable + marker
137
 
138
 
139
+ def _to_final_sigma(text: str) -> str:
140
+ # Step 3: in rendered output, only word-final sigmas become final-sigma.
141
+ def _convert_word(token: str) -> str:
142
+ if not token.strip():
143
+ return token
144
+
145
+ chars = list(token)
146
+ last_greek_idx = -1
147
+ for i, ch in enumerate(chars):
148
+ if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
149
+ last_greek_idx = i
150
+
151
+ if last_greek_idx != -1 and chars[last_greek_idx] == "σ":
152
+ chars[last_greek_idx] = "ς"
153
+
154
+ return "".join(chars)
155
+
156
+ return "".join(_convert_word(tok) for tok in re.findall(r"\S+|\s+", text))
157
+
158
+
159
+ def _restore_expanded_word(marked_word: str, reference_word: str) -> str:
160
+ restored = marked_word.replace("δσ", "ζ").replace("κσ", "ξ").replace("πσ", "ψ")
161
+
162
+ ref_norm = lower_grc(normalize_word(reference_word))
163
+ if "ῥ" in ref_norm:
164
+ rho_idx = restored.find("ρ")
165
+ if rho_idx != -1:
166
+ restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :]
167
+
168
+ return _to_final_sigma(restored)
169
+
170
+
171
+ def _consume_word_alignment(
172
+ aligned: List[Tuple[str, int]],
173
+ start_idx: int,
174
+ expected_syllables: List[str],
175
+ ) -> Tuple[List[Tuple[str, int]], int]:
176
+ if start_idx >= len(aligned):
177
+ return [], start_idx
178
+
179
+ expected_bases = only_bases("".join(expected_syllables))
180
+ if expected_bases:
181
+ taken: List[Tuple[str, int]] = []
182
+ i = start_idx
183
+ while i < len(aligned):
184
+ taken.append(aligned[i])
185
+ current_bases = only_bases("".join(s for s, _ in taken))
186
+ if current_bases == expected_bases:
187
+ return taken, i + 1
188
+ if len(current_bases) > len(expected_bases) and not current_bases.startswith(expected_bases):
189
+ break
190
+ i += 1
191
+
192
+ fallback_count = len(expected_syllables)
193
+ if fallback_count <= 0:
194
+ return [], start_idx
195
+
196
+ end_idx = min(len(aligned), start_idx + fallback_count)
197
+ return aligned[start_idx:end_idx], end_idx
198
+
199
+
200
+ def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -> str:
201
+ # Step 1: normalize input final sigma to medial sigma for matching only.
202
+ line_for_matching = line.replace("ς", "σ")
203
+ parts = re.findall(r"\S+|\s+", line)
204
+ parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
205
+ out_parts: List[str] = []
206
+ cursor = 0
207
+
208
+ for part, part_for_matching in zip(parts, parts_for_matching):
209
+ if part_for_matching.isspace():
210
+ # Step 2: preserve original spacing exactly.
211
+ out_parts.append(part_for_matching)
212
+ continue
213
+
214
+ normalized_word = lower_grc(normalize_word(part_for_matching)).replace("ς", "σ")
215
+ expected_tokens = process_word(normalized_word)
216
+ expected_syllables = syllabify_joined(expected_tokens)
217
+
218
+ taken, cursor = _consume_word_alignment(aligned, cursor, expected_syllables)
219
+ if not taken:
220
+ out_parts.append(part_for_matching)
221
+ continue
222
+
223
+ marked = "".join(_mark_syllable_plain(syl, label) for syl, label in taken)
224
+ restored = _restore_expanded_word(marked, part)
225
+ out_parts.append(restored)
226
+
227
+ if cursor < len(aligned):
228
+ tail = "".join(_mark_syllable_plain(syl, label) for syl, label in aligned[cursor:])
229
+ out_parts.append(_to_final_sigma(tail))
230
+
231
+ return "".join(out_parts)
232
+
233
+
234
  def render_results(text: str, model_label: str):
235
  lines = [line.strip() for line in text.splitlines() if line.strip()]
236
  if not lines:
 
244
  for idx, line in enumerate(lines, start=1):
245
  aligned = classify_line(line, model_id)
246
  chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
247
+ plain_line = _render_plain_line_with_spacing(line, aligned)
248
 
249
  cards.append(
250
  f"""
 
257
  )
258
 
259
  export_lines.append(f"Line {idx}: {line}")
260
+ export_lines.append(f" {plain_line}" if plain_line else " (no syllables found)")
261
 
262
  html_result = (
263
  "<div class='legend'><span class='dot long'></span>Long"