Spaces:
Sleeping
Sleeping
Simplify sigma normalization and final-sigma rendering
Browse files
app.py
CHANGED
|
@@ -7,7 +7,7 @@ import torch
|
|
| 7 |
import torch.nn.functional as F
|
| 8 |
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
| 9 |
|
| 10 |
-
from grc_utils import lower_grc, normalize_word, heavy, vowel
|
| 11 |
|
| 12 |
from syllabify import syllabify_joined
|
| 13 |
from preprocess import process_word
|
|
@@ -136,6 +136,101 @@ def _mark_syllable_plain(syllable: str, label_id: int) -> str:
|
|
| 136 |
return syllable + marker
|
| 137 |
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
def render_results(text: str, model_label: str):
|
| 140 |
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
| 141 |
if not lines:
|
|
@@ -149,7 +244,7 @@ def render_results(text: str, model_label: str):
|
|
| 149 |
for idx, line in enumerate(lines, start=1):
|
| 150 |
aligned = classify_line(line, model_id)
|
| 151 |
chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
|
| 152 |
-
|
| 153 |
|
| 154 |
cards.append(
|
| 155 |
f"""
|
|
@@ -162,7 +257,7 @@ def render_results(text: str, model_label: str):
|
|
| 162 |
)
|
| 163 |
|
| 164 |
export_lines.append(f"Line {idx}: {line}")
|
| 165 |
-
export_lines.append(" "
|
| 166 |
|
| 167 |
html_result = (
|
| 168 |
"<div class='legend'><span class='dot long'></span>Long"
|
|
|
|
| 7 |
import torch.nn.functional as F
|
| 8 |
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
| 9 |
|
| 10 |
+
from grc_utils import lower_grc, normalize_word, heavy, vowel, only_bases
|
| 11 |
|
| 12 |
from syllabify import syllabify_joined
|
| 13 |
from preprocess import process_word
|
|
|
|
| 136 |
return syllable + marker
|
| 137 |
|
| 138 |
|
| 139 |
+
def _to_final_sigma(text: str) -> str:
|
| 140 |
+
# Step 3: in rendered output, only word-final sigmas become final-sigma.
|
| 141 |
+
def _convert_word(token: str) -> str:
|
| 142 |
+
if not token.strip():
|
| 143 |
+
return token
|
| 144 |
+
|
| 145 |
+
chars = list(token)
|
| 146 |
+
last_greek_idx = -1
|
| 147 |
+
for i, ch in enumerate(chars):
|
| 148 |
+
if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
|
| 149 |
+
last_greek_idx = i
|
| 150 |
+
|
| 151 |
+
if last_greek_idx != -1 and chars[last_greek_idx] == "σ":
|
| 152 |
+
chars[last_greek_idx] = "ς"
|
| 153 |
+
|
| 154 |
+
return "".join(chars)
|
| 155 |
+
|
| 156 |
+
return "".join(_convert_word(tok) for tok in re.findall(r"\S+|\s+", text))
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def _restore_expanded_word(marked_word: str, reference_word: str) -> str:
|
| 160 |
+
restored = marked_word.replace("δσ", "ζ").replace("κσ", "ξ").replace("πσ", "ψ")
|
| 161 |
+
|
| 162 |
+
ref_norm = lower_grc(normalize_word(reference_word))
|
| 163 |
+
if "ῥ" in ref_norm:
|
| 164 |
+
rho_idx = restored.find("ρ")
|
| 165 |
+
if rho_idx != -1:
|
| 166 |
+
restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :]
|
| 167 |
+
|
| 168 |
+
return _to_final_sigma(restored)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def _consume_word_alignment(
|
| 172 |
+
aligned: List[Tuple[str, int]],
|
| 173 |
+
start_idx: int,
|
| 174 |
+
expected_syllables: List[str],
|
| 175 |
+
) -> Tuple[List[Tuple[str, int]], int]:
|
| 176 |
+
if start_idx >= len(aligned):
|
| 177 |
+
return [], start_idx
|
| 178 |
+
|
| 179 |
+
expected_bases = only_bases("".join(expected_syllables))
|
| 180 |
+
if expected_bases:
|
| 181 |
+
taken: List[Tuple[str, int]] = []
|
| 182 |
+
i = start_idx
|
| 183 |
+
while i < len(aligned):
|
| 184 |
+
taken.append(aligned[i])
|
| 185 |
+
current_bases = only_bases("".join(s for s, _ in taken))
|
| 186 |
+
if current_bases == expected_bases:
|
| 187 |
+
return taken, i + 1
|
| 188 |
+
if len(current_bases) > len(expected_bases) and not current_bases.startswith(expected_bases):
|
| 189 |
+
break
|
| 190 |
+
i += 1
|
| 191 |
+
|
| 192 |
+
fallback_count = len(expected_syllables)
|
| 193 |
+
if fallback_count <= 0:
|
| 194 |
+
return [], start_idx
|
| 195 |
+
|
| 196 |
+
end_idx = min(len(aligned), start_idx + fallback_count)
|
| 197 |
+
return aligned[start_idx:end_idx], end_idx
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -> str:
|
| 201 |
+
# Step 1: normalize input final sigma to medial sigma for matching only.
|
| 202 |
+
line_for_matching = line.replace("ς", "σ")
|
| 203 |
+
parts = re.findall(r"\S+|\s+", line)
|
| 204 |
+
parts_for_matching = re.findall(r"\S+|\s+", line_for_matching)
|
| 205 |
+
out_parts: List[str] = []
|
| 206 |
+
cursor = 0
|
| 207 |
+
|
| 208 |
+
for part, part_for_matching in zip(parts, parts_for_matching):
|
| 209 |
+
if part_for_matching.isspace():
|
| 210 |
+
# Step 2: preserve original spacing exactly.
|
| 211 |
+
out_parts.append(part_for_matching)
|
| 212 |
+
continue
|
| 213 |
+
|
| 214 |
+
normalized_word = lower_grc(normalize_word(part_for_matching)).replace("ς", "σ")
|
| 215 |
+
expected_tokens = process_word(normalized_word)
|
| 216 |
+
expected_syllables = syllabify_joined(expected_tokens)
|
| 217 |
+
|
| 218 |
+
taken, cursor = _consume_word_alignment(aligned, cursor, expected_syllables)
|
| 219 |
+
if not taken:
|
| 220 |
+
out_parts.append(part_for_matching)
|
| 221 |
+
continue
|
| 222 |
+
|
| 223 |
+
marked = "".join(_mark_syllable_plain(syl, label) for syl, label in taken)
|
| 224 |
+
restored = _restore_expanded_word(marked, part)
|
| 225 |
+
out_parts.append(restored)
|
| 226 |
+
|
| 227 |
+
if cursor < len(aligned):
|
| 228 |
+
tail = "".join(_mark_syllable_plain(syl, label) for syl, label in aligned[cursor:])
|
| 229 |
+
out_parts.append(_to_final_sigma(tail))
|
| 230 |
+
|
| 231 |
+
return "".join(out_parts)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
def render_results(text: str, model_label: str):
|
| 235 |
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
| 236 |
if not lines:
|
|
|
|
| 244 |
for idx, line in enumerate(lines, start=1):
|
| 245 |
aligned = classify_line(line, model_id)
|
| 246 |
chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
|
| 247 |
+
plain_line = _render_plain_line_with_spacing(line, aligned)
|
| 248 |
|
| 249 |
cards.append(
|
| 250 |
f"""
|
|
|
|
| 257 |
)
|
| 258 |
|
| 259 |
export_lines.append(f"Line {idx}: {line}")
|
| 260 |
+
export_lines.append(f" {plain_line}" if plain_line else " (no syllables found)")
|
| 261 |
|
| 262 |
html_result = (
|
| 263 |
"<div class='legend'><span class='dot long'></span>Long"
|