Upload app.py
Browse files
app.py
CHANGED
|
@@ -165,38 +165,89 @@ def simp_tok(sentence: str):
|
|
| 165 |
# ----------------------------
|
| 166 |
def split_sentences(text: str):
|
| 167 |
"""Split input into sentences.
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
"""
|
| 171 |
-
|
| 172 |
-
|
|
|
|
| 173 |
return []
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
if _HAS_FOTOKENIZER:
|
| 176 |
try:
|
| 177 |
toks = fo_tokenize(s)
|
| 178 |
-
sents = []
|
| 179 |
-
cur = []
|
|
|
|
| 180 |
for tok in toks:
|
| 181 |
-
if tok
|
| 182 |
-
|
| 183 |
continue
|
| 184 |
|
| 185 |
# Descriptor-only token (e.g., sentence boundary markers)
|
| 186 |
descr = FO_TOK.descr.get(tok.kind, "").replace(" ", "_")
|
|
|
|
| 187 |
if descr == "BEGIN_SENT":
|
|
|
|
| 188 |
if cur:
|
| 189 |
sent = "".join(cur).strip()
|
| 190 |
if sent:
|
| 191 |
sents.append(sent)
|
| 192 |
cur = []
|
| 193 |
-
|
|
|
|
|
|
|
| 194 |
sent = "".join(cur).strip()
|
| 195 |
if sent:
|
| 196 |
sents.append(sent)
|
| 197 |
cur = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
else:
|
| 199 |
-
# Ignore other descriptor-only tokens
|
| 200 |
pass
|
| 201 |
|
| 202 |
if cur:
|
|
@@ -204,14 +255,14 @@ def split_sentences(text: str):
|
|
| 204 |
if sent:
|
| 205 |
sents.append(sent)
|
| 206 |
|
| 207 |
-
# If fotokenizer didn't yield markers, treat as one sentence.
|
| 208 |
-
return sents or [s]
|
| 209 |
except Exception:
|
| 210 |
# We'll fall back below
|
| 211 |
pass
|
| 212 |
|
| 213 |
# Fallback: split on end punctuation followed by whitespace.
|
| 214 |
-
parts = re.split(r"(?<=[.!?])\s+", s)
|
| 215 |
return [p.strip() for p in parts if p.strip()]
|
| 216 |
|
| 217 |
|
|
|
|
| 165 |
# ----------------------------
|
| 166 |
def split_sentences(text: str):
|
| 167 |
"""Split input into sentences.
|
| 168 |
+
|
| 169 |
+
We use FO-Tokenizer sentence markers (BEGIN_SENT / END_SENT) when possible.
|
| 170 |
+
|
| 171 |
+
Important detail: some FO-Tokenizer builds emit *whitespace* as "descriptor-only"
|
| 172 |
+
tokens (empty `.txt`). If we simply join `.txt` pieces we can lose spaces and end
|
| 173 |
+
up with merged words (e.g. `Núriggarkanska`). This function therefore:
|
| 174 |
+
- preserves `.txt` pieces as-is
|
| 175 |
+
- converts descriptor-only whitespace-like tokens into a single space
|
| 176 |
+
- adds a **best-effort** inserted space between tokens in cases where whitespace
|
| 177 |
+
is missing but clearly intended (word→word, comma/semicolon/colon→word)
|
| 178 |
"""
|
| 179 |
+
|
| 180 |
+
s = (text or "")
|
| 181 |
+
if not s.strip():
|
| 182 |
return []
|
| 183 |
|
| 184 |
+
def _norm(piece: str) -> str:
|
| 185 |
+
return re.sub(r"[\r\n]+", " ", piece)
|
| 186 |
+
|
| 187 |
+
def _append_piece(buf: list[str], piece: str) -> None:
|
| 188 |
+
if not piece:
|
| 189 |
+
return
|
| 190 |
+
piece = _norm(piece)
|
| 191 |
+
if not buf:
|
| 192 |
+
buf.append(piece)
|
| 193 |
+
return
|
| 194 |
+
|
| 195 |
+
# If we already ended with whitespace, just append.
|
| 196 |
+
last = buf[-1]
|
| 197 |
+
last_char = last[-1] if last else ""
|
| 198 |
+
if last_char.isspace():
|
| 199 |
+
buf.append(piece)
|
| 200 |
+
return
|
| 201 |
+
|
| 202 |
+
# If next token begins with a letter/number and previous token ends with:
|
| 203 |
+
# - a letter/number (word→word)
|
| 204 |
+
# - comma/semicolon/colon (",;:" → word)
|
| 205 |
+
# ...then insert a space (this fixes missing whitespace from some tokenizers).
|
| 206 |
+
if piece[0].isalnum() and (last_char.isalnum() or last_char in {",", ";", ":"}):
|
| 207 |
+
buf.append(" ")
|
| 208 |
+
|
| 209 |
+
buf.append(piece)
|
| 210 |
+
|
| 211 |
if _HAS_FOTOKENIZER:
|
| 212 |
try:
|
| 213 |
toks = fo_tokenize(s)
|
| 214 |
+
sents: list[str] = []
|
| 215 |
+
cur: list[str] = []
|
| 216 |
+
|
| 217 |
for tok in toks:
|
| 218 |
+
if getattr(tok, "txt", None):
|
| 219 |
+
_append_piece(cur, tok.txt)
|
| 220 |
continue
|
| 221 |
|
| 222 |
# Descriptor-only token (e.g., sentence boundary markers)
|
| 223 |
descr = FO_TOK.descr.get(tok.kind, "").replace(" ", "_")
|
| 224 |
+
|
| 225 |
if descr == "BEGIN_SENT":
|
| 226 |
+
# Flush anything we may have buffered (robustness for odd streams)
|
| 227 |
if cur:
|
| 228 |
sent = "".join(cur).strip()
|
| 229 |
if sent:
|
| 230 |
sents.append(sent)
|
| 231 |
cur = []
|
| 232 |
+
continue
|
| 233 |
+
|
| 234 |
+
if descr == "END_SENT":
|
| 235 |
sent = "".join(cur).strip()
|
| 236 |
if sent:
|
| 237 |
sents.append(sent)
|
| 238 |
cur = []
|
| 239 |
+
continue
|
| 240 |
+
|
| 241 |
+
# Best-effort: keep whitespace-like descriptor-only tokens.
|
| 242 |
+
up = descr.upper()
|
| 243 |
+
if "WHITESPACE" in up or "SPACE" in up or "TAB" in up:
|
| 244 |
+
_append_piece(cur, " ")
|
| 245 |
+
elif "NEWLINE" in up or ("LINE" in up and "BREAK" in up):
|
| 246 |
+
_append_piece(cur, " ")
|
| 247 |
+
elif up == "DASH":
|
| 248 |
+
_append_piece(cur, "-")
|
| 249 |
else:
|
| 250 |
+
# Ignore other descriptor-only tokens.
|
| 251 |
pass
|
| 252 |
|
| 253 |
if cur:
|
|
|
|
| 255 |
if sent:
|
| 256 |
sents.append(sent)
|
| 257 |
|
| 258 |
+
# If fotokenizer didn't yield any markers, treat as one sentence.
|
| 259 |
+
return sents or [s.strip()]
|
| 260 |
except Exception:
|
| 261 |
# We'll fall back below
|
| 262 |
pass
|
| 263 |
|
| 264 |
# Fallback: split on end punctuation followed by whitespace.
|
| 265 |
+
parts = re.split(r"(?<=[.!?])\s+", s.strip())
|
| 266 |
return [p.strip() for p in parts if p.strip()]
|
| 267 |
|
| 268 |
|