unijoh commited on
Commit
81b6b5e
·
verified ·
1 Parent(s): 32695b0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -13
app.py CHANGED
@@ -165,38 +165,89 @@ def simp_tok(sentence: str):
165
  # ----------------------------
166
  def split_sentences(text: str):
167
  """Split input into sentences.
168
- - Prefer FO-Tokenizer if available (BEGIN_SENT / END_SENT markers).
169
- - Fall back to a simple regex split if FO-Tokenizer isn't available or fails.
 
 
 
 
 
 
 
 
170
  """
171
- s = (text or "").strip()
172
- if not s:
 
173
  return []
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  if _HAS_FOTOKENIZER:
176
  try:
177
  toks = fo_tokenize(s)
178
- sents = []
179
- cur = []
 
180
  for tok in toks:
181
- if tok.txt:
182
- cur.append(re.sub(r"[\r\n]+", " ", tok.txt))
183
  continue
184
 
185
  # Descriptor-only token (e.g., sentence boundary markers)
186
  descr = FO_TOK.descr.get(tok.kind, "").replace(" ", "_")
 
187
  if descr == "BEGIN_SENT":
 
188
  if cur:
189
  sent = "".join(cur).strip()
190
  if sent:
191
  sents.append(sent)
192
  cur = []
193
- elif descr == "END_SENT":
 
 
194
  sent = "".join(cur).strip()
195
  if sent:
196
  sents.append(sent)
197
  cur = []
 
 
 
 
 
 
 
 
 
 
198
  else:
199
- # Ignore other descriptor-only tokens
200
  pass
201
 
202
  if cur:
@@ -204,14 +255,14 @@ def split_sentences(text: str):
204
  if sent:
205
  sents.append(sent)
206
 
207
- # If fotokenizer didn't yield markers, treat as one sentence.
208
- return sents or [s]
209
  except Exception:
210
  # We'll fall back below
211
  pass
212
 
213
  # Fallback: split on end punctuation followed by whitespace.
214
- parts = re.split(r"(?<=[.!?])\s+", s)
215
  return [p.strip() for p in parts if p.strip()]
216
 
217
 
 
165
  # ----------------------------
166
  def split_sentences(text: str):
167
  """Split input into sentences.
168
+
169
+ We use FO-Tokenizer sentence markers (BEGIN_SENT / END_SENT) when possible.
170
+
171
+ Important detail: some FO-Tokenizer builds emit *whitespace* as "descriptor-only"
172
+ tokens (empty `.txt`). If we simply join `.txt` pieces we can lose spaces and end
173
+ up with merged words (e.g. `Núriggarkanska`). This function therefore:
174
+ - preserves `.txt` pieces as-is
175
+ - converts descriptor-only whitespace-like tokens into a single space
176
+ - adds a **best-effort** inserted space between tokens in cases where whitespace
177
+ is missing but clearly intended (word→word, comma/semicolon/colon→word)
178
  """
179
+
180
+ s = (text or "")
181
+ if not s.strip():
182
  return []
183
 
184
+ def _norm(piece: str) -> str:
185
+ return re.sub(r"[\r\n]+", " ", piece)
186
+
187
+ def _append_piece(buf: list[str], piece: str) -> None:
188
+ if not piece:
189
+ return
190
+ piece = _norm(piece)
191
+ if not buf:
192
+ buf.append(piece)
193
+ return
194
+
195
+ # If we already ended with whitespace, just append.
196
+ last = buf[-1]
197
+ last_char = last[-1] if last else ""
198
+ if last_char.isspace():
199
+ buf.append(piece)
200
+ return
201
+
202
+ # If next token begins with a letter/number and previous token ends with:
203
+ # - a letter/number (word→word)
204
+ # - comma/semicolon/colon (",;:" → word)
205
+ # ...then insert a space (this fixes missing whitespace from some tokenizers).
206
+ if piece[0].isalnum() and (last_char.isalnum() or last_char in {",", ";", ":"}):
207
+ buf.append(" ")
208
+
209
+ buf.append(piece)
210
+
211
  if _HAS_FOTOKENIZER:
212
  try:
213
  toks = fo_tokenize(s)
214
+ sents: list[str] = []
215
+ cur: list[str] = []
216
+
217
  for tok in toks:
218
+ if getattr(tok, "txt", None):
219
+ _append_piece(cur, tok.txt)
220
  continue
221
 
222
  # Descriptor-only token (e.g., sentence boundary markers)
223
  descr = FO_TOK.descr.get(tok.kind, "").replace(" ", "_")
224
+
225
  if descr == "BEGIN_SENT":
226
+ # Flush anything we may have buffered (robustness for odd streams)
227
  if cur:
228
  sent = "".join(cur).strip()
229
  if sent:
230
  sents.append(sent)
231
  cur = []
232
+ continue
233
+
234
+ if descr == "END_SENT":
235
  sent = "".join(cur).strip()
236
  if sent:
237
  sents.append(sent)
238
  cur = []
239
+ continue
240
+
241
+ # Best-effort: keep whitespace-like descriptor-only tokens.
242
+ up = descr.upper()
243
+ if "WHITESPACE" in up or "SPACE" in up or "TAB" in up:
244
+ _append_piece(cur, " ")
245
+ elif "NEWLINE" in up or ("LINE" in up and "BREAK" in up):
246
+ _append_piece(cur, " ")
247
+ elif up == "DASH":
248
+ _append_piece(cur, "-")
249
  else:
250
+ # Ignore other descriptor-only tokens.
251
  pass
252
 
253
  if cur:
 
255
  if sent:
256
  sents.append(sent)
257
 
258
+ # If fotokenizer didn't yield any markers, treat as one sentence.
259
+ return sents or [s.strip()]
260
  except Exception:
261
  # We'll fall back below
262
  pass
263
 
264
  # Fallback: split on end punctuation followed by whitespace.
265
+ parts = re.split(r"(?<=[.!?])\s+", s.strip())
266
  return [p.strip() for p in parts if p.strip()]
267
 
268