Spaces:

Setur
/

Marka

Sleeping

App Files Files Community

unijoh commited on Jan 21

Commit

81b6b5e

verified ·

1 Parent(s): 32695b0

Upload app.py

Browse files

Files changed (1) hide show

app.py +64 -13

app.py CHANGED Viewed

@@ -165,38 +165,89 @@ def simp_tok(sentence: str):
 # ----------------------------
 def split_sentences(text: str):
     """Split input into sentences.
-    - Prefer FO-Tokenizer if available (BEGIN_SENT / END_SENT markers).
-    - Fall back to a simple regex split if FO-Tokenizer isn't available or fails.
     """
-    s = (text or "").strip()
-    if not s:
         return []
     if _HAS_FOTOKENIZER:
         try:
             toks = fo_tokenize(s)
-            sents = []
-            cur = []
             for tok in toks:
-                if tok.txt:
-                    cur.append(re.sub(r"[\r\n]+", " ", tok.txt))
                     continue
                 # Descriptor-only token (e.g., sentence boundary markers)
                 descr = FO_TOK.descr.get(tok.kind, "").replace(" ", "_")
                 if descr == "BEGIN_SENT":
                     if cur:
                         sent = "".join(cur).strip()
                         if sent:
                             sents.append(sent)
                     cur = []
-                elif descr == "END_SENT":
                     sent = "".join(cur).strip()
                     if sent:
                         sents.append(sent)
                     cur = []
                 else:
-                    # Ignore other descriptor-only tokens
                     pass
             if cur:
@@ -204,14 +255,14 @@ def split_sentences(text: str):
                 if sent:
                     sents.append(sent)
-            # If fotokenizer didn't yield markers, treat as one sentence.
-            return sents or [s]
         except Exception:
             # We'll fall back below
             pass
     # Fallback: split on end punctuation followed by whitespace.
-    parts = re.split(r"(?<=[.!?])\s+", s)
     return [p.strip() for p in parts if p.strip()]

 # ----------------------------
 def split_sentences(text: str):
     """Split input into sentences.
+    We use FO-Tokenizer sentence markers (BEGIN_SENT / END_SENT) when possible.
+    Important detail: some FO-Tokenizer builds emit *whitespace* as "descriptor-only"
+    tokens (empty `.txt`). If we simply join `.txt` pieces we can lose spaces and end
+    up with merged words (e.g. `Núriggarkanska`). This function therefore:
+    - preserves `.txt` pieces as-is
+    - converts descriptor-only whitespace-like tokens into a single space
+    - adds a **best-effort** inserted space between tokens in cases where whitespace
+      is missing but clearly intended (word→word, comma/semicolon/colon→word)
     """
+    s = (text or "")
+    if not s.strip():
         return []
+    def _norm(piece: str) -> str:
+        return re.sub(r"[\r\n]+", " ", piece)
+    def _append_piece(buf: list[str], piece: str) -> None:
+        if not piece:
+            return
+        piece = _norm(piece)
+        if not buf:
+            buf.append(piece)
+            return
+        # If we already ended with whitespace, just append.
+        last = buf[-1]
+        last_char = last[-1] if last else ""
+        if last_char.isspace():
+            buf.append(piece)
+            return
+        # If next token begins with a letter/number and previous token ends with:
+        # - a letter/number (word→word)
+        # - comma/semicolon/colon (",;:" → word)
+        # ...then insert a space (this fixes missing whitespace from some tokenizers).
+        if piece[0].isalnum() and (last_char.isalnum() or last_char in {",", ";", ":"}):
+            buf.append(" ")
+        buf.append(piece)
     if _HAS_FOTOKENIZER:
         try:
             toks = fo_tokenize(s)
+            sents: list[str] = []
+            cur: list[str] = []
             for tok in toks:
+                if getattr(tok, "txt", None):
+                    _append_piece(cur, tok.txt)
                     continue
                 # Descriptor-only token (e.g., sentence boundary markers)
                 descr = FO_TOK.descr.get(tok.kind, "").replace(" ", "_")
                 if descr == "BEGIN_SENT":
+                    # Flush anything we may have buffered (robustness for odd streams)
                     if cur:
                         sent = "".join(cur).strip()
                         if sent:
                             sents.append(sent)
                     cur = []
+                    continue
+                if descr == "END_SENT":
                     sent = "".join(cur).strip()
                     if sent:
                         sents.append(sent)
                     cur = []
+                    continue
+                # Best-effort: keep whitespace-like descriptor-only tokens.
+                up = descr.upper()
+                if "WHITESPACE" in up or "SPACE" in up or "TAB" in up:
+                    _append_piece(cur, " ")
+                elif "NEWLINE" in up or ("LINE" in up and "BREAK" in up):
+                    _append_piece(cur, " ")
+                elif up == "DASH":
+                    _append_piece(cur, "-")
                 else:
+                    # Ignore other descriptor-only tokens.
                     pass
             if cur:
                 if sent:
                     sents.append(sent)
+            # If fotokenizer didn't yield any markers, treat as one sentence.
+            return sents or [s.strip()]
         except Exception:
             # We'll fall back below
             pass
     # Fallback: split on end punctuation followed by whitespace.
+    parts = re.split(r"(?<=[.!?])\s+", s.strip())
     return [p.strip() for p in parts if p.strip()]