Lior-0618 Claude Sonnet 4.6 commited on
Commit
ca55dbc
Β·
1 Parent(s): 4c097b5

fix: distribute text by sentence boundaries instead of character proportion

Browse files

Split transcript at punctuation marks before assigning to VAD segments,
so sentences are never cut mid-phrase. Falls back to character-level for
text without punctuation.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. model/voxtral-server/main.py +39 -26
model/voxtral-server/main.py CHANGED
@@ -259,44 +259,57 @@ def _segments_from_vad(audio: np.ndarray, sr: int) -> tuple[list[dict], str]:
259
  return segs, "vad"
260
 
261
 
262
- def _tokenize_text(text: str) -> list[str]:
263
- """Split text into tokens. For CJK text (no spaces), split by character.
264
- For space-separated languages, split by whitespace."""
265
- tokens = text.split()
266
- # If no spaces found (e.g. Chinese/Japanese), split by character instead
267
- if len(tokens) <= 1 and len(text) > 1:
268
- return list(text)
269
- return tokens
270
 
271
 
272
  def _distribute_text(full_text: str, segs: list[dict]) -> list[dict]:
273
- """Proportionally distribute transcription tokens across segments by duration."""
274
- tokens = _tokenize_text(full_text)
275
- total_tokens = len(tokens)
276
- if not tokens or not segs:
 
277
  return [{**s, "text": ""} for s in segs]
278
 
 
 
 
 
 
 
 
 
 
279
  total_dur = sum(s["end"] - s["start"] for s in segs)
280
  if total_dur <= 0:
281
- result = [{**segs[0], "text": full_text}]
282
- return result + [{**s, "text": ""} for s in segs[1:]]
283
 
284
  is_cjk = len(full_text.split()) <= 1 and len(full_text) > 1
285
  sep = "" if is_cjk else " "
286
 
287
- result: list[dict] = []
288
- token_idx = 0
 
 
 
289
  for i, seg in enumerate(segs):
290
- dur = seg["end"] - seg["start"]
291
- frac = dur / total_dur
292
- n = round(frac * total_tokens)
293
- if i == len(segs) - 1:
294
- chunk = tokens[token_idx:]
295
- else:
296
- chunk = tokens[token_idx: token_idx + max(1, n)]
297
- result.append({**seg, "text": sep.join(chunk)})
298
- token_idx += len(chunk)
299
- return result
 
 
 
 
300
 
301
 
302
  # ─── Emotion analysis ──────────────────────────────────────────────────────────
 
259
  return segs, "vad"
260
 
261
 
262
+ def _split_sentences(text: str) -> list[str]:
263
+ """Split text into sentences at punctuation boundaries (CJK + Latin)."""
264
+ import re
265
+ parts = re.split(r'(?<=[οΌŸοΌγ€‚?!])\s*', text)
266
+ return [p for p in parts if p.strip()]
 
 
 
267
 
268
 
269
  def _distribute_text(full_text: str, segs: list[dict]) -> list[dict]:
270
+ """Assign complete sentences to segments by time proportion.
271
+ Sentences are never split mid-punctuation; each segment gets whole sentences.
272
+ Falls back to character-level splitting if no sentence boundaries found.
273
+ """
274
+ if not full_text or not segs:
275
  return [{**s, "text": ""} for s in segs]
276
 
277
+ if len(segs) == 1:
278
+ return [{**segs[0], "text": full_text}]
279
+
280
+ sentences = _split_sentences(full_text)
281
+ # Fallback: split by character if no sentence boundaries
282
+ if len(sentences) <= 1:
283
+ is_cjk = len(full_text.split()) <= 1
284
+ sentences = list(full_text) if is_cjk else full_text.split()
285
+
286
  total_dur = sum(s["end"] - s["start"] for s in segs)
287
  if total_dur <= 0:
288
+ return [{**segs[0], "text": full_text}] + [{**s, "text": ""} for s in segs[1:]]
 
289
 
290
  is_cjk = len(full_text.split()) <= 1 and len(full_text) > 1
291
  sep = "" if is_cjk else " "
292
 
293
+ # Assign each sentence to the segment whose cumulative time covers its proportional position
294
+ n = len(sentences)
295
+ result_texts: list[list[str]] = [[] for _ in segs]
296
+
297
+ cumulative = 0.0
298
  for i, seg in enumerate(segs):
299
+ cumulative += (seg["end"] - seg["start"]) / total_dur
300
+ # Assign sentences whose proportional position falls within this segment's cumulative range
301
+ threshold = cumulative * n
302
+ while len(result_texts[i]) + sum(len(t) for t in result_texts[:i]) < round(threshold):
303
+ idx = sum(len(t) for t in result_texts)
304
+ if idx >= n:
305
+ break
306
+ result_texts[i].append(sentences[idx])
307
+
308
+ # Ensure any leftover sentences go to the last segment
309
+ assigned = sum(len(t) for t in result_texts)
310
+ result_texts[-1].extend(sentences[assigned:])
311
+
312
+ return [{**seg, "text": sep.join(texts)} for seg, texts in zip(segs, result_texts)]
313
 
314
 
315
  # ─── Emotion analysis ──────────────────────────────────────────────────────────