mazesmazes commited on
Commit
2580a47
·
verified ·
1 Parent(s): e864225

Training in progress - step 12000

Browse files
Files changed (1) hide show
  1. asr_pipeline.py +18 -23
asr_pipeline.py CHANGED
@@ -489,28 +489,23 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
489
  text = text.lower()
490
 
491
  # 2. REMOVE REPETITIVE LOOPS
492
- # If the model repeats the same phrase, keep only one instance.
493
  words = text.split()
494
- for n in range(1, min(6, len(words) // 2 + 1)):
495
- last_sequence = words[-n:]
496
- repeat_count = 0
497
- idx = len(words) - n
498
- while idx >= n and words[idx - n : idx] == last_sequence:
499
- repeat_count += 1
500
- idx -= n
501
-
502
- if repeat_count >= 1:
503
- words = words[: idx + n]
504
- text = " ".join(words)
505
- break
506
-
507
- # 3. COMBINE ACRONYMS
508
- # Merge consecutive single letters into one word (e.g., "u s a" -> "usa")
509
- text = re.sub(r"\b([a-z])((?:\s+[a-z])+)\b", lambda m: m.group(0).replace(" ", ""), text)
510
-
511
- # 4. NORMALIZE CURRENCY
512
- # Convert "eur X" to "X euros" for Whisper normalizer compatibility
513
- text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text)
514
-
515
- # 5. STRIP WHITESPACE
516
  return re.sub(r"\s+", " ", text).strip()
 
489
  text = text.lower()
490
 
491
  # 2. REMOVE REPETITIVE LOOPS
492
+ # If the model repeats the same phrase more than twice, remove all repetitions.
493
  words = text.split()
494
+ if len(words) > 10:
495
+ # Check for repeating n-grams (1 to 5 words long)
496
+ for n in range(1, 6):
497
+ last_sequence = words[-n:]
498
+ repeat_count = 0
499
+ idx = len(words) - n
500
+ while idx >= n and words[idx - n : idx] == last_sequence:
501
+ repeat_count += 1
502
+ idx -= n
503
+
504
+ # If more than 2 exact repetitions at the end, remove all of them
505
+ if repeat_count > 2:
506
+ words = words[:idx]
507
+ text = " ".join(words)
508
+ break
509
+
510
+ # 3. STRIP WHITESPACE
 
 
 
 
 
511
  return re.sub(r"\s+", " ", text).strip()