mazesmazes commited on
Commit
4382c4b
·
verified ·
1 Parent(s): d18d57f

Update custom model files, README, and requirements

Browse files
Files changed (1) hide show
  1. asr_pipeline.py +23 -18
asr_pipeline.py CHANGED
@@ -489,23 +489,28 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
489
  text = text.lower()
490
 
491
  # 2. REMOVE REPETITIVE LOOPS
492
- # If the model repeats the same phrase more than twice, remove all repetitions.
493
  words = text.split()
494
- if len(words) > 10:
495
- # Check for repeating n-grams (1 to 5 words long)
496
- for n in range(1, 6):
497
- last_sequence = words[-n:]
498
- repeat_count = 0
499
- idx = len(words) - n
500
- while idx >= n and words[idx - n : idx] == last_sequence:
501
- repeat_count += 1
502
- idx -= n
503
-
504
- # If more than 2 exact repetitions at the end, remove all of them
505
- if repeat_count > 2:
506
- words = words[:idx]
507
- text = " ".join(words)
508
- break
509
-
510
- # 3. STRIP WHITESPACE
 
 
 
 
 
511
  return re.sub(r"\s+", " ", text).strip()
 
489
  text = text.lower()
490
 
491
  # 2. REMOVE REPETITIVE LOOPS
492
+ # If the model repeats the same phrase, keep only one instance.
493
  words = text.split()
494
+ for n in range(1, min(6, len(words) // 2 + 1)):
495
+ last_sequence = words[-n:]
496
+ repeat_count = 0
497
+ idx = len(words) - n
498
+ while idx >= n and words[idx - n : idx] == last_sequence:
499
+ repeat_count += 1
500
+ idx -= n
501
+
502
+ if repeat_count >= 1:
503
+ words = words[: idx + n]
504
+ text = " ".join(words)
505
+ break
506
+
507
+ # 3. COMBINE ACRONYMS
508
+ # Merge consecutive single letters into one word (e.g., "u s a" -> "usa")
509
+ text = re.sub(r"\b([a-z])((?:\s+[a-z])+)\b", lambda m: m.group(0).replace(" ", ""), text)
510
+
511
+ # 4. NORMALIZE CURRENCY
512
+ # Convert "eur X" to "X euros" for Whisper normalizer compatibility
513
+ text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text)
514
+
515
+ # 5. STRIP WHITESPACE
516
  return re.sub(r"\s+", " ", text).strip()