mazesmazes commited on
Commit
fe23936
·
verified ·
1 Parent(s): d0f55fa

Training in progress - step 500

Browse files
Files changed (2) hide show
  1. asr_config.py +1 -1
  2. asr_pipeline.py +4 -1
asr_config.py CHANGED
@@ -71,7 +71,7 @@ class ASRConfig(transformers.PretrainedConfig):
71
  "min_new_tokens": 0,
72
  "repetition_penalty": 1.0,
73
  "length_penalty": 1.0,
74
- "no_repeat_ngram_size": 0,
75
  "use_cache": True,
76
  }
77
 
 
71
  "min_new_tokens": 0,
72
  "repetition_penalty": 1.0,
73
  "length_penalty": 1.0,
74
+ "no_repeat_ngram_size": 0, # Prevent repeating 3-grams like "so so so"
75
  "use_cache": True,
76
  }
77
 
asr_pipeline.py CHANGED
@@ -486,6 +486,7 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
486
  return ""
487
 
488
  original_len = len(text.split())
 
489
 
490
  # 1. LOWERCASE
491
  text = text.lower()
@@ -505,8 +506,10 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
505
  words = words[: idx + n]
506
  text = " ".join(words)
507
  print(
508
- f"[DEBUG] Truncated repetition: {original_len} -> {len(words)} words (n={n}, repeats={repeat_count})"
509
  )
 
 
510
  break
511
 
512
  # 3. COMBINE ACRONYMS
 
486
  return ""
487
 
488
  original_len = len(text.split())
489
+ original_text = text # Keep for debug
490
 
491
  # 1. LOWERCASE
492
  text = text.lower()
 
506
  words = words[: idx + n]
507
  text = " ".join(words)
508
  print(
509
+ f"[POSTPROCESS] Truncated repetition: {original_len} -> {len(words)} words (n={n}, repeats={repeat_count})"
510
  )
511
+ print(f"[POSTPROCESS] Before: {original_text[:100]}...")
512
+ print(f"[POSTPROCESS] After: {text[:100]}...")
513
  break
514
 
515
  # 3. COMBINE ACRONYMS