nexusbert commited on
Commit
a955afe
·
1 Parent(s): 4f2310e

igbo asr fix

Browse files
Files changed (1) hide show
  1. app.py +5 -9
app.py CHANGED
@@ -106,12 +106,11 @@ def _get_igbo_asr():
106
  logger.warning("HF_TOKEN not set - Igbo ASR model requires authentication")
107
  return None, None
108
 
109
-
110
  hf_token = hf_token.strip()
111
 
112
  try:
113
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
114
- logger.info("Lazy-loading Igbo ASR model (gated model)...")
115
  igbo_processor = WhisperProcessor.from_pretrained("NCAIR1/Igbo-ASR", token=hf_token)
116
  igbo_model = WhisperForConditionalGeneration.from_pretrained("NCAIR1/Igbo-ASR", token=hf_token)
117
  igbo_model.to(device)
@@ -132,13 +131,12 @@ def _run_whisper(model: WhisperForConditionalGeneration, proc: WhisperProcessor,
132
  generation_kwargs = {
133
  "max_length": 448,
134
  "num_beams": 1,
135
- "do_sample": False,
136
- "early_stopping": True
137
  }
138
 
139
-
140
  if language == "igbo" or "igbo" in str(model.config).lower():
141
- generation_kwargs["language"] = "igbo"
 
142
  generation_kwargs["task"] = "transcribe"
143
 
144
  with torch.no_grad():
@@ -194,7 +192,6 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
194
  def speech_to_text(audio_data: bytes) -> str:
195
  audio_array = preprocess_audio_ffmpeg(audio_data)
196
 
197
- # Try Igbo ASR first for better Igbo detection
198
  igbo_result = _get_igbo_asr()
199
  if igbo_result[0] is not None and igbo_result[1] is not None:
200
  igbo_model, igbo_proc = igbo_result
@@ -203,9 +200,8 @@ def speech_to_text(audio_data: bytes) -> str:
203
  logger.info("Using Igbo ASR result")
204
  return igbo_text
205
 
206
- # Fallback to MMS for other languages
207
  mms_result = _get_mms()
208
- if mms_result[0] is not None and mms_result[1] is not None:
209
  mms_model, mms_proc = mms_result
210
  mms_text = _run_mms(mms_model, mms_proc, audio_array)
211
  if mms_text and mms_text.strip():
 
106
  logger.warning("HF_TOKEN not set - Igbo ASR model requires authentication")
107
  return None, None
108
 
 
109
  hf_token = hf_token.strip()
110
 
111
  try:
112
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
113
+ logger.info("Lazy-loading Igbo ASR model...")
114
  igbo_processor = WhisperProcessor.from_pretrained("NCAIR1/Igbo-ASR", token=hf_token)
115
  igbo_model = WhisperForConditionalGeneration.from_pretrained("NCAIR1/Igbo-ASR", token=hf_token)
116
  igbo_model.to(device)
 
131
  generation_kwargs = {
132
  "max_length": 448,
133
  "num_beams": 1,
134
+ "do_sample": False
 
135
  }
136
 
 
137
  if language == "igbo" or "igbo" in str(model.config).lower():
138
+ pass
139
+ else:
140
  generation_kwargs["task"] = "transcribe"
141
 
142
  with torch.no_grad():
 
192
  def speech_to_text(audio_data: bytes) -> str:
193
  audio_array = preprocess_audio_ffmpeg(audio_data)
194
 
 
195
  igbo_result = _get_igbo_asr()
196
  if igbo_result[0] is not None and igbo_result[1] is not None:
197
  igbo_model, igbo_proc = igbo_result
 
200
  logger.info("Using Igbo ASR result")
201
  return igbo_text
202
 
 
203
  mms_result = _get_mms()
204
+ if mms_result and mms_result[0] is not None and mms_result[1] is not None:
205
  mms_model, mms_proc = mms_result
206
  mms_text = _run_mms(mms_model, mms_proc, audio_array)
207
  if mms_text and mms_text.strip():