Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,14 +1,18 @@
|
|
| 1 |
-
# app.py (
|
| 2 |
|
| 3 |
import os
|
| 4 |
import json
|
| 5 |
import time
|
| 6 |
import uuid
|
| 7 |
import logging
|
|
|
|
|
|
|
|
|
|
| 8 |
import gradio as gr
|
| 9 |
from transformers import pipeline
|
| 10 |
import numpy as np
|
| 11 |
-
import
|
|
|
|
| 12 |
|
| 13 |
# Optional: modest thread hints for CPU Spaces
|
| 14 |
try:
|
|
@@ -101,12 +105,12 @@ def _push_row_to_hf_dataset(row, audio_file_path):
|
|
| 101 |
return "Pushed to HF Dataset."
|
| 102 |
|
| 103 |
# --- Map display names to your HF Hub model IDs ---
|
| 104 |
-
# --- EDIT THIS: map display names to your HF Hub model IDs ---
|
| 105 |
language_models = {
|
| 106 |
"Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
|
| 107 |
"Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2",
|
| 108 |
"Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
|
| 109 |
-
|
|
|
|
| 110 |
"Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
|
| 111 |
"Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
|
| 112 |
"Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
|
|
@@ -124,21 +128,77 @@ language_models = {
|
|
| 124 |
"Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
|
| 125 |
"Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
|
| 126 |
"Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
|
| 127 |
-
# "WOLOF":
|
| 128 |
-
# "HAITIAN CREOLE":
|
| 129 |
-
# "KABYLE":
|
| 130 |
"Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
|
| 131 |
-
"Luganda": "FarmerlineML/luganda_fkd",
|
| 132 |
"Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
|
| 133 |
"Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
|
| 134 |
"Pidgin": "FarmerlineML/pidgin_nigerian",
|
| 135 |
"Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
|
| 136 |
"Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
# add more as needed
|
| 140 |
}
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
# -------- Lazy-load pipeline cache (Space-safe) --------
|
| 143 |
_PIPELINE_CACHE = {}
|
| 144 |
_CACHE_ORDER = [] # usage order
|
|
@@ -193,15 +253,15 @@ def _model_revision_from_pipeline(pipe) -> str:
|
|
| 193 |
# -------- Inference --------
|
| 194 |
def transcribe(audio_path: str, language: str):
|
| 195 |
"""
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
Returns transcript
|
| 199 |
"""
|
| 200 |
if not audio_path:
|
| 201 |
return "⚠️ Please upload or record an audio clip.", None
|
| 202 |
|
| 203 |
-
speech, sr =
|
| 204 |
-
duration_s = float(
|
| 205 |
|
| 206 |
pipe = get_asr_pipeline(language)
|
| 207 |
decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
|
|
@@ -233,7 +293,6 @@ def transcribe(audio_path: str, language: str):
|
|
| 233 |
def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, audio_file_path):
|
| 234 |
"""
|
| 235 |
Push a minimal row to HF Dataset: model info, language, transcript, optional corrected text, score.
|
| 236 |
-
No WER/CER computations.
|
| 237 |
"""
|
| 238 |
if not meta:
|
| 239 |
return {"status": "No transcription metadata available. Please transcribe first."}
|
|
|
|
| 1 |
+
# app.py (MP3-robust loader + Luganda FKD commented; minimal feedback)
|
| 2 |
|
| 3 |
import os
|
| 4 |
import json
|
| 5 |
import time
|
| 6 |
import uuid
|
| 7 |
import logging
|
| 8 |
+
import shutil
|
| 9 |
+
import subprocess
|
| 10 |
+
import tempfile
|
| 11 |
import gradio as gr
|
| 12 |
from transformers import pipeline
|
| 13 |
import numpy as np
|
| 14 |
+
import soundfile as sf # librosa depends on this; good for wav/flac/ogg
|
| 15 |
+
import librosa # fallback / resampling
|
| 16 |
|
| 17 |
# Optional: modest thread hints for CPU Spaces
|
| 18 |
try:
|
|
|
|
| 105 |
return "Pushed to HF Dataset."
|
| 106 |
|
| 107 |
# --- Map display names to your HF Hub model IDs ---
|
|
|
|
| 108 |
language_models = {
|
| 109 |
"Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
|
| 110 |
"Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2",
|
| 111 |
"Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
|
| 112 |
+
"Luganda": "FarmerlineML/w2v-bert-2.0_luganda", # active
|
| 113 |
+
# "Luganda (FKD)": "FarmerlineML/luganda_fkd", # commented out per request
|
| 114 |
"Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
|
| 115 |
"Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
|
| 116 |
"Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
|
|
|
|
| 128 |
"Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
|
| 129 |
"Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
|
| 130 |
"Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
|
| 131 |
+
# "WOLOF": "misterkissi/w2v2-lg-xls-r-1b-wolof",
|
| 132 |
+
# "HAITIAN CREOLE": "misterkissi/whisper-small-haitian-creole",
|
| 133 |
+
# "KABYLE": "misterkissi/w2v2-lg-xls-r-1b-kabyle",
|
| 134 |
"Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
|
|
|
|
| 135 |
"Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
|
| 136 |
"Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
|
| 137 |
"Pidgin": "FarmerlineML/pidgin_nigerian",
|
| 138 |
"Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
|
| 139 |
"Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
|
| 140 |
+
"Krio": "FarmerlineML/w2v-bert-2.0_krio_v3",
|
|
|
|
|
|
|
| 141 |
}
|
| 142 |
|
| 143 |
+
# -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
|
| 144 |
+
TARGET_SR = 16000
|
| 145 |
+
|
| 146 |
+
def _has_ffmpeg():
|
| 147 |
+
return shutil.which("ffmpeg") is not None
|
| 148 |
+
|
| 149 |
+
def _load_with_soundfile(path):
|
| 150 |
+
data, sr = sf.read(path, always_2d=False)
|
| 151 |
+
if isinstance(data, np.ndarray) and data.ndim > 1:
|
| 152 |
+
data = data.mean(axis=1)
|
| 153 |
+
return data.astype(np.float32), sr
|
| 154 |
+
|
| 155 |
+
def _load_with_ffmpeg(path, target_sr=TARGET_SR):
|
| 156 |
+
# Convert to mono 16k wav in a temp file using ffmpeg
|
| 157 |
+
if not _has_ffmpeg():
|
| 158 |
+
raise RuntimeError("ffmpeg not available")
|
| 159 |
+
tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 160 |
+
tmp_wav.close()
|
| 161 |
+
cmd = [
|
| 162 |
+
"ffmpeg", "-hide_banner", "-loglevel", "error",
|
| 163 |
+
"-y", "-i", path,
|
| 164 |
+
"-ac", "1", "-ar", str(target_sr),
|
| 165 |
+
tmp_wav.name,
|
| 166 |
+
]
|
| 167 |
+
subprocess.run(cmd, check=True)
|
| 168 |
+
data, sr = sf.read(tmp_wav.name, always_2d=False)
|
| 169 |
+
try:
|
| 170 |
+
os.remove(tmp_wav.name)
|
| 171 |
+
except Exception:
|
| 172 |
+
pass
|
| 173 |
+
if isinstance(data, np.ndarray) and data.ndim > 1:
|
| 174 |
+
data = data.mean(axis=1)
|
| 175 |
+
return data.astype(np.float32), sr
|
| 176 |
+
|
| 177 |
+
def _resample_if_needed(y, sr, target_sr=TARGET_SR):
|
| 178 |
+
if sr == target_sr:
|
| 179 |
+
return y.astype(np.float32), sr
|
| 180 |
+
y_rs = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
|
| 181 |
+
return y_rs.astype(np.float32), target_sr
|
| 182 |
+
|
| 183 |
+
def load_audio_any(path, target_sr=TARGET_SR):
|
| 184 |
+
"""Robust loader: wav/flac/ogg via soundfile; mp3/m4a via ffmpeg; fallback to librosa."""
|
| 185 |
+
ext = os.path.splitext(path)[1].lower()
|
| 186 |
+
try:
|
| 187 |
+
if ext in {".wav", ".flac", ".ogg", ".opus"}:
|
| 188 |
+
y, sr = _load_with_soundfile(path)
|
| 189 |
+
elif _has_ffmpeg():
|
| 190 |
+
y, sr = _load_with_ffmpeg(path, target_sr=target_sr)
|
| 191 |
+
return y, sr # already mono+16k
|
| 192 |
+
else:
|
| 193 |
+
# Fallback to librosa for formats like mp3/m4a when ffmpeg isn't present
|
| 194 |
+
y, sr = librosa.load(path, sr=None, mono=True)
|
| 195 |
+
y, sr = _resample_if_needed(y, sr, target_sr)
|
| 196 |
+
return y, sr
|
| 197 |
+
except Exception as e:
|
| 198 |
+
logging.warning(f"[AUDIO] Primary load failed for {path} ({e}). Falling back to librosa.")
|
| 199 |
+
y, sr = librosa.load(path, sr=target_sr, mono=True)
|
| 200 |
+
return y.astype(np.float32), sr
|
| 201 |
+
|
| 202 |
# -------- Lazy-load pipeline cache (Space-safe) --------
|
| 203 |
_PIPELINE_CACHE = {}
|
| 204 |
_CACHE_ORDER = [] # usage order
|
|
|
|
| 253 |
# -------- Inference --------
|
| 254 |
def transcribe(audio_path: str, language: str):
|
| 255 |
"""
|
| 256 |
+
Robust audio load (mp3/m4a friendly), resample to 16 kHz mono,
|
| 257 |
+
then run it through the chosen ASR pipeline.
|
| 258 |
+
Returns transcript and a meta dict for feedback.
|
| 259 |
"""
|
| 260 |
if not audio_path:
|
| 261 |
return "⚠️ Please upload or record an audio clip.", None
|
| 262 |
|
| 263 |
+
speech, sr = load_audio_any(audio_path, target_sr=TARGET_SR)
|
| 264 |
+
duration_s = float(len(speech) / float(sr))
|
| 265 |
|
| 266 |
pipe = get_asr_pipeline(language)
|
| 267 |
decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
|
|
|
|
| 293 |
def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, audio_file_path):
|
| 294 |
"""
|
| 295 |
Push a minimal row to HF Dataset: model info, language, transcript, optional corrected text, score.
|
|
|
|
| 296 |
"""
|
| 297 |
if not meta:
|
| 298 |
return {"status": "No transcription metadata available. Please transcribe first."}
|