Spaces:

DarliAI
/

Evaluation

Sleeping

App Files Files Community

FarmerlineML commited on Aug 14, 2025

Commit

57b796f

verified ·

1 Parent(s): 81083e5

Update app.py

Browse files

Files changed (1) hide show

app.py +232 -425

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py (MP3-robust loader + Robust HF Dataset Appending)
 import os
 import json
@@ -13,11 +13,6 @@ from transformers import pipeline
 import numpy as np
 import soundfile as sf  # librosa depends on this; good for wav/flac/ogg
 import librosa  # fallback / resampling
-import pandas as pd
-import pyarrow.parquet as pq
-import pyarrow as pa
-from huggingface_hub import HfApi
-from typing import Optional, Tuple, Dict, Any
 # Optional: modest thread hints for CPU Spaces
 try:
@@ -27,27 +22,95 @@ try:
 except Exception:
     pass
-# Setup logging with more detail
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-# -------- CONFIG: Hub dataset target --------
 HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
-# Initialize HF API client once
-hf_api = HfApi() if PUSH_TO_HF else None
 # --- Map display names to your HF Hub model IDs ---
 language_models = {
     "Akan (Asante Twi)":        "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
     "Ewe":                      "FarmerlineML/w2v-bert-2.0_ewe_2",
     "Kiswahili":                "FarmerlineML/w2v-bert-2.0_swahili_alpha",
-    "Luganda":                  "FarmerlineML/w2v-bert-2.0_luganda",
     "Brazilian Portuguese":     "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
     "Fante":                    "misterkissi/w2v2-lg-xls-r-300m-fante",
     "Bemba":                    "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
@@ -65,180 +128,61 @@ language_models = {
     "Amharic":                  "misterkissi/w2v2-lg-xls-r-1b-amharic",
     "Xhosa":                    "misterkissi/w2v2-lg-xls-r-300m-xhosa",
     "Tsonga":                   "misterkissi/w2v2-lg-xls-r-300m-tsonga",
     "Yoruba":                   "FarmerlineML/w2v-bert-2.0_yoruba_v1",
     "Luo":                      "FarmerlineML/w2v-bert-2.0_luo_v2",
     "Somali":                   "FarmerlineML/w2v-bert-2.0_somali_alpha",
     "Pidgin":                   "FarmerlineML/pidgin_nigerian",
     "Kikuyu":                   "FarmerlineML/w2v-bert-2.0_kikuyu",
     "Igbo":                     "FarmerlineML/w2v-bert-2.0_igbo_v1",
-    "Krio":                     "FarmerlineML/w2v-bert-2.0_krio_v3",
 }
-# -------- Robust Dataset Push Function --------
-def _push_row_to_hf_dataset(row: Dict[str, Any], audio_file_path: Optional[str]) -> str:
-    """
-    Append a single example to the HF dataset repo using Parquet files.
-    Each submission creates a new Parquet file to avoid overwrites.
-    """
-    if not PUSH_TO_HF:
-        return "HF push disabled (missing HF_TOKEN or repo)."
-    if not hf_api:
-        return "HF API client not initialized."
-    # Create a copy of the row to avoid modifying the original
-    example = dict(row)
-    # Generate unique identifiers for this submission
-    timestamp = time.strftime("%Y%m%d_%H%M%S", time.gmtime())
-    unique_id = str(uuid.uuid4())[:8]
-    # Handle audio file if provided and user consented
-    audio_uploaded = False
-    if audio_file_path and os.path.isfile(audio_file_path) and example.get("share_publicly", False):
-        try:
-            # Store reference to audio file in the dataset
-            audio_filename = f"audio_{timestamp}_{unique_id}{os.path.splitext(audio_file_path)[1]}"
-            example["audio_filename"] = audio_filename
-            # Upload audio file separately
-            logger.info(f"Uploading audio file: {audio_filename}")
-            hf_api.upload_file(
-                path_or_fileobj=audio_file_path,
-                path_in_repo=f"audio/{audio_filename}",
-                repo_id=HF_DATASET_REPO,
-                repo_type="dataset",
-                token=HF_TOKEN,
-                commit_message=f"Add audio for feedback {timestamp}"
-            )
-            audio_uploaded = True
-            logger.info("Audio file uploaded successfully")
-        except Exception as e:
-            logger.error(f"Failed to upload audio: {e}")
-            example["audio_filename"] = None
-    else:
-        example["audio_filename"] = None
-    # Normalize data types for Parquet storage
-    def _safe_cast(value, cast_func, default=None):
-        """Safely cast a value to a type, returning default on failure."""
-        try:
-            return cast_func(value) if value is not None else default
-        except (ValueError, TypeError):
-            return default
-    # Type normalization
-    example["latency_ms"] = _safe_cast(example.get("latency_ms"), int)
-    example["score_out_of_10"] = _safe_cast(example.get("score_out_of_10"), int)
-    example["sample_rate"] = _safe_cast(example.get("sample_rate"), int)
-    example["rtf"] = _safe_cast(example.get("rtf"), float)
-    example["audio_duration_s"] = _safe_cast(example.get("audio_duration_s"), float)
-    example["share_publicly"] = bool(example.get("share_publicly", False))
-    # Ensure all string fields are properly handled
-    string_fields = ["timestamp", "session_id", "language_display", "model_id",
-                    "model_revision", "source", "decode_params", "transcript_hyp",
-                    "corrected_text"]
-    for field in string_fields:
-        if field in example and example[field] is not None:
-            example[field] = str(example[field])
-    # Create DataFrame and save as Parquet
-    df = pd.DataFrame([example])
-    # Generate Parquet filename
-    parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
-    # Create temporary Parquet file
-    temp_parquet = None
-    try:
-        with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
-            temp_parquet = tmp_file.name
-            df.to_parquet(temp_parquet, engine='pyarrow', compression='snappy')
-        # Upload Parquet file to dataset repo
-        logger.info(f"Uploading feedback data: {parquet_filename}")
-        hf_api.upload_file(
-            path_or_fileobj=temp_parquet,
-            path_in_repo=f"data/{parquet_filename}",
-            repo_id=HF_DATASET_REPO,
-            repo_type="dataset",
-            token=HF_TOKEN,
-            commit_message=f"Add feedback row {timestamp}"
-        )
-        logger.info("Feedback data uploaded successfully")
-        status_msg = f"Successfully pushed to HF Dataset as {parquet_filename}"
-        if audio_uploaded:
-            status_msg += " (with audio)"
-        return status_msg
-    except Exception as e:
-        logger.error(f"Failed to push to HF Dataset: {e}")
-        return f"Failed to push to HF Dataset: {str(e)}"
-    finally:
-        # Clean up temporary file
-        if temp_parquet and os.path.exists(temp_parquet):
-            try:
-                os.remove(temp_parquet)
-            except Exception as e:
-                logger.warning(f"Failed to remove temp file: {e}")
 # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
 TARGET_SR = 16000
-def _has_ffmpeg() -> bool:
-    """Check if ffmpeg is available in the system."""
     return shutil.which("ffmpeg") is not None
-def _load_with_soundfile(path: str) -> Tuple[np.ndarray, int]:
-    """Load audio using soundfile (for wav/flac/ogg)."""
     data, sr = sf.read(path, always_2d=False)
     if isinstance(data, np.ndarray) and data.ndim > 1:
         data = data.mean(axis=1)
     return data.astype(np.float32), sr
-def _load_with_ffmpeg(path: str, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
-    """Convert audio to mono wav using ffmpeg."""
     if not _has_ffmpeg():
         raise RuntimeError("ffmpeg not available")
     tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     tmp_wav.close()
     try:
-        cmd = [
-            "ffmpeg", "-hide_banner", "-loglevel", "error",
-            "-y", "-i", path,
-            "-ac", "1", "-ar", str(target_sr),
-            tmp_wav.name,
-        ]
-        subprocess.run(cmd, check=True)
-        data, sr = sf.read(tmp_wav.name, always_2d=False)
-        if isinstance(data, np.ndarray) and data.ndim > 1:
-            data = data.mean(axis=1)
-        return data.astype(np.float32), sr
-    finally:
-        try:
-            os.remove(tmp_wav.name)
-        except Exception:
-            pass
-def _resample_if_needed(y: np.ndarray, sr: int, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
-    """Resample audio if needed."""
     if sr == target_sr:
         return y.astype(np.float32), sr
     y_rs = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
     return y_rs.astype(np.float32), target_sr
-def load_audio_any(path: str, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
     """Robust loader: wav/flac/ogg via soundfile; mp3/m4a via ffmpeg; fallback to librosa."""
-    if not os.path.exists(path):
-        raise FileNotFoundError(f"Audio file not found: {path}")
     ext = os.path.splitext(path)[1].lower()
     try:
         if ext in {".wav", ".flac", ".ogg", ".opus"}:
             y, sr = _load_with_soundfile(path)
@@ -248,11 +192,10 @@ def load_audio_any(path: str, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, i
         else:
             # Fallback to librosa for formats like mp3/m4a when ffmpeg isn't present
             y, sr = librosa.load(path, sr=None, mono=True)
         y, sr = _resample_if_needed(y, sr, target_sr)
         return y, sr
     except Exception as e:
-        logger.warning(f"Primary load failed for {path} ({e}). Falling back to librosa.")
         y, sr = librosa.load(path, sr=target_sr, mono=True)
         return y.astype(np.float32), sr
@@ -261,23 +204,20 @@ _PIPELINE_CACHE = {}
 _CACHE_ORDER = []  # usage order
 _CACHE_MAX_SIZE = 3  # tune for RAM
-def _touch_cache(key: str):
-    """Update cache access order."""
     if key in _CACHE_ORDER:
         _CACHE_ORDER.remove(key)
     _CACHE_ORDER.insert(0, key)
 def _evict_if_needed():
-    """Evict least recently used pipelines if cache is full."""
     while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
-        if _CACHE_ORDER:
-            oldest = _CACHE_ORDER.pop()
-            if oldest in _PIPELINE_CACHE:
-                logger.info(f"Evicting pipeline from cache: {oldest}")
-                del _PIPELINE_CACHE[oldest]
 def get_asr_pipeline(language_display: str):
-    """Get or create ASR pipeline for the specified language."""
     if language_display not in language_models:
         raise ValueError(f"Unknown language selection: {language_display}")
@@ -286,15 +226,13 @@ def get_asr_pipeline(language_display: str):
         return _PIPELINE_CACHE[language_display]
     model_id = language_models[language_display]
-    logger.info(f"Loading pipeline for '{language_display}' -> {model_id}")
     pipe = pipeline(
         task="automatic-speech-recognition",
         model=model_id,
-        device=-1,  # CPU on Spaces
         chunk_length_s=30
     )
     _PIPELINE_CACHE[language_display] = pipe
     _touch_cache(language_display)
     _evict_if_needed()
@@ -302,7 +240,7 @@ def get_asr_pipeline(language_display: str):
 # -------- Helpers --------
 def _model_revision_from_pipeline(pipe) -> str:
-    """Best-effort capture of revision/hash for reproducibility."""
     for attr in ("hub_revision", "revision", "_commit_hash"):
         val = getattr(getattr(pipe, "model", None), attr, None)
         if val:
@@ -313,7 +251,7 @@ def _model_revision_from_pipeline(pipe) -> str:
         return "unknown"
 # -------- Inference --------
-def transcribe(audio_path: str, language: str) -> Tuple[str, Optional[Dict[str, Any]]]:
     """
     Robust audio load (mp3/m4a friendly), resample to 16 kHz mono,
     then run it through the chosen ASR pipeline.
@@ -321,269 +259,138 @@ def transcribe(audio_path: str, language: str) -> Tuple[str, Optional[Dict[str,
     """
     if not audio_path:
         return "⚠️ Please upload or record an audio clip.", None
-    try:
-        # Load and process audio
-        speech, sr = load_audio_any(audio_path, target_sr=TARGET_SR)
-        duration_s = float(len(speech) / float(sr))
-        # Get ASR pipeline
-        pipe = get_asr_pipeline(language)
-        decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
-        # Run inference
-        logger.info(f"Running ASR inference for {language} on {duration_s:.2f}s audio")
-        t0 = time.time()
-        result = pipe({"sampling_rate": sr, "raw": speech})
-        latency_ms = int((time.time() - t0) * 1000.0)
-        hyp_text = result.get("text", "")
-        # Calculate real-time factor
-        rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)
-        # Prepare metadata
-        meta = {
-            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
-            "session_id": f"anon-{uuid.uuid4()}",
-            "language_display": language,
-            "model_id": language_models.get(language, "unknown"),
-            "model_revision": _model_revision_from_pipeline(pipe),
-            "audio_duration_s": duration_s,
-            "sample_rate": sr,
-            "source": "upload",
-            "decode_params": json.dumps(decode_params),
-            "transcript_hyp": hyp_text,
-            "latency_ms": latency_ms,
-            "rtf": rtf,
-        }
-        logger.info(f"Transcription complete. RTF: {rtf:.3f}")
-        return hyp_text, meta
-    except Exception as e:
-        logger.error(f"Transcription failed: {e}")
-        return f"❌ Transcription failed: {str(e)}", None
-# -------- Feedback submit --------
-def submit_feedback(
-    meta: Optional[Dict[str, Any]],
-    corrected_text: str,
-    score: int,
-    store_audio: bool,
-    share_publicly: bool,
-    audio_file_path: Optional[str]
-) -> Dict[str, Any]:
     """
-    Submit feedback to HF Dataset with improved error handling.
     """
     if not meta:
-        return {
-            "status": "❌ No transcription metadata available. Please transcribe first.",
-            "success": False
-        }
-    # Prepare row data
     row = dict(meta)
     row.update({
         "corrected_text": (corrected_text or "").strip(),
         "score_out_of_10": int(score) if score is not None else None,
         "share_publicly": bool(share_publicly),
     })
-    # Push to HF Dataset
     try:
         audio_to_push = audio_file_path if store_audio else None
         hf_status = _push_row_to_hf_dataset(row, audio_to_push)
-        return {
-            "status": f"✅ {hf_status}",
-            "success": True,
-            "latency_ms": row["latency_ms"],
-            "rtf": f"{row['rtf']:.3f}",
-            "model_id": row["model_id"],
-            "model_revision": row["model_revision"],
-            "language": row["language_display"],
-        }
     except Exception as e:
-        logger.error(f"Failed to submit feedback: {e}")
-        return {
-            "status": f"❌ Failed to submit feedback: {str(e)}",
-            "success": False
-        }
-# -------- Gradio UI --------
-def create_demo():
-    """Create the Gradio demo interface."""
-    with gr.Blocks(
-        title="🌐 Multilingual ASR Demo",
-        theme=gr.themes.Soft()
-    ) as demo:
-        gr.Markdown(
-            """
-            # 🎙️ Multilingual Speech-to-Text Demo
-            Upload an audio file (MP3, WAV, FLAC, M4A, OGG, etc.) or record via your microphone.
-            Then choose the language/model and hit **Transcribe**.
-            ---
-            """
-        )
-        with gr.Row():
-            with gr.Column(scale=1):
-                lang = gr.Dropdown(
-                    choices=list(language_models.keys()),
-                    value=list(language_models.keys())[0],
-                    label="Select Language / Model",
-                    info="Choose the language of your audio"
-                )
-                audio = gr.Audio(
-                    sources=["upload", "microphone"],
-                    type="filepath",
-                    label="Upload or Record Audio",
-                    elem_id="audio-input"
-                )
-                btn = gr.Button("🎯 Transcribe", variant="primary", size="lg")
-            with gr.Column(scale=1):
-                output = gr.Textbox(
-                    label="Transcription",
-                    placeholder="Transcription will appear here...",
-                    lines=5
-                )
-                # Status indicators
-                with gr.Row():
-                    status_box = gr.Textbox(
-                        label="Status",
-                        interactive=False,
-                        placeholder="Ready",
-                        max_lines=1
-                    )
-        # Hidden state to carry metadata from transcribe -> feedback
-        meta_state = gr.State(value=None)
-        # Evaluation section
-        with gr.Accordion("📝 Evaluation & Feedback", open=False):
-            gr.Markdown(
-                """
-                Help us improve! Please provide feedback on the transcription quality.
-                """
-            )
-            with gr.Row():
-                corrected_tb = gr.Textbox(
-                    label="Corrected transcript (optional)",
-                    placeholder="If there are errors, type the correct transcription here...",
-                    lines=4,
-                    value=""
-                )
-            with gr.Row():
-                score_slider = gr.Slider(
-                    minimum=0,
-                    maximum=10,
-                    step=1,
-                    label="Quality Score (0 = terrible, 10 = perfect)",
-                    value=7,
-                    info="Rate the transcription quality"
-                )
-            with gr.Row():
-                store_audio_cb = gr.Checkbox(
-                    label="Allow storing my audio for research/evaluation",
-                    value=False,
-                    info="Audio will be stored securely and used only for improving the models"
-                )
-                share_cb = gr.Checkbox(
-                    label="Allow sharing this example publicly",
-                    value=False,
-                    info="Your example may be used in public datasets or demos"
-                )
-            submit_btn = gr.Button("📤 Submit Feedback", variant="secondary")
-            results_json = gr.JSON(
-                label="Submission Result",
-                visible=True
-            )
-        # Examples section
-        with gr.Accordion("📚 Example Usage", open=False):
-            gr.Markdown(
-                """
-                ### Tips for best results:
-                - Speak clearly and at a normal pace
-                - Minimize background noise
-                - Keep recordings under 30 seconds for optimal performance
-                - Select the correct language before transcribing
-                ### Supported formats:
-                WAV, MP3, FLAC, M4A, OGG, OPUS, and more!
-                """
-            )
-        # Wire up events
-        def _transcribe_and_update(audio_path, language):
-            """Transcribe and update UI components."""
-            if not audio_path:
-                return "", None, "", "⚠️ Please provide audio"
-            status_box_val = f"🔄 Processing {language}..."
-            hyp, meta = transcribe(audio_path, language)
-            if meta:
-                status_msg = f"✅ Done! (RTF: {meta['rtf']:.3f})"
-                # Pre-fill corrected with hypothesis for easy edits
-                return hyp, meta, hyp, status_msg
-            else:
-                return hyp, None, "", "❌ Transcription failed"
-        btn.click(
-            fn=_transcribe_and_update,
-            inputs=[audio, lang],
-            outputs=[output, meta_state, corrected_tb, status_box]
-        )
-        submit_btn.click(
-            fn=submit_feedback,
-            inputs=[
-                meta_state,
-                corrected_tb,
-                score_slider,
-                store_audio_cb,
-                share_cb,
-                audio
-            ],
-            outputs=results_json
         )
-        # Auto-focus on audio input when page loads
-        demo.load(
-            fn=lambda: "Ready",
-            inputs=[],
-            outputs=[status_box]
         )
-    return demo
-# -------- Main --------
 if __name__ == "__main__":
-    # Log startup info
-    logger.info(f"Starting ASR Demo")
-    logger.info(f"HF Dataset Repo: {HF_DATASET_REPO}")
-    logger.info(f"Push to HF enabled: {PUSH_TO_HF}")
-    logger.info(f"Available languages: {len(language_models)}")
-    # Create and launch demo
-    demo = create_demo()
-    demo.queue(max_size=10)  # Limit queue size for stability
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False  # Set to True if you want a public link
-    )

+# app.py (MP3-robust loader + Luganda FKD commented; minimal feedback)
 import os
 import json
 import numpy as np
 import soundfile as sf  # librosa depends on this; good for wav/flac/ogg
 import librosa  # fallback / resampling
 # Optional: modest thread hints for CPU Spaces
 try:
 except Exception:
     pass
+# Basic logging so we can verify which model is loaded per inference
+logging.basicConfig(level=logging.INFO)
+# --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
+from datasets import Dataset, Features, Value, Audio, load_dataset
+# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
 HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
+HF_FEATURES = Features({
+    "timestamp":        Value("string"),
+    "session_id":       Value("string"),
+    "language_display": Value("string"),
+    "model_id":         Value("string"),
+    "model_revision":   Value("string"),
+    "audio":            Audio(sampling_rate=None),   # uploaded only if user consents
+    "audio_duration_s": Value("float32"),
+    "sample_rate":      Value("int32"),
+    "source":           Value("string"),
+    "decode_params":    Value("string"),
+    "transcript_hyp":   Value("string"),
+    "corrected_text":   Value("string"),
+    "latency_ms":       Value("int32"),
+    "rtf":              Value("float32"),
+    "score_out_of_10":  Value("int32"),
+    "share_publicly":   Value("bool"),
+})
+def _push_row_to_hf_dataset(row, audio_file_path):
+    """
+    Append a single example to the HF dataset repo (train split).
+    If user didn't consent or no audio path, 'audio' field is None.
+    """
+    if not PUSH_TO_HF:
+        return "HF push disabled (missing HF_TOKEN or repo)."
+    example = dict(row)
+    # Audio: only include if user consented and file exists
+    example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
+    # Normalize types
+    def _to_int(v):
+        try:
+            return int(v)
+        except Exception:
+            return None
+    def _to_float(v):
+        try:
+            return float(v)
+        except Exception:
+            return None
+    for k in ["latency_ms", "score_out_of_10", "sample_rate"]:
+        example[k] = _to_int(example.get(k))
+    for k in ["rtf", "audio_duration_s"]:
+        example[k] = _to_float(example.get(k))
+    ds = Dataset.from_list([example], features=HF_FEATURES)
+    # Load existing split if present, then append
+    try:
+        existing = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
+        merged = existing.concatenate(ds)
+    except Exception:
+        merged = ds
+    merged.push_to_hub(
+        HF_DATASET_REPO,
+        split="train",
+        private=True,
+        token=HF_TOKEN,
+        commit_message="append feedback row"
+    )
+    return "Pushed to HF Dataset."
 # --- Map display names to your HF Hub model IDs ---
 language_models = {
     "Akan (Asante Twi)":        "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
     "Ewe":                      "FarmerlineML/w2v-bert-2.0_ewe_2",
     "Kiswahili":                "FarmerlineML/w2v-bert-2.0_swahili_alpha",
+    "Luganda":                  "FarmerlineML/w2v-bert-2.0_luganda",   # active
+    # "Luganda (FKD)":          "FarmerlineML/luganda_fkd",            # commented out per request
     "Brazilian Portuguese":     "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
     "Fante":                    "misterkissi/w2v2-lg-xls-r-300m-fante",
     "Bemba":                    "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
     "Amharic":                  "misterkissi/w2v2-lg-xls-r-1b-amharic",
     "Xhosa":                    "misterkissi/w2v2-lg-xls-r-300m-xhosa",
     "Tsonga":                   "misterkissi/w2v2-lg-xls-r-300m-tsonga",
+    # "WOLOF":                  "misterkissi/w2v2-lg-xls-r-1b-wolof",
+    # "HAITIAN CREOLE":         "misterkissi/whisper-small-haitian-creole",
+    # "KABYLE":                 "misterkissi/w2v2-lg-xls-r-1b-kabyle",
     "Yoruba":                   "FarmerlineML/w2v-bert-2.0_yoruba_v1",
     "Luo":                      "FarmerlineML/w2v-bert-2.0_luo_v2",
     "Somali":                   "FarmerlineML/w2v-bert-2.0_somali_alpha",
     "Pidgin":                   "FarmerlineML/pidgin_nigerian",
     "Kikuyu":                   "FarmerlineML/w2v-bert-2.0_kikuyu",
     "Igbo":                     "FarmerlineML/w2v-bert-2.0_igbo_v1",
+    "Krio":                   "FarmerlineML/w2v-bert-2.0_krio_v3",
 }
 # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
 TARGET_SR = 16000
+def _has_ffmpeg():
     return shutil.which("ffmpeg") is not None
+def _load_with_soundfile(path):
     data, sr = sf.read(path, always_2d=False)
     if isinstance(data, np.ndarray) and data.ndim > 1:
         data = data.mean(axis=1)
     return data.astype(np.float32), sr
+def _load_with_ffmpeg(path, target_sr=TARGET_SR):
+    # Convert to mono 16k wav in a temp file using ffmpeg
     if not _has_ffmpeg():
         raise RuntimeError("ffmpeg not available")
     tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     tmp_wav.close()
+    cmd = [
+        "ffmpeg", "-hide_banner", "-loglevel", "error",
+        "-y", "-i", path,
+        "-ac", "1", "-ar", str(target_sr),
+        tmp_wav.name,
+    ]
+    subprocess.run(cmd, check=True)
+    data, sr = sf.read(tmp_wav.name, always_2d=False)
     try:
+        os.remove(tmp_wav.name)
+    except Exception:
+        pass
+    if isinstance(data, np.ndarray) and data.ndim > 1:
+        data = data.mean(axis=1)
+    return data.astype(np.float32), sr
+def _resample_if_needed(y, sr, target_sr=TARGET_SR):
     if sr == target_sr:
         return y.astype(np.float32), sr
     y_rs = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
     return y_rs.astype(np.float32), target_sr
+def load_audio_any(path, target_sr=TARGET_SR):
     """Robust loader: wav/flac/ogg via soundfile; mp3/m4a via ffmpeg; fallback to librosa."""
     ext = os.path.splitext(path)[1].lower()
     try:
         if ext in {".wav", ".flac", ".ogg", ".opus"}:
             y, sr = _load_with_soundfile(path)
         else:
             # Fallback to librosa for formats like mp3/m4a when ffmpeg isn't present
             y, sr = librosa.load(path, sr=None, mono=True)
         y, sr = _resample_if_needed(y, sr, target_sr)
         return y, sr
     except Exception as e:
+        logging.warning(f"[AUDIO] Primary load failed for {path} ({e}). Falling back to librosa.")
         y, sr = librosa.load(path, sr=target_sr, mono=True)
         return y.astype(np.float32), sr
 _CACHE_ORDER = []  # usage order
 _CACHE_MAX_SIZE = 3  # tune for RAM
+def _touch_cache(key):
     if key in _CACHE_ORDER:
         _CACHE_ORDER.remove(key)
     _CACHE_ORDER.insert(0, key)
 def _evict_if_needed():
     while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
+        oldest = _CACHE_ORDER.pop()
+        try:
+            del _PIPELINE_CACHE[oldest]
+        except KeyError:
+            pass
 def get_asr_pipeline(language_display: str):
     if language_display not in language_models:
         raise ValueError(f"Unknown language selection: {language_display}")
         return _PIPELINE_CACHE[language_display]
     model_id = language_models[language_display]
+    logging.info(f"[ASR] Loading pipeline for '{language_display}' -> {model_id}")
     pipe = pipeline(
         task="automatic-speech-recognition",
         model=model_id,
+        device=-1,          # CPU on Spaces (explicit)
         chunk_length_s=30
     )
     _PIPELINE_CACHE[language_display] = pipe
     _touch_cache(language_display)
     _evict_if_needed()
 # -------- Helpers --------
 def _model_revision_from_pipeline(pipe) -> str:
+    # Best-effort capture of revision/hash for reproducibility
     for attr in ("hub_revision", "revision", "_commit_hash"):
         val = getattr(getattr(pipe, "model", None), attr, None)
         if val:
         return "unknown"
 # -------- Inference --------
+def transcribe(audio_path: str, language: str):
     """
     Robust audio load (mp3/m4a friendly), resample to 16 kHz mono,
     then run it through the chosen ASR pipeline.
     """
     if not audio_path:
         return "⚠️ Please upload or record an audio clip.", None
+    speech, sr = load_audio_any(audio_path, target_sr=TARGET_SR)
+    duration_s = float(len(speech) / float(sr))
+    pipe = get_asr_pipeline(language)
+    decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
+    t0 = time.time()
+    result = pipe({"sampling_rate": sr, "raw": speech})
+    latency_ms = int((time.time() - t0) * 1000.0)
+    hyp_text = result.get("text", "")
+    rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)
+    meta = {
+        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "session_id": f"anon-{uuid.uuid4()}",
+        "language_display": language,
+        "model_id": language_models.get(language, "unknown"),
+        "model_revision": _model_revision_from_pipeline(pipe),
+        "audio_duration_s": duration_s,
+        "sample_rate": sr,
+        "source": "upload",
+        "decode_params": json.dumps(decode_params),
+        "transcript_hyp": hyp_text,
+        "latency_ms": latency_ms,
+        "rtf": rtf,
+    }
+    return hyp_text, meta
+# -------- Feedback submit (minimal) --------
+def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, audio_file_path):
     """
+    Push a minimal row to HF Dataset: model info, language, transcript, optional corrected text, score.
     """
     if not meta:
+        return {"status": "No transcription metadata available. Please transcribe first."}
     row = dict(meta)
     row.update({
         "corrected_text": (corrected_text or "").strip(),
         "score_out_of_10": int(score) if score is not None else None,
         "share_publicly": bool(share_publicly),
     })
     try:
         audio_to_push = audio_file_path if store_audio else None
         hf_status = _push_row_to_hf_dataset(row, audio_to_push)
+        status = f"Feedback saved. {hf_status}"
     except Exception as e:
+        status = f"Failed to push to HF Dataset: {e}"
+    return {
+        "status": status,
+        "latency_ms": row["latency_ms"],
+        "rtf": row["rtf"],
+        "model_id": row["model_id"],
+        "model_revision": row["model_revision"],
+        "language": row["language_display"],
+    }
+# -------- UI (original preserved; additions appended) --------
+with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
+    gr.Markdown(
+        """
+        ## 🎙️ Multilingual Speech-to-Text
+        Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
+        Then choose the language/model and hit **Transcribe**.
+        """
+    )
+    with gr.Row():
+        lang = gr.Dropdown(
+            choices=list(language_models.keys()),
+            value=list(language_models.keys())[0],
+            label="Select Language / Model"
         )
+    with gr.Row():
+        audio = gr.Audio(
+            sources=["upload", "microphone"],
+            type="filepath",
+            label="Upload or Record Audio"
         )
+    btn = gr.Button("Transcribe")
+    output = gr.Textbox(label="Transcription")
+    # Hidden state to carry metadata from transcribe -> feedback
+    meta_state = gr.State(value=None)
+    # Keep original behavior: output shows transcript
+    # Also capture meta into the hidden state
+    def _transcribe_and_store(audio_path, language):
+        hyp, meta = transcribe(audio_path, language)
+        # Pre-fill corrected with hypothesis for easy edits
+        return hyp, meta, hyp
+    # --- Minimal Evaluation (score + optional corrected text) ---
+    with gr.Accordion("Evaluation", open=False):
+        with gr.Row():
+            corrected_tb = gr.Textbox(label="Corrected transcript (optional)", lines=4, value="")
+        with gr.Row():
+            score_slider = gr.Slider(minimum=0, maximum=10, step=1, label="Score out of 10", value=7)
+        with gr.Row():
+            store_audio_cb = gr.Checkbox(label="Allow storing my audio for research/eval", value=False)
+            share_cb = gr.Checkbox(label="Allow sharing this example publicly", value=False)
+        submit_btn = gr.Button("Submit")
+        results_json = gr.JSON(label="Status")
+    # Wire events
+    btn.click(
+        fn=_transcribe_and_store,
+        inputs=[audio, lang],
+        outputs=[output, meta_state, corrected_tb]
+    )
+    submit_btn.click(
+        fn=submit_feedback,
+        inputs=[
+            meta_state,
+            corrected_tb,
+            score_slider,
+            store_audio_cb,
+            share_cb,
+            audio  # raw file path from gr.Audio
+        ],
+        outputs=results_json
+    )
+# Keep Spaces stable under load
 if __name__ == "__main__":
+    demo.queue()
+    demo.launch()