Spaces:

EYEDOL
/

DEV_LANG

Sleeping

App Files Files Community

EYEDOL commited on Nov 11, 2025

Commit

53c76c9

verified ·

1 Parent(s): 24ed6f4

Update app.py

Browse files

Files changed (1) hide show

app.py +201 -35

app.py CHANGED Viewed

@@ -1,53 +1,219 @@
 import os
 import torch
 from transformers import pipeline
 import gradio as gr
 MODEL_ID = "EYEDOL/Yoruba-ASRNEW"
 device = 0 if torch.cuda.is_available() else -1
 asr = pipeline("automatic-speech-recognition", model=MODEL_ID, device=device)
-def transcribe_from_file(audio):
     """
-    audio: Tuple (sample_rate, numpy array) or filepath depending on mode
     """
-    if audio is None:
         return "No audio provided."
-    # If audio is a tuple (sample_rate, data), save temporarily
-    if isinstance(audio, tuple):
-        import tempfile
-        import soundfile as sf
-        temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-        sf.write(temp_wav.name, audio[1], audio[0])
-        audio_path = temp_wav.name
     else:
-        audio_path = audio  # already a path
-    result = asr(audio_path)
-    return result.get("text", "")
-with gr.Blocks(title="Yoruba ASR Demo") as demo:
-    gr.Markdown("## Yoruba ASR — try microphone or upload an audio file 🎙️")
-    with gr.Tabs():
-        with gr.TabItem("🎤 Microphone"):
-            mic_input = gr.Audio(label="Record from mic", type="numpy")  # microphone input as numpy array
-            mic_button = gr.Button("Transcribe")
-            mic_output = gr.Textbox(label="Transcription")
-            mic_button.click(fn=transcribe_from_file, inputs=mic_input, outputs=mic_output)
-        with gr.TabItem("📁 Upload File"):
-            file_input = gr.Audio(label="Upload audio file", type="filepath")  # uploaded file path
-            file_button = gr.Button("Transcribe")
-            file_output = gr.Textbox(label="Transcription")
-            file_button.click(fn=transcribe_from_file, inputs=file_input, outputs=file_output)
-    gr.Markdown(
-        "✅ **Tips:** Use clear Yoruba speech. For private models, add an `HF_TOKEN` secret in Space settings. "
-        "GPU is recommended for faster inference."
-    )
 if __name__ == "__main__":
     demo.launch()

 import os
+import tempfile
+import math
 import torch
+import soundfile as sf
 from transformers import pipeline
 import gradio as gr
+# Optional: pydub helps with splitting arbitrary audio formats (mp3, m4a, etc.)
+from pydub import AudioSegment
 MODEL_ID = "EYEDOL/Yoruba-ASRNEW"
+# device for transformers pipeline
 device = 0 if torch.cuda.is_available() else -1
+# Create pipeline (automatic-speech-recognition)
 asr = pipeline("automatic-speech-recognition", model=MODEL_ID, device=device)
+# Utility: write numpy (rate, data) to wav
+def save_numpy_to_wav(np_tuple):
+    samplerate, data = np_tuple
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    sf.write(tmp.name, data, samplerate)
+    return tmp.name
+# Utility: return audio duration in seconds (works for file paths)
+def get_duration_seconds(path):
+    try:
+        info = sf.info(path)
+        return info.duration
+    except Exception:
+        # fallback to pydub
+        seg = AudioSegment.from_file(path)
+        return len(seg) / 1000.0
+# Split an audio file into chunks (ms). Returns list of (chunk_path, start_ms, end_ms)
+def split_audio_file(path, chunk_length_ms=25000, overlap_ms=500):
+    audio = AudioSegment.from_file(path)
+    duration_ms = len(audio)
+    chunks = []
+    start = 0
+    while start < duration_ms:
+        end = start + chunk_length_ms
+        if end > duration_ms:
+            end = duration_ms
+        chunk = audio[start:end]
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        chunk.export(tmp.name, format="wav")
+        chunks.append((tmp.name, start, end))
+        # advance start by chunk_length - overlap
+        start += chunk_length_ms - overlap_ms
+    return chunks
+# Transcribe a single file path (wraps pipeline call). Supports passing return_timestamps param optionally.
+def transcribe_file(path, return_timestamps=False):
+    if return_timestamps:
+        # some pipelines accept return_timestamps=True and return timestamps tokens;
+        # exact format can vary by library version. We'll pass the kwarg and try to handle the output.
+        out = asr(path, return_timestamps=True)
+    else:
+        out = asr(path)
+    return out
+# Main: handle any input (numpy tuple or path)
+def transcribe(audio_input, allow_longform_with_timestamps=False, chunk_length_seconds=25, overlap_seconds=0.5):
     """
+    audio_input: either a tuple (sr, numpy array) from gradio mic, or a filepath string from upload
+    returns: dict with 'full_text' and 'segments' list of {start_s, end_s, text}
     """
+    # Normalize input to a filepath
+    if audio_input is None:
         return "No audio provided."
+    if isinstance(audio_input, tuple):
+        # Gradio microphone when type="numpy" sends (sample_rate, numpy_array)
+        audio_path = save_numpy_to_wav(audio_input)
     else:
+        audio_path = audio_input  # uploaded filepath
+    # determine duration
+    duration_s = get_duration_seconds(audio_path)
+    # If short enough, just transcribe directly
+    if duration_s <= 30:
+        out = transcribe_file(audio_path, return_timestamps=False)
+        text = out.get("text", out)
+        segments = [{"start_s": 0.0, "end_s": duration_s, "text": text}]
+        full_text = text
+        # cleanup if we created a temp file
+        if isinstance(audio_input, tuple):
+            try:
+                os.unlink(audio_path)
+            except Exception:
+                pass
+        return {"full_text": full_text, "segments": segments}
+    # duration > 30s -> handle long audio
+    if allow_longform_with_timestamps:
+        # try calling the pipeline with return_timestamps=True
+        try:
+            out = transcribe_file(audio_path, return_timestamps=True)
+            # expected: out may contain 'text' and 'chunks' or 'segments' with timestamps depending on HF version.
+            # We'll try to be flexible.
+            full_text = out.get("text", None)
+            segments = []
+            # If the pipeline returned timestamps in 'chunks' or 'segments':
+            if isinstance(out, dict):
+                if "chunks" in out and isinstance(out["chunks"], list):
+                    for c in out["chunks"]:
+                        # chunk may contain 'text', 'timestamp' or 'start'/'end'
+                        start = c.get("timestamp", [None, None])
+                        if isinstance(start, list) and len(start) == 2:
+                            start_s, end_s = start[0], start[1]
+                        else:
+                            start_s = c.get("start", None)
+                            end_s = c.get("end", None)
+                        segments.append({
+                            "start_s": start_s,
+                            "end_s": end_s,
+                            "text": c.get("text", "")
+                        })
+                elif "words" in out and isinstance(out["words"], list):
+                    # group words into coarse segments (simple approach: group by contiguous words)
+                    # For simplicity, transform words items into tiny segments
+                    for w in out["words"]:
+                        segments.append({
+                            "start_s": w.get("start", None),
+                            "end_s": w.get("end", None),
+                            "text": w.get("word", "")
+                        })
+                else:
+                    # fallback: no structured chunks — return whole text as single segment
+                    if full_text is None:
+                        full_text = str(out)
+                    segments = [{"start_s": 0.0, "end_s": duration_s, "text": full_text}]
+            else:
+                # pipeline returned a string or something else
+                full_text = str(out)
+                segments = [{"start_s": 0.0, "end_s": duration_s, "text": full_text}]
+            if isinstance(audio_input, tuple):
+                try:
+                    os.unlink(audio_path)
+                except Exception:
+                    pass
+            return {"full_text": full_text, "segments": segments}
+        except Exception as e:
+            # Fall back to chunking if long-form timestamps fail
+            print("Long-form timestamps failed, falling back to chunking:", e)
+    # Default: chunking approach
+    chunk_length_ms = int(chunk_length_seconds * 1000)
+    overlap_ms = int(overlap_seconds * 1000)
+    chunks = split_audio_file(audio_path, chunk_length_ms=chunk_length_ms, overlap_ms=overlap_ms)
+    stitched_texts = []
+    segments = []
+    for chunk_path, start_ms, end_ms in chunks:
+        try:
+            out = transcribe_file(chunk_path, return_timestamps=False)
+            text = out.get("text", out)
+        except Exception as e:
+            text = f"[ERROR transcribing chunk: {e}]"
+        start_s = start_ms / 1000.0
+        end_s = end_ms / 1000.0
+        segments.append({"start_s": start_s, "end_s": end_s, "text": text})
+        stitched_texts.append(text)
+        # cleanup chunk file
+        try:
+            os.unlink(chunk_path)
+        except Exception:
+            pass
+    # cleanup original temp if microphone
+    if isinstance(audio_input, tuple):
+        try:
+            os.unlink(audio_path)
+        except Exception:
+            pass
+    full_text = " ".join([s for s in stitched_texts if s])
+    return {"full_text": full_text, "segments": segments}
+# Gradio UI
+with gr.Blocks(title="Yoruba ASR — long audio ready") as demo:
+    gr.Markdown("## Yoruba ASR — Upload or use microphone. Supports long audio via chunking or long-form timestamps 🎧")
+    with gr.Row():
+        with gr.Column():
+            mic = gr.Audio(label="Record from mic (use 'Record' then 'Stop')", type="numpy")
+            upload = gr.Audio(label="Or upload audio file", type="filepath")
+            mode = gr.Radio(choices=["Use microphone input", "Use uploaded file"], value="Use microphone input", label="Input source")
+            longform_checkbox = gr.Checkbox(label="Try model's long-form timestamps (may be supported by some Whisper forks)", value=False)
+            chunk_len = gr.Slider(minimum=10, maximum=60, value=25, step=5, label="Chunk length (seconds) — used when chunking")
+            overlap = gr.Slider(minimum=0, maximum=5, value=0.5, step=0.5, label="Chunk overlap (seconds)")
+            transcribe_btn = gr.Button("Transcribe")
+        with gr.Column():
+            full_text_out = gr.Textbox(label="Full transcription", lines=8)
+            segments_out = gr.JSON(label="Segments (start_s, end_s, text)")
+    def handle_transcription(mic_input, upload_input, mode_choice, use_longform, chunk_len_s, overlap_s):
+        audio_src = mic_input if mode_choice == "Use microphone input" else upload_input
+        res = transcribe(audio_src, allow_longform_with_timestamps=use_longform, chunk_length_seconds=chunk_len_s, overlap_seconds=overlap_s)
+        if isinstance(res, str):
+            return res, []
+        return res["full_text"], res["segments"]
+    transcribe_btn.click(fn=handle_transcription, inputs=[mic, upload, mode, longform_checkbox, chunk_len, overlap], outputs=[full_text_out, segments_out])
+    gr.Markdown("**Notes:**\n\n- Chunking is robust and recommended if you experience errors. Default chunk length is 25s with 0.5s overlap. "
+                "- If you enable long-form timestamps, the pipeline will attempt `return_timestamps=True` and return timestamps if the model supports it. "
+                "- Ensure your Space has enough compute (GPU recommended) for faster transcription.")
 if __name__ == "__main__":
     demo.launch()