Spaces:

ridgerun-ai
/

parakeet-tdt-0.6b-v2

Running

App Files Files Community

mgruner commited on May 6

Commit

c913b1a

1 Parent(s): 4704268

Add support for timestamps as well

Browse files

Files changed (3) hide show

pyproject.toml +1 -0
run.py +40 -4
uv.lock +2 -0

pyproject.toml CHANGED Viewed

@@ -8,5 +8,6 @@ dependencies = [
     "gradio>=5.29.0",
     "nemo-toolkit[asr]>=2.2.1",
     "numpy<2.0",
     "scipy>=1.15.2",
 ]

     "gradio>=5.29.0",
     "nemo-toolkit[asr]>=2.2.1",
     "numpy<2.0",
+    "pandas>=2.2.3",
     "scipy>=1.15.2",
 ]

run.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import nemo.collections.asr as nemo_asr
 import numpy as np
 from scipy import signal
 TARGET_SR = 16_000  # Hz
@@ -80,14 +81,32 @@ def _resample(audio: np.ndarray, rate: int, target_rate: int) -> np.ndarray:
     return resampled
-def _invoke_model(audio: np.ndarray):
     global _model
     if not _model:
         _model = nemo_asr.models.ASRModel.from_pretrained(
             model_name="nvidia/parakeet-tdt-0.6b-v2"
         )
-    return _model.transcribe(audio=audio)[0].text
 def transcribe(audio: tuple[np.ndarray, int] | None):
@@ -96,9 +115,12 @@ def transcribe(audio: tuple[np.ndarray, int] | None):
     rate, data = audio
     data = _to_float32(data)
     data = _resample(data, rate, TARGET_SR)
-    return _invoke_model(data)
 app = gr.Interface(
@@ -108,7 +130,21 @@ app = gr.Interface(
         type="numpy",
         label="Upload or record audio",
     ),
-    outputs=gr.Textbox(label="Transcription", show_copy_button=True),
     title=TITLE,
     description=DESCRIPTION,
 )

 import gradio as gr
 import nemo.collections.asr as nemo_asr
 import numpy as np
+import pandas as pd
 from scipy import signal
 TARGET_SR = 16_000  # Hz
     return resampled
+def _load_model():
     global _model
     if not _model:
         _model = nemo_asr.models.ASRModel.from_pretrained(
             model_name="nvidia/parakeet-tdt-0.6b-v2"
         )
+    return _model
+def _to_pandas(prediction, keyword):
+    return pd.DataFrame(prediction.timestamp[keyword])[
+        [keyword, "start", "end"]
+    ]
+def _invoke_model(model, audio: np.ndarray):
+    prediction = model.transcribe(audio=audio, timestamps=True)[0]
+    text = prediction.text
+    chars = _to_pandas(prediction, "char")
+    words = _to_pandas(prediction, "word")
+    segments = _to_pandas(prediction, "segment")
+    return text, chars, words, segments
 def transcribe(audio: tuple[np.ndarray, int] | None):
     rate, data = audio
+    model = _load_model()
     data = _to_float32(data)
     data = _resample(data, rate, TARGET_SR)
+    text, chars, words, segments = _invoke_model(model, data)
+    return text, segments, words, chars
 app = gr.Interface(
         type="numpy",
         label="Upload or record audio",
     ),
+    outputs=[
+        gr.Textbox(label="Transcription", show_copy_button=True),
+        gr.Dataframe(
+            label="Segments",
+            headers=["Segment", "Start", "End"],
+        ),
+        gr.Dataframe(
+            label="Words",
+            headers=["Word", "Start", "End"],
+        ),
+        gr.Dataframe(
+            label="Characters",
+            headers=["Character", "Start", "End"],
+        ),
+    ],
     title=TITLE,
     description=DESCRIPTION,
 )

uv.lock CHANGED Viewed

@@ -2694,6 +2694,7 @@ dependencies = [
     { name = "gradio" },
     { name = "nemo-toolkit", extra = ["asr"] },
     { name = "numpy" },
     { name = "scipy" },
 ]
@@ -2702,6 +2703,7 @@ requires-dist = [
     { name = "gradio", specifier = ">=5.29.0" },
     { name = "nemo-toolkit", extras = ["asr"], specifier = ">=2.2.1" },
     { name = "numpy", specifier = "<2.0" },
     { name = "scipy", specifier = ">=1.15.2" },
 ]

     { name = "gradio" },
     { name = "nemo-toolkit", extra = ["asr"] },
     { name = "numpy" },
+    { name = "pandas" },
     { name = "scipy" },
 ]
     { name = "gradio", specifier = ">=5.29.0" },
     { name = "nemo-toolkit", extras = ["asr"], specifier = ">=2.2.1" },
     { name = "numpy", specifier = "<2.0" },
+    { name = "pandas", specifier = ">=2.2.3" },
     { name = "scipy", specifier = ">=1.15.2" },
 ]