LT4Ryan commited on
Commit
3c9c761
·
1 Parent(s): 7d3fe20

big change commit

Browse files
.AudioDog/Include/site/python3.11/greenlet/greenlet.h ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* -*- indent-tabs-mode: nil; tab-width: 4; -*- */
2
+
3
+ /* Greenlet object interface */
4
+
5
+ #ifndef Py_GREENLETOBJECT_H
6
+ #define Py_GREENLETOBJECT_H
7
+
8
+
9
+ #include <Python.h>
10
+
11
+ #ifdef __cplusplus
12
+ extern "C" {
13
+ #endif
14
+
15
+ /* This is deprecated and undocumented. It does not change. */
16
+ #define GREENLET_VERSION "1.0.0"
17
+
18
+ #ifndef GREENLET_MODULE
19
+ #define implementation_ptr_t void*
20
+ #endif
21
+
22
+ typedef struct _greenlet {
23
+ PyObject_HEAD
24
+ PyObject* weakreflist;
25
+ PyObject* dict;
26
+ implementation_ptr_t pimpl;
27
+ } PyGreenlet;
28
+
29
+ #define PyGreenlet_Check(op) (op && PyObject_TypeCheck(op, &PyGreenlet_Type))
30
+
31
+
32
+ /* C API functions */
33
+
34
+ /* Total number of symbols that are exported */
35
+ #define PyGreenlet_API_pointers 12
36
+
37
+ #define PyGreenlet_Type_NUM 0
38
+ #define PyExc_GreenletError_NUM 1
39
+ #define PyExc_GreenletExit_NUM 2
40
+
41
+ #define PyGreenlet_New_NUM 3
42
+ #define PyGreenlet_GetCurrent_NUM 4
43
+ #define PyGreenlet_Throw_NUM 5
44
+ #define PyGreenlet_Switch_NUM 6
45
+ #define PyGreenlet_SetParent_NUM 7
46
+
47
+ #define PyGreenlet_MAIN_NUM 8
48
+ #define PyGreenlet_STARTED_NUM 9
49
+ #define PyGreenlet_ACTIVE_NUM 10
50
+ #define PyGreenlet_GET_PARENT_NUM 11
51
+
52
+ #ifndef GREENLET_MODULE
53
+ /* This section is used by modules that uses the greenlet C API */
54
+ static void** _PyGreenlet_API = NULL;
55
+
56
+ # define PyGreenlet_Type \
57
+ (*(PyTypeObject*)_PyGreenlet_API[PyGreenlet_Type_NUM])
58
+
59
+ # define PyExc_GreenletError \
60
+ ((PyObject*)_PyGreenlet_API[PyExc_GreenletError_NUM])
61
+
62
+ # define PyExc_GreenletExit \
63
+ ((PyObject*)_PyGreenlet_API[PyExc_GreenletExit_NUM])
64
+
65
+ /*
66
+ * PyGreenlet_New(PyObject *args)
67
+ *
68
+ * greenlet.greenlet(run, parent=None)
69
+ */
70
+ # define PyGreenlet_New \
71
+ (*(PyGreenlet * (*)(PyObject * run, PyGreenlet * parent)) \
72
+ _PyGreenlet_API[PyGreenlet_New_NUM])
73
+
74
+ /*
75
+ * PyGreenlet_GetCurrent(void)
76
+ *
77
+ * greenlet.getcurrent()
78
+ */
79
+ # define PyGreenlet_GetCurrent \
80
+ (*(PyGreenlet * (*)(void)) _PyGreenlet_API[PyGreenlet_GetCurrent_NUM])
81
+
82
+ /*
83
+ * PyGreenlet_Throw(
84
+ * PyGreenlet *greenlet,
85
+ * PyObject *typ,
86
+ * PyObject *val,
87
+ * PyObject *tb)
88
+ *
89
+ * g.throw(...)
90
+ */
91
+ # define PyGreenlet_Throw \
92
+ (*(PyObject * (*)(PyGreenlet * self, \
93
+ PyObject * typ, \
94
+ PyObject * val, \
95
+ PyObject * tb)) \
96
+ _PyGreenlet_API[PyGreenlet_Throw_NUM])
97
+
98
+ /*
99
+ * PyGreenlet_Switch(PyGreenlet *greenlet, PyObject *args)
100
+ *
101
+ * g.switch(*args, **kwargs)
102
+ */
103
+ # define PyGreenlet_Switch \
104
+ (*(PyObject * \
105
+ (*)(PyGreenlet * greenlet, PyObject * args, PyObject * kwargs)) \
106
+ _PyGreenlet_API[PyGreenlet_Switch_NUM])
107
+
108
+ /*
109
+ * PyGreenlet_SetParent(PyObject *greenlet, PyObject *new_parent)
110
+ *
111
+ * g.parent = new_parent
112
+ */
113
+ # define PyGreenlet_SetParent \
114
+ (*(int (*)(PyGreenlet * greenlet, PyGreenlet * nparent)) \
115
+ _PyGreenlet_API[PyGreenlet_SetParent_NUM])
116
+
117
+ /*
118
+ * PyGreenlet_GetParent(PyObject* greenlet)
119
+ *
120
+ * return greenlet.parent;
121
+ *
122
+ * This could return NULL even if there is no exception active.
123
+ * If it does not return NULL, you are responsible for decrementing the
124
+ * reference count.
125
+ */
126
+ # define PyGreenlet_GetParent \
127
+ (*(PyGreenlet* (*)(PyGreenlet*)) \
128
+ _PyGreenlet_API[PyGreenlet_GET_PARENT_NUM])
129
+
130
+ /*
131
+ * deprecated, undocumented alias.
132
+ */
133
+ # define PyGreenlet_GET_PARENT PyGreenlet_GetParent
134
+
135
+ # define PyGreenlet_MAIN \
136
+ (*(int (*)(PyGreenlet*)) \
137
+ _PyGreenlet_API[PyGreenlet_MAIN_NUM])
138
+
139
+ # define PyGreenlet_STARTED \
140
+ (*(int (*)(PyGreenlet*)) \
141
+ _PyGreenlet_API[PyGreenlet_STARTED_NUM])
142
+
143
+ # define PyGreenlet_ACTIVE \
144
+ (*(int (*)(PyGreenlet*)) \
145
+ _PyGreenlet_API[PyGreenlet_ACTIVE_NUM])
146
+
147
+
148
+
149
+
150
+ /* Macro that imports greenlet and initializes C API */
151
+ /* NOTE: This has actually moved to ``greenlet._greenlet._C_API``, but we
152
+ keep the older definition to be sure older code that might have a copy of
153
+ the header still works. */
154
+ # define PyGreenlet_Import() \
155
+ { \
156
+ _PyGreenlet_API = (void**)PyCapsule_Import("greenlet._C_API", 0); \
157
+ }
158
+
159
+ #endif /* GREENLET_MODULE */
160
+
161
+ #ifdef __cplusplus
162
+ }
163
+ #endif
164
+ #endif /* !Py_GREENLETOBJECT_H */
.AudioDog/etc/jupyter/nbconfig/notebook.d/pydeck.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "load_extensions": {
3
+ "pydeck/extension": true
4
+ }
5
+ }
.AudioDog/pyvenv.cfg ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ home = C:\Users\rndav\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0
2
+ include-system-site-packages = false
3
+ version = 3.11.9
4
+ executable = C:\Users\rndav\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe
5
+ command = C:\Users\rndav\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m venv C:\Users\rndav\Documents\GitHub\AudioDog\.AudioDog
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,14 +1,14 @@
1
- ---
2
- title: AudioDog
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.0.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: testing out the nvidia parakeet model
12
- ---
13
-
14
  An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
1
+ ---
2
+ title: AudioDog
3
+ emoji: 💬
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.0.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: testing out the nvidia parakeet model
12
+ ---
13
+
14
  An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
app.py CHANGED
@@ -1,376 +1,286 @@
1
- from nemo.collections.asr.models import ASRModel
2
- import torch
3
- import gradio as gr
4
- import spaces
5
- import gc
6
- from pathlib import Path
7
- from pydub import AudioSegment
8
- import numpy as np
9
- import os
10
- import tempfile
11
- import gradio.themes as gr_themes
12
- import csv
13
-
14
- device = "cuda" if torch.cuda.is_available() else "cpu"
15
- MODEL_NAME="nvidia/parakeet-tdt-0.6b-v2"
16
-
17
- model = ASRModel.from_pretrained(model_name=MODEL_NAME)
18
- model.eval()
19
-
20
- def get_audio_segment(audio_path, start_second, end_second):
21
- """
22
- Extract a segment of audio from a given audio file.
23
- Parameters:
24
- audio_path (str): Path to the audio file to process
25
- start_second (float): Start time of the segment in seconds
26
- end_second (float): End time of the segment in seconds
27
- Returns:
28
- tuple or None: A tuple containing (frame_rate, samples) where:
29
- - frame_rate (int): The sample rate of the audio
30
- - samples (numpy.ndarray): The audio samples as a numpy array
31
- Returns None if there's an error processing the audio
32
- """
33
- if not audio_path or not Path(audio_path).exists():
34
- print(f"Warning: Audio path '{audio_path}' not found or invalid for clipping.")
35
- return None
36
- try:
37
- start_ms = int(start_second * 1000)
38
- end_ms = int(end_second * 1000)
39
-
40
- start_ms = max(0, start_ms)
41
- if end_ms <= start_ms:
42
- print(f"Warning: End time ({end_second}s) is not after start time ({start_second}s). Adjusting end time.")
43
- end_ms = start_ms + 100
44
-
45
- audio = AudioSegment.from_file(audio_path)
46
- clipped_audio = audio[start_ms:end_ms]
47
-
48
- samples = np.array(clipped_audio.get_array_of_samples())
49
- if clipped_audio.channels == 2:
50
- samples = samples.reshape((-1, 2)).mean(axis=1).astype(samples.dtype)
51
-
52
- frame_rate = clipped_audio.frame_rate
53
- if frame_rate <= 0:
54
- print(f"Warning: Invalid frame rate ({frame_rate}) detected for clipped audio.")
55
- frame_rate = audio.frame_rate
56
-
57
- if samples.size == 0:
58
- print(f"Warning: Clipped audio resulted in empty samples array ({start_second}s to {end_second}s).")
59
- return None
60
-
61
- return (frame_rate, samples)
62
- except FileNotFoundError:
63
- print(f"Error: Audio file not found at path: {audio_path}")
64
- return None
65
- except Exception as e:
66
- print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}")
67
- return None
68
-
69
- @spaces.GPU
70
- def get_transcripts_and_raw_times(audio_path):
71
- """
72
- Transcribe an audio file and generate timestamps for each segment.
73
- Parameters:
74
- audio_path (str): Path to the audio file to transcribe
75
- Returns:
76
- tuple: A tuple containing:
77
- - vis_data (list): List of [start, end, text] for visualization
78
- - raw_times_data (list): List of [start, end] timestamps
79
- - audio_path (str): Path to the processed audio file
80
- - button_update (gr.DownloadButton): Gradio button component for CSV download
81
- Notes:
82
- - Automatically handles audio preprocessing (resampling to 16kHz, mono conversion)
83
- - Uses NVIDIA's Parakeet TDT model for transcription
84
- - Generates a CSV file with transcription results
85
- """
86
- if not audio_path:
87
- gr.Error("No audio file path provided for transcription.", duration=None)
88
- # Return an update to hide the button
89
- return [], [], None, gr.DownloadButton(visible=False)
90
-
91
- vis_data = [["N/A", "N/A", "Processing failed"]]
92
- raw_times_data = [[0.0, 0.0]]
93
- processed_audio_path = None
94
- temp_file = None
95
- csv_file_path = None
96
- original_path_name = Path(audio_path).name
97
-
98
- try:
99
- try:
100
- gr.Info(f"Loading audio: {original_path_name}", duration=2)
101
- audio = AudioSegment.from_file(audio_path)
102
- except Exception as load_e:
103
- gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
104
- # Return an update to hide the button
105
- return [["Error", "Error", "Load failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
106
-
107
- resampled = False
108
- mono = False
109
-
110
- target_sr = 16000
111
- if audio.frame_rate != target_sr:
112
- try:
113
- audio = audio.set_frame_rate(target_sr)
114
- resampled = True
115
- except Exception as resample_e:
116
- gr.Error(f"Failed to resample audio: {resample_e}", duration=None)
117
- # Return an update to hide the button
118
- return [["Error", "Error", "Resample failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
119
-
120
- if audio.channels == 2:
121
- try:
122
- audio = audio.set_channels(1)
123
- mono = True
124
- except Exception as mono_e:
125
- gr.Error(f"Failed to convert audio to mono: {mono_e}", duration=None)
126
- # Return an update to hide the button
127
- return [["Error", "Error", "Mono conversion failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
128
- elif audio.channels > 2:
129
- gr.Error(f"Audio has {audio.channels} channels. Only mono (1) or stereo (2) supported.", duration=None)
130
- # Return an update to hide the button
131
- return [["Error", "Error", f"{audio.channels}-channel audio not supported"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
132
-
133
- if resampled or mono:
134
- try:
135
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
136
- audio.export(temp_file.name, format="wav")
137
- processed_audio_path = temp_file.name
138
- temp_file.close()
139
- transcribe_path = processed_audio_path
140
- info_path_name = f"{original_path_name} (processed)"
141
- except Exception as export_e:
142
- gr.Error(f"Failed to export processed audio: {export_e}", duration=None)
143
- if temp_file and hasattr(temp_file, 'name') and os.path.exists(temp_file.name): # Check temp_file has 'name' attribute
144
- os.remove(temp_file.name)
145
- # Return an update to hide the button
146
- return [["Error", "Error", "Export failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
147
- else:
148
- transcribe_path = audio_path
149
- info_path_name = original_path_name
150
-
151
- try:
152
- model.to(device)
153
- gr.Info(f"Transcribing {info_path_name} on {device}...", duration=2)
154
- output = model.transcribe([transcribe_path], timestamps=True)
155
-
156
- if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
157
- gr.Error("Transcription failed or produced unexpected output format.", duration=None)
158
- # Return an update to hide the button
159
- return [["Error", "Error", "Transcription Format Issue"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
160
-
161
- segment_timestamps = output[0].timestamp['segment']
162
- csv_headers = ["Start (s)", "End (s)", "Segment"]
163
- vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
164
- raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
165
-
166
- # Default button update (hidden) in case CSV writing fails
167
- button_update = gr.DownloadButton(visible=False)
168
- try:
169
- temp_csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode='w', newline='', encoding='utf-8')
170
- writer = csv.writer(temp_csv_file)
171
- writer.writerow(csv_headers)
172
- writer.writerows(vis_data)
173
- csv_file_path = temp_csv_file.name
174
- temp_csv_file.close()
175
- print(f"CSV transcript saved to temporary file: {csv_file_path}")
176
- # If CSV is saved, create update to show button with path
177
- button_update = gr.DownloadButton(value=csv_file_path, visible=True)
178
- except Exception as csv_e:
179
- gr.Error(f"Failed to create transcript CSV file: {csv_e}", duration=None)
180
- print(f"Error writing CSV: {csv_e}")
181
- # csv_file_path remains None, button_update remains hidden
182
-
183
- gr.Info("Transcription complete.", duration=2)
184
- # Return the data and the button update dictionary
185
- return vis_data, raw_times_data, audio_path, button_update
186
-
187
- except torch.cuda.OutOfMemoryError as e:
188
- error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
189
- print(f"CUDA OutOfMemoryError: {e}")
190
- gr.Error(error_msg, duration=None)
191
- # Return an update to hide the button
192
- return [["OOM", "OOM", error_msg]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
193
-
194
- except FileNotFoundError:
195
- error_msg = f"Audio file for transcription not found: {Path(transcribe_path).name}."
196
- print(f"Error: Transcribe audio file not found at path: {transcribe_path}")
197
- gr.Error(error_msg, duration=None)
198
- # Return an update to hide the button
199
- return [["Error", "Error", "File not found for transcription"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
200
-
201
- except Exception as e:
202
- error_msg = f"Transcription failed: {e}"
203
- print(f"Error during transcription processing: {e}")
204
- gr.Error(error_msg, duration=None)
205
- vis_data = [["Error", "Error", error_msg]]
206
- raw_times_data = [[0.0, 0.0]]
207
- # Return an update to hide the button
208
- return vis_data, raw_times_data, audio_path, gr.DownloadButton(visible=False)
209
- finally:
210
- try:
211
- if 'model' in locals() and hasattr(model, 'cpu'):
212
- if device == 'cuda':
213
- model.cpu()
214
- gc.collect()
215
- if device == 'cuda':
216
- torch.cuda.empty_cache()
217
- except Exception as cleanup_e:
218
- print(f"Error during model cleanup: {cleanup_e}")
219
- gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
220
-
221
- finally:
222
- if processed_audio_path and os.path.exists(processed_audio_path):
223
- try:
224
- os.remove(processed_audio_path)
225
- print(f"Temporary audio file {processed_audio_path} removed.")
226
- except Exception as e:
227
- print(f"Error removing temporary audio file {processed_audio_path}: {e}")
228
-
229
- def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
230
- """
231
- Play a selected segment from the transcription results.
232
- Parameters:
233
- evt (gr.SelectData): Gradio select event containing the index of selected segment
234
- raw_ts_list (list): List of [start, end] timestamps for all segments
235
- current_audio_path (str): Path to the current audio file being processed
236
- Returns:
237
- gr.Audio: Gradio Audio component containing the selected segment for playback
238
- Notes:
239
- - Extracts and plays the audio segment corresponding to the selected transcription
240
- - Returns None if segment extraction fails or inputs are invalid
241
- """
242
- if not isinstance(raw_ts_list, list):
243
- print(f"Warning: raw_ts_list is not a list ({type(raw_ts_list)}). Cannot play segment.")
244
- return gr.Audio(value=None, label="Selected Segment")
245
-
246
- if not current_audio_path:
247
- print("No audio path available to play segment from.")
248
- return gr.Audio(value=None, label="Selected Segment")
249
-
250
- selected_index = evt.index[0]
251
-
252
- if selected_index < 0 or selected_index >= len(raw_ts_list):
253
- print(f"Invalid index {selected_index} selected for list of length {len(raw_ts_list)}.")
254
- return gr.Audio(value=None, label="Selected Segment")
255
-
256
- if not isinstance(raw_ts_list[selected_index], (list, tuple)) or len(raw_ts_list[selected_index]) != 2:
257
- print(f"Warning: Data at index {selected_index} is not in the expected format [start, end].")
258
- return gr.Audio(value=None, label="Selected Segment")
259
-
260
- start_time_s, end_time_s = raw_ts_list[selected_index]
261
-
262
- print(f"Attempting to play segment: {current_audio_path} from {start_time_s:.2f}s to {end_time_s:.2f}s")
263
-
264
- segment_data = get_audio_segment(current_audio_path, start_time_s, end_time_s)
265
-
266
- if segment_data:
267
- print("Segment data retrieved successfully.")
268
- return gr.Audio(value=segment_data, autoplay=True, label=f"Segment: {start_time_s:.2f}s - {end_time_s:.2f}s", interactive=False)
269
- else:
270
- print("Failed to get audio segment data.")
271
- return gr.Audio(value=None, label="Selected Segment")
272
-
273
- article = (
274
- "<p style='font-size: 1.1em;'>"
275
- "This demo showcases <code><a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2'>parakeet-tdt-0.6b-v2</a></code>, a 600-million-parameter model designed for high-quality English speech recognition."
276
- "</p>"
277
- "<p><strong style='color: red; font-size: 1.2em;'>Key Features:</strong></p>"
278
- "<ul style='font-size: 1.1em;'>"
279
- " <li>Automatic punctuation and capitalization</li>"
280
- " <li>Accurate word-level timestamps (click on a segment in the table below to play it!)</li>"
281
- " <li>Efficiently transcribes long audio segments (up to 20 minutes) <small>(For even longer audios, see <a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py' target='_blank'>this script</a>)</small></li>"
282
- " <li>Robust performance on spoken numbers, and song lyrics transcription </li>"
283
- "</ul>"
284
- "<p style='font-size: 1.1em;'>"
285
- "This model is <strong>available for commercial and non-commercial use</strong>."
286
- "</p>"
287
- "<p style='text-align: center;'>"
288
- "<a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2' target='_blank'>🎙️ Learn more about the Model</a> | "
289
- "<a href='https://arxiv.org/abs/2305.05084' target='_blank'>📄 Fast Conformer paper</a> | "
290
- "<a href='https://arxiv.org/abs/2304.06795' target='_blank'>📚 TDT paper</a> | "
291
- "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>🧑‍💻 NeMo Repository</a>"
292
- "</p>"
293
- )
294
-
295
- examples = [
296
- ["data/example-yt_saTD1u8PorI.mp3"],
297
- ]
298
-
299
- # Define an NVIDIA-inspired theme
300
- nvidia_theme = gr_themes.Default(
301
- primary_hue=gr_themes.Color(
302
- c50="#E6F1D9", # Lightest green
303
- c100="#CEE3B3",
304
- c200="#B5D58C",
305
- c300="#9CC766",
306
- c400="#84B940",
307
- c500="#76B900", # NVIDIA Green
308
- c600="#68A600",
309
- c700="#5A9200",
310
- c800="#4C7E00",
311
- c900="#3E6A00", # Darkest green
312
- c950="#2F5600"
313
- ),
314
- neutral_hue="gray", # Use gray for neutral elements
315
- font=[gr_themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
316
- ).set()
317
-
318
- # Apply the custom theme
319
- with gr.Blocks(theme=nvidia_theme) as demo:
320
- model_display_name = MODEL_NAME.split('/')[-1] if '/' in MODEL_NAME else MODEL_NAME
321
- gr.Markdown(f"<h1 style='text-align: center; margin: 0 auto;'>Speech Transcription with {model_display_name}</h1>")
322
- gr.HTML(article)
323
-
324
- current_audio_path_state = gr.State(None)
325
- raw_timestamps_list_state = gr.State([])
326
-
327
- with gr.Tabs():
328
- with gr.TabItem("Audio File"):
329
- file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
330
- gr.Examples(examples=examples, inputs=[file_input], label="Example Audio Files (Click to Load)")
331
- file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary")
332
-
333
- with gr.TabItem("Microphone"):
334
- mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
335
- mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary")
336
-
337
- gr.Markdown("---")
338
- gr.Markdown("<p><strong style='color: #FF0000; font-size: 1.2em;'>Transcription Results (Click row to play segment)</strong></p>")
339
-
340
- # Define the DownloadButton *before* the DataFrame
341
- download_btn = gr.DownloadButton(label="Download Transcript (CSV)", visible=False)
342
-
343
- vis_timestamps_df = gr.DataFrame(
344
- headers=["Start (s)", "End (s)", "Segment"],
345
- datatype=["number", "number", "str"],
346
- wrap=True,
347
- label="Transcription Segments"
348
- )
349
-
350
- # selected_segment_player was defined after download_btn previously, keep it after df for layout
351
- selected_segment_player = gr.Audio(label="Selected Segment", interactive=False)
352
-
353
- mic_transcribe_btn.click(
354
- fn=get_transcripts_and_raw_times,
355
- inputs=[mic_input],
356
- outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn],
357
- api_name="transcribe_mic"
358
- )
359
-
360
- file_transcribe_btn.click(
361
- fn=get_transcripts_and_raw_times,
362
- inputs=[file_input],
363
- outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn],
364
- api_name="transcribe_file"
365
- )
366
-
367
- vis_timestamps_df.select(
368
- fn=play_segment,
369
- inputs=[raw_timestamps_list_state, current_audio_path_state],
370
- outputs=[selected_segment_player],
371
- )
372
-
373
- if __name__ == "__main__":
374
- print("Launching Gradio Demo...")
375
- demo.queue()
376
  demo.launch()
 
1
+ from nemo.collections.asr.models import ASRModel
2
+ import torch
3
+ import gradio as gr
4
+ import spaces
5
+ import gc
6
+ from pathlib import Path
7
+ from pydub import AudioSegment
8
+ import numpy as np
9
+ import os
10
+ import tempfile
11
+ import gradio.themes as gr_themes
12
+ import csv
13
+
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ MODEL_NAME="nvidia/parakeet-tdt-0.6b-v2"
16
+
17
+ model = ASRModel.from_pretrained(model_name=MODEL_NAME)
18
+ model.eval()
19
+
20
+ def get_audio_segment(audio_path, start_second, end_second):
21
+ """
22
+ Extract a segment of audio from a given audio file.
23
+ Parameters:
24
+ audio_path (str): Path to the audio file to process
25
+ start_second (float): Start time of the segment in seconds
26
+ end_second (float): End time of the segment in seconds
27
+ Returns:
28
+ tuple or None: A tuple containing (frame_rate, samples) where:
29
+ - frame_rate (int): The sample rate of the audio
30
+ - samples (numpy.ndarray): The audio samples as a numpy array
31
+ Returns None if there's an error processing the audio
32
+ """
33
+ if not audio_path or not Path(audio_path).exists():
34
+ print(f"Warning: Audio path '{audio_path}' not found or invalid for clipping.")
35
+ return None
36
+ try:
37
+ start_ms = int(start_second * 1000)
38
+ end_ms = int(end_second * 1000)
39
+
40
+ start_ms = max(0, start_ms)
41
+ if end_ms <= start_ms:
42
+ print(f"Warning: End time ({end_second}s) is not after start time ({start_second}s). Adjusting end time.")
43
+ end_ms = start_ms + 100
44
+
45
+ audio = AudioSegment.from_file(audio_path)
46
+ clipped_audio = audio[start_ms:end_ms]
47
+
48
+ samples = np.array(clipped_audio.get_array_of_samples())
49
+ if clipped_audio.channels == 2:
50
+ samples = samples.reshape((-1, 2)).mean(axis=1).astype(samples.dtype)
51
+
52
+ frame_rate = clipped_audio.frame_rate
53
+ if frame_rate <= 0:
54
+ print(f"Warning: Invalid frame rate ({frame_rate}) detected for clipped audio.")
55
+ frame_rate = audio.frame_rate
56
+
57
+ if samples.size == 0:
58
+ print(f"Warning: Clipped audio resulted in empty samples array ({start_second}s to {end_second}s).")
59
+ return None
60
+
61
+ return (frame_rate, samples)
62
+ except FileNotFoundError:
63
+ print(f"Error: Audio file not found at path: {audio_path}")
64
+ return None
65
+ except Exception as e:
66
+ print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}")
67
+ return None
68
+
69
+ @spaces.GPU
70
+ @spaces.GPU
71
+ def get_transcripts_and_raw_times(audio_path):
72
+ if not audio_path:
73
+ gr.Error("No audio file path provided for transcription.", duration=None)
74
+ return [], [], None, gr.DownloadButton(visible=False)
75
+
76
+ original_path_name = Path(audio_path).name
77
+ try:
78
+ gr.Info(f"Loading audio: {original_path_name}", duration=2)
79
+ full_audio = AudioSegment.from_file(audio_path)
80
+ except Exception as load_e:
81
+ gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
82
+ return [["Error", "Error", "Load failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
83
+
84
+ # Ensure 16kHz mono
85
+ if full_audio.frame_rate != 16000:
86
+ full_audio = full_audio.set_frame_rate(16000)
87
+ if full_audio.channels != 1:
88
+ full_audio = full_audio.set_channels(1)
89
+
90
+ chunk_duration_ms = 5 * 60 * 1000 # 5 minutes in milliseconds
91
+ total_duration_ms = len(full_audio)
92
+ total_chunks = (total_duration_ms + chunk_duration_ms - 1) // chunk_duration_ms
93
+
94
+ vis_data = []
95
+ raw_times_data = []
96
+
97
+ model.to(device)
98
+
99
+ for i, start_ms in enumerate(range(0, total_duration_ms, chunk_duration_ms), start=1):
100
+ end_ms = min(start_ms + chunk_duration_ms, total_duration_ms)
101
+ chunk = full_audio[start_ms:end_ms]
102
+
103
+ gr.Info(f"Transcribing chunk {i} of {total_chunks} ({start_ms/1000:.0f}s to {end_ms/1000:.0f}s)...", duration=3)
104
+
105
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav:
106
+ chunk.export(temp_wav.name, format="wav")
107
+ temp_wav_path = temp_wav.name
108
+
109
+ try:
110
+ output = model.transcribe([temp_wav_path], timestamps=True)
111
+ if not output or not output[0].timestamp or 'segment' not in output[0].timestamp:
112
+ continue
113
+
114
+ for ts in output[0].timestamp['segment']:
115
+ abs_start = ts['start'] + (start_ms / 1000.0)
116
+ abs_end = ts['end'] + (start_ms / 1000.0)
117
+ vis_data.append([f"{abs_start:.2f}", f"{abs_end:.2f}", ts['segment']])
118
+ raw_times_data.append([abs_start, abs_end])
119
+ except Exception as e:
120
+ gr.Warning(f"Chunk {i} failed: {e}", duration=3)
121
+ finally:
122
+ os.remove(temp_wav_path)
123
+
124
+ model.cpu()
125
+ gc.collect()
126
+ if device == "cuda":
127
+ torch.cuda.empty_cache()
128
+
129
+ # Generate CSV
130
+ button_update = gr.DownloadButton(visible=False)
131
+ try:
132
+ csv_headers = ["Start (s)", "End (s)", "Segment"]
133
+ temp_csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode='w', newline='', encoding='utf-8')
134
+ writer = csv.writer(temp_csv_file)
135
+ writer.writerow(csv_headers)
136
+ writer.writerows(vis_data)
137
+ csv_file_path = temp_csv_file.name
138
+ temp_csv_file.close()
139
+ button_update = gr.DownloadButton(value=csv_file_path, visible=True)
140
+ except Exception as csv_e:
141
+ gr.Error(f"Failed to create transcript CSV file: {csv_e}", duration=None)
142
+
143
+ gr.Info("Transcription complete.", duration=2)
144
+ return vis_data, raw_times_data, audio_path, button_update
145
+
146
+ @spaces.GPU
147
+ def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
148
+ """
149
+ Play a selected segment from the transcription results.
150
+ Parameters:
151
+ evt (gr.SelectData): Gradio select event containing the index of selected segment
152
+ raw_ts_list (list): List of [start, end] timestamps for all segments
153
+ current_audio_path (str): Path to the current audio file being processed
154
+ Returns:
155
+ gr.Audio: Gradio Audio component containing the selected segment for playback
156
+ Notes:
157
+ - Extracts and plays the audio segment corresponding to the selected transcription
158
+ - Returns None if segment extraction fails or inputs are invalid
159
+ """
160
+ if not isinstance(raw_ts_list, list):
161
+ print(f"Warning: raw_ts_list is not a list ({type(raw_ts_list)}). Cannot play segment.")
162
+ return gr.Audio(value=None, label="Selected Segment")
163
+
164
+ if not current_audio_path:
165
+ print("No audio path available to play segment from.")
166
+ return gr.Audio(value=None, label="Selected Segment")
167
+
168
+ selected_index = evt.index[0]
169
+
170
+ if selected_index < 0 or selected_index >= len(raw_ts_list):
171
+ print(f"Invalid index {selected_index} selected for list of length {len(raw_ts_list)}.")
172
+ return gr.Audio(value=None, label="Selected Segment")
173
+
174
+ if not isinstance(raw_ts_list[selected_index], (list, tuple)) or len(raw_ts_list[selected_index]) != 2:
175
+ print(f"Warning: Data at index {selected_index} is not in the expected format [start, end].")
176
+ return gr.Audio(value=None, label="Selected Segment")
177
+
178
+ start_time_s, end_time_s = raw_ts_list[selected_index]
179
+
180
+ print(f"Attempting to play segment: {current_audio_path} from {start_time_s:.2f}s to {end_time_s:.2f}s")
181
+
182
+ segment_data = get_audio_segment(current_audio_path, start_time_s, end_time_s)
183
+
184
+ if segment_data:
185
+ print("Segment data retrieved successfully.")
186
+ return gr.Audio(value=segment_data, autoplay=True, label=f"Segment: {start_time_s:.2f}s - {end_time_s:.2f}s", interactive=False)
187
+ else:
188
+ print("Failed to get audio segment data.")
189
+ return gr.Audio(value=None, label="Selected Segment")
190
+
191
+ article = (
192
+ "<p style='font-size: 1.1em;'>"
193
+ "AudioDog uses <code><a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2'>parakeet-tdt-0.6b-v2</a></code>, a 600-million-parameter model designed for high-quality English speech recognition."
194
+ "</p>"
195
+ "<p><strong style='color: red; font-size: 1.2em;'>Key Features:</strong></p>"
196
+ "<ul style='font-size: 1.1em;'>"
197
+ " <li>Automatic punctuation and capitalization</li>"
198
+ " <li>Accurate word-level timestamps (click on a segment in the table below to play it!)</li>"
199
+ " <li>Efficiently transcribes long audio segments by chunking them into smaller segments and stitching them together when done.</li>"
200
+ " <li>MP3 support for audio input and output, works well on downloaded YouTube videos.</li>"
201
+ "</ul>"
202
+ )
203
+ examples = [
204
+ ["data/example-yt_saTD1u8PorI.mp3"],
205
+ ]
206
+
207
+
208
+ # Define an NVIDIA-inspired theme
209
+ nvidia_theme = gr_themes.Default(
210
+ primary_hue=gr_themes.Color(
211
+ c50="#E6F1D9", # Lightest green
212
+ c100="#CEE3B3",
213
+ c200="#B5D58C",
214
+ c300="#9CC766",
215
+ c400="#84B940",
216
+ c500="#76B900", # NVIDIA Green
217
+ c600="#68A600",
218
+ c700="#5A9200",
219
+ c800="#4C7E00",
220
+ c900="#3E6A00", # Darkest green
221
+ c950="#2F5600"
222
+ ),
223
+ neutral_hue="gray", # Use gray for neutral elements
224
+ font=[gr_themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
225
+ ).set()
226
+
227
+ # Apply the custom theme
228
+ with gr.Blocks(theme=nvidia_theme) as demo:
229
+ gr.Image("pics/AD.jpg", label="AudioDog Logo", show_label=False)
230
+ model_display_name = MODEL_NAME.split('/')[-1] if '/' in MODEL_NAME else MODEL_NAME
231
+ gr.Markdown(f"<h1 style='text-align: center; margin: 0 auto;'>AudioDog, powered by {model_display_name}</h1>")
232
+ gr.HTML(article)
233
+
234
+ current_audio_path_state = gr.State(None)
235
+ raw_timestamps_list_state = gr.State([])
236
+
237
+ with gr.Tabs():
238
+ with gr.TabItem("Audio File"):
239
+ file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
240
+ gr.Examples(examples=examples, inputs=[file_input], label="Example Audio Files (Click to Load)")
241
+ file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary")
242
+
243
+ with gr.TabItem("Microphone"):
244
+ mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
245
+ mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary")
246
+
247
+ gr.Markdown("---")
248
+ gr.Markdown("<p><strong style='color: #FF0000; font-size: 1.2em;'>Transcription Results (Click row to play segment)</strong></p>")
249
+
250
+ # Define the DownloadButton *before* the DataFrame
251
+ download_btn = gr.DownloadButton(label="Download Transcript (CSV)", visible=False)
252
+
253
+ vis_timestamps_df = gr.DataFrame(
254
+ headers=["Start (s)", "End (s)", "Segment"],
255
+ datatype=["number", "number", "str"],
256
+ wrap=True,
257
+ label="Transcription Segments"
258
+ )
259
+
260
+ # selected_segment_player was defined after download_btn previously, keep it after df for layout
261
+ selected_segment_player = gr.Audio(label="Selected Segment", interactive=False)
262
+
263
+ mic_transcribe_btn.click(
264
+ fn=get_transcripts_and_raw_times,
265
+ inputs=[mic_input],
266
+ outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn],
267
+ api_name="transcribe_mic"
268
+ )
269
+
270
+ file_transcribe_btn.click(
271
+ fn=get_transcripts_and_raw_times,
272
+ inputs=[file_input],
273
+ outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn],
274
+ api_name="transcribe_file"
275
+ )
276
+
277
+ vis_timestamps_df.select(
278
+ fn=play_segment,
279
+ inputs=[raw_timestamps_list_state, current_audio_path_state],
280
+ outputs=[selected_segment_player],
281
+ )
282
+
283
+ if __name__ == "__main__":
284
+ print("Launching AudioDog...")
285
+ demo.queue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  demo.launch()
pics/AD.jpg ADDED
qa.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import torch, platform
2
+ print("torch:", torch.__version__)
3
+ print("torch compiled CUDA:", torch.version.cuda)
4
+ print("CUDA available:", torch.cuda.is_available())
5
+ if torch.cuda.is_available():
6
+ print("GPU:", torch.cuda.get_device_name(0))
7
+ print("Device capability (SM):", torch.cuda.get_device_capability(0)) # e.g., (9, 0)
8
+ print("Torch arch list:", torch.cuda.get_arch_list()) # what the wheel contains
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
- Cython
2
- git+https://github.com/NVIDIA/NeMo.git@r2.3.0#egg=nemo_toolkit[asr]
3
- numpy<2.0
4
- cuda-python>=12.3
5
- gradio>=5.39.0
 
1
+ Cython
2
+ git+https://github.com/NVIDIA/NeMo.git@r2.3.0#egg=nemo_toolkit[asr]
3
+ numpy<2.0