WJ88 commited on
Commit
4e726f3
·
verified ·
1 Parent(s): 76ac8d9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -0
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import nemo.collections.asr as nemo_asr
3
+ import numpy as np
4
+ from pydub import AudioSegment
5
+ from pydub.silence import detect_silence
6
+ import warnings
7
+ import torch
8
+
9
+ warnings.filterwarnings("ignore")
10
+
11
+ # Global model loader
12
+ model = None
13
+
14
+ def load_model():
15
+ global model
16
+ if model is None:
17
+ model = nemo_asr.models.ASRModel.from_pretrained(
18
+ model_name="nvidia/parakeet-tdt-0.6b-v3",
19
+ map_location="cpu"
20
+ )
21
+ model.eval()
22
+ return model
23
+
24
+ class TranscriptionState:
25
+ def __init__(self):
26
+ self.buffer = None
27
+ self.text = ""
28
+
29
+ def transcribe_segment(segment_array: np.ndarray):
30
+ """Transcribe a normalized audio segment."""
31
+ load_model()
32
+ with torch.no_grad(), warnings.catch_warnings():
33
+ warnings.simplefilter("ignore")
34
+ output = model.transcribe([segment_array])
35
+ return output[0]
36
+
37
+ def process_live_audio(audio: np.ndarray, state: TranscriptionState):
38
+ """Process live mic audio with VAD and buffer management."""
39
+ if audio is None or len(audio) == 0:
40
+ return "", state
41
+
42
+ # Convert to int16 for pydub VAD
43
+ audio_int16 = (audio * 32767).astype(np.int16)
44
+ new_segment = AudioSegment(
45
+ data=audio_int16.tobytes(),
46
+ frame_rate=16000,
47
+ sample_width=2,
48
+ channels=1
49
+ )
50
+
51
+ # Append to buffer
52
+ if state.buffer is None:
53
+ state.buffer = new_segment
54
+ else:
55
+ state.buffer += new_segment
56
+
57
+ # Trim buffer to prevent accumulation (keep last 60s)
58
+ max_duration_ms = 60000
59
+ if state.buffer.duration_seconds > 60:
60
+ # Re-transcribe full current buffer before trimming
61
+ full_array = np.array(state.buffer.get_array_of_samples(), dtype=np.float32) / 32767.0
62
+ state.text = transcribe_segment(full_array)
63
+ # Trim to last 30s for ongoing buffer (balances memory and context)
64
+ state.buffer = state.buffer[-30000:]
65
+
66
+ # VAD: Detect pauses in current buffer
67
+ silent_windows = detect_silence(
68
+ state.buffer,
69
+ min_silence_len=500, # 0.5s pause
70
+ silence_thresh=-40 # dB threshold
71
+ )
72
+
73
+ if len(silent_windows) > 0:
74
+ last_silence_end = silent_windows[-1][1]
75
+ if last_silence_end < len(state.buffer):
76
+ # Transcribe up to end of last silence
77
+ segment = state.buffer[:last_silence_end]
78
+ segment_array = np.array(segment.get_array_of_samples(), dtype=np.float32) / 32767.0
79
+ partial_text = transcribe_segment(segment_array)
80
+ state.text = partial_text
81
+ # Keep remaining as buffer
82
+ state.buffer = state.buffer[last_silence_end:]
83
+
84
+ return state.text, state
85
+
86
+ def transcribe_file(audio: np.ndarray):
87
+ """Batch transcribe uploaded file."""
88
+ if audio is None:
89
+ return ""
90
+ load_model()
91
+ # Assume mono 16kHz; resample if needed (Gradio handles basic)
92
+ if len(audio.shape) > 1:
93
+ audio = np.mean(audio, axis=1)
94
+ with torch.no_grad(), warnings.catch_warnings():
95
+ warnings.simplefilter("ignore")
96
+ output = model.transcribe([audio])
97
+ return output[0]
98
+
99
+ def clear_session(state: TranscriptionState):
100
+ """Reset session."""
101
+ state.buffer = None
102
+ state.text = ""
103
+ return ""
104
+
105
+ # Gradio UI
106
+ with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
107
+ gr.Markdown(
108
+ """
109
+ # NVIDIA Parakeet-TDT 0.6B v3 Real-Time Transcription
110
+ Speak into your microphone for live multilingual transcription. Updates on pauses. Clear to start over.
111
+ Supports 25 European languages automatically. Optimized for CPU.
112
+ """
113
+ )
114
+
115
+ with gr.Tab("Live Microphone"):
116
+ state = gr.State(TranscriptionState())
117
+ audio_input = gr.Audio(
118
+ source="microphone",
119
+ type="numpy",
120
+ live=True,
121
+ label="Speak now..."
122
+ )
123
+ output_text = gr.Textbox(
124
+ label="Live Transcription",
125
+ lines=10,
126
+ interactive=False
127
+ )
128
+ clear_btn = gr.Button("Clear Session")
129
+
130
+ # Live updates
131
+ audio_input.change(
132
+ process_live_audio,
133
+ inputs=[audio_input, state],
134
+ outputs=[output_text, state]
135
+ )
136
+ clear_btn.click(
137
+ clear_session,
138
+ inputs=state,
139
+ outputs=[output_text, state]
140
+ )
141
+
142
+ with gr.Tab("File Upload"):
143
+ file_input = gr.Audio(source="upload", type="numpy")
144
+ file_output = gr.Textbox(label="File Transcription", lines=10)
145
+ transcribe_btn = gr.Button("Transcribe File")
146
+ transcribe_btn.click(
147
+ transcribe_file,
148
+ inputs=file_input,
149
+ outputs=file_output
150
+ )
151
+
152
+ gr.Markdown(
153
+ """
154
+ **Notes:** For best results, speak clearly with short pauses. Long sessions (>1 min) may require clearing to maintain speed.
155
+ """
156
+ )
157
+
158
+ if __name__ == "__main__":
159
+ demo.launch()