WJ88 commited on
Commit
16e01ac
·
verified ·
1 Parent(s): 6079a5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -39
app.py CHANGED
@@ -6,6 +6,8 @@ from pydub.silence import detect_silence
6
  import warnings
7
  import torch
8
  import logging
 
 
9
 
10
  warnings.filterwarnings("ignore")
11
  logging.getLogger("nemo").setLevel(logging.ERROR) # Suppress NeMo logs
@@ -25,7 +27,7 @@ def load_model():
25
 
26
  class TranscriptionState:
27
  def __init__(self):
28
- self.buffer = None
29
  self.text = ""
30
 
31
  def transcribe_segment(segment_array: np.ndarray):
@@ -36,24 +38,22 @@ def transcribe_segment(segment_array: np.ndarray):
36
  output = model.transcribe([segment_array])
37
  return output[0]
38
 
39
- def process_live_audio(audio, state: TranscriptionState):
40
- """Process live mic audio chunks with VAD and buffer management."""
41
- if audio is None or len(audio) == 0:
42
  return state.text, state
43
 
44
- sr, audio_data = audio # Unpack Gradio tuple (sr, data)
45
- if sr != 16000:
46
- # Basic resampling placeholder; use librosa if needed
47
- warnings.warn(f"Unexpected SR {sr}; assuming 16000")
48
-
49
- # Convert to int16 for pydub VAD
50
- audio_int16 = (audio_data * 32767).astype(np.int16)
51
- new_segment = AudioSegment(
52
- data=audio_int16.tobytes(),
53
- frame_rate=16000,
54
- sample_width=2,
55
- channels=1
56
- )
57
 
58
  # Append to buffer
59
  if state.buffer is None:
@@ -62,10 +62,9 @@ def process_live_audio(audio, state: TranscriptionState):
62
  state.buffer += new_segment
63
 
64
  # Trim buffer to prevent accumulation (keep last 60s)
65
- max_duration_ms = 60000
66
  if state.buffer.duration_seconds > 60:
67
  # Re-transcribe full current buffer before trimming
68
- full_array = np.array(state.buffer.get_array_of_samples(), dtype=np.float32) / 32767.0
69
  state.text = transcribe_segment(full_array)
70
  # Trim to last 30s for ongoing buffer
71
  state.buffer = state.buffer[-30000:]
@@ -82,7 +81,7 @@ def process_live_audio(audio, state: TranscriptionState):
82
  if last_silence_end < len(state.buffer):
83
  # Transcribe up to end of last silence
84
  segment = state.buffer[:last_silence_end]
85
- segment_array = np.array(segment.get_array_of_samples(), dtype=np.float32) / 32767.0
86
  partial_text = transcribe_segment(segment_array)
87
  state.text = partial_text
88
  # Keep remaining as buffer
@@ -90,15 +89,16 @@ def process_live_audio(audio, state: TranscriptionState):
90
 
91
  return state.text, state
92
 
93
- def transcribe_file(audio):
94
- """Batch transcribe uploaded file."""
95
- if audio is None:
96
  return ""
97
- sr, audio_data = audio # Unpack tuple
98
- if sr != 16000:
99
- warnings.warn(f"Unexpected SR {sr}; assuming 16000")
100
- if len(audio_data.shape) > 1:
101
- audio_data = np.mean(audio_data, axis=1)
 
102
  load_model()
103
  with torch.no_grad(), warnings.catch_warnings():
104
  warnings.simplefilter("ignore")
@@ -111,13 +111,12 @@ def clear_session(state: TranscriptionState):
111
  state.text = ""
112
  return "", state
113
 
114
- # Gradio UI
115
  with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
116
  gr.Markdown(
117
  """
118
  # NVIDIA Parakeet-TDT 0.6B v3 Real-Time Transcription
119
- Speak into your microphone for live multilingual transcription. Updates on pauses. Clear to start over.
120
- Supports 25 European languages automatically. Optimized for CPU.
121
  """
122
  )
123
 
@@ -125,22 +124,23 @@ with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
125
  state = gr.State(TranscriptionState())
126
  audio_input = gr.Audio(
127
  sources=["microphone"],
128
- type="numpy",
129
  streaming=True,
130
- label="Speak now..."
131
  )
132
  output_text = gr.Textbox(
133
  label="Live Transcription",
134
  lines=10,
135
  interactive=False
136
  )
137
- clear_btn = gr.Button("Clear Session")
138
 
139
- # Live updates on chunks
140
  audio_input.change(
141
  process_live_audio,
142
  inputs=[audio_input, state],
143
- outputs=[output_text, state]
 
144
  )
145
  clear_btn.click(
146
  clear_session,
@@ -149,7 +149,7 @@ with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
149
  )
150
 
151
  with gr.Tab("File Upload"):
152
- file_input = gr.Audio(sources=["upload"], type="numpy")
153
  file_output = gr.Textbox(label="File Transcription", lines=10)
154
  transcribe_btn = gr.Button("Transcribe File")
155
  transcribe_btn.click(
@@ -160,9 +160,9 @@ with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
160
 
161
  gr.Markdown(
162
  """
163
- **Notes:** For best results, speak clearly with short pauses. Long sessions (>1 min) may require clearing to maintain speed.
164
  """
165
  )
166
 
167
  if __name__ == "__main__":
168
- demo.launch()
 
6
  import warnings
7
  import torch
8
  import logging
9
+ import io
10
+ import librosa
11
 
12
  warnings.filterwarnings("ignore")
13
  logging.getLogger("nemo").setLevel(logging.ERROR) # Suppress NeMo logs
 
27
 
28
  class TranscriptionState:
29
  def __init__(self):
30
+ self.buffer = None # AudioSegment
31
  self.text = ""
32
 
33
  def transcribe_segment(segment_array: np.ndarray):
 
38
  output = model.transcribe([segment_array])
39
  return output[0]
40
 
41
+ def process_live_audio(chunk_bytes, state: TranscriptionState):
42
+ """Process live mic PCM bytes chunk with VAD and buffer management."""
43
+ if chunk_bytes is None or len(chunk_bytes) == 0:
44
  return state.text, state
45
 
46
+ # Create AudioSegment from raw PCM bytes (16kHz mono int16)
47
+ try:
48
+ new_segment = AudioSegment(
49
+ data=chunk_bytes,
50
+ frame_rate=16000,
51
+ sample_width=2,
52
+ channels=1
53
+ )
54
+ except Exception as e:
55
+ print(f"Chunk creation error: {e}")
56
+ return state.text, state
 
 
57
 
58
  # Append to buffer
59
  if state.buffer is None:
 
62
  state.buffer += new_segment
63
 
64
  # Trim buffer to prevent accumulation (keep last 60s)
 
65
  if state.buffer.duration_seconds > 60:
66
  # Re-transcribe full current buffer before trimming
67
+ full_array = np.array(state.buffer.get_array_of_samples(), dtype=np.float32) / 32768.0
68
  state.text = transcribe_segment(full_array)
69
  # Trim to last 30s for ongoing buffer
70
  state.buffer = state.buffer[-30000:]
 
81
  if last_silence_end < len(state.buffer):
82
  # Transcribe up to end of last silence
83
  segment = state.buffer[:last_silence_end]
84
+ segment_array = np.array(segment.get_array_of_samples(), dtype=np.float32) / 32768.0
85
  partial_text = transcribe_segment(segment_array)
86
  state.text = partial_text
87
  # Keep remaining as buffer
 
89
 
90
  return state.text, state
91
 
92
+ def transcribe_file(audio_path):
93
+ """Batch transcribe uploaded file path."""
94
+ if audio_path is None or not os.path.exists(audio_path):
95
  return ""
96
+ try:
97
+ audio_data, sr = librosa.load(audio_path, sr=16000, mono=True)
98
+ if len(audio_data) == 0:
99
+ return ""
100
+ except Exception:
101
+ return "Error loading file."
102
  load_model()
103
  with torch.no_grad(), warnings.catch_warnings():
104
  warnings.simplefilter("ignore")
 
111
  state.text = ""
112
  return "", state
113
 
114
+ # Gradio UI with Blocks for tabs
115
  with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
116
  gr.Markdown(
117
  """
118
  # NVIDIA Parakeet-TDT 0.6B v3 Real-Time Transcription
119
+ Speak continuously into the microphone—transcription updates live on natural pauses (0.5s+). Supports 25 European languages automatically. Optimized for CPU.
 
120
  """
121
  )
122
 
 
124
  state = gr.State(TranscriptionState())
125
  audio_input = gr.Audio(
126
  sources=["microphone"],
127
+ type="bytes",
128
  streaming=True,
129
+ label="Speak now—updates on pauses"
130
  )
131
  output_text = gr.Textbox(
132
  label="Live Transcription",
133
  lines=10,
134
  interactive=False
135
  )
136
+ clear_btn = gr.Button("Clear Session", variant="secondary")
137
 
138
+ # Stream updates on each chunk
139
  audio_input.change(
140
  process_live_audio,
141
  inputs=[audio_input, state],
142
+ outputs=[output_text, state],
143
+ show_progress=False # Avoid UI flicker during fast chunks
144
  )
145
  clear_btn.click(
146
  clear_session,
 
149
  )
150
 
151
  with gr.Tab("File Upload"):
152
+ file_input = gr.Audio(sources=["upload"], type="filepath")
153
  file_output = gr.Textbox(label="File Transcription", lines=10)
154
  transcribe_btn = gr.Button("Transcribe File")
155
  transcribe_btn.click(
 
160
 
161
  gr.Markdown(
162
  """
163
+ **Tips:** Speak clearly with brief pauses for instant updates. Long monologues auto-update every 60s. Clear resets buffer for fresh starts.
164
  """
165
  )
166
 
167
  if __name__ == "__main__":
168
+ demo.launch(share=False, debug=True)