Scrapyard commited on
Commit
8b3bbb3
·
1 Parent(s): bc075a6

smother voice expericnce

Browse files
Files changed (3) hide show
  1. __pycache__/transcriber.cpython-310.pyc +0 -0
  2. app.py +19 -121
  3. transcriber.py +154 -0
__pycache__/transcriber.cpython-310.pyc ADDED
Binary file (4.33 kB). View file
 
app.py CHANGED
@@ -1,123 +1,13 @@
1
  import gradio as gr
2
  import numpy as np
3
- from faster_whisper import WhisperModel
4
- import threading
5
- import time
6
- import scipy.signal as signal
7
 
8
- # Initialize the WhisperModel
9
- audio_model = WhisperModel("tiny.en", device="cpu", compute_type="int8")
10
 
11
- class AudioProcessor:
12
- def __init__(self):
13
- self.audio_buffer = np.array([]) # Stores raw audio for playback
14
- self.sample_rate = 16000 # Default sample rate for whisper
15
- self.lock = threading.Lock() # Thread safety for buffer access
16
- self.transcription = [''] # List of transcription segments
17
- self.min_process_length = 1 * self.sample_rate # Process at least 1 second
18
- self.max_buffer_size = 30 * self.sample_rate # Maximum buffer size (30 seconds)
19
- self.last_process_time = time.time()
20
- self.process_interval = 1.0 # Process every 1 second
21
-
22
- def add_audio(self, audio_data, sr):
23
- """Add audio to the buffer and process if needed"""
24
- with self.lock:
25
- # Convert to mono if stereo
26
- if audio_data.ndim > 1:
27
- audio_data = audio_data.mean(axis=1)
28
-
29
- # Keep original format without normalization
30
- audio_data = audio_data.astype(np.float32)
31
-
32
- # Resample properly if needed
33
- if sr != self.sample_rate:
34
- try:
35
- number_of_samples = int(len(audio_data) * self.sample_rate / sr)
36
- audio_data = signal.resample(audio_data, number_of_samples)
37
- except Exception as e:
38
- print(f"Resampling error: {e}")
39
- ratio = self.sample_rate / sr
40
- audio_data = np.interp(
41
- np.arange(0, len(audio_data) * ratio, ratio),
42
- np.arange(0, len(audio_data)),
43
- audio_data
44
- )
45
-
46
- # Add to buffer without renormalizing
47
- if len(self.audio_buffer) == 0:
48
- self.audio_buffer = audio_data
49
- else:
50
- self.audio_buffer = np.concatenate([self.audio_buffer, audio_data])
51
-
52
- # Trim buffer if it gets too large
53
- if len(self.audio_buffer) > self.max_buffer_size:
54
- self.audio_buffer = self.audio_buffer[-self.max_buffer_size:]
55
-
56
- # Check if we should process now
57
- should_process = (
58
- len(self.audio_buffer) >= self.min_process_length and
59
- time.time() - self.last_process_time >= self.process_interval
60
- )
61
-
62
- if should_process:
63
- self.last_process_time = time.time()
64
- # Process the buffer in a separate thread to avoid blocking
65
- threading.Thread(target=self._process_audio).start()
66
-
67
- return len(self.audio_buffer)
68
-
69
- def _process_audio(self):
70
- """Process the current audio buffer (should be called in a separate thread)"""
71
- with self.lock:
72
- # Make a copy for processing
73
- audio = self.audio_buffer.copy()
74
-
75
- # Normalize for transcription
76
- audio_norm = audio.astype(np.float32)
77
- if np.max(np.abs(audio_norm)) > 0:
78
- audio_norm = audio_norm / np.max(np.abs(audio_norm))
79
-
80
- try:
81
- # Transcribe with whisper
82
- segments, info = audio_model.transcribe(audio_norm, beam_size=5)
83
- result = list(segments)
84
-
85
- if result:
86
- with self.lock:
87
- # Update the transcription
88
- self.transcription = [seg.text for seg in result]
89
- except Exception as e:
90
- print(f"Transcription error: {e}")
91
-
92
- def get_transcription(self):
93
- """Get the current transcription text"""
94
- with self.lock:
95
- return " ".join(self.transcription)
96
-
97
- def clear_buffer(self):
98
- """Clear the audio buffer"""
99
- with self.lock:
100
- self.audio_buffer = np.array([])
101
- self.transcription = ['']
102
- return "Buffers cleared"
103
-
104
- def get_playback_audio(self):
105
- """Get properly formatted audio for Gradio playback"""
106
- with self.lock:
107
- if len(self.audio_buffer) == 0:
108
- return None
109
-
110
- # Make a copy and ensure proper format for Gradio
111
- audio = self.audio_buffer.copy()
112
-
113
- # Ensure audio is in the correct range for playback (-1 to 1)
114
- if np.max(np.abs(audio)) > 0:
115
- audio = audio / max(1.0, np.max(np.abs(audio)))
116
-
117
- return (self.sample_rate, audio)
118
-
119
- # Create processor instance
120
- processor = AudioProcessor()
121
 
122
  def process_mic_audio(audio):
123
  """Process audio from Gradio microphone and update transcription"""
@@ -135,7 +25,7 @@ def process_mic_audio(audio):
135
  # Return status update and transcription
136
  buffer_seconds = buffer_size / processor.sample_rate
137
  return (
138
- f"Buffer size: {buffer_size} samples ({buffer_seconds:.2f} seconds)",
139
  transcription
140
  )
141
 
@@ -147,8 +37,13 @@ def get_current_buffer():
147
  """Get the current buffer for playback"""
148
  return processor.get_playback_audio()
149
 
 
 
 
 
 
150
  # Create Gradio interface
151
- with gr.Blocks() as demo:
152
  gr.Markdown("# Live Speech Recognition with Buffer Playback")
153
 
154
  with gr.Row():
@@ -161,11 +56,12 @@ with gr.Blocks() as demo:
161
  with gr.Row():
162
  clear_btn = gr.Button("Clear Buffer")
163
  play_btn = gr.Button("Get Buffer for Playback")
 
164
 
165
  with gr.Row():
166
  transcription_output = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
167
 
168
- # Connect components - removed the 'every' parameter for compatibility
169
  audio_input.stream(
170
  process_mic_audio,
171
  audio_input,
@@ -174,6 +70,8 @@ with gr.Blocks() as demo:
174
 
175
  clear_btn.click(clear_audio_buffer, None, [status_output, buffer_audio, transcription_output])
176
  play_btn.click(get_current_buffer, None, buffer_audio)
 
177
 
178
- # Launch the interface
179
- demo.launch()
 
 
1
  import gradio as gr
2
  import numpy as np
3
+ from transcriber import AudioProcessor
 
 
 
4
 
5
+ # Create processor instance with more conservative settings
6
+ processor = AudioProcessor(model_size="tiny.en", device="cpu")
7
 
8
+ # Adjust some settings for better quality
9
+ processor.min_process_length = 2 * processor.sample_rate # Need at least 2 seconds before processing
10
+ processor.process_interval = 1.5 # Process at most every 1.5 seconds
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def process_mic_audio(audio):
13
  """Process audio from Gradio microphone and update transcription"""
 
25
  # Return status update and transcription
26
  buffer_seconds = buffer_size / processor.sample_rate
27
  return (
28
+ f"Buffer: {buffer_seconds:.1f}s | Processed: {processor.processed_length/processor.sample_rate:.1f}s",
29
  transcription
30
  )
31
 
 
37
  """Get the current buffer for playback"""
38
  return processor.get_playback_audio()
39
 
40
+ def force_transcribe():
41
+ """Force transcription of current buffer"""
42
+ processor._process_audio()
43
+ return processor.get_transcription()
44
+
45
  # Create Gradio interface
46
+ with gr.Blocks(title="Live Speech Transcription") as demo:
47
  gr.Markdown("# Live Speech Recognition with Buffer Playback")
48
 
49
  with gr.Row():
 
56
  with gr.Row():
57
  clear_btn = gr.Button("Clear Buffer")
58
  play_btn = gr.Button("Get Buffer for Playback")
59
+ force_btn = gr.Button("Force Transcribe")
60
 
61
  with gr.Row():
62
  transcription_output = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
63
 
64
+ # Connect components
65
  audio_input.stream(
66
  process_mic_audio,
67
  audio_input,
 
70
 
71
  clear_btn.click(clear_audio_buffer, None, [status_output, buffer_audio, transcription_output])
72
  play_btn.click(get_current_buffer, None, buffer_audio)
73
+ force_btn.click(force_transcribe, None, transcription_output)
74
 
75
+ if __name__ == "__main__":
76
+ # Launch the interface
77
+ demo.launch()
transcriber.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import threading
3
+ import time
4
+ from faster_whisper import WhisperModel
5
+ import scipy.signal as signal
6
+
7
+ class AudioProcessor:
8
+ def __init__(self, model_size="tiny.en", device="cpu", compute_type="int8"):
9
+ """Initialize the audio processor with configurable parameters"""
10
+ self.audio_buffer = np.array([]) # Stores raw audio for playback
11
+ self.processed_length = 0 # Length of audio already processed
12
+ self.sample_rate = 16000 # Default sample rate for whisper
13
+ self.lock = threading.Lock() # Thread safety for buffer access
14
+ self.transcription = [''] # List of transcription segments
15
+ self.min_process_length = 1 * self.sample_rate # Process at least 1 second
16
+ self.max_buffer_size = 30 * self.sample_rate # Maximum buffer size (30 seconds)
17
+ self.last_process_time = time.time()
18
+ self.process_interval = 1.0 # Process every 1 second
19
+ self.is_processing = False # Flag to prevent concurrent processing
20
+
21
+ # Initialize the whisper model
22
+ self.audio_model = WhisperModel(model_size, device=device, compute_type=compute_type)
23
+ print(f"Initialized {model_size} model on {device}")
24
+
25
+ def add_audio(self, audio_data, sr):
26
+ """
27
+ Add audio to the buffer and process if needed
28
+
29
+ Args:
30
+ audio_data (numpy.ndarray): Audio data to add
31
+ sr (int): Sample rate of the audio data
32
+
33
+ Returns:
34
+ int: Current buffer size in samples
35
+ """
36
+ with self.lock:
37
+ # Convert to mono if stereo
38
+ if audio_data.ndim > 1:
39
+ audio_data = audio_data.mean(axis=1)
40
+
41
+ # Keep original format without normalization
42
+ audio_data = audio_data.astype(np.float32)
43
+
44
+ # Resample properly if needed
45
+ if sr != self.sample_rate:
46
+ try:
47
+ # Use scipy for proper resampling
48
+ number_of_samples = int(len(audio_data) * self.sample_rate / sr)
49
+ audio_data = signal.resample(audio_data, number_of_samples)
50
+ except Exception as e:
51
+ print(f"Resampling error: {e}")
52
+ # Fallback to simple method if scipy fails
53
+ ratio = self.sample_rate / sr
54
+ audio_data = np.interp(
55
+ np.arange(0, len(audio_data) * ratio, ratio),
56
+ np.arange(0, len(audio_data)),
57
+ audio_data
58
+ )
59
+
60
+ # Apply fade-in to prevent clicks at chunk boundaries (5ms fade)
61
+ fade_samples = min(int(0.005 * self.sample_rate), len(audio_data))
62
+ if fade_samples > 0:
63
+ fade_in = np.linspace(0, 1, fade_samples)
64
+ audio_data[:fade_samples] = audio_data[:fade_samples] * fade_in
65
+
66
+ # Add to buffer
67
+ if len(self.audio_buffer) == 0:
68
+ self.audio_buffer = audio_data
69
+ else:
70
+ self.audio_buffer = np.concatenate([self.audio_buffer, audio_data])
71
+
72
+ # Trim buffer if it gets too large
73
+ if len(self.audio_buffer) > self.max_buffer_size:
74
+ excess = len(self.audio_buffer) - self.max_buffer_size
75
+ self.audio_buffer = self.audio_buffer[excess:]
76
+ # Adjust processed length when trimming
77
+ self.processed_length = max(0, self.processed_length - excess)
78
+
79
+ # Check if we should process now
80
+ should_process = (
81
+ len(self.audio_buffer) >= self.min_process_length and
82
+ time.time() - self.last_process_time >= self.process_interval and
83
+ not self.is_processing
84
+ )
85
+
86
+ if should_process:
87
+ self.last_process_time = time.time()
88
+ self.is_processing = True
89
+ # Process the buffer in a separate thread to avoid blocking
90
+ threading.Thread(target=self._process_audio).start()
91
+
92
+ return len(self.audio_buffer)
93
+
94
+ def _process_audio(self):
95
+ """Process the current audio buffer (should be called in a separate thread)"""
96
+ try:
97
+ with self.lock:
98
+ # Get unprocessed portion of the buffer
99
+ if self.processed_length >= len(self.audio_buffer):
100
+ self.is_processing = False
101
+ return
102
+
103
+ # Make a copy of the full buffer for processing
104
+ audio = self.audio_buffer.copy()
105
+
106
+ # Normalize for transcription
107
+ audio_norm = audio.astype(np.float32)
108
+ if np.max(np.abs(audio_norm)) > 0:
109
+ audio_norm = audio_norm / np.max(np.abs(audio_norm))
110
+
111
+ # Transcribe with whisper
112
+ segments, info = self.audio_model.transcribe(audio_norm, beam_size=5)
113
+ result = list(segments)
114
+
115
+ if result:
116
+ with self.lock:
117
+ # Update the transcription
118
+ self.transcription = [seg.text for seg in result]
119
+ # Mark the whole buffer as processed
120
+ self.processed_length = len(self.audio_buffer)
121
+ except Exception as e:
122
+ print(f"Transcription error: {e}")
123
+ finally:
124
+ # Reset processing flag
125
+ self.is_processing = False
126
+
127
+ def get_transcription(self):
128
+ """Get the current transcription text"""
129
+ with self.lock:
130
+ return " ".join(self.transcription)
131
+
132
+ def clear_buffer(self):
133
+ """Clear the audio buffer"""
134
+ with self.lock:
135
+ self.audio_buffer = np.array([])
136
+ self.processed_length = 0
137
+ self.transcription = ['']
138
+ self.is_processing = False
139
+ return "Buffers cleared"
140
+
141
+ def get_playback_audio(self):
142
+ """Get properly formatted audio for Gradio playback"""
143
+ with self.lock:
144
+ if len(self.audio_buffer) == 0:
145
+ return None
146
+
147
+ # Make a copy and ensure proper format for Gradio
148
+ audio = self.audio_buffer.copy()
149
+
150
+ # Ensure audio is in the correct range for playback (-1 to 1)
151
+ if np.max(np.abs(audio)) > 0:
152
+ audio = audio / max(1.0, np.max(np.abs(audio)))
153
+
154
+ return (self.sample_rate, audio)