Spaces:
Paused
Paused
File size: 3,280 Bytes
7fe32cd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | import pyaudio
import numpy as np
import tensorflow as tf
import zipfile
import wave
import time
# Audio stream configuration
FORMAT = pyaudio.paInt16 # 16-bit PCM
CHANNELS = 1 # Mono channel
RATE = 16000 # 16kHz sample rate
CHUNK = 1024 # Buffer size
TARGET_LENGTH = 15600
SILENCE_THRESHOLD = 5000 # 5 seconds of silence
audio_buffer = np.zeros(TARGET_LENGTH, dtype=np.float32)
model_path = '1.tflite'
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
waveform_input_index = input_details[0]['index']
scores_output_index = output_details[0]['index']
with zipfile.ZipFile(model_path) as z:
with z.open('yamnet_label_list.txt') as f:
labels = [line.decode('utf-8').strip() for line in f]
# Ensure the input tensor is correctly sized
interpreter.resize_tensor_input(waveform_input_index, [TARGET_LENGTH], strict=False)
interpreter.allocate_tensors()
# Initialize PyAudio
p = pyaudio.PyAudio()
def record_audio():
try:
# Open the audio stream
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("Recording... Press Ctrl+C to stop.")
# Open a .wav file to save the audio
wf = wave.open("audio.wav", 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
last_speech_time = time.time()
# Continuously read from the stream and append to audio_data
while True:
audio_data = stream.read(CHUNK)
audio_chunk = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
audio_buffer = np.roll(audio_buffer, -len(audio_chunk))
audio_buffer[-len(audio_chunk):] = audio_chunk
# Write audio data to the .wav file
wf.writeframes(audio_data)
# Set the tensor data
interpreter.set_tensor(waveform_input_index, audio_buffer)
# Run the model
interpreter.invoke()
scores = interpreter.get_tensor(scores_output_index)
# Get the top classification result
top_class_index = scores.argmax()
prediction = labels[top_class_index]
print(prediction)
# Check for silence
if np.max(np.abs(audio_chunk)) > 0.01:
last_speech_time = time.time()
elif time.time() - last_speech_time > SILENCE_THRESHOLD / 1000:
print("Silence detected. Stopping recording.")
break
except KeyboardInterrupt:
# Handle the KeyboardInterrupt to stop recording
print("\nRecording stopped by user.")
finally:
# Stop and close the stream and terminate PyAudio
stream.stop_stream()
stream.close()
p.terminate()
wf.close()
print("Stream closed and resources released.")
return "audio.wav"
if __name__ == "__main__":
record_audio()
|