import pyaudio import numpy as np import tensorflow as tf import zipfile import wave import time # Audio stream configuration FORMAT = pyaudio.paInt16 # 16-bit PCM CHANNELS = 1 # Mono channel RATE = 16000 # 16kHz sample rate CHUNK = 1024 # Buffer size TARGET_LENGTH = 15600 SILENCE_THRESHOLD = 5000 # 5 seconds of silence audio_buffer = np.zeros(TARGET_LENGTH, dtype=np.float32) model_path = '1.tflite' interpreter = tf.lite.Interpreter(model_path=model_path) interpreter.allocate_tensors() input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() waveform_input_index = input_details[0]['index'] scores_output_index = output_details[0]['index'] with zipfile.ZipFile(model_path) as z: with z.open('yamnet_label_list.txt') as f: labels = [line.decode('utf-8').strip() for line in f] # Ensure the input tensor is correctly sized interpreter.resize_tensor_input(waveform_input_index, [TARGET_LENGTH], strict=False) interpreter.allocate_tensors() # Initialize PyAudio p = pyaudio.PyAudio() def record_audio(): try: # Open the audio stream stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) print("Recording... Press Ctrl+C to stop.") # Open a .wav file to save the audio wf = wave.open("audio.wav", 'wb') wf.setnchannels(CHANNELS) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(RATE) last_speech_time = time.time() # Continuously read from the stream and append to audio_data while True: audio_data = stream.read(CHUNK) audio_chunk = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 audio_buffer = np.roll(audio_buffer, -len(audio_chunk)) audio_buffer[-len(audio_chunk):] = audio_chunk # Write audio data to the .wav file wf.writeframes(audio_data) # Set the tensor data interpreter.set_tensor(waveform_input_index, audio_buffer) # Run the model interpreter.invoke() scores = interpreter.get_tensor(scores_output_index) # Get the top classification result top_class_index = scores.argmax() prediction = labels[top_class_index] print(prediction) # Check for silence if np.max(np.abs(audio_chunk)) > 0.01: last_speech_time = time.time() elif time.time() - last_speech_time > SILENCE_THRESHOLD / 1000: print("Silence detected. Stopping recording.") break except KeyboardInterrupt: # Handle the KeyboardInterrupt to stop recording print("\nRecording stopped by user.") finally: # Stop and close the stream and terminate PyAudio stream.stop_stream() stream.close() p.terminate() wf.close() print("Stream closed and resources released.") return "audio.wav" if __name__ == "__main__": record_audio()