File size: 3,280 Bytes
7fe32cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pyaudio
import numpy as np
import tensorflow as tf
import zipfile
import wave
import time

# Audio stream configuration
FORMAT = pyaudio.paInt16  # 16-bit PCM
CHANNELS = 1  # Mono channel
RATE = 16000  # 16kHz sample rate
CHUNK = 1024  # Buffer size
TARGET_LENGTH = 15600
SILENCE_THRESHOLD = 5000  # 5 seconds of silence

audio_buffer = np.zeros(TARGET_LENGTH, dtype=np.float32)
model_path = '1.tflite'
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

waveform_input_index = input_details[0]['index']
scores_output_index = output_details[0]['index']

with zipfile.ZipFile(model_path) as z:
    with z.open('yamnet_label_list.txt') as f:
        labels = [line.decode('utf-8').strip() for line in f]

# Ensure the input tensor is correctly sized
interpreter.resize_tensor_input(waveform_input_index, [TARGET_LENGTH], strict=False)
interpreter.allocate_tensors()
# Initialize PyAudio
p = pyaudio.PyAudio()

def record_audio():
    try:
        # Open the audio stream
        stream = p.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        frames_per_buffer=CHUNK)

        print("Recording... Press Ctrl+C to stop.")

        # Open a .wav file to save the audio
        wf = wave.open("audio.wav", 'wb')
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(p.get_sample_size(FORMAT))
        wf.setframerate(RATE)

        last_speech_time = time.time()

        # Continuously read from the stream and append to audio_data
        while True:
            audio_data = stream.read(CHUNK)
            audio_chunk = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
            audio_buffer = np.roll(audio_buffer, -len(audio_chunk))
            audio_buffer[-len(audio_chunk):] = audio_chunk

            # Write audio data to the .wav file
            wf.writeframes(audio_data)

            # Set the tensor data
            interpreter.set_tensor(waveform_input_index, audio_buffer)

            # Run the model
            interpreter.invoke()
            scores = interpreter.get_tensor(scores_output_index)

            # Get the top classification result
            top_class_index = scores.argmax()
            prediction = labels[top_class_index]
            print(prediction)

            # Check for silence
            if np.max(np.abs(audio_chunk)) > 0.01:
                last_speech_time = time.time()
            elif time.time() - last_speech_time > SILENCE_THRESHOLD / 1000:
                print("Silence detected. Stopping recording.")
                break

    except KeyboardInterrupt:
        # Handle the KeyboardInterrupt to stop recording
        print("\nRecording stopped by user.")

    finally:
        # Stop and close the stream and terminate PyAudio
        stream.stop_stream()
        stream.close()
        p.terminate()
        wf.close()
        print("Stream closed and resources released.")

    return "audio.wav"

if __name__ == "__main__":
    record_audio()