| from google.cloud import speech |
| import wave |
| from pydub import AudioSegment |
|
|
| def get_audio_properties(file_path): |
| """ |
| Get sample rate and number of channels from the WAV file. |
| """ |
| with wave.open(file_path, "rb") as wav_file: |
| sample_rate = wav_file.getframerate() |
| channels = wav_file.getnchannels() |
| return sample_rate, channels |
|
|
| def convert_to_mono(input_path, output_path): |
| """Convert video from 2+ channel audio to 1 channel audio for a |
| single detection""" |
| audio = AudioSegment.from_wav(input_path) |
| mono_audio = audio.set_channels(1) |
| mono_audio.export(output_path, format="wav") |
| |
| def split_audio(file_path, chunk_length_ms=60000): |
| audio = AudioSegment.from_wav(file_path) |
| chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] |
|
|
| chunk_files = [] |
| for i, chunk in enumerate(chunks): |
| chunk_name = f"chunk_{i}.wav" |
| chunk.export(chunk_name, format="wav") |
| chunk_files.append(chunk_name) |
| |
| return chunk_files |
|
|
| def transcribe_audio(file_path): |
| """Gets the .wav file path and perform speech to text |
| returns the string witch represents the words spoken""" |
| |
| client = speech.SpeechClient() |
|
|
| |
| sample_rate, channels = get_audio_properties(file_path) |
|
|
| |
| with open(file_path, "rb") as audio_file: |
| audio_content = audio_file.read() |
|
|
| |
| audio = speech.RecognitionAudio(content=audio_content) |
| config = speech.RecognitionConfig( |
| encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, |
| sample_rate_hertz=sample_rate, |
| language_code="en-US", |
| audio_channel_count=channels, |
| enable_separate_recognition_per_channel=(channels > 1) |
| ) |
|
|
| |
| response = client.recognize(config=config, audio=audio) |
|
|
| |
| concatenated_transcript = ". ".join([result.alternatives[0].transcript for result in response.results]) |
|
|
| |
| return concatenated_transcript |
|
|
| def get_transcription_from_sound(file_path:str)->str: |
| """Converts the audio to a single channel and calls the transcription function""" |
| output_path = "audio_mono.wav" |
| convert_to_mono(file_path,output_path) |
| |
| final_transcript = transcribe_audio(output_path) |
| |
| return final_transcript |
|
|
| if __name__=="__main__": |
| file_path = "jackhammer.wav" |
| output_path = "audio_mono.wav" |
| convert_to_mono(file_path,output_path) |
| final_transcript = transcribe_audio(output_path) |
| print(final_transcript) |