MeetingUnderstandingDemo / cloud_speech.py
Vlad Bastina
first commit
599e594
from google.cloud import speech
from google.cloud import storage
import os
import wave
from pydub import AudioSegment
def get_audio_properties(file_path):
"""
Get sample rate and number of channels from the WAV file.
"""
with wave.open(file_path, "rb") as wav_file:
sample_rate = wav_file.getframerate()
channels = wav_file.getnchannels()
return sample_rate, channels
def convert_to_mono(input_path, output_path):
"""Convert video from 2+ channel audio to 1 channel audio for a
single detection"""
audio = AudioSegment.from_wav(input_path)
mono_audio = audio.set_channels(1)
mono_audio.export(output_path, format="wav")
def upload_to_gcs(bucket_name, local_file_path, gcs_file_name):
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(gcs_file_name)
blob.upload_from_filename(local_file_path)
return f"gs://{bucket_name}/{gcs_file_name}"
def transcribe_gcs(gcs_uri):
sample_rate , _ = get_audio_properties('temp_audio.wav')
client = speech.SpeechClient()
audio = speech.RecognitionAudio(uri=gcs_uri)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=sample_rate,
language_code="en-US",
)
operation = client.long_running_recognize(config=config, audio=audio)
response = operation.result(timeout=600) # Wait for result
transcript = "\n".join([result.alternatives[0].transcript for result in response.results])
return transcript
def get_transcription():
# Example usage
bucket_name = "meeting-audio-bucket"
audio_file = "temp_audio.wav" # Extracted audio file
convert_to_mono(audio_file,audio_file)
gcs_uri = upload_to_gcs(bucket_name, audio_file, "temp_audio.wav")
transcript = transcribe_gcs(gcs_uri)
return transcript