import riva.client import riva.client.realtime from riva.client.argparse_utils import add_asr_config_argparse_parameters, add_connection_argparse_parameters import os from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() uri = "grpc.nvcf.nvidia.com:443" auth = riva.client.Auth( uri=uri, use_ssl=True, metadata_args=[ ["function-id", "b702f636-f60c-4a3d-a6f4-f3568c13bd7d"], ["authorization", f"Bearer {os.environ['NVIDIA_API']}"], ] ) asr_service = riva.client.ASRService(auth) # offline_config = riva.client.RecognitionConfig( # encoding=riva.client.AudioEncoding.LINEAR_PCM, # max_alternatives=1, # enable_automatic_punctuation=True, # verbatim_transcripts=False, # ) name_variants = [ "deepak", # standard spelling "dee-pak", # phonetic-like spelling "deepuck", # alternative spelling to match pronunciation "D IY P AH K", # ARPAbet phonetic ] offline_config = riva.client.RecognitionConfig( language_code="en-US", # model=args.model_name, max_alternatives=1, # profanity_filter=args.profanity_filter, enable_automatic_punctuation=True, verbatim_transcripts=False, speech_contexts=[{"phrases": name_variants, "boost": 20.0}] # enable_word_time_offsets=args.word_time_offsets or args.speaker_diarization, ) with open("./en-US_sample.wav", 'rb') as fh: data = fh.read() def asr_transcribe(audio: bytes) -> str: global offline_config, asr_service response = asr_service.offline_recognize(audio, offline_config) transcript:str = " ".join([result.alternatives[0].transcript for result in response.results]) # print("Final transcript:", transcript) return transcript def foo(): global data, offline_config, asr_service response = asr_service.offline_recognize(data, offline_config) transcript = " ".join([result.alternatives[0].transcript for result in response.results]) # print("Final transcript:", transcript) return transcript if __name__ =="__main__": transcript = foo() print(transcript)