#run this in your terminal, Olaolu # Set-ExecutionPolicy Unrestricted -Scope Process import torch import gradio as gr import librosa import os import base64 import tempfile import io import numpy as np from dotenv import load_dotenv from transformers import pipeline from huggingface_hub import login from google.cloud import translate_v3 from gradio.routes import mount_gradio_app from spitch import Spitch from pydub import AudioSegment import requests load_dotenv() spitch_client = Spitch() #activate spitch # =========================== # INITIAL SETUP # =========================== # Log in to Hugging Face hf_token = os.getenv("HUGGINGFACE_TOKEN") if hf_token: login(token=hf_token) else: print("⚠️ No Hugging Face token found. You cannot access private models.") # Load and decode Google credentials (Base64 Encoding) creds_b64 = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON") #get the b64 string if creds_b64: creds_json = base64.b64decode(creds_b64).decode("utf-8") #decode to json string temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json") #create a temp file temp_file.write(creds_json.encode("utf-8")) #write json to this file temp_file.flush() #write os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file.name #update cred else: print("⚠️ No GCP creds found.") # Google Cloud project ID PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") # =========================== # LOAD ASR MODEL # =========================== # use device=0 for GPU if available, otherwise CPU (-1) device = 0 if torch.cuda.is_available() else -1 try: asr_yoruba = pipeline("automatic-speech-recognition", model="NCAIR1/Yoruba-ASR", device=device) except Exception as e: print(f"⚠️ Could not load Yoruba ASR: {e}") asr_yoruba = None # English ASR: default to facebook/wav2vec2-base-960h when EN_ASR_MODEL not provided EN_ASR_MODEL = os.getenv("EN_ASR_MODEL", "facebook/wav2vec2-base-960h") try: asr_english = pipeline("automatic-speech-recognition", model=EN_ASR_MODEL, device=device) # print(f"✅ English ASR loaded: {EN_ASR_MODEL}") except Exception as e: print(f"⚠️ Could not load English ASR ({EN_ASR_MODEL}): {e}") asr_english = None print("✅ Done loading models!\n") # =========================== # TRANSLATION FUNCTION # =========================== def translate_text(text: str, mode: str): """Translate text according to mode: 'Yoruba → English' or 'English → Yoruba'.""" if not text: return "" if mode == "Yoruba → English": source = "yo" target = "en-US" else: source = "en" target = "yo" try: google_client = translate_v3.TranslationServiceClient() parent = f"projects/{PROJECT_ID}/locations/global" response = google_client.translate_text( contents=[text], parent=parent, mime_type="text/plain", source_language_code=source, target_language_code=target, ) return response.translations[0].translated_text except Exception as e: print(f"⚠️ Translation failed: {e}") return "" # fail gracefully # =========================== # Google English ASR # =========================== # import os # from google.cloud.speech_v2 import SpeechClient # from google.cloud.speech_v2.types import cloud_speech # def quickstart_v2(audio_file: str) -> cloud_speech.RecognizeResponse: # """Transcribe an audio file. # Args: # audio_file (str): Path to the local audio file to be transcribed. # Returns: # cloud_speech.RecognizeResponse: The response from the recognize request, containing # the transcription results # """ # # Reads a file as bytes # with open(audio_file, "rb") as f: # audio_content = f.read() # # Instantiates a client # client = SpeechClient() # config = cloud_speech.RecognitionConfig( # auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(), # language_codes=["en-US"], # model="long", # ) # request = cloud_speech.RecognizeRequest( # recognizer=f"projects/{PROJECT_ID}/locations/global/recognizers/_", # config=config, # content=audio_content, # ) # # Transcribes the audio into text # response = client.recognize(request=request) # for result in response.results: # print(f"Transcript: {result.alternatives[0].transcript}") # return response # =========================== # ASR + translate wrapper # =========================== def transcribe_and_translate(file_path, mode: str): """Return (transcription, translation) for the given file and mode.""" if not file_path: return "...", "" # load and standardize audio audio, sr = librosa.load(file_path, sr=16000) # choose ASR pipeline if mode == "Yoruba → English": asr_result = asr_yoruba(audio) else: asr_result = asr_english(audio) #get the transcription transcription = asr_result.get("text", "") if isinstance(asr_result, dict) else str(asr_result) transcription = transcription.capitalize() translation = translate_text(transcription, mode) if transcription else "" return transcription, translation # # =========================== # # SPITCH TTS # # =========================== # def synthesize_tts_to_array(text: str, language: str = "yo", voice: str = "segun"): # """Given the translation, # Use Spitch to produce MP3 bytes and convert # to (sr, numpy_array) for gr.Audio.""" # if not text: # return None # #Invoke Spitch # resp = spitch_client.speech.generate(text=text, language=language, voice=voice, format="mp3") # #Get the mp3 bytes # mp3_bytes = resp.read() # #Get the audio file # audio = AudioSegment.from_file(io.BytesIO(mp3_bytes), format="mp3") # sr = audio.frame_rate # samples = np.array(audio.get_array_of_samples()) # #TODO # if audio.channels > 1: # samples = samples.reshape((-1, audio.channels)) # # normalize integer samples -> float32 in [-1, 1] # max_val = float(1 << (8 * audio.sample_width - 1)) # samples = samples.astype(np.float32) / max_val # return (sr, samples) # =========================== # YARNGPT TTS # =========================== def synthesize_tts_to_array(text: str, language: str = "yo", voice: str = "Femi"): """Given the translation, Use YarnGPT to produce MP3 bytes and convert to (sr, numpy_array) for gr.Audio.""" YARNGPT_API_URL = "https://yarngpt.ai/api/v1/tts" YARNGPT_API_KEY = os.getenv("YARNGPT_API_KEY") if not text or not YARNGPT_API_KEY: print("⚠️Translation or API key is missing") return None headers = { "Authorization": f"Bearer {YARNGPT_API_KEY}" } payload = { "text": text, "voice": voice, } #Invoke YarnGPT response = requests.post(YARNGPT_API_URL, headers=headers, json=payload, stream=True) if response.status_code != 200: print(f"Error: {response.status_code}") print(response.json()) return None #Get the mp3 bytes mp3_bytes = response.content # Use io.BytesIO to treat the bytes content as a file in memory try: audio = AudioSegment.from_file(io.BytesIO(mp3_bytes), format="mp3") except Exception as e: print(f"Error processing audio with pydub: {e}") return None # Get the sampling rate sr = audio.frame_rate samples = np.array(audio.get_array_of_samples()) #TODO: WHAT DOES THIS DO AND WHY? if audio.channels > 1: samples = samples.reshape((-1, audio.channels)) # normalize integer samples -> float32 in [-1, 1] max_val = float(1 << (8 * audio.sample_width - 1)) samples = samples.astype(np.float32) / max_val return (sr, samples) # =========================== OLD # GRADIO INTERFACE # =========================== # with gr.Blocks(title="Olùkọ́ | Learn Yoruba") as app: # gr.Markdown("# 🇳🇬 Olùkọ́") # gr.Markdown( # "Comprehensive Yoruba learning tool!" # ) # # --- Tab 1: ASR + Translator --- # # single editable textbox + a microphone recorder placed beside it # with gr.Row(): # output_transcription = gr.Textbox( # label="✍️ Speak/Type...", # interactive=True, # placeholder="Type here or press the mic to speak..." # ) # mic_recorder = gr.Audio( # sources="microphone", #accept microphone input # type="filepath", #store this in a filepath # label="🎙️", # show_label=True # ) # output_translation = gr.Textbox(label="💬 Translation (English)") # # When the mic recorder finishes, transcribe and place text into the editable textbox # mic_recorder.change( # transcribe_audio, # inputs=mic_recorder, #input to the transcribe_audio function # outputs=output_transcription, #output goes into this text box # ) # # When the user types / changes the textbox, translate # output_transcription.change( # translate_text, # inputs=output_transcription, #input to the translate_text function # outputs=output_translation, #output of the translate_text function gets stored in outputs # ) # =========================== # GRADIO INTERFACE - Tab 1 (updated) # =========================== with gr.Blocks(title="Olùkọ́ | Learn Yoruba") as app: gr.Markdown("# 🇳🇬 Olùkọ́") gr.Markdown("Comprehensive Yoruba learning tool!") # direction selector mode = gr.Radio( choices=["Yoruba → English", "English → Yoruba"], value="Yoruba → English", label="Direction" ) with gr.Row(): # single editable textbox + microphone next to it #User input textbox output_transcription = gr.Textbox( label="✍️ Speak/Type...", interactive=True ) # User input microphone mic_recorder = gr.Audio( sources="microphone", type="filepath", label="🎙️", show_label=True ) #Store translation textbox + TTS model in same row with gr.Row(): #Translation textbox output_translation = gr.Textbox(label="💬 Translation") #Button for TTS. TODO # tts_button = gr.Button("Play TTS") #Audio for TTS playback. TODO # tts_audio = gr.Audio(label="TTS Playback", type="numpy", interactive=False) #TODO # def _on_tts_click(text, direction): # """Generate TTS from the translation textbox (no disk write) # and return (sr, samples).""" # if not text: # return None # # select language/voice mapping as needed # if direction == "English → Yoruba": # lang = "yo" # voice = "Femi" # else: # lang = "en" # voice = "Mary" # try: # result = synthesize_tts_to_array(text, language=lang, voice=voice) # return result # (sr, numpy_array) or None # except Exception as e: # print("TTS generation failed:", e) # return None # when the mic finishes: transcribe + translate and populate both boxes mic_recorder.change( transcribe_and_translate, inputs=[mic_recorder, mode], outputs=[output_transcription, output_translation], ) # when the user types/edits the transcription box, translate according to mode output_transcription.change( translate_text, inputs=[output_transcription, mode], outputs=output_translation, ) #TODO If the TTS_Button is pushed, call the _on_tts_click function #Send the output audi0 (sr, numpy_array) to the tts_audio block # tts_button.click( # _on_tts_click, # inputs=[output_translation, mode], # outputs=tts_audio # ) # =========================== # APP LAUNCH # =========================== # mount_gradio_app(api_app, app, path="/") if __name__ == "__main__": app.launch() #server_name="0.0.0.0", server_port=7860)