Spaces:

Oluko
/

yoruba-practice

Sleeping

File size: 12,440 Bytes

#run this in your terminal, Olaolu
# Set-ExecutionPolicy Unrestricted -Scope Process
import torch
import gradio as gr
import librosa
import os
import base64
import tempfile
import io
import numpy as np

from dotenv import load_dotenv
from transformers import pipeline
from huggingface_hub import login
from google.cloud import translate_v3
from gradio.routes import mount_gradio_app
from spitch import Spitch
from pydub import AudioSegment
import requests

load_dotenv()
spitch_client = Spitch() #activate spitch

# ===========================
#  INITIAL SETUP
# ===========================

# Log in to Hugging Face
hf_token = os.getenv("HUGGINGFACE_TOKEN")

if hf_token:
    login(token=hf_token)
else:
    print("⚠️ No Hugging Face token found. You cannot access private models.")

# Load and decode Google credentials (Base64 Encoding)
creds_b64 = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON") #get the b64 string
if creds_b64:
    creds_json = base64.b64decode(creds_b64).decode("utf-8") #decode to json string
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json") #create a temp file
    temp_file.write(creds_json.encode("utf-8")) #write json to this file
    temp_file.flush() #write
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file.name #update cred
else:
    print("⚠️ No GCP creds found.")

# Google Cloud project ID
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")

# ===========================
#  LOAD ASR MODEL
# ===========================
# use device=0 for GPU if available, otherwise CPU (-1)
device = 0 if torch.cuda.is_available() else -1

try:
    asr_yoruba = pipeline("automatic-speech-recognition", model="NCAIR1/Yoruba-ASR", device=device)
except Exception as e:
    print(f"⚠️ Could not load Yoruba ASR: {e}")
    asr_yoruba = None

# English ASR: default to facebook/wav2vec2-base-960h when EN_ASR_MODEL not provided
EN_ASR_MODEL = os.getenv("EN_ASR_MODEL", "facebook/wav2vec2-base-960h")
try:
    asr_english = pipeline("automatic-speech-recognition", model=EN_ASR_MODEL, device=device)
    # print(f"✅ English ASR loaded: {EN_ASR_MODEL}")
except Exception as e:
    print(f"⚠️ Could not load English ASR ({EN_ASR_MODEL}): {e}")
    asr_english = None

print("✅ Done loading models!\n")

# ===========================
#  TRANSLATION FUNCTION
# ===========================
def translate_text(text: str, mode: str):
    """Translate text according to mode: 'Yoruba → English' or 'English → Yoruba'."""
    if not text:
        return ""

    if mode == "Yoruba → English":
        source = "yo"
        target = "en-US"
    else:
        source = "en"
        target = "yo"

    try:
        google_client = translate_v3.TranslationServiceClient()
        parent = f"projects/{PROJECT_ID}/locations/global"

        response = google_client.translate_text(
            contents=[text],
            parent=parent,
            mime_type="text/plain",
            source_language_code=source,
            target_language_code=target,
        )

        return response.translations[0].translated_text
    except Exception as e:
        print(f"⚠️ Translation failed: {e}")
        return ""  # fail gracefully

# ===========================
# Google English ASR
# ===========================
# import os

# from google.cloud.speech_v2 import SpeechClient
# from google.cloud.speech_v2.types import cloud_speech

# def quickstart_v2(audio_file: str) -> cloud_speech.RecognizeResponse:
#     """Transcribe an audio file.
#     Args:
#         audio_file (str): Path to the local audio file to be transcribed.
#     Returns:
#         cloud_speech.RecognizeResponse: The response from the recognize request, containing
#         the transcription results
#     """
#     # Reads a file as bytes
#     with open(audio_file, "rb") as f:
#         audio_content = f.read()

#     # Instantiates a client
#     client = SpeechClient()

#     config = cloud_speech.RecognitionConfig(
#         auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
#         language_codes=["en-US"],
#         model="long",
#     )

#     request = cloud_speech.RecognizeRequest(
#         recognizer=f"projects/{PROJECT_ID}/locations/global/recognizers/_",
#         config=config,
#         content=audio_content,
#     )

#     # Transcribes the audio into text
#     response = client.recognize(request=request)

#     for result in response.results:
#         print(f"Transcript: {result.alternatives[0].transcript}")

#     return response


# ===========================
#  ASR + translate wrapper
# ===========================
def transcribe_and_translate(file_path, mode: str):
    """Return (transcription, translation) for the given file and mode."""
    if not file_path:
        return "...", ""

    # load and standardize audio
    audio, sr = librosa.load(file_path, sr=16000)

    # choose ASR pipeline
    if mode == "Yoruba → English":
        asr_result = asr_yoruba(audio)
    else:
        asr_result = asr_english(audio)

    #get the transcription
    transcription = asr_result.get("text", "") if isinstance(asr_result, dict) else str(asr_result)
    transcription = transcription.capitalize()

    translation = translate_text(transcription, mode) if transcription else ""
    return transcription, translation


# # ===========================
# #  SPITCH TTS
# # ===========================
# def synthesize_tts_to_array(text: str, language: str = "yo", voice: str = "segun"):
#     """Given the translation,
#     Use Spitch to produce MP3 bytes and convert 
#     to (sr, numpy_array) for gr.Audio."""

#     if not text:
#         return None
    
#     #Invoke Spitch
#     resp = spitch_client.speech.generate(text=text, language=language, voice=voice, format="mp3")
    
#     #Get the mp3 bytes
#     mp3_bytes = resp.read()

#     #Get the audio file
#     audio = AudioSegment.from_file(io.BytesIO(mp3_bytes), format="mp3")
#     sr = audio.frame_rate
#     samples = np.array(audio.get_array_of_samples())

#     #TODO
#     if audio.channels > 1:
#         samples = samples.reshape((-1, audio.channels))
#     # normalize integer samples -> float32 in [-1, 1]
#     max_val = float(1 << (8 * audio.sample_width - 1))
#     samples = samples.astype(np.float32) / max_val
#     return (sr, samples)


# ===========================
#  YARNGPT TTS
# ===========================

def synthesize_tts_to_array(text: str, language: str = "yo", voice: str = "Femi"):
    """Given the translation,
    Use YarnGPT to produce MP3 bytes and convert 
    to (sr, numpy_array) for gr.Audio."""
    
    YARNGPT_API_URL = "https://yarngpt.ai/api/v1/tts"
    YARNGPT_API_KEY = os.getenv("YARNGPT_API_KEY")

    if not text or not YARNGPT_API_KEY:
        print("⚠️Translation or API key is missing")
        return None
    
    headers = {
    "Authorization": f"Bearer {YARNGPT_API_KEY}"
    }

    payload = {
        "text": text,
        "voice": voice,
    }
    
    #Invoke YarnGPT
    response = requests.post(YARNGPT_API_URL, 
                             headers=headers, 
                             json=payload, 
                             stream=True)
    
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(response.json())
        return None
    
    #Get the mp3 bytes
    mp3_bytes = response.content

    # Use io.BytesIO to treat the bytes content as a file in memory
    try:
        audio = AudioSegment.from_file(io.BytesIO(mp3_bytes), format="mp3")
    except Exception as e:
        print(f"Error processing audio with pydub: {e}")
        return None
    
    # Get the sampling rate
    sr = audio.frame_rate
    samples = np.array(audio.get_array_of_samples())

    #TODO: WHAT DOES THIS DO AND WHY?
    if audio.channels > 1:
        samples = samples.reshape((-1, audio.channels))

    # normalize integer samples -> float32 in [-1, 1]
    max_val = float(1 << (8 * audio.sample_width - 1))
    samples = samples.astype(np.float32) / max_val
    return (sr, samples)


# =========================== OLD
#  GRADIO INTERFACE
# ===========================
# with gr.Blocks(title="Olùkọ́ | Learn Yoruba") as app:
#     gr.Markdown("# 🇳🇬 Olùkọ́")
#     gr.Markdown(
#         "Comprehensive Yoruba learning tool!"
#     )


# # --- Tab 1: ASR + Translator ---
#     # single editable textbox + a microphone recorder placed beside it
#     with gr.Row():
#         output_transcription = gr.Textbox(
#             label="✍️ Speak/Type...",
#             interactive=True,
#             placeholder="Type here or press the mic to speak..."
#         )
#         mic_recorder = gr.Audio(
#             sources="microphone", #accept microphone input
#             type="filepath", #store this in a filepath
#             label="🎙️",
#             show_label=True
#         )

#     output_translation = gr.Textbox(label="💬 Translation (English)")

#     # When the mic recorder finishes, transcribe and place text into the editable textbox
#     mic_recorder.change(
#         transcribe_audio,
#         inputs=mic_recorder, #input to the transcribe_audio function
#         outputs=output_transcription, #output goes into this text box
#     )

#     # When the user types / changes the textbox, translate
#     output_transcription.change(
#         translate_text,
#         inputs=output_transcription, #input to the translate_text function
#         outputs=output_translation, #output of the translate_text function gets stored in outputs
#     )

# ===========================
#  GRADIO INTERFACE - Tab 1 (updated)
# ===========================
with gr.Blocks(title="Olùkọ́ | Learn Yoruba") as app:
    gr.Markdown("# 🇳🇬 Olùkọ́")
    gr.Markdown("Comprehensive Yoruba learning tool!")

    # direction selector
    mode = gr.Radio(
        choices=["Yoruba → English", "English → Yoruba"],
        value="Yoruba → English",
        label="Direction"
    )
    with gr.Row():

        # single editable textbox + microphone next to it
        #User input textbox
        output_transcription = gr.Textbox(
            label="✍️ Speak/Type...",
            interactive=True
        )

        # User input microphone
        mic_recorder = gr.Audio(
            sources="microphone",
            type="filepath",
            label="🎙️",
            show_label=True
        )

    #Store translation textbox + TTS model in same row
    with gr.Row():
        #Translation textbox
        output_translation = gr.Textbox(label="💬 Translation")

        #Button for TTS. TODO
        # tts_button = gr.Button("Play TTS")

        #Audio for TTS playback. TODO
        # tts_audio = gr.Audio(label="TTS Playback", type="numpy", interactive=False)

    #TODO
    # def _on_tts_click(text, direction):
    #     """Generate TTS from the translation textbox (no disk write) 
    #     and return (sr, samples)."""
    #     if not text:
    #         return None
        
    #     # select language/voice mapping as needed
    #     if direction == "English → Yoruba":
    #         lang = "yo"
    #         voice = "Femi"
    #     else:
    #         lang = "en"
    #         voice = "Mary"

    #     try:
    #         result = synthesize_tts_to_array(text, language=lang, voice=voice)
    #         return result  # (sr, numpy_array) or None
    #     except Exception as e:
    #         print("TTS generation failed:", e)
    #         return None

    # when the mic finishes: transcribe + translate and populate both boxes
    mic_recorder.change(
        transcribe_and_translate,
        inputs=[mic_recorder, mode],
        outputs=[output_transcription, output_translation],
    )

    # when the user types/edits the transcription box, translate according to mode
    output_transcription.change(
        translate_text,
        inputs=[output_transcription, mode],
        outputs=output_translation,
    )

    #TODO If the TTS_Button is pushed, call the _on_tts_click function
    #Send the output audi0 (sr, numpy_array) to the tts_audio block  
    # tts_button.click(
    #     _on_tts_click, 
    #     inputs=[output_translation, mode], 
    #     outputs=tts_audio
    # )

# ===========================
#  APP LAUNCH
# ===========================
# mount_gradio_app(api_app, app, path="/")

if __name__ == "__main__":
    app.launch() #server_name="0.0.0.0", server_port=7860)