Spaces:

Senath
/

iVoiceSeamless

Sleeping

File size: 1,871 Bytes

8ab6697
5ca0193
8ab6697
 
9acc204
 
 
 
 
 
c3764b1
ba62404
8ab6697
 
 
9acc204
 
 
 
5ca0193
ba62404
9acc204
 
 
7ce9df0
9acc204
e8bbdcb
9acc204
 
ba62404
9acc204
 
 
ba62404
9acc204
 
 
48bd16f
9acc204
e8bbdcb
7ce9df0
ba62404
c3764b1
 
9acc204
8ab6697
 
48bd16f
c3764b1
 
8ab6697
5ca0193
c3764b1
9acc204
ba62404
c06020e
9acc204
ba62404
7ce9df0

import os
import torch
import torchaudio
import gradio as gr
from transformers import (
    AutoProcessor,
    SeamlessM4TProcessor,
    SeamlessM4TForTextToText,
    SeamlessM4TForTextToSpeech
)

# Constants
MODEL_NAME = "facebook/hf-seamless-m4t-medium"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load processor and models
processor = SeamlessM4TProcessor.from_pretrained(MODEL_NAME)
t2t_model = SeamlessM4TForTextToText.from_pretrained(MODEL_NAME).to(device).eval()
t2s_model = SeamlessM4TForTextToSpeech.from_pretrained(MODEL_NAME).to(device).eval()

# Main translation function
def translate(text_input, source_lang, target_lang, auto_detect):
    if not text_input:
        return "No input text provided.", None

    src = None if auto_detect else source_lang

    # Prepare input
    inputs = processor(text=text_input, src_lang=src, return_tensors="pt").to(device)

    # Text-to-Text
    text_tokens = t2t_model.generate(**inputs, tgt_lang=target_lang)
    translated_text = processor.decode(text_tokens[0].tolist(), skip_special_tokens=True)

    # Text-to-Speech
    speech_waveform = t2s_model.generate(**inputs, tgt_lang=target_lang)[0].cpu().numpy().squeeze()
    translated_audio = (16000, speech_waveform)

    return translated_text, translated_audio

# Gradio Interface
iface = gr.Interface(
    fn=translate,
    inputs=[
        gr.Textbox(label="Input Text"),
        gr.Textbox(label="Source Language (e.g. eng)"),
        gr.Textbox(label="Target Language (e.g. fra)"),
        gr.Checkbox(label="Auto-detect source language")
    ],
    outputs=[
        gr.Textbox(label="Translated Text"),
        gr.Audio(label="Translated Speech")
    ],
    title="iVoice Translate (T2T + T2S)"
).queue()

# Launch
if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", share=True, server_port=int(os.environ.get("PORT", 7860)))