Spaces:
Sleeping
Sleeping
File size: 1,871 Bytes
8ab6697 5ca0193 8ab6697 9acc204 c3764b1 ba62404 8ab6697 9acc204 5ca0193 ba62404 9acc204 7ce9df0 9acc204 e8bbdcb 9acc204 ba62404 9acc204 ba62404 9acc204 48bd16f 9acc204 e8bbdcb 7ce9df0 ba62404 c3764b1 9acc204 8ab6697 48bd16f c3764b1 8ab6697 5ca0193 c3764b1 9acc204 ba62404 c06020e 9acc204 ba62404 7ce9df0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | import os
import torch
import torchaudio
import gradio as gr
from transformers import (
AutoProcessor,
SeamlessM4TProcessor,
SeamlessM4TForTextToText,
SeamlessM4TForTextToSpeech
)
# Constants
MODEL_NAME = "facebook/hf-seamless-m4t-medium"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load processor and models
processor = SeamlessM4TProcessor.from_pretrained(MODEL_NAME)
t2t_model = SeamlessM4TForTextToText.from_pretrained(MODEL_NAME).to(device).eval()
t2s_model = SeamlessM4TForTextToSpeech.from_pretrained(MODEL_NAME).to(device).eval()
# Main translation function
def translate(text_input, source_lang, target_lang, auto_detect):
if not text_input:
return "No input text provided.", None
src = None if auto_detect else source_lang
# Prepare input
inputs = processor(text=text_input, src_lang=src, return_tensors="pt").to(device)
# Text-to-Text
text_tokens = t2t_model.generate(**inputs, tgt_lang=target_lang)
translated_text = processor.decode(text_tokens[0].tolist(), skip_special_tokens=True)
# Text-to-Speech
speech_waveform = t2s_model.generate(**inputs, tgt_lang=target_lang)[0].cpu().numpy().squeeze()
translated_audio = (16000, speech_waveform)
return translated_text, translated_audio
# Gradio Interface
iface = gr.Interface(
fn=translate,
inputs=[
gr.Textbox(label="Input Text"),
gr.Textbox(label="Source Language (e.g. eng)"),
gr.Textbox(label="Target Language (e.g. fra)"),
gr.Checkbox(label="Auto-detect source language")
],
outputs=[
gr.Textbox(label="Translated Text"),
gr.Audio(label="Translated Speech")
],
title="iVoice Translate (T2T + T2S)"
).queue()
# Launch
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0", share=True, server_port=int(os.environ.get("PORT", 7860)))
|