import os import torch import torchaudio import gradio as gr from transformers import ( AutoProcessor, SeamlessM4TProcessor, SeamlessM4TForTextToText, SeamlessM4TForTextToSpeech ) # Constants MODEL_NAME = "facebook/hf-seamless-m4t-medium" device = "cuda" if torch.cuda.is_available() else "cpu" # Load processor and models processor = SeamlessM4TProcessor.from_pretrained(MODEL_NAME) t2t_model = SeamlessM4TForTextToText.from_pretrained(MODEL_NAME).to(device).eval() t2s_model = SeamlessM4TForTextToSpeech.from_pretrained(MODEL_NAME).to(device).eval() # Main translation function def translate(text_input, source_lang, target_lang, auto_detect): if not text_input: return "No input text provided.", None src = None if auto_detect else source_lang # Prepare input inputs = processor(text=text_input, src_lang=src, return_tensors="pt").to(device) # Text-to-Text text_tokens = t2t_model.generate(**inputs, tgt_lang=target_lang) translated_text = processor.decode(text_tokens[0].tolist(), skip_special_tokens=True) # Text-to-Speech speech_waveform = t2s_model.generate(**inputs, tgt_lang=target_lang)[0].cpu().numpy().squeeze() translated_audio = (16000, speech_waveform) return translated_text, translated_audio # Gradio Interface iface = gr.Interface( fn=translate, inputs=[ gr.Textbox(label="Input Text"), gr.Textbox(label="Source Language (e.g. eng)"), gr.Textbox(label="Target Language (e.g. fra)"), gr.Checkbox(label="Auto-detect source language") ], outputs=[ gr.Textbox(label="Translated Text"), gr.Audio(label="Translated Speech") ], title="iVoice Translate (T2T + T2S)" ).queue() # Launch if __name__ == "__main__": iface.launch(server_name="0.0.0.0", share=True, server_port=int(os.environ.get("PORT", 7860)))