Spaces:
Sleeping
Sleeping
File size: 2,514 Bytes
bae3072 3d6d4c6 3787c01 e9e99ba 3d6d4c6 3787c01 3d6d4c6 3787c01 bae3072 3d6d4c6 bae3072 3d6d4c6 1967070 3d6d4c6 e9e99ba 3d6d4c6 3787c01 3d6d4c6 e9e99ba 3787c01 e9e99ba 3787c01 3d6d4c6 e9e99ba 3d6d4c6 e9e99ba 3d6d4c6 1967070 e475210 e9e99ba 3d6d4c6 e475210 3d6d4c6 bae3072 3787c01 bae3072 3787c01 3d6d4c6 3787c01 bae3072 3787c01 bae3072 3787c01 bae3072 3787c01 bae3072 3787c01 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | import os
import torch
import torchaudio
import tempfile
from TTS.api import TTS # Offline TTS
from transformers import (
SeamlessM4TProcessor,
SeamlessM4TForSpeechToText,
SeamlessM4TForSpeechToSpeech,
)
import gradio as gr
# Constants
MODEL_NAME = "facebook/hf-seamless-m4t-medium"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load processor and models
processor = SeamlessM4TProcessor.from_pretrained(MODEL_NAME)
s2t_model = SeamlessM4TForSpeechToText.from_pretrained(MODEL_NAME).to(device).eval()
s2s_model = SeamlessM4TForSpeechToSpeech.from_pretrained(MODEL_NAME).to(device).eval()
# Load offline TTS model (English-only for now)
tts_engine = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)
# Main translation function
def translate_from_text(text_input, source_lang, target_lang, auto_detect):
if not text_input.strip():
return "Empty input text.", None
# Step 1: Convert input text to speech using offline TTS
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav_file:
tts_engine.tts_to_file(text=text_input, file_path=wav_file.name)
waveform, sr = torchaudio.load(wav_file.name)
# Step 2: Resample to 16kHz
waveform = torchaudio.functional.resample(waveform, sr, 16000)
src = None if auto_detect else source_lang
# Step 3: Prepare processor input
inputs = processor(audios=waveform, src_lang=src, return_tensors="pt").to(device)
# Step 4: Speech-to-Text
text_tokens = s2t_model.generate(**inputs, tgt_lang=target_lang)
translated_text = processor.decode(text_tokens[0].tolist(), skip_special_tokens=True)
# Step 5: Speech-to-Speech
speech_waveform = s2s_model.generate(**inputs, tgt_lang=target_lang)[0].cpu().numpy().squeeze()
translated_audio = (16000, speech_waveform)
return translated_text, translated_audio
# Gradio Interface
iface = gr.Interface(
fn=translate_from_text,
inputs=[
gr.Textbox(label="Input Text"),
gr.Textbox(label="Source Language (e.g. eng)"),
gr.Textbox(label="Target Language (e.g. hin)"),
gr.Checkbox(label="Auto-detect Source Language")
],
outputs=[
gr.Textbox(label="Translated Text"),
gr.Audio(label="Translated Speech", type="numpy")
],
title="iVoice Translate (T2T + T2S → S2T + S2S)"
).queue()
# Launch server
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|