File size: 2,514 Bytes
bae3072
 
3d6d4c6
3787c01
e9e99ba
3d6d4c6
 
 
3787c01
3d6d4c6
3787c01
bae3072
3d6d4c6
bae3072
 
 
3d6d4c6
1967070
3d6d4c6
 
 
e9e99ba
 
 
3d6d4c6
3787c01
 
 
3d6d4c6
e9e99ba
 
 
 
3787c01
e9e99ba
3787c01
3d6d4c6
 
e9e99ba
3d6d4c6
 
e9e99ba
3d6d4c6
1967070
e475210
e9e99ba
3d6d4c6
 
 
 
e475210
3d6d4c6
bae3072
3787c01
bae3072
3787c01
3d6d4c6
3787c01
 
bae3072
 
 
3787c01
bae3072
3787c01
bae3072
 
3787c01
bae3072
3787c01
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import torch
import torchaudio
import tempfile
from TTS.api import TTS  # Offline TTS
from transformers import (
    SeamlessM4TProcessor,
    SeamlessM4TForSpeechToText,
    SeamlessM4TForSpeechToSpeech,
)
import gradio as gr

# Constants
MODEL_NAME = "facebook/hf-seamless-m4t-medium"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load processor and models
processor = SeamlessM4TProcessor.from_pretrained(MODEL_NAME)
s2t_model = SeamlessM4TForSpeechToText.from_pretrained(MODEL_NAME).to(device).eval()
s2s_model = SeamlessM4TForSpeechToSpeech.from_pretrained(MODEL_NAME).to(device).eval()

# Load offline TTS model (English-only for now)
tts_engine = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)

# Main translation function
def translate_from_text(text_input, source_lang, target_lang, auto_detect):
    if not text_input.strip():
        return "Empty input text.", None

    # Step 1: Convert input text to speech using offline TTS
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav_file:
        tts_engine.tts_to_file(text=text_input, file_path=wav_file.name)
        waveform, sr = torchaudio.load(wav_file.name)

    # Step 2: Resample to 16kHz
    waveform = torchaudio.functional.resample(waveform, sr, 16000)
    src = None if auto_detect else source_lang

    # Step 3: Prepare processor input
    inputs = processor(audios=waveform, src_lang=src, return_tensors="pt").to(device)

    # Step 4: Speech-to-Text
    text_tokens = s2t_model.generate(**inputs, tgt_lang=target_lang)
    translated_text = processor.decode(text_tokens[0].tolist(), skip_special_tokens=True)

    # Step 5: Speech-to-Speech
    speech_waveform = s2s_model.generate(**inputs, tgt_lang=target_lang)[0].cpu().numpy().squeeze()
    translated_audio = (16000, speech_waveform)

    return translated_text, translated_audio

# Gradio Interface
iface = gr.Interface(
    fn=translate_from_text,
    inputs=[
        gr.Textbox(label="Input Text"),
        gr.Textbox(label="Source Language (e.g. eng)"),
        gr.Textbox(label="Target Language (e.g. hin)"),
        gr.Checkbox(label="Auto-detect Source Language")
    ],
    outputs=[
        gr.Textbox(label="Translated Text"),
        gr.Audio(label="Translated Speech", type="numpy")
    ],
    title="iVoice Translate (T2T + T2S → S2T + S2S)"
).queue()

# Launch server
if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))