Spaces:

Senath
/

iVoiceSeamless

Runtime error

App Files Files Community

Senath commited on May 28, 2025

Commit

8ab6697

verified ·

1 Parent(s): dbf8c6e

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -30

app.py CHANGED Viewed

@@ -1,48 +1,58 @@
-import gradio as gr
-import torchaudio
 import torch
 from transformers import AutoProcessor, SeamlessM4TModel
-# Load model and processor
-model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
-processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
-def translate(text_input, audio_file, target_lang):
-    results = []
     if text_input:
-        text_inputs = processor(text=text_input, return_tensors="pt")
-        audio_out = model.generate(**text_inputs, tgt_lang=target_lang)[0].cpu().numpy().squeeze()
-        results.append(("Translated from text", audio_out))
-    if audio_file:
-        audio_waveform, sr = torchaudio.load(audio_file)
-        audio_waveform = torchaudio.functional.resample(audio_waveform, sr, 16000)
-        audio_inputs = processor(audios=audio_waveform, return_tensors="pt")
-        audio_out = model.generate(**audio_inputs, tgt_lang=target_lang)[0].cpu().numpy().squeeze()
-        results.append(("Translated from audio", audio_out))
-    if results:
-        combined_text = "\n".join([r[0] for r in results])
-        combined_audio = results[0][1]
-        return combined_text, (16000, combined_audio)
     return "No input provided.", None
-demo = gr.Interface(
     fn=translate,
     inputs=[
-        gr.Textbox(label="Input Text", placeholder="Enter text to translate (optional)"),
         gr.Audio(type="filepath", label="Input Audio (optional)"),
-        gr.Dropdown(choices=["eng", "hin", "spa", "fra", "por"], label="Target Language", value="hin")
     ],
     outputs=[
-        gr.Textbox(label="Translation Info"),
         gr.Audio(label="Translated Speech")
     ],
-    title="SeamlessM4T Translation (Text & Audio)",
-    description="Upload audio or enter text, pick a target language, and get translated text + speech."
-)
 if __name__ == "__main__":
-    demo.launch()

+import os
 import torch
+import torchaudio
+import gradio as gr
 from transformers import AutoProcessor, SeamlessM4TModel
+MODEL_NAME = "facebook/hf-seamless-m4t-medium"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+processor = AutoProcessor.from_pretrained(MODEL_NAME)
+model = SeamlessM4TModel.from_pretrained(MODEL_NAME).to(device).eval()
+def translate(text_input, audio_input, source_lang, target_lang, auto_detect):
+    outputs = []
+    src = None if auto_detect else source_lang
+    # From text input
     if text_input:
+        inputs = processor(text=text_input, src_lang=src, return_tensors="pt").to(device)
+        output = model.generate(**inputs, tgt_lang=target_lang)
+        text_out = processor.decode(output[0].tolist(), skip_special_tokens=True)
+        speech_out = output[1].cpu().numpy().squeeze()
+        outputs.append((f"Text translated", text_out, (16000, speech_out)))
+    # From audio input
+    elif audio_input:
+        waveform, sr = torchaudio.load(audio_input)
+        waveform = torchaudio.functional.resample(waveform, sr, 16000)
+        inputs = processor(audios=waveform, src_lang=src, return_tensors="pt").to(device)
+        output = model.generate(**inputs, tgt_lang=target_lang)
+        text_out = processor.decode(output[0].tolist(), skip_special_tokens=True)
+        speech_out = output[1].cpu().numpy().squeeze()
+        outputs.append((f"Audio translated", text_out, (16000, speech_out)))
+    if outputs:
+        _, txt, aud = outputs[0]
+        return txt, aud
     return "No input provided.", None
+iface = gr.Interface(
     fn=translate,
     inputs=[
+        gr.Textbox(label="Input Text (optional)"),
         gr.Audio(type="filepath", label="Input Audio (optional)"),
+        gr.Textbox(label="Source Language (e.g. eng)"),
+        gr.Textbox(label="Target Language (e.g. fra)"),
+        gr.Checkbox(label="Auto-detect source language")
     ],
     outputs=[
+        gr.Textbox(label="Translated Text"),
         gr.Audio(label="Translated Speech")
     ],
+    title="iVoice Translate (Text + Speech)"
+).queue()
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", share=True, server_port=int(os.environ.get("PORT", 7860)))