File size: 3,852 Bytes
d4f129c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""Untitled2.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1dwiOTRmj8MDuVOgv2OKzE7qx3UXPKCQH
"""

!pip install gradio git+https://github.com/huggingface/transformers.git sentencepiece torchaudio

# Install necessary packages
# !pip install gradio git+https://github.com/huggingface/transformers.git sentencepiece torchaudio

import gradio as gr
from transformers import AutoProcessor, SeamlessM4Tv2Model
import torchaudio
import numpy as np

# Load the processor and model
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
sample_rate = model.config.sampling_rate

# Text-to-Speech function
def text_to_speech(text, src_lang="eng", tgt_lang="arb"):
    text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt")
    audio_array_from_text = model.generate(**text_inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
    return sample_rate, audio_array_from_text

# Speech-to-Speech function
def speech_to_speech(audio, src_lang="eng", tgt_lang="rus"):
    audio, orig_freq = torchaudio.load(audio)
    audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16000)  # Must be a 16 kHz waveform array
    audio_inputs = processor(audios=audio, return_tensors="pt")
    audio_array_from_audio = model.generate(**audio_inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
    return sample_rate, audio_array_from_audio

# Speech-to-Text function
def speech_to_text(audio, src_lang="eng", tgt_lang="ces"):
    audio, orig_freq = torchaudio.load(audio)
    audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16000)  # Must be a 16 kHz waveform array
    audio_inputs = processor(audios=audio, return_tensors="pt")
    output_tokens = model.generate(**audio_inputs, tgt_lang=tgt_lang, generate_speech=False)
    translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
    return translated_text_from_audio

# Text-to-Text function
def text_to_text(text, src_lang="eng", tgt_lang="ces"):
    text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt")
    output_tokens = model.generate(**text_inputs, tgt_lang=tgt_lang, generate_speech=False)
    translated_text_from_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
    return translated_text_from_text

# Create Gradio interfaces
text_to_speech_interface = gr.Interface(
    fn=text_to_speech,
    inputs=[gr.Textbox(label="Input Text"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="arb")],
    outputs=[gr.Audio(label="Output Audio")]
)

speech_to_speech_interface = gr.Interface(
    fn=speech_to_speech,
    inputs=[gr.Audio(type="filepath"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="rus")],
    outputs=[gr.Audio(label="Output Audio")]
)

speech_to_text_interface = gr.Interface(
    fn=speech_to_text,
    inputs=[gr.Audio(type="filepath"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="ces")],
    outputs=gr.Textbox(label="Translated Text")
)

text_to_text_interface = gr.Interface(
    fn=text_to_text,
    inputs=[gr.Textbox(label="Input Text"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="ces")],
    outputs=gr.Textbox(label="Translated Text")
)

# Combine all interfaces into a single tabbed interface
app = gr.TabbedInterface(
    [text_to_speech_interface, speech_to_speech_interface, speech_to_text_interface, text_to_text_interface],
    ["Text-to-Speech", "Speech-to-Speech", "Speech-to-Text", "Text-to-Text"]
)

# Launch the app
app.launch()