Spaces:
Runtime error
Runtime error
File size: 2,505 Bytes
044b155 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import gradio as gr
import torch
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from googletrans import Translator
from pydub import AudioSegment
import io
# Load the Whisper model and processor once
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
def transcribe_and_translate(audio, target_language=None):
try:
# Convert the Gradio audio input to an AudioSegment
audio_segment = AudioSegment.from_file(io.BytesIO(audio), format="wav")
audio_segment = audio_segment.set_channels(1).set_frame_rate(16000)
# Prepare audio for Whisper
buffer = io.BytesIO()
audio_segment.export(buffer, format="wav")
waveform = torch.FloatTensor(np.frombuffer(buffer.getvalue(), dtype=np.int16) / 32768.0).unsqueeze(0)
# Transcription
inputs = processor(waveform, return_tensors="pt", sampling_rate=16000)
with torch.no_grad():
logits = model.generate(inputs["input_features"])
transcription = processor.batch_decode(logits, skip_special_tokens=True)[0]
if target_language:
translator = Translator()
translation = translator.translate(transcription, dest=target_language)
return transcription, translation.text
else:
return transcription, None
except Exception as e:
return f"An error occurred: {e}", None
# Gradio interface
def process_audio(audio, target_language):
transcription, translation = transcribe_and_translate(audio, target_language)
if translation:
return transcription, translation
else:
return transcription, "No translation requested."
# Define the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Audio Transcription and Translation")
audio_input = gr.Audio(source="microphone", type="file", label="Record or Upload Audio")
target_language = gr.Textbox(label="Target Language (e.g., 'es' for Spanish)", placeholder="Leave blank for no translation")
transcription_output = gr.Textbox(label="Transcription")
translation_output = gr.Textbox(label="Translation")
submit_button = gr.Button("Transcribe and Translate")
submit_button.click(process_audio, inputs=[audio_input, target_language], outputs=[transcription_output, translation_output])
# Launch the Gradio interface
demo.launch()
|