mini / app.py
Manikeerthan01's picture
Create app.py
044b155 verified
import gradio as gr
import torch
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from googletrans import Translator
from pydub import AudioSegment
import io
# Load the Whisper model and processor once
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
def transcribe_and_translate(audio, target_language=None):
try:
# Convert the Gradio audio input to an AudioSegment
audio_segment = AudioSegment.from_file(io.BytesIO(audio), format="wav")
audio_segment = audio_segment.set_channels(1).set_frame_rate(16000)
# Prepare audio for Whisper
buffer = io.BytesIO()
audio_segment.export(buffer, format="wav")
waveform = torch.FloatTensor(np.frombuffer(buffer.getvalue(), dtype=np.int16) / 32768.0).unsqueeze(0)
# Transcription
inputs = processor(waveform, return_tensors="pt", sampling_rate=16000)
with torch.no_grad():
logits = model.generate(inputs["input_features"])
transcription = processor.batch_decode(logits, skip_special_tokens=True)[0]
if target_language:
translator = Translator()
translation = translator.translate(transcription, dest=target_language)
return transcription, translation.text
else:
return transcription, None
except Exception as e:
return f"An error occurred: {e}", None
# Gradio interface
def process_audio(audio, target_language):
transcription, translation = transcribe_and_translate(audio, target_language)
if translation:
return transcription, translation
else:
return transcription, "No translation requested."
# Define the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Audio Transcription and Translation")
audio_input = gr.Audio(source="microphone", type="file", label="Record or Upload Audio")
target_language = gr.Textbox(label="Target Language (e.g., 'es' for Spanish)", placeholder="Leave blank for no translation")
transcription_output = gr.Textbox(label="Transcription")
translation_output = gr.Textbox(label="Translation")
submit_button = gr.Button("Transcribe and Translate")
submit_button.click(process_audio, inputs=[audio_input, target_language], outputs=[transcription_output, translation_output])
# Launch the Gradio interface
demo.launch()