import gradio as gr import torch import numpy as np from transformers import WhisperProcessor, WhisperForConditionalGeneration from googletrans import Translator from pydub import AudioSegment import io # Load the Whisper model and processor once processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") def transcribe_and_translate(audio, target_language=None): try: # Convert the Gradio audio input to an AudioSegment audio_segment = AudioSegment.from_file(io.BytesIO(audio), format="wav") audio_segment = audio_segment.set_channels(1).set_frame_rate(16000) # Prepare audio for Whisper buffer = io.BytesIO() audio_segment.export(buffer, format="wav") waveform = torch.FloatTensor(np.frombuffer(buffer.getvalue(), dtype=np.int16) / 32768.0).unsqueeze(0) # Transcription inputs = processor(waveform, return_tensors="pt", sampling_rate=16000) with torch.no_grad(): logits = model.generate(inputs["input_features"]) transcription = processor.batch_decode(logits, skip_special_tokens=True)[0] if target_language: translator = Translator() translation = translator.translate(transcription, dest=target_language) return transcription, translation.text else: return transcription, None except Exception as e: return f"An error occurred: {e}", None # Gradio interface def process_audio(audio, target_language): transcription, translation = transcribe_and_translate(audio, target_language) if translation: return transcription, translation else: return transcription, "No translation requested." # Define the Gradio interface with gr.Blocks() as demo: gr.Markdown("## Audio Transcription and Translation") audio_input = gr.Audio(source="microphone", type="file", label="Record or Upload Audio") target_language = gr.Textbox(label="Target Language (e.g., 'es' for Spanish)", placeholder="Leave blank for no translation") transcription_output = gr.Textbox(label="Transcription") translation_output = gr.Textbox(label="Translation") submit_button = gr.Button("Transcribe and Translate") submit_button.click(process_audio, inputs=[audio_input, target_language], outputs=[transcription_output, translation_output]) # Launch the Gradio interface demo.launch()