from IPython.display import HTML, Javascript from google.colab.output import eval_js import base64 import time import torch from transformers import WhisperProcessor, WhisperForConditionalGeneration from googletrans import Translator from pydub import AudioSegment import io def record(): js = Javascript(""" async function recordAudio() { const div = document.createElement('div'); const audio = document.createElement('audio'); const strtButton = document.createElement('button'); const stopButton = document.createElement('button'); strtButton.textContent = 'Start Recording'; stopButton.textContent = 'Stop Recording'; document.body.appendChild(div); div.appendChild(strtButton); div.appendChild(audio); const stream = await navigator.mediaDevices.getUserMedia({audio:true}); let recorder = new MediaRecorder(stream); audio.style.display = 'block'; audio.srcObject = stream; audio.controls = true; audio.muted = true; await new Promise((resolve) => strtButton.onclick = resolve); strtButton.replaceWith(stopButton); recorder.start(); await new Promise((resolve) => stopButton.onclick = resolve); recorder.stop(); let recData = await new Promise((resolve) => recorder.ondataavailable = resolve); let arrBuff = await recData.data.arrayBuffer(); stream.getAudioTracks()[0].stop(); div.remove(); let binaryString = ''; let bytes = new Uint8Array(arrBuff); bytes.forEach((byte) => { binaryString += String.fromCharCode(byte); }); const url = URL.createObjectURL(recData.data); const player = document.createElement('audio'); player.controls = true; player.src = url; document.body.appendChild(player); return btoa(binaryString); }""") display(js) output = eval_js('recordAudio({})') # Generate a unique filename using the current timestamp filename = f"audio_{int(time.time())}.wav" with open(filename, 'wb') as file: binary = base64.b64decode(output) file.write(binary) print('Recording saved to:', file.name) return filename def transcribe_and_translate(audio_filename, target_language=None): # Load the processor and model from Hugging Face's transformers library processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") # Load the audio file audio = AudioSegment.from_wav(audio_filename) audio = audio.set_channels(1).set_frame_rate(16000) audio = io.BytesIO() audio.export(audio, format="wav") audio = torch.FloatTensor(audio.getvalue()).unsqueeze(0) # Process the audio and perform transcription inputs = processor(audio, return_tensors="pt").input_values with torch.no_grad(): logits = model(input_values=inputs).logits transcription = processor.batch_decode(logits.numpy()) print("Transcription:", transcription[0]) # Translate the transcription if a target language is provided if target_language: translator = Translator() translation = translator.translate(transcription[0], dest=target_language) print(f"Translation to {target_language}: {translation.text}") return transcription[0], translation.text else: return transcription[0], None def main(): ad = record() # Prompt the user for a target language target_language = input("Enter the target language code (e.g., 'es' for Spanish, 'fr' for French, etc.), or press Enter to skip translation: ") # Transcribe and optionally translate transcribe_and_translate(ad, target_language if target_language else None) if __name__ == "__main__": main()