from IPython.display import HTML, Javascript
from google.colab.output import eval_js
import base64
import time
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from googletrans import Translator
from pydub import AudioSegment
import io
def record():
js = Javascript("""
async function recordAudio() {
const div = document.createElement('div');
const audio = document.createElement('audio');
const strtButton = document.createElement('button');
const stopButton = document.createElement('button');
strtButton.textContent = 'Start Recording';
stopButton.textContent = 'Stop Recording';
document.body.appendChild(div);
div.appendChild(strtButton);
div.appendChild(audio);
const stream = await navigator.mediaDevices.getUserMedia({audio:true});
let recorder = new MediaRecorder(stream);
audio.style.display = 'block';
audio.srcObject = stream;
audio.controls = true;
audio.muted = true;
await new Promise((resolve) => strtButton.onclick = resolve);
strtButton.replaceWith(stopButton);
recorder.start();
await new Promise((resolve) => stopButton.onclick = resolve);
recorder.stop();
let recData = await new Promise((resolve) => recorder.ondataavailable = resolve);
let arrBuff = await recData.data.arrayBuffer();
stream.getAudioTracks()[0].stop();
div.remove();
let binaryString = '';
let bytes = new Uint8Array(arrBuff);
bytes.forEach((byte) => { binaryString += String.fromCharCode(byte); });
const url = URL.createObjectURL(recData.data);
const player = document.createElement('audio');
player.controls = true;
player.src = url;
document.body.appendChild(player);
return btoa(binaryString);
}""")
display(js)
output = eval_js('recordAudio({})')
# Generate a unique filename using the current timestamp
filename = f"audio_{int(time.time())}.wav"
with open(filename, 'wb') as file:
binary = base64.b64decode(output)
file.write(binary)
print('Recording saved to:', file.name)
return filename
def transcribe_and_translate(audio_filename, target_language=None):
# Load the processor and model from Hugging Face's transformers library
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
# Load the audio file
audio = AudioSegment.from_wav(audio_filename)
audio = audio.set_channels(1).set_frame_rate(16000)
audio = io.BytesIO()
audio.export(audio, format="wav")
audio = torch.FloatTensor(audio.getvalue()).unsqueeze(0)
# Process the audio and perform transcription
inputs = processor(audio, return_tensors="pt").input_values
with torch.no_grad():
logits = model(input_values=inputs).logits
transcription = processor.batch_decode(logits.numpy())
print("Transcription:", transcription[0])
# Translate the transcription if a target language is provided
if target_language:
translator = Translator()
translation = translator.translate(transcription[0], dest=target_language)
print(f"Translation to {target_language}: {translation.text}")
return transcription[0], translation.text
else:
return transcription[0], None
def main():
ad = record()
# Prompt the user for a target language
target_language = input("Enter the target language code (e.g., 'es' for Spanish, 'fr' for French, etc.), or press Enter to skip translation: ")
# Transcribe and optionally translate
transcribe_and_translate(ad, target_language if target_language else None)
if __name__ == "__main__":
main()