Spaces:
Sleeping
Sleeping
| from IPython.display import HTML, Javascript | |
| from google.colab.output import eval_js | |
| import base64 | |
| import time | |
| import torch | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| from googletrans import Translator | |
| from pydub import AudioSegment | |
| import io | |
| def record(): | |
| js = Javascript(""" | |
| async function recordAudio() { | |
| const div = document.createElement('div'); | |
| const audio = document.createElement('audio'); | |
| const strtButton = document.createElement('button'); | |
| const stopButton = document.createElement('button'); | |
| strtButton.textContent = 'Start Recording'; | |
| stopButton.textContent = 'Stop Recording'; | |
| document.body.appendChild(div); | |
| div.appendChild(strtButton); | |
| div.appendChild(audio); | |
| const stream = await navigator.mediaDevices.getUserMedia({audio:true}); | |
| let recorder = new MediaRecorder(stream); | |
| audio.style.display = 'block'; | |
| audio.srcObject = stream; | |
| audio.controls = true; | |
| audio.muted = true; | |
| await new Promise((resolve) => strtButton.onclick = resolve); | |
| strtButton.replaceWith(stopButton); | |
| recorder.start(); | |
| await new Promise((resolve) => stopButton.onclick = resolve); | |
| recorder.stop(); | |
| let recData = await new Promise((resolve) => recorder.ondataavailable = resolve); | |
| let arrBuff = await recData.data.arrayBuffer(); | |
| stream.getAudioTracks()[0].stop(); | |
| div.remove(); | |
| let binaryString = ''; | |
| let bytes = new Uint8Array(arrBuff); | |
| bytes.forEach((byte) => { binaryString += String.fromCharCode(byte); }); | |
| const url = URL.createObjectURL(recData.data); | |
| const player = document.createElement('audio'); | |
| player.controls = true; | |
| player.src = url; | |
| document.body.appendChild(player); | |
| return btoa(binaryString); | |
| }""") | |
| display(js) | |
| output = eval_js('recordAudio({})') | |
| # Generate a unique filename using the current timestamp | |
| filename = f"audio_{int(time.time())}.wav" | |
| with open(filename, 'wb') as file: | |
| binary = base64.b64decode(output) | |
| file.write(binary) | |
| print('Recording saved to:', file.name) | |
| return filename | |
| def transcribe_and_translate(audio_filename, target_language=None): | |
| # Load the processor and model from Hugging Face's transformers library | |
| processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") | |
| model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") | |
| # Load the audio file | |
| audio = AudioSegment.from_wav(audio_filename) | |
| audio = audio.set_channels(1).set_frame_rate(16000) | |
| audio = io.BytesIO() | |
| audio.export(audio, format="wav") | |
| audio = torch.FloatTensor(audio.getvalue()).unsqueeze(0) | |
| # Process the audio and perform transcription | |
| inputs = processor(audio, return_tensors="pt").input_values | |
| with torch.no_grad(): | |
| logits = model(input_values=inputs).logits | |
| transcription = processor.batch_decode(logits.numpy()) | |
| print("Transcription:", transcription[0]) | |
| # Translate the transcription if a target language is provided | |
| if target_language: | |
| translator = Translator() | |
| translation = translator.translate(transcription[0], dest=target_language) | |
| print(f"Translation to {target_language}: {translation.text}") | |
| return transcription[0], translation.text | |
| else: | |
| return transcription[0], None | |
| def main(): | |
| ad = record() | |
| # Prompt the user for a target language | |
| target_language = input("Enter the target language code (e.g., 'es' for Spanish, 'fr' for French, etc.), or press Enter to skip translation: ") | |
| # Transcribe and optionally translate | |
| transcribe_and_translate(ad, target_language if target_language else None) | |
| if __name__ == "__main__": | |
| main() |