File size: 3,889 Bytes
e6e0737
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110


from IPython.display import HTML, Javascript
from google.colab.output import eval_js
import base64
import time
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from googletrans import Translator
from pydub import AudioSegment
import io

def record():
    js = Javascript("""
    async function recordAudio() {
        const div = document.createElement('div');
        const audio = document.createElement('audio');
        const strtButton = document.createElement('button');
        const stopButton = document.createElement('button');

        strtButton.textContent = 'Start Recording';
        stopButton.textContent = 'Stop Recording';

        document.body.appendChild(div);
        div.appendChild(strtButton);
        div.appendChild(audio);

        const stream = await navigator.mediaDevices.getUserMedia({audio:true});
        let recorder = new MediaRecorder(stream);
        audio.style.display = 'block';
        audio.srcObject = stream;
        audio.controls = true;
        audio.muted = true;

        await new Promise((resolve) => strtButton.onclick = resolve);
        strtButton.replaceWith(stopButton);
        recorder.start();

        await new Promise((resolve) => stopButton.onclick = resolve);
        recorder.stop();
        let recData = await new Promise((resolve) => recorder.ondataavailable = resolve);
        let arrBuff = await recData.data.arrayBuffer();
        stream.getAudioTracks()[0].stop();
        div.remove();

        let binaryString = '';
        let bytes = new Uint8Array(arrBuff);
        bytes.forEach((byte) => { binaryString += String.fromCharCode(byte); });

        const url = URL.createObjectURL(recData.data);
        const player = document.createElement('audio');
        player.controls = true;
        player.src = url;
        document.body.appendChild(player);

        return btoa(binaryString);
    }""")
    display(js)
    output = eval_js('recordAudio({})')

    # Generate a unique filename using the current timestamp
    filename = f"audio_{int(time.time())}.wav"

    with open(filename, 'wb') as file:
        binary = base64.b64decode(output)
        file.write(binary)

    print('Recording saved to:', file.name)
    return filename

def transcribe_and_translate(audio_filename, target_language=None):
    # Load the processor and model from Hugging Face's transformers library
    processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

    # Load the audio file
    audio = AudioSegment.from_wav(audio_filename)
    audio = audio.set_channels(1).set_frame_rate(16000)
    audio = io.BytesIO()
    audio.export(audio, format="wav")
    audio = torch.FloatTensor(audio.getvalue()).unsqueeze(0)

    # Process the audio and perform transcription
    inputs = processor(audio, return_tensors="pt").input_values
    with torch.no_grad():
        logits = model(input_values=inputs).logits
    transcription = processor.batch_decode(logits.numpy())

    print("Transcription:", transcription[0])

    # Translate the transcription if a target language is provided
    if target_language:
        translator = Translator()
        translation = translator.translate(transcription[0], dest=target_language)
        print(f"Translation to {target_language}: {translation.text}")
        return transcription[0], translation.text
    else:
        return transcription[0], None

def main():
    ad = record()

    # Prompt the user for a target language
    target_language = input("Enter the target language code (e.g., 'es' for Spanish, 'fr' for French, etc.), or press Enter to skip translation: ")
    
    # Transcribe and optionally translate
    transcribe_and_translate(ad, target_language if target_language else None)

if __name__ == "__main__":
    main()