File size: 1,974 Bytes
08309f4
3057bfd
08309f4
3057bfd
08309f4
3057bfd
 
08309f4
e186284
3057bfd
 
 
08309f4
3057bfd
e186284
 
 
 
3057bfd
 
 
 
08309f4
3057bfd
 
e186284
08309f4
e186284
08309f4
 
3057bfd
 
 
08309f4
3057bfd
08309f4
3057bfd
08309f4
3057bfd
e186284
3057bfd
e186284
3057bfd
 
e186284
 
 
 
3057bfd
 
 
 
e186284
 
 
 
 
 
 
08309f4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
from nemo.collections.asr.models import ASRModel
from docx import Document
import torch
import uuid
from pydub import AudioSegment
import os

# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")
model.eval().to(device).to(torch.bfloat16)

def transcribe_to_docx(audio_path):
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"File not found: {audio_path}")

    # Convert to mono 16kHz wav
    audio = AudioSegment.from_file(audio_path)
    audio = audio.set_channels(1).set_frame_rate(16000)
    tmp_wav = f"/tmp/{uuid.uuid4()}.wav"
    audio.export(tmp_wav, format="wav")

    # Transcribe
    output = model.transcribe([tmp_wav])
    transcript = output[0].text

    # Save to Word
    doc = Document()
    doc.add_heading("Transcription", level=1)
    doc.add_paragraph(transcript)
    docx_path = f"/tmp/{uuid.uuid4()}.docx"
    doc.save(docx_path)

    return transcript, docx_path

# UI
with gr.Blocks() as demo:
    gr.Markdown("## 🎙️ Upload Audio and Download Word Transcription")

    audio_input = gr.Audio(type="filepath", label="Upload Audio File")
    transcribe_button = gr.Button("Transcribe", variant="primary")
    transcript_output = gr.Textbox(label="Transcript")
    docx_file_output = gr.File(label="Download .docx")
    download_button = gr.Button("Ready to Download", visible=False, variant="secondary")

    def enable_download(transcript, file):
        return gr.update(visible=True, variant="primary"), transcript, file

    transcribe_button.click(
        fn=transcribe_to_docx,
        inputs=audio_input,
        outputs=[transcript_output, docx_file_output],
        show_progress=True,
        api_name="transcribe"
    ).then(
        fn=enable_download,
        inputs=[transcript_output, docx_file_output],
        outputs=[download_button, transcript_output, docx_file_output]
    )

demo.launch()