Spaces:

David-Chew-HL
/

Transcriber-2.0

Sleeping

File size: 1,974 Bytes

import gradio as gr
from nemo.collections.asr.models import ASRModel
from docx import Document
import torch
import uuid
from pydub import AudioSegment
import os

# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")
model.eval().to(device).to(torch.bfloat16)

def transcribe_to_docx(audio_path):
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"File not found: {audio_path}")

    # Convert to mono 16kHz wav
    audio = AudioSegment.from_file(audio_path)
    audio = audio.set_channels(1).set_frame_rate(16000)
    tmp_wav = f"/tmp/{uuid.uuid4()}.wav"
    audio.export(tmp_wav, format="wav")

    # Transcribe
    output = model.transcribe([tmp_wav])
    transcript = output[0].text

    # Save to Word
    doc = Document()
    doc.add_heading("Transcription", level=1)
    doc.add_paragraph(transcript)
    docx_path = f"/tmp/{uuid.uuid4()}.docx"
    doc.save(docx_path)

    return transcript, docx_path

# UI
with gr.Blocks() as demo:
    gr.Markdown("## 🎙️ Upload Audio and Download Word Transcription")

    audio_input = gr.Audio(type="filepath", label="Upload Audio File")
    transcribe_button = gr.Button("Transcribe", variant="primary")
    transcript_output = gr.Textbox(label="Transcript")
    docx_file_output = gr.File(label="Download .docx")
    download_button = gr.Button("Ready to Download", visible=False, variant="secondary")

    def enable_download(transcript, file):
        return gr.update(visible=True, variant="primary"), transcript, file

    transcribe_button.click(
        fn=transcribe_to_docx,
        inputs=audio_input,
        outputs=[transcript_output, docx_file_output],
        show_progress=True,
        api_name="transcribe"
    ).then(
        fn=enable_download,
        inputs=[transcript_output, docx_file_output],
        outputs=[download_button, transcript_output, docx_file_output]
    )

demo.launch()