import gradio as gr from nemo.collections.asr.models import ASRModel from docx import Document import torch import uuid from pydub import AudioSegment import os # Load model device = "cuda" if torch.cuda.is_available() else "cpu" model = ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2") model.eval().to(device).to(torch.bfloat16) def transcribe_to_docx(audio_path): if not os.path.exists(audio_path): raise FileNotFoundError(f"File not found: {audio_path}") # Convert to mono 16kHz wav audio = AudioSegment.from_file(audio_path) audio = audio.set_channels(1).set_frame_rate(16000) tmp_wav = f"/tmp/{uuid.uuid4()}.wav" audio.export(tmp_wav, format="wav") # Transcribe output = model.transcribe([tmp_wav]) transcript = output[0].text # Save to Word doc = Document() doc.add_heading("Transcription", level=1) doc.add_paragraph(transcript) docx_path = f"/tmp/{uuid.uuid4()}.docx" doc.save(docx_path) return transcript, docx_path # UI with gr.Blocks() as demo: gr.Markdown("## 🎙️ Upload Audio and Download Word Transcription") audio_input = gr.Audio(type="filepath", label="Upload Audio File") transcribe_button = gr.Button("Transcribe", variant="primary") transcript_output = gr.Textbox(label="Transcript") docx_file_output = gr.File(label="Download .docx") download_button = gr.Button("Ready to Download", visible=False, variant="secondary") def enable_download(transcript, file): return gr.update(visible=True, variant="primary"), transcript, file transcribe_button.click( fn=transcribe_to_docx, inputs=audio_input, outputs=[transcript_output, docx_file_output], show_progress=True, api_name="transcribe" ).then( fn=enable_download, inputs=[transcript_output, docx_file_output], outputs=[download_button, transcript_output, docx_file_output] ) demo.launch()