Spaces:

Mohammadp
/

gr

Sleeping

App Files Files Community

Mohammadp commited on Feb 4, 2025

Commit

f309d86

verified ·

1 Parent(s): c6ee3dc

Upload app.py

Browse files

Files changed (1) hide show

app.py +95 -0

app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import gradio as gr
+import os
+from moviepy.editor import VideoFileClip
+from pydub import AudioSegment
+import torch
+from nemo.collections.asr.models import EncDecCTCModelBPE  # Adjust based on your model type
+# Load trained NeMo model
+MODEL_PATH = "https://huggingface.co/Mohammadp/Persian-ASR/blob/main/conformer_transducer_persian.nemo"
+model = EncDecCTCModelBPE.restore_from(MODEL_PATH)  # Adjust based on your model type
+# Constants
+SAMPLE_RATE = 16000
+MAX_CHUNK_LENGTH_MS = 10 * 1000  # 10 seconds per chunk
+# Helper functions
+def extract_audio_from_video(video_path):
+    """Extracts audio from a video file and saves it as a WAV file."""
+    video = VideoFileClip(video_path)
+    audio_path = "extracted_audio.wav"
+    video.audio.write_audiofile(audio_path)
+    return audio_path
+def resample_audio(audio_path, target_sample_rate=SAMPLE_RATE):
+    """Resamples an audio file to 16kHz."""
+    audio = AudioSegment.from_file(audio_path)
+    audio = audio.set_frame_rate(target_sample_rate)
+    resampled_path = "resampled_audio.wav"
+    audio.export(resampled_path, format="wav")
+    return resampled_path
+def split_audio(audio_path, max_length_ms=MAX_CHUNK_LENGTH_MS):
+    """Splits audio into chunks of max_length_ms each."""
+    audio = AudioSegment.from_file(audio_path)
+    chunks = []
+    for i in range(0, len(audio), max_length_ms):
+        chunk = audio[i:i + max_length_ms]
+        chunk_path = f"chunk_{i // max_length_ms}.wav"
+        chunk.export(chunk_path, format="wav")
+        chunks.append(chunk_path)
+    return chunks
+def transcribe_audio(audio_path):
+    """Transcribes a single audio file using the NeMo model."""
+    return model.transcribe([audio_path])[0]
+def process_audio(audio_path):
+    """Processes an audio file: resamples, splits, and transcribes."""
+    resampled_path = resample_audio(audio_path)
+    chunks = split_audio(resampled_path)
+    transcriptions = [transcribe_audio(chunk) for chunk in chunks]
+    return " ".join(transcriptions)
+def process_video(video_path):
+    """Extracts and processes audio from a video file."""
+    audio_path = extract_audio_from_video(video_path)
+    return process_audio(audio_path)
+def process_microphone(audio_path):
+    """Processes live-recorded microphone audio."""
+    return process_audio(audio_path)
+# Gradio Interface
+def process_input(video=None, audio=None, microphone=None):
+    if video is not None:
+        return f"Transcription: {process_video(video)}"
+    elif audio is not None:
+        return f"Transcription: {process_audio(audio)}"
+    elif microphone is not None:
+        return f"Transcription: {process_microphone(microphone)}"
+    else:
+        return "No input provided."
+# ** WAV FILE EXAMPLES ONLY **
+example_wav_files = [
+    "example1.wav",  # Replace with actual WAV file paths
+    "example2.wav",
+    "example3.wav"
+]
+iface = gr.Interface(
+    fn=process_input,
+    inputs=[
+        gr.Video(label="Upload Video"),
+        gr.Audio(label="Upload Audio File", type="filepath"),
+        gr.Microphone(label="Record from Microphone", type="filepath")
+    ],
+    outputs="text",
+    title="NeMo ASR Transcription Interface",
+    description="Upload a video, an audio file, or record from the microphone to transcribe the audio using a trained NeMo model.",
+    examples=[[None, wav, None] for wav in example_wav_files]  # **Only WAV examples**
+)
+iface.launch()