Spaces:

Mohammadp
/

gr

Sleeping

App Files Files Community

Mohammadp commited on Feb 4, 2025

Commit

aeca815

verified ·

1 Parent(s): f5ab0c0

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -94

app.py CHANGED Viewed

@@ -1,95 +1,103 @@
-import gradio as gr
-import os
-from moviepy.editor import VideoFileClip
-from pydub import AudioSegment
-import torch
-from nemo.collections.asr.models import EncDecCTCModelBPE  # Adjust based on your model type
-# Load trained NeMo model
-MODEL_PATH = "https://huggingface.co/Mohammadp/Persian-ASR/blob/main/conformer_transducer_persian.nemo"
-model = EncDecCTCModelBPE.restore_from(MODEL_PATH)  # Adjust based on your model type
-# Constants
-SAMPLE_RATE = 16000
-MAX_CHUNK_LENGTH_MS = 10 * 1000  # 10 seconds per chunk
-# Helper functions
-def extract_audio_from_video(video_path):
-    """Extracts audio from a video file and saves it as a WAV file."""
-    video = VideoFileClip(video_path)
-    audio_path = "extracted_audio.wav"
-    video.audio.write_audiofile(audio_path)
-    return audio_path
-def resample_audio(audio_path, target_sample_rate=SAMPLE_RATE):
-    """Resamples an audio file to 16kHz."""
-    audio = AudioSegment.from_file(audio_path)
-    audio = audio.set_frame_rate(target_sample_rate)
-    resampled_path = "resampled_audio.wav"
-    audio.export(resampled_path, format="wav")
-    return resampled_path
-def split_audio(audio_path, max_length_ms=MAX_CHUNK_LENGTH_MS):
-    """Splits audio into chunks of max_length_ms each."""
-    audio = AudioSegment.from_file(audio_path)
-    chunks = []
-    for i in range(0, len(audio), max_length_ms):
-        chunk = audio[i:i + max_length_ms]
-        chunk_path = f"chunk_{i // max_length_ms}.wav"
-        chunk.export(chunk_path, format="wav")
-        chunks.append(chunk_path)
-    return chunks
-def transcribe_audio(audio_path):
-    """Transcribes a single audio file using the NeMo model."""
-    return model.transcribe([audio_path])[0]
-def process_audio(audio_path):
-    """Processes an audio file: resamples, splits, and transcribes."""
-    resampled_path = resample_audio(audio_path)
-    chunks = split_audio(resampled_path)
-    transcriptions = [transcribe_audio(chunk) for chunk in chunks]
-    return " ".join(transcriptions)
-def process_video(video_path):
-    """Extracts and processes audio from a video file."""
-    audio_path = extract_audio_from_video(video_path)
-    return process_audio(audio_path)
-def process_microphone(audio_path):
-    """Processes live-recorded microphone audio."""
-    return process_audio(audio_path)
-# Gradio Interface
-def process_input(video=None, audio=None, microphone=None):
-    if video is not None:
-        return f"Transcription: {process_video(video)}"
-    elif audio is not None:
-        return f"Transcription: {process_audio(audio)}"
-    elif microphone is not None:
-        return f"Transcription: {process_microphone(microphone)}"
-    else:
-        return "No input provided."
-# ** WAV FILE EXAMPLES ONLY **
-example_wav_files = [
-    "example1.wav",  # Replace with actual WAV file paths
-    "example2.wav",
-    "example3.wav"
-]
-iface = gr.Interface(
-    fn=process_input,
-    inputs=[
-        gr.Video(label="Upload Video"),
-        gr.Audio(label="Upload Audio File", type="filepath"),
-        gr.Microphone(label="Record from Microphone", type="filepath")
-    ],
-    outputs="text",
-    title="NeMo ASR Transcription Interface",
-    description="Upload a video, an audio file, or record from the microphone to transcribe the audio using a trained NeMo model.",
-    examples=[[None, wav, None] for wav in example_wav_files]  # **Only WAV examples**
-)
 iface.launch()

+import gradio as gr
+import os
+from moviepy.editor import VideoFileClip
+from pydub import AudioSegment
+import torch
+from nemo.collections.asr.models import EncDecCTCModelBPE  # Adjust based on your model type
+import wget
+MODEL_URL = "https://huggingface.co/Mohammadp/Persian-ASR/resolve/main/conformer_transducer_persian.nemo"
+MODEL_PATH = "conformer_transducer_persian.nemo"
+# Download model if it doesn't exist
+if not os.path.exists(MODEL_PATH):
+    print("Downloading model...")
+    wget.download(MODEL_URL, MODEL_PATH)
+    print("\nModel downloaded successfully.")
+# Load the model
+model = EncDecCTCModelBPE.restore_from(MODEL_PATH)
+print("Model loaded successfully!")
+# Constants
+SAMPLE_RATE = 16000
+MAX_CHUNK_LENGTH_MS = 10 * 1000  # 10 seconds per chunk
+# Helper functions
+def extract_audio_from_video(video_path):
+    """Extracts audio from a video file and saves it as a WAV file."""
+    video = VideoFileClip(video_path)
+    audio_path = "extracted_audio.wav"
+    video.audio.write_audiofile(audio_path)
+    return audio_path
+def resample_audio(audio_path, target_sample_rate=SAMPLE_RATE):
+    """Resamples an audio file to 16kHz."""
+    audio = AudioSegment.from_file(audio_path)
+    audio = audio.set_frame_rate(target_sample_rate)
+    resampled_path = "resampled_audio.wav"
+    audio.export(resampled_path, format="wav")
+    return resampled_path
+def split_audio(audio_path, max_length_ms=MAX_CHUNK_LENGTH_MS):
+    """Splits audio into chunks of max_length_ms each."""
+    audio = AudioSegment.from_file(audio_path)
+    chunks = []
+    for i in range(0, len(audio), max_length_ms):
+        chunk = audio[i:i + max_length_ms]
+        chunk_path = f"chunk_{i // max_length_ms}.wav"
+        chunk.export(chunk_path, format="wav")
+        chunks.append(chunk_path)
+    return chunks
+def transcribe_audio(audio_path):
+    """Transcribes a single audio file using the NeMo model."""
+    return model.transcribe([audio_path])[0]
+def process_audio(audio_path):
+    """Processes an audio file: resamples, splits, and transcribes."""
+    resampled_path = resample_audio(audio_path)
+    chunks = split_audio(resampled_path)
+    transcriptions = [transcribe_audio(chunk) for chunk in chunks]
+    return " ".join(transcriptions)
+def process_video(video_path):
+    """Extracts and processes audio from a video file."""
+    audio_path = extract_audio_from_video(video_path)
+    return process_audio(audio_path)
+def process_microphone(audio_path):
+    """Processes live-recorded microphone audio."""
+    return process_audio(audio_path)
+# Gradio Interface
+def process_input(video=None, audio=None, microphone=None):
+    if video is not None:
+        return f"Transcription: {process_video(video)}"
+    elif audio is not None:
+        return f"Transcription: {process_audio(audio)}"
+    elif microphone is not None:
+        return f"Transcription: {process_microphone(microphone)}"
+    else:
+        return "No input provided."
+# ** WAV FILE EXAMPLES ONLY **
+example_wav_files = [
+    "example1.wav",  # Replace with actual WAV file paths
+    "example2.wav",
+    "example3.wav"
+]
+iface = gr.Interface(
+    fn=process_input,
+    inputs=[
+        gr.Video(label="Upload Video"),
+        gr.Audio(label="Upload Audio File", type="filepath"),
+        gr.Microphone(label="Record from Microphone", type="filepath")
+    ],
+    outputs="text",
+    title="NeMo ASR Transcription Interface",
+    description="Upload a video, an audio file, or record from the microphone to transcribe the audio using a trained NeMo model.",
+    examples=[[None, wav, None] for wav in example_wav_files]  # **Only WAV examples**
+)
 iface.launch()