Mohammadp commited on
Commit
f309d86
·
verified ·
1 Parent(s): c6ee3dc

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -0
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from moviepy.editor import VideoFileClip
4
+ from pydub import AudioSegment
5
+ import torch
6
+ from nemo.collections.asr.models import EncDecCTCModelBPE # Adjust based on your model type
7
+
8
+ # Load trained NeMo model
9
+ MODEL_PATH = "https://huggingface.co/Mohammadp/Persian-ASR/blob/main/conformer_transducer_persian.nemo"
10
+
11
+ model = EncDecCTCModelBPE.restore_from(MODEL_PATH) # Adjust based on your model type
12
+
13
+ # Constants
14
+ SAMPLE_RATE = 16000
15
+ MAX_CHUNK_LENGTH_MS = 10 * 1000 # 10 seconds per chunk
16
+
17
+ # Helper functions
18
+ def extract_audio_from_video(video_path):
19
+ """Extracts audio from a video file and saves it as a WAV file."""
20
+ video = VideoFileClip(video_path)
21
+ audio_path = "extracted_audio.wav"
22
+ video.audio.write_audiofile(audio_path)
23
+ return audio_path
24
+
25
+ def resample_audio(audio_path, target_sample_rate=SAMPLE_RATE):
26
+ """Resamples an audio file to 16kHz."""
27
+ audio = AudioSegment.from_file(audio_path)
28
+ audio = audio.set_frame_rate(target_sample_rate)
29
+ resampled_path = "resampled_audio.wav"
30
+ audio.export(resampled_path, format="wav")
31
+ return resampled_path
32
+
33
+ def split_audio(audio_path, max_length_ms=MAX_CHUNK_LENGTH_MS):
34
+ """Splits audio into chunks of max_length_ms each."""
35
+ audio = AudioSegment.from_file(audio_path)
36
+ chunks = []
37
+ for i in range(0, len(audio), max_length_ms):
38
+ chunk = audio[i:i + max_length_ms]
39
+ chunk_path = f"chunk_{i // max_length_ms}.wav"
40
+ chunk.export(chunk_path, format="wav")
41
+ chunks.append(chunk_path)
42
+ return chunks
43
+
44
+ def transcribe_audio(audio_path):
45
+ """Transcribes a single audio file using the NeMo model."""
46
+ return model.transcribe([audio_path])[0]
47
+
48
+ def process_audio(audio_path):
49
+ """Processes an audio file: resamples, splits, and transcribes."""
50
+ resampled_path = resample_audio(audio_path)
51
+ chunks = split_audio(resampled_path)
52
+ transcriptions = [transcribe_audio(chunk) for chunk in chunks]
53
+ return " ".join(transcriptions)
54
+
55
+ def process_video(video_path):
56
+ """Extracts and processes audio from a video file."""
57
+ audio_path = extract_audio_from_video(video_path)
58
+ return process_audio(audio_path)
59
+
60
+ def process_microphone(audio_path):
61
+ """Processes live-recorded microphone audio."""
62
+ return process_audio(audio_path)
63
+
64
+ # Gradio Interface
65
+ def process_input(video=None, audio=None, microphone=None):
66
+ if video is not None:
67
+ return f"Transcription: {process_video(video)}"
68
+ elif audio is not None:
69
+ return f"Transcription: {process_audio(audio)}"
70
+ elif microphone is not None:
71
+ return f"Transcription: {process_microphone(microphone)}"
72
+ else:
73
+ return "No input provided."
74
+
75
+ # ** WAV FILE EXAMPLES ONLY **
76
+ example_wav_files = [
77
+ "example1.wav", # Replace with actual WAV file paths
78
+ "example2.wav",
79
+ "example3.wav"
80
+ ]
81
+
82
+ iface = gr.Interface(
83
+ fn=process_input,
84
+ inputs=[
85
+ gr.Video(label="Upload Video"),
86
+ gr.Audio(label="Upload Audio File", type="filepath"),
87
+ gr.Microphone(label="Record from Microphone", type="filepath")
88
+ ],
89
+ outputs="text",
90
+ title="NeMo ASR Transcription Interface",
91
+ description="Upload a video, an audio file, or record from the microphone to transcribe the audio using a trained NeMo model.",
92
+ examples=[[None, wav, None] for wav in example_wav_files] # **Only WAV examples**
93
+ )
94
+
95
+ iface.launch()