Mohammadp commited on
Commit
aeca815
·
verified ·
1 Parent(s): f5ab0c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -94
app.py CHANGED
@@ -1,95 +1,103 @@
1
- import gradio as gr
2
- import os
3
- from moviepy.editor import VideoFileClip
4
- from pydub import AudioSegment
5
- import torch
6
- from nemo.collections.asr.models import EncDecCTCModelBPE # Adjust based on your model type
7
-
8
- # Load trained NeMo model
9
- MODEL_PATH = "https://huggingface.co/Mohammadp/Persian-ASR/blob/main/conformer_transducer_persian.nemo"
10
-
11
- model = EncDecCTCModelBPE.restore_from(MODEL_PATH) # Adjust based on your model type
12
-
13
- # Constants
14
- SAMPLE_RATE = 16000
15
- MAX_CHUNK_LENGTH_MS = 10 * 1000 # 10 seconds per chunk
16
-
17
- # Helper functions
18
- def extract_audio_from_video(video_path):
19
- """Extracts audio from a video file and saves it as a WAV file."""
20
- video = VideoFileClip(video_path)
21
- audio_path = "extracted_audio.wav"
22
- video.audio.write_audiofile(audio_path)
23
- return audio_path
24
-
25
- def resample_audio(audio_path, target_sample_rate=SAMPLE_RATE):
26
- """Resamples an audio file to 16kHz."""
27
- audio = AudioSegment.from_file(audio_path)
28
- audio = audio.set_frame_rate(target_sample_rate)
29
- resampled_path = "resampled_audio.wav"
30
- audio.export(resampled_path, format="wav")
31
- return resampled_path
32
-
33
- def split_audio(audio_path, max_length_ms=MAX_CHUNK_LENGTH_MS):
34
- """Splits audio into chunks of max_length_ms each."""
35
- audio = AudioSegment.from_file(audio_path)
36
- chunks = []
37
- for i in range(0, len(audio), max_length_ms):
38
- chunk = audio[i:i + max_length_ms]
39
- chunk_path = f"chunk_{i // max_length_ms}.wav"
40
- chunk.export(chunk_path, format="wav")
41
- chunks.append(chunk_path)
42
- return chunks
43
-
44
- def transcribe_audio(audio_path):
45
- """Transcribes a single audio file using the NeMo model."""
46
- return model.transcribe([audio_path])[0]
47
-
48
- def process_audio(audio_path):
49
- """Processes an audio file: resamples, splits, and transcribes."""
50
- resampled_path = resample_audio(audio_path)
51
- chunks = split_audio(resampled_path)
52
- transcriptions = [transcribe_audio(chunk) for chunk in chunks]
53
- return " ".join(transcriptions)
54
-
55
- def process_video(video_path):
56
- """Extracts and processes audio from a video file."""
57
- audio_path = extract_audio_from_video(video_path)
58
- return process_audio(audio_path)
59
-
60
- def process_microphone(audio_path):
61
- """Processes live-recorded microphone audio."""
62
- return process_audio(audio_path)
63
-
64
- # Gradio Interface
65
- def process_input(video=None, audio=None, microphone=None):
66
- if video is not None:
67
- return f"Transcription: {process_video(video)}"
68
- elif audio is not None:
69
- return f"Transcription: {process_audio(audio)}"
70
- elif microphone is not None:
71
- return f"Transcription: {process_microphone(microphone)}"
72
- else:
73
- return "No input provided."
74
-
75
- # ** WAV FILE EXAMPLES ONLY **
76
- example_wav_files = [
77
- "example1.wav", # Replace with actual WAV file paths
78
- "example2.wav",
79
- "example3.wav"
80
- ]
81
-
82
- iface = gr.Interface(
83
- fn=process_input,
84
- inputs=[
85
- gr.Video(label="Upload Video"),
86
- gr.Audio(label="Upload Audio File", type="filepath"),
87
- gr.Microphone(label="Record from Microphone", type="filepath")
88
- ],
89
- outputs="text",
90
- title="NeMo ASR Transcription Interface",
91
- description="Upload a video, an audio file, or record from the microphone to transcribe the audio using a trained NeMo model.",
92
- examples=[[None, wav, None] for wav in example_wav_files] # **Only WAV examples**
93
- )
94
-
 
 
 
 
 
 
 
 
95
  iface.launch()
 
1
+ import gradio as gr
2
+ import os
3
+ from moviepy.editor import VideoFileClip
4
+ from pydub import AudioSegment
5
+ import torch
6
+ from nemo.collections.asr.models import EncDecCTCModelBPE # Adjust based on your model type
7
+ import wget
8
+
9
+ MODEL_URL = "https://huggingface.co/Mohammadp/Persian-ASR/resolve/main/conformer_transducer_persian.nemo"
10
+ MODEL_PATH = "conformer_transducer_persian.nemo"
11
+
12
+ # Download model if it doesn't exist
13
+ if not os.path.exists(MODEL_PATH):
14
+ print("Downloading model...")
15
+ wget.download(MODEL_URL, MODEL_PATH)
16
+ print("\nModel downloaded successfully.")
17
+
18
+ # Load the model
19
+ model = EncDecCTCModelBPE.restore_from(MODEL_PATH)
20
+ print("Model loaded successfully!")
21
+ # Constants
22
+ SAMPLE_RATE = 16000
23
+ MAX_CHUNK_LENGTH_MS = 10 * 1000 # 10 seconds per chunk
24
+
25
+ # Helper functions
26
+ def extract_audio_from_video(video_path):
27
+ """Extracts audio from a video file and saves it as a WAV file."""
28
+ video = VideoFileClip(video_path)
29
+ audio_path = "extracted_audio.wav"
30
+ video.audio.write_audiofile(audio_path)
31
+ return audio_path
32
+
33
+ def resample_audio(audio_path, target_sample_rate=SAMPLE_RATE):
34
+ """Resamples an audio file to 16kHz."""
35
+ audio = AudioSegment.from_file(audio_path)
36
+ audio = audio.set_frame_rate(target_sample_rate)
37
+ resampled_path = "resampled_audio.wav"
38
+ audio.export(resampled_path, format="wav")
39
+ return resampled_path
40
+
41
+ def split_audio(audio_path, max_length_ms=MAX_CHUNK_LENGTH_MS):
42
+ """Splits audio into chunks of max_length_ms each."""
43
+ audio = AudioSegment.from_file(audio_path)
44
+ chunks = []
45
+ for i in range(0, len(audio), max_length_ms):
46
+ chunk = audio[i:i + max_length_ms]
47
+ chunk_path = f"chunk_{i // max_length_ms}.wav"
48
+ chunk.export(chunk_path, format="wav")
49
+ chunks.append(chunk_path)
50
+ return chunks
51
+
52
+ def transcribe_audio(audio_path):
53
+ """Transcribes a single audio file using the NeMo model."""
54
+ return model.transcribe([audio_path])[0]
55
+
56
+ def process_audio(audio_path):
57
+ """Processes an audio file: resamples, splits, and transcribes."""
58
+ resampled_path = resample_audio(audio_path)
59
+ chunks = split_audio(resampled_path)
60
+ transcriptions = [transcribe_audio(chunk) for chunk in chunks]
61
+ return " ".join(transcriptions)
62
+
63
+ def process_video(video_path):
64
+ """Extracts and processes audio from a video file."""
65
+ audio_path = extract_audio_from_video(video_path)
66
+ return process_audio(audio_path)
67
+
68
+ def process_microphone(audio_path):
69
+ """Processes live-recorded microphone audio."""
70
+ return process_audio(audio_path)
71
+
72
+ # Gradio Interface
73
+ def process_input(video=None, audio=None, microphone=None):
74
+ if video is not None:
75
+ return f"Transcription: {process_video(video)}"
76
+ elif audio is not None:
77
+ return f"Transcription: {process_audio(audio)}"
78
+ elif microphone is not None:
79
+ return f"Transcription: {process_microphone(microphone)}"
80
+ else:
81
+ return "No input provided."
82
+
83
+ # ** WAV FILE EXAMPLES ONLY **
84
+ example_wav_files = [
85
+ "example1.wav", # Replace with actual WAV file paths
86
+ "example2.wav",
87
+ "example3.wav"
88
+ ]
89
+
90
+ iface = gr.Interface(
91
+ fn=process_input,
92
+ inputs=[
93
+ gr.Video(label="Upload Video"),
94
+ gr.Audio(label="Upload Audio File", type="filepath"),
95
+ gr.Microphone(label="Record from Microphone", type="filepath")
96
+ ],
97
+ outputs="text",
98
+ title="NeMo ASR Transcription Interface",
99
+ description="Upload a video, an audio file, or record from the microphone to transcribe the audio using a trained NeMo model.",
100
+ examples=[[None, wav, None] for wav in example_wav_files] # **Only WAV examples**
101
+ )
102
+
103
  iface.launch()