Hadi32 commited on
Commit
dca6b45
·
verified ·
1 Parent(s): 9612a6d

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +189 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from faster_whisper import WhisperModel
4
+ import torch
5
+ import os
6
+
7
+ # --- Configuration ---
8
+ # You can change the model size here (tiny, base, small, medium, large-v2, large-v3)
9
+ # The user specifically requested "tiny" (guillaumekln/faster-whisper-tiny equivalent)
10
+ MODEL_SIZE = "tiny"
11
+
12
+ # Check for CUDA availability
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ # Use float16 only if on CUDA, otherwise int8 or float32 for CPU
15
+ compute_type = "float16" if device == "cuda" else "int8"
16
+
17
+ print(f"Initializing Faster Whisper Model: {MODEL_SIZE}")
18
+ print(f"Device: {device}, Compute Type: {compute_type}")
19
+
20
+ # Load the model.
21
+ # download_root is not specified, so it defaults to the user's cache directory
22
+ # (which persists in HF Spaces if caching is enabled, or redownloads if ephemeral)
23
+ try:
24
+ model = WhisperModel(MODEL_SIZE, device=device, compute_type=compute_type)
25
+ except Exception as e:
26
+ print(f"Error loading model: {e}")
27
+ print("Attempting to load on CPU with int8...")
28
+ model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
29
+
30
+ # --- Language Options ---
31
+ # A selection of common languages supported by Whisper
32
+ LANGUAGES = [
33
+ "Auto-Detect",
34
+ "Bengali (bn)",
35
+ "English (en)",
36
+ "Hindi (hi)",
37
+ "Chinese (zh)",
38
+ "Spanish (es)",
39
+ "French (fr)",
40
+ "German (de)",
41
+ "Japanese (ja)",
42
+ "Russian (ru)",
43
+ "Portuguese (pt)",
44
+ "Arabic (ar)",
45
+ "Urdu (ur)",
46
+ "Italian (it)",
47
+ "Korean (ko)",
48
+ "Turkish (tr)",
49
+ "Polish (pl)",
50
+ "Dutch (nl)",
51
+ "Thai (th)",
52
+ "Vietnamese (vi)",
53
+ "Indonesian (id)"
54
+ ]
55
+
56
+ def format_timestamp(seconds):
57
+ """Formats seconds into MM:SS.ms"""
58
+ minutes = int(seconds // 60)
59
+ secs = seconds % 60
60
+ return f"{minutes:02d}:{secs:05.2f}"
61
+
62
+ def transcribe_audio(audio_path, language, beam_size, vad_filter):
63
+ """
64
+ Transcribes the given audio file using Faster Whisper.
65
+ Yields segments as they are processed for a real-time effect.
66
+ """
67
+ if not audio_path:
68
+ yield "Please upload or record an audio file first.", "Waiting..."
69
+ return
70
+
71
+ # Parse language code
72
+ lang_code = None
73
+ if language and language != "Auto-Detect":
74
+ # Extracts 'bn' from 'Bengali (bn)'
75
+ try:
76
+ lang_code = language.split("(")[-1].strip(")")
77
+ except:
78
+ lang_code = None
79
+
80
+ print(f"Transcribing {audio_path} with language={lang_code}, beam_size={beam_size}, vad={vad_filter}")
81
+
82
+ try:
83
+ segments, info = model.transcribe(
84
+ audio_path,
85
+ language=lang_code,
86
+ beam_size=int(beam_size),
87
+ vad_filter=vad_filter
88
+ )
89
+
90
+ detected_lang_info = f"Detected Language: {info.language} (Prob: {info.language_probability:.2f})"
91
+
92
+ full_transcript = ""
93
+ current_text = ""
94
+
95
+ # Iterate over segments generator
96
+ for segment in segments:
97
+ start_fmt = format_timestamp(segment.start)
98
+ end_fmt = format_timestamp(segment.end)
99
+
100
+ # Format: [00:00.00 -> 00:05.00] Text
101
+ segment_text = f"[{start_fmt} -> {end_fmt}] {segment.text}"
102
+ full_transcript += segment_text + "\n"
103
+
104
+ # Yielding the updated transcript and status
105
+ yield full_transcript, f"{detected_lang_info} | Processing segment endings at {end_fmt}s"
106
+
107
+ yield full_transcript, f"{detected_lang_info} | Completed"
108
+
109
+ except Exception as e:
110
+ yield f"Error during transcription: {str(e)}", "Error"
111
+
112
+ # --- Gradio UI ---
113
+ theme = gr.themes.Soft(primary_hue="blue", neutral_hue="slate")
114
+
115
+ with gr.Blocks(theme=theme, title="Faster Whisper Tiny Demo") as demo:
116
+
117
+ with gr.Row():
118
+ with gr.Column(scale=1):
119
+ gr.Markdown(
120
+ """
121
+ # 🎙️ Faster Whisper Tiny STT Demo
122
+ ### Bengali & Multilingual Support | বাংলা এবং বহুভাষিক সমর্থন
123
+
124
+ This Space uses the `faster-whisper` library with the **'tiny'** model for fast and efficient speech-to-text transcription.
125
+ Run entirely on CPU/GPU seamlessly.
126
+ """
127
+ )
128
+
129
+ with gr.Row():
130
+ with gr.Column(scale=1):
131
+ # Audio Input: allow file upload and microphone
132
+ audio_input = gr.Audio(
133
+ sources=["upload", "microphone"],
134
+ type="filepath",
135
+ label="Audio Input (Audio File or Microphone) | অডিও ইনপুট"
136
+ )
137
+
138
+ with gr.Accordion("Advanced Settings | উন্নত সেটিংস", open=True):
139
+ language_dropdown = gr.Dropdown(
140
+ choices=LANGUAGES,
141
+ value="Auto-Detect",
142
+ label="Language | ভাষা",
143
+ info="Select 'Auto-Detect' or specify a language."
144
+ )
145
+
146
+ beam_size_slider = gr.Slider(
147
+ minimum=1,
148
+ maximum=10,
149
+ step=1,
150
+ value=5,
151
+ label="Beam Size",
152
+ info="Higher values search more paths (slower but potentially more accurate)."
153
+ )
154
+
155
+ vad_filter_checkbox = gr.Checkbox(
156
+ value=True,
157
+ label="VAD Filter",
158
+ info="Filter out silence using Voice Activity Detection."
159
+ )
160
+
161
+ transcribe_btn = gr.Button("Transcribe Audio | প্রতিলিপি করুন", variant="primary", size="lg")
162
+
163
+ with gr.Column(scale=1):
164
+ status_output = gr.Textbox(label="Status | অবস্থা", interactive=False)
165
+ transcript_output = gr.Textbox(
166
+ label="Transcription Output | প্রতিলিপি ফলাফল",
167
+ show_copy_button=True,
168
+ lines=20,
169
+ max_lines=30,
170
+ placeholder="Transcription will appear here..."
171
+ )
172
+
173
+ # Event Handlers
174
+ transcribe_btn.click(
175
+ fn=transcribe_audio,
176
+ inputs=[audio_input, language_dropdown, beam_size_slider, vad_filter_checkbox],
177
+ outputs=[transcript_output, status_output]
178
+ )
179
+
180
+ gr.Markdown(
181
+ """
182
+ ---
183
+ **Note:** The model downloads automatically on the first run.
184
+ Powered by [faster-whisper](https://github.com/guillaumekln/faster-whisper) and Hugging Face Spaces.
185
+ """
186
+ )
187
+
188
+ if __name__ == "__main__":
189
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ faster-whisper
3
+ torch