Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -24,17 +24,80 @@ status_placeholder.info("Loading Whisper model from Hugging Face...")
|
|
| 24 |
def load_whisper_model():
|
| 25 |
"""
|
| 26 |
Load the Whisper model and processor from Hugging Face.
|
| 27 |
-
Change 'openai/whisper-
|
| 28 |
"""
|
| 29 |
-
model_name = "openai/whisper-
|
| 30 |
processor = WhisperProcessor.from_pretrained(model_name)
|
| 31 |
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
| 32 |
return processor, model
|
| 33 |
|
| 34 |
processor, model = load_whisper_model()
|
| 35 |
-
|
| 36 |
status_placeholder.info("Whisper model loaded successfully!")
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
|
| 39 |
"""
|
| 40 |
Split an audio file into chunks using silence detection.
|
|
@@ -50,42 +113,29 @@ def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=
|
|
| 50 |
status_placeholder.info(f"Audio split into {len(chunks)} chunks.")
|
| 51 |
return chunks
|
| 52 |
|
| 53 |
-
|
| 54 |
-
# """
|
| 55 |
-
# Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
|
| 56 |
-
# This uses librosa to load and resample the audio as required.
|
| 57 |
-
# """
|
| 58 |
-
# # Load audio with librosa at 16kHz (as required by Whisper)
|
| 59 |
-
# speech, sr = librosa.load(audio_file, sr=16000)
|
| 60 |
-
# input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
|
| 61 |
-
# predicted_ids = model.generate(input_features)
|
| 62 |
-
# transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
| 63 |
-
# return transcription
|
| 64 |
-
|
| 65 |
-
def transcribe(audio_file):
|
| 66 |
"""
|
| 67 |
Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
|
| 68 |
This uses librosa to load and resample the audio as required.
|
| 69 |
-
The transcription is forced to
|
| 70 |
|
| 71 |
Args:
|
| 72 |
audio_file (str): Path to the audio file.
|
|
|
|
| 73 |
|
| 74 |
Returns:
|
| 75 |
-
str: Transcribed text
|
| 76 |
"""
|
| 77 |
# Load audio with librosa at 16kHz (as required by Whisper)
|
| 78 |
speech, sr = librosa.load(audio_file, sr=16000)
|
| 79 |
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
|
| 80 |
-
# Force the transcription output to
|
| 81 |
-
forced_ids = processor.get_decoder_prompt_ids(language=
|
| 82 |
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_ids)
|
| 83 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
| 84 |
return transcription
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
def transcribe_chunk(chunk, index, min_length_ms=100):
|
| 89 |
"""
|
| 90 |
Transcribe an individual audio chunk.
|
| 91 |
"""
|
|
@@ -96,13 +146,13 @@ def transcribe_chunk(chunk, index, min_length_ms=100):
|
|
| 96 |
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
|
| 97 |
chunk.export(temp_audio_file.name, format="wav")
|
| 98 |
temp_audio_file_path = temp_audio_file.name
|
| 99 |
-
status_placeholder.info(f"Transcribing chunk {index}...")
|
| 100 |
-
transcription = transcribe(temp_audio_file_path)
|
| 101 |
os.remove(temp_audio_file_path)
|
| 102 |
st.write(f"Transcription for chunk {index}: {transcription}")
|
| 103 |
return (index, transcription)
|
| 104 |
|
| 105 |
-
def process_audio_chunks(audio_chunks):
|
| 106 |
"""
|
| 107 |
Process and transcribe each audio chunk in sequence.
|
| 108 |
Reports the total time taken.
|
|
@@ -111,7 +161,7 @@ def process_audio_chunks(audio_chunks):
|
|
| 111 |
min_length_ms = 100 # minimum duration for processing
|
| 112 |
start_transcription = time.time()
|
| 113 |
for i, chunk in enumerate(audio_chunks):
|
| 114 |
-
index, text = transcribe_chunk(chunk, i, min_length_ms)
|
| 115 |
transcriptions.append((index, text))
|
| 116 |
transcriptions.sort(key=lambda x: x[0])
|
| 117 |
total_time = time.time() - start_transcription
|
|
@@ -152,7 +202,7 @@ if uploaded_file is not None and st.session_state.transcription is None:
|
|
| 152 |
processing_start = time.time()
|
| 153 |
with st.spinner('Processing audio...'):
|
| 154 |
audio_chunks = split_audio_on_silence(temp_audio_file)
|
| 155 |
-
transcription = process_audio_chunks(audio_chunks)
|
| 156 |
if transcription:
|
| 157 |
st.session_state.transcription = transcription
|
| 158 |
st.success('Transcription complete!')
|
|
|
|
| 24 |
def load_whisper_model():
|
| 25 |
"""
|
| 26 |
Load the Whisper model and processor from Hugging Face.
|
| 27 |
+
Change 'openai/whisper-small' to another variant if needed.
|
| 28 |
"""
|
| 29 |
+
model_name = "openai/whisper-small" # You can change to "tiny", "base", "medium", or "large" based on resources.
|
| 30 |
processor = WhisperProcessor.from_pretrained(model_name)
|
| 31 |
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
| 32 |
return processor, model
|
| 33 |
|
| 34 |
processor, model = load_whisper_model()
|
|
|
|
| 35 |
status_placeholder.info("Whisper model loaded successfully!")
|
| 36 |
|
| 37 |
+
# Comprehensive dictionary of languages supported by Whisper (most common ones)
|
| 38 |
+
LANGUAGES = {
|
| 39 |
+
"en": "English",
|
| 40 |
+
"zh": "Chinese",
|
| 41 |
+
"de": "German",
|
| 42 |
+
"es": "Spanish",
|
| 43 |
+
"ru": "Russian",
|
| 44 |
+
"ko": "Korean",
|
| 45 |
+
"fr": "French",
|
| 46 |
+
"ja": "Japanese",
|
| 47 |
+
"pt": "Portuguese",
|
| 48 |
+
"tr": "Turkish",
|
| 49 |
+
"pl": "Polish",
|
| 50 |
+
"ca": "Catalan",
|
| 51 |
+
"nl": "Dutch",
|
| 52 |
+
"ar": "Arabic",
|
| 53 |
+
"sv": "Swedish",
|
| 54 |
+
"it": "Italian",
|
| 55 |
+
"id": "Indonesian",
|
| 56 |
+
"hi": "Hindi",
|
| 57 |
+
"fi": "Finnish",
|
| 58 |
+
"vi": "Vietnamese",
|
| 59 |
+
"fa": "Persian",
|
| 60 |
+
"mr": "Marathi",
|
| 61 |
+
"uk": "Ukrainian",
|
| 62 |
+
"el": "Greek",
|
| 63 |
+
"ms": "Malay",
|
| 64 |
+
"cs": "Czech",
|
| 65 |
+
"ro": "Romanian",
|
| 66 |
+
"da": "Danish",
|
| 67 |
+
"hu": "Hungarian",
|
| 68 |
+
"ta": "Tamil",
|
| 69 |
+
"no": "Norwegian",
|
| 70 |
+
"th": "Thai",
|
| 71 |
+
"ur": "Urdu",
|
| 72 |
+
"hr": "Croatian",
|
| 73 |
+
"bg": "Bulgarian",
|
| 74 |
+
"lt": "Lithuanian",
|
| 75 |
+
"la": "Latin",
|
| 76 |
+
"mi": "Maori",
|
| 77 |
+
"ml": "Malayalam",
|
| 78 |
+
"cy": "Welsh",
|
| 79 |
+
"sk": "Slovak",
|
| 80 |
+
"te": "Telugu",
|
| 81 |
+
"ka": "Georgian",
|
| 82 |
+
"sl": "Slovenian",
|
| 83 |
+
"kn": "Kannada",
|
| 84 |
+
"et": "Estonian",
|
| 85 |
+
"mk": "Macedonian",
|
| 86 |
+
"br": "Breton",
|
| 87 |
+
"eu": "Basque",
|
| 88 |
+
"is": "Icelandic",
|
| 89 |
+
"hy": "Armenian",
|
| 90 |
+
"af": "Afrikaans"
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
# Create a sorted list of language names for the selectbox
|
| 94 |
+
language_names = sorted(LANGUAGES.values())
|
| 95 |
+
default_language = "English" # Default language
|
| 96 |
+
|
| 97 |
+
selected_lang_name = st.selectbox("Select transcription language", language_names, index=language_names.index(default_language))
|
| 98 |
+
# Find the language code by reverse lookup in LANGUAGES
|
| 99 |
+
selected_language = [code for code, name in LANGUAGES.items() if name == selected_lang_name][0]
|
| 100 |
+
|
| 101 |
def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
|
| 102 |
"""
|
| 103 |
Split an audio file into chunks using silence detection.
|
|
|
|
| 113 |
status_placeholder.info(f"Audio split into {len(chunks)} chunks.")
|
| 114 |
return chunks
|
| 115 |
|
| 116 |
+
def transcribe(audio_file, language):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
"""
|
| 118 |
Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
|
| 119 |
This uses librosa to load and resample the audio as required.
|
| 120 |
+
The transcription is forced to the specified language.
|
| 121 |
|
| 122 |
Args:
|
| 123 |
audio_file (str): Path to the audio file.
|
| 124 |
+
language (str): Language code (e.g., "en", "es").
|
| 125 |
|
| 126 |
Returns:
|
| 127 |
+
str: Transcribed text.
|
| 128 |
"""
|
| 129 |
# Load audio with librosa at 16kHz (as required by Whisper)
|
| 130 |
speech, sr = librosa.load(audio_file, sr=16000)
|
| 131 |
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
|
| 132 |
+
# Force the transcription output to the chosen language:
|
| 133 |
+
forced_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
|
| 134 |
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_ids)
|
| 135 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
| 136 |
return transcription
|
| 137 |
|
| 138 |
+
def transcribe_chunk(chunk, index, language, min_length_ms=100):
|
|
|
|
|
|
|
| 139 |
"""
|
| 140 |
Transcribe an individual audio chunk.
|
| 141 |
"""
|
|
|
|
| 146 |
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
|
| 147 |
chunk.export(temp_audio_file.name, format="wav")
|
| 148 |
temp_audio_file_path = temp_audio_file.name
|
| 149 |
+
status_placeholder.info(f"Transcribing chunk {index} in {selected_lang_name}...")
|
| 150 |
+
transcription = transcribe(temp_audio_file_path, language)
|
| 151 |
os.remove(temp_audio_file_path)
|
| 152 |
st.write(f"Transcription for chunk {index}: {transcription}")
|
| 153 |
return (index, transcription)
|
| 154 |
|
| 155 |
+
def process_audio_chunks(audio_chunks, language):
|
| 156 |
"""
|
| 157 |
Process and transcribe each audio chunk in sequence.
|
| 158 |
Reports the total time taken.
|
|
|
|
| 161 |
min_length_ms = 100 # minimum duration for processing
|
| 162 |
start_transcription = time.time()
|
| 163 |
for i, chunk in enumerate(audio_chunks):
|
| 164 |
+
index, text = transcribe_chunk(chunk, i, language, min_length_ms)
|
| 165 |
transcriptions.append((index, text))
|
| 166 |
transcriptions.sort(key=lambda x: x[0])
|
| 167 |
total_time = time.time() - start_transcription
|
|
|
|
| 202 |
processing_start = time.time()
|
| 203 |
with st.spinner('Processing audio...'):
|
| 204 |
audio_chunks = split_audio_on_silence(temp_audio_file)
|
| 205 |
+
transcription = process_audio_chunks(audio_chunks, selected_language)
|
| 206 |
if transcription:
|
| 207 |
st.session_state.transcription = transcription
|
| 208 |
st.success('Transcription complete!')
|