Spaces:

Deepakkori45
/

AudioBot

Sleeping

App Files Files Community

Deepakkori45 commited on Feb 2, 2025

Commit

eeadc49

verified ·

1 Parent(s): 469f0e5

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -28

app.py CHANGED Viewed

@@ -24,17 +24,80 @@ status_placeholder.info("Loading Whisper model from Hugging Face...")
 def load_whisper_model():
     """
     Load the Whisper model and processor from Hugging Face.
-    Change 'openai/whisper-base' to another variant if needed.
     """
-    model_name = "openai/whisper-Small"
     processor = WhisperProcessor.from_pretrained(model_name)
     model = WhisperForConditionalGeneration.from_pretrained(model_name)
     return processor, model
 processor, model = load_whisper_model()
 status_placeholder.info("Whisper model loaded successfully!")
 def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
     """
     Split an audio file into chunks using silence detection.
@@ -50,42 +113,29 @@ def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=
     status_placeholder.info(f"Audio split into {len(chunks)} chunks.")
     return chunks
-# def transcribe(audio_file):
-#     """
-#     Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
-#     This uses librosa to load and resample the audio as required.
-#     """
-#     # Load audio with librosa at 16kHz (as required by Whisper)
-#     speech, sr = librosa.load(audio_file, sr=16000)
-#     input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
-#     predicted_ids = model.generate(input_features)
-#     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-#     return transcription
-def transcribe(audio_file):
     """
     Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
     This uses librosa to load and resample the audio as required.
-    The transcription is forced to be in English.
     Args:
         audio_file (str): Path to the audio file.
     Returns:
-        str: Transcribed text in English.
     """
     # Load audio with librosa at 16kHz (as required by Whisper)
     speech, sr = librosa.load(audio_file, sr=16000)
     input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
-    # Force the transcription output to be in English:
-    forced_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
     predicted_ids = model.generate(input_features, forced_decoder_ids=forced_ids)
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     return transcription
-def transcribe_chunk(chunk, index, min_length_ms=100):
     """
     Transcribe an individual audio chunk.
     """
@@ -96,13 +146,13 @@ def transcribe_chunk(chunk, index, min_length_ms=100):
     with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
         chunk.export(temp_audio_file.name, format="wav")
         temp_audio_file_path = temp_audio_file.name
-    status_placeholder.info(f"Transcribing chunk {index}...")
-    transcription = transcribe(temp_audio_file_path)
     os.remove(temp_audio_file_path)
     st.write(f"Transcription for chunk {index}: {transcription}")
     return (index, transcription)
-def process_audio_chunks(audio_chunks):
     """
     Process and transcribe each audio chunk in sequence.
     Reports the total time taken.
@@ -111,7 +161,7 @@ def process_audio_chunks(audio_chunks):
     min_length_ms = 100  # minimum duration for processing
     start_transcription = time.time()
     for i, chunk in enumerate(audio_chunks):
-        index, text = transcribe_chunk(chunk, i, min_length_ms)
         transcriptions.append((index, text))
     transcriptions.sort(key=lambda x: x[0])
     total_time = time.time() - start_transcription
@@ -152,7 +202,7 @@ if uploaded_file is not None and st.session_state.transcription is None:
     processing_start = time.time()
     with st.spinner('Processing audio...'):
         audio_chunks = split_audio_on_silence(temp_audio_file)
-        transcription = process_audio_chunks(audio_chunks)
         if transcription:
             st.session_state.transcription = transcription
             st.success('Transcription complete!')

 def load_whisper_model():
     """
     Load the Whisper model and processor from Hugging Face.
+    Change 'openai/whisper-small' to another variant if needed.
     """
+    model_name = "openai/whisper-small"  # You can change to "tiny", "base", "medium", or "large" based on resources.
     processor = WhisperProcessor.from_pretrained(model_name)
     model = WhisperForConditionalGeneration.from_pretrained(model_name)
     return processor, model
 processor, model = load_whisper_model()
 status_placeholder.info("Whisper model loaded successfully!")
+# Comprehensive dictionary of languages supported by Whisper (most common ones)
+LANGUAGES = {
+    "en": "English",
+    "zh": "Chinese",
+    "de": "German",
+    "es": "Spanish",
+    "ru": "Russian",
+    "ko": "Korean",
+    "fr": "French",
+    "ja": "Japanese",
+    "pt": "Portuguese",
+    "tr": "Turkish",
+    "pl": "Polish",
+    "ca": "Catalan",
+    "nl": "Dutch",
+    "ar": "Arabic",
+    "sv": "Swedish",
+    "it": "Italian",
+    "id": "Indonesian",
+    "hi": "Hindi",
+    "fi": "Finnish",
+    "vi": "Vietnamese",
+    "fa": "Persian",
+    "mr": "Marathi",
+    "uk": "Ukrainian",
+    "el": "Greek",
+    "ms": "Malay",
+    "cs": "Czech",
+    "ro": "Romanian",
+    "da": "Danish",
+    "hu": "Hungarian",
+    "ta": "Tamil",
+    "no": "Norwegian",
+    "th": "Thai",
+    "ur": "Urdu",
+    "hr": "Croatian",
+    "bg": "Bulgarian",
+    "lt": "Lithuanian",
+    "la": "Latin",
+    "mi": "Maori",
+    "ml": "Malayalam",
+    "cy": "Welsh",
+    "sk": "Slovak",
+    "te": "Telugu",
+    "ka": "Georgian",
+    "sl": "Slovenian",
+    "kn": "Kannada",
+    "et": "Estonian",
+    "mk": "Macedonian",
+    "br": "Breton",
+    "eu": "Basque",
+    "is": "Icelandic",
+    "hy": "Armenian",
+    "af": "Afrikaans"
+}
+# Create a sorted list of language names for the selectbox
+language_names = sorted(LANGUAGES.values())
+default_language = "English"  # Default language
+selected_lang_name = st.selectbox("Select transcription language", language_names, index=language_names.index(default_language))
+# Find the language code by reverse lookup in LANGUAGES
+selected_language = [code for code, name in LANGUAGES.items() if name == selected_lang_name][0]
 def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
     """
     Split an audio file into chunks using silence detection.
     status_placeholder.info(f"Audio split into {len(chunks)} chunks.")
     return chunks
+def transcribe(audio_file, language):
     """
     Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
     This uses librosa to load and resample the audio as required.
+    The transcription is forced to the specified language.
     Args:
         audio_file (str): Path to the audio file.
+        language (str): Language code (e.g., "en", "es").
     Returns:
+        str: Transcribed text.
     """
     # Load audio with librosa at 16kHz (as required by Whisper)
     speech, sr = librosa.load(audio_file, sr=16000)
     input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
+    # Force the transcription output to the chosen language:
+    forced_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
     predicted_ids = model.generate(input_features, forced_decoder_ids=forced_ids)
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     return transcription
+def transcribe_chunk(chunk, index, language, min_length_ms=100):
     """
     Transcribe an individual audio chunk.
     """
     with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
         chunk.export(temp_audio_file.name, format="wav")
         temp_audio_file_path = temp_audio_file.name
+    status_placeholder.info(f"Transcribing chunk {index} in {selected_lang_name}...")
+    transcription = transcribe(temp_audio_file_path, language)
     os.remove(temp_audio_file_path)
     st.write(f"Transcription for chunk {index}: {transcription}")
     return (index, transcription)
+def process_audio_chunks(audio_chunks, language):
     """
     Process and transcribe each audio chunk in sequence.
     Reports the total time taken.
     min_length_ms = 100  # minimum duration for processing
     start_transcription = time.time()
     for i, chunk in enumerate(audio_chunks):
+        index, text = transcribe_chunk(chunk, i, language, min_length_ms)
         transcriptions.append((index, text))
     transcriptions.sort(key=lambda x: x[0])
     total_time = time.time() - start_transcription
     processing_start = time.time()
     with st.spinner('Processing audio...'):
         audio_chunks = split_audio_on_silence(temp_audio_file)
+        transcription = process_audio_chunks(audio_chunks, selected_language)
         if transcription:
             st.session_state.transcription = transcription
             st.success('Transcription complete!')