Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| import librosa | |
| import torch | |
| from pydub import AudioSegment | |
| from pydub.silence import split_on_silence | |
| from dotenv import load_dotenv | |
| from tempfile import NamedTemporaryFile | |
| import math | |
| from docx import Document | |
| import time | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| # Load environment variables from .env file (if needed for other config) | |
| load_dotenv() | |
| # Create a placeholder for status messages | |
| status_placeholder = st.empty() | |
| # Display status while loading the model | |
| status_placeholder.info("Loading Whisper model from Hugging Face...") | |
| def load_whisper_model(): | |
| """ | |
| Load the Whisper model and processor from Hugging Face. | |
| Change 'openai/whisper-small' to another variant if needed. | |
| """ | |
| model_name = "openai/whisper-small" # You can change to "tiny", "base", "medium", or "large" based on resources. | |
| processor = WhisperProcessor.from_pretrained(model_name) | |
| model = WhisperForConditionalGeneration.from_pretrained(model_name) | |
| return processor, model | |
| processor, model = load_whisper_model() | |
| status_placeholder.info("Whisper model loaded successfully!") | |
| # Comprehensive dictionary of languages supported by Whisper (most common ones) | |
| LANGUAGES = { | |
| "en": "English", | |
| "zh": "Chinese", | |
| "de": "German", | |
| "es": "Spanish", | |
| "ru": "Russian", | |
| "ko": "Korean", | |
| "fr": "French", | |
| "ja": "Japanese", | |
| "pt": "Portuguese", | |
| "tr": "Turkish", | |
| "pl": "Polish", | |
| "ca": "Catalan", | |
| "nl": "Dutch", | |
| "ar": "Arabic", | |
| "sv": "Swedish", | |
| "it": "Italian", | |
| "id": "Indonesian", | |
| "hi": "Hindi", | |
| "fi": "Finnish", | |
| "vi": "Vietnamese", | |
| "fa": "Persian", | |
| "mr": "Marathi", | |
| "uk": "Ukrainian", | |
| "el": "Greek", | |
| "ms": "Malay", | |
| "cs": "Czech", | |
| "ro": "Romanian", | |
| "da": "Danish", | |
| "hu": "Hungarian", | |
| "ta": "Tamil", | |
| "no": "Norwegian", | |
| "th": "Thai", | |
| "ur": "Urdu", | |
| "hr": "Croatian", | |
| "bg": "Bulgarian", | |
| "lt": "Lithuanian", | |
| "la": "Latin", | |
| "mi": "Maori", | |
| "ml": "Malayalam", | |
| "cy": "Welsh", | |
| "sk": "Slovak", | |
| "te": "Telugu", | |
| "ka": "Georgian", | |
| "sl": "Slovenian", | |
| "kn": "Kannada", | |
| "et": "Estonian", | |
| "mk": "Macedonian", | |
| "br": "Breton", | |
| "eu": "Basque", | |
| "is": "Icelandic", | |
| "hy": "Armenian", | |
| "af": "Afrikaans" | |
| } | |
| # Create a sorted list of language names for the selectbox | |
| language_names = sorted(LANGUAGES.values()) | |
| default_language = "English" # Default language | |
| selected_lang_name = st.selectbox("Select transcription language", language_names, index=language_names.index(default_language)) | |
| # Find the language code by reverse lookup in LANGUAGES | |
| selected_language = [code for code, name in LANGUAGES.items() if name == selected_lang_name][0] | |
| def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250): | |
| """ | |
| Split an audio file into chunks using silence detection. | |
| """ | |
| status_placeholder.info("Splitting audio on silence...") | |
| audio = AudioSegment.from_file(audio_file_path) | |
| chunks = split_on_silence( | |
| audio, | |
| min_silence_len=min_silence_len, | |
| silence_thresh=silence_thresh, | |
| keep_silence=keep_silence | |
| ) | |
| status_placeholder.info(f"Audio split into {len(chunks)} chunks.") | |
| return chunks | |
| def transcribe(audio_file, language): | |
| """ | |
| Transcribe an audio file using the locally loaded Whisper model from Hugging Face. | |
| This uses librosa to load and resample the audio as required. | |
| The transcription is forced to the specified language. | |
| Args: | |
| audio_file (str): Path to the audio file. | |
| language (str): Language code (e.g., "en", "es"). | |
| Returns: | |
| str: Transcribed text. | |
| """ | |
| # Load audio with librosa at 16kHz (as required by Whisper) | |
| speech, sr = librosa.load(audio_file, sr=16000) | |
| input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features | |
| # Force the transcription output to the chosen language: | |
| forced_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe") | |
| predicted_ids = model.generate(input_features, forced_decoder_ids=forced_ids) | |
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
| return transcription | |
| def transcribe_chunk(chunk, index, language, min_length_ms=100): | |
| """ | |
| Transcribe an individual audio chunk. | |
| """ | |
| if len(chunk) < min_length_ms: | |
| st.warning(f"Chunk {index} is too short to be processed.") | |
| return (index, "") | |
| # Save chunk temporarily as a WAV file | |
| with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: | |
| chunk.export(temp_audio_file.name, format="wav") | |
| temp_audio_file_path = temp_audio_file.name | |
| status_placeholder.info(f"Transcribing chunk {index} in {selected_lang_name}...") | |
| transcription = transcribe(temp_audio_file_path, language) | |
| os.remove(temp_audio_file_path) | |
| st.write(f"Transcription for chunk {index}: {transcription}") | |
| return (index, transcription) | |
| def process_audio_chunks(audio_chunks, language): | |
| """ | |
| Process and transcribe each audio chunk in sequence. | |
| Reports the total time taken. | |
| """ | |
| transcriptions = [] | |
| min_length_ms = 100 # minimum duration for processing | |
| start_transcription = time.time() | |
| for i, chunk in enumerate(audio_chunks): | |
| index, text = transcribe_chunk(chunk, i, language, min_length_ms) | |
| transcriptions.append((index, text)) | |
| transcriptions.sort(key=lambda x: x[0]) | |
| total_time = time.time() - start_transcription | |
| status_placeholder.info(f"All chunks transcribed in {total_time:.2f} seconds.") | |
| combined = " ".join([text for idx, text in transcriptions]) | |
| return combined | |
| def save_transcription_to_docx(transcription, audio_file_path): | |
| """ | |
| Save the transcription as a .docx file. | |
| """ | |
| base_name = os.path.splitext(os.path.basename(audio_file_path))[0] | |
| output_file_name = f"{base_name}_full_transcription.docx" | |
| status_placeholder.info("Saving transcription to DOCX...") | |
| doc = Document() | |
| doc.add_paragraph(transcription) | |
| doc.save(output_file_name) | |
| status_placeholder.info("Transcription saved as DOCX.") | |
| return output_file_name | |
| st.title("Audio Transcription with Whisper (Local via Hugging Face)") | |
| # Allow uploading of audio or video files | |
| uploaded_file = st.file_uploader("Upload an audio or video file", type=["wav", "mp3", "ogg", "m4a", "mp4", "mov"]) | |
| if 'transcription' not in st.session_state: | |
| st.session_state.transcription = None | |
| if uploaded_file is not None and st.session_state.transcription is None: | |
| st.audio(uploaded_file) | |
| # Save uploaded file temporarily | |
| file_extension = uploaded_file.name.split(".")[-1] | |
| temp_audio_file = f"temp_audio_file.{file_extension}" | |
| with open(temp_audio_file, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| processing_start = time.time() | |
| with st.spinner('Processing audio...'): | |
| audio_chunks = split_audio_on_silence(temp_audio_file) | |
| transcription = process_audio_chunks(audio_chunks, selected_language) | |
| if transcription: | |
| st.session_state.transcription = transcription | |
| st.success('Transcription complete!') | |
| output_docx_file = save_transcription_to_docx(transcription, uploaded_file.name) | |
| st.session_state.output_docx_file = output_docx_file | |
| processing_duration = time.time() - processing_start | |
| status_placeholder.info(f"Total processing time: {processing_duration:.2f} seconds.") | |
| if os.path.exists(temp_audio_file): | |
| os.remove(temp_audio_file) | |
| if st.session_state.transcription: | |
| st.text_area("Transcription", st.session_state.transcription, key="transcription_area_final") | |
| with open(st.session_state.output_docx_file, "rb") as docx_file: | |
| st.download_button( | |
| label="Download Transcription (.docx)", | |
| data=docx_file, | |
| file_name=st.session_state.output_docx_file, | |
| mime='application/vnd.openxmlformats-officedocument.wordprocessingml.document' | |
| ) | |