Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from transformers import pipeline | |
| from pydub import AudioSegment | |
| import pysrt | |
| import os | |
| import io | |
| st.title("Speech-to-Text with Transformers") | |
| with st.expander("README"): | |
| st.write("This tool transcribes audio files using Hugging Face Transformers. Upload an audio file, choose your model size, and optionally translate to English. A WebVTT/SRT file will be generated and can be downloaded. This is suitable for use as a subtitle file (e.g., in DaVinci Resolve Import Subtitles).") | |
| # Upload audio file | |
| uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"]) | |
| # Model selection | |
| # Note: For Hugging Face Spaces, larger models might require more resources (GPU). | |
| # "tiny", "base", "small", "medium" are common Whisper sizes. | |
| model_size = st.selectbox( | |
| "Model Size (select a smaller model for faster inference or limited resources)", | |
| ("openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium") | |
| ) | |
| # Should we translate to English? | |
| translate = st.checkbox("Would you like a translation to English?") | |
| # Information about resource usage on Hugging Face Spaces | |
| st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. Larger models will consume more resources and time.") | |
| def load_whisper_pipeline(model_name): | |
| """ | |
| Loads the Hugging Face Whisper ASR pipeline. | |
| Uses st.cache_resource to avoid reloading the model on every rerun. | |
| We explicitly tell the pipeline to return timestamps for long-form audio. | |
| """ | |
| st.info(f"Loading {model_name} model... This may take a moment.") | |
| # Set return_timestamps=True to handle audio longer than 30 seconds | |
| return pipeline("automatic-speech-recognition", model=model_name, return_timestamps=True) | |
| def transcribe_with_transformers(audio_file_path, model_name, translate_to_english): | |
| """ | |
| Transcribes audio using the Hugging Face Transformers pipeline and generates an SRT. | |
| """ | |
| try: | |
| asr_pipeline = load_whisper_pipeline(model_name) | |
| st.info("Transcribing audio... Please wait.") | |
| # Configure generation arguments for translation if requested | |
| generate_kwargs = {} | |
| if translate_to_english: | |
| generate_kwargs["task"] = "translate" | |
| # Pass the audio file path and any generation arguments to the pipeline | |
| prediction = asr_pipeline(audio_file_path, generate_kwargs=generate_kwargs) | |
| transcribed_text = prediction["text"] | |
| st.subheader("Full Transcription Output:") | |
| st.write(transcribed_text) | |
| srt_content = pysrt.SubRipFile() | |
| # The 'chunks' key will be present if return_timestamps=True was set | |
| if "chunks" in prediction: | |
| for i, chunk in enumerate(prediction["chunks"]): | |
| start_time_seconds = chunk["timestamp"][0] if chunk["timestamp"][0] is not None else 0.0 | |
| end_time_seconds = chunk["timestamp"][1] if chunk["timestamp"][1] is not None else start_time_seconds + 1.0 # Default if end is None | |
| # Helper function to convert seconds to pysrt.SubRipTime | |
| def seconds_to_srt_time(total_seconds): | |
| hours = int(total_seconds / 3600) | |
| minutes = int((total_seconds % 3600) / 60) | |
| seconds = int(total_seconds % 60) | |
| milliseconds = int((total_seconds - int(total_seconds)) * 1000) | |
| return pysrt.SubRipTime(hours, minutes, seconds, milliseconds) | |
| item = pysrt.SubRipItem( | |
| index=i + 1, | |
| start=seconds_to_srt_time(start_time_seconds), | |
| end=seconds_to_srt_time(end_time_seconds), | |
| text=chunk["text"] | |
| ) | |
| srt_content.append(item) | |
| else: | |
| st.warning("Could not retrieve segmented timestamps. Generating a single subtitle entry.") | |
| # Fallback: Create a single subtitle entry if chunks are not available | |
| # This is less ideal but ensures some output even if timestamps are missing | |
| audio_duration_seconds = 0 | |
| try: | |
| audio = AudioSegment.from_file(audio_file_path) | |
| audio_duration_seconds = audio.duration_seconds | |
| except Exception: | |
| # Estimate duration if pydub fails | |
| audio_duration_seconds = len(transcribed_text) * 0.1 # Very rough estimate | |
| item = pysrt.SubRipItem( | |
| index=1, | |
| start=pysrt.SubRipTime(0, 0, 0, 0), | |
| end=pysrt.SubRipTime(0, 0, int(audio_duration_seconds), 0), | |
| text=transcribed_text | |
| ) | |
| srt_content.append(item) | |
| srt_file_path = "audio.srt" | |
| srt_content.save(srt_file_path, encoding='utf-8') | |
| st.success("Transcription successful! Download subtitle file?") | |
| with open(srt_file_path, "rb") as f: | |
| st.download_button("Download Subtitle in SRT Format", f, file_name="audio.srt") | |
| os.remove(srt_file_path) | |
| except Exception as e: | |
| st.error(f"Error during transcription: {str(e)}") | |
| st.info("Common issues: File format not supported, model loading failed (check Hugging Face Space logs), or audio too large for available memory.") | |
| if uploaded_file is not None: | |
| # Save uploaded file to a temporary location for transformers pipeline | |
| # The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust. | |
| # It's crucial to give the file a proper extension for pydub to identify format | |
| temp_file_name = "temp_audio_file." + uploaded_file.type.split('/')[-1] | |
| with open(temp_file_name, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| audio_file_path = temp_file_name | |
| transcribe_with_transformers(audio_file_path, model_size, translate) | |
| # Clean up the temporary file | |
| if os.path.exists(audio_file_path): | |
| os.remove(audio_file_path) |