Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import time | |
| import os | |
| import zipfile | |
| import torch | |
| import librosa | |
| import soundfile as sf | |
| from transformers import pipeline | |
| from typing import List, Tuple, Generator | |
| import datetime | |
| from pydub import AudioSegment | |
| # Initial model name | |
| MODEL_NAME = "primeline/whisper-tiny-german-1224" | |
| speech_to_text = pipeline("automatic-speech-recognition", model=MODEL_NAME) | |
| # Initial status message | |
| STANDARD_OUTPUT_TEXT = "**Status:**<br>" | |
| def get_file_creation_date(file_path: str) -> str: | |
| """ | |
| Returns the creation date of a file. | |
| Args: | |
| file_path (str): The path to the file. | |
| Returns: | |
| str: The creation date in a human-readable format. | |
| """ | |
| try: | |
| # Get file statistics | |
| file_stats = os.stat(file_path) | |
| # Retrieve and format creation time | |
| creation_time = datetime.datetime.fromtimestamp(file_stats.st_ctime) | |
| return creation_time.strftime("%Y-%m-%d %H:%M:%S") | |
| except FileNotFoundError: | |
| return "File not found." | |
| def load_model(model_name: str): | |
| """ | |
| Loads the selected Hugging Face model. | |
| Args: | |
| model_name (str): The name of the Hugging Face model to load. | |
| Returns: | |
| pipeline: The loaded model pipeline. | |
| """ | |
| return pipeline("automatic-speech-recognition", model=model_name) | |
| def convert_to_wav(file_path: str) -> str: | |
| """ | |
| Converts audio files to WAV format if necessary. | |
| Args: | |
| file_path (str): Path to the uploaded audio file. | |
| Returns: | |
| str: Path to the converted WAV file. | |
| """ | |
| if file_path.endswith(".m4a") or file_path.endswith(".aac"): | |
| audio = AudioSegment.from_file(file_path) | |
| wav_path = file_path.rsplit('.', 1)[0] + ".wav" | |
| audio.export(wav_path, format="wav") | |
| return wav_path | |
| return file_path | |
| def preprocess_audio(file_path: str) -> str: | |
| """ | |
| Preprocesses the audio file to ensure compatibility with the AI model. | |
| Args: | |
| file_path (str): Path to the uploaded audio file. | |
| Returns: | |
| str: Path to the preprocessed audio file. | |
| """ | |
| file_path = convert_to_wav(file_path) # Convert to WAV if necessary | |
| y, sr = librosa.load(file_path, sr=16000) # Resample audio to 16kHz | |
| processed_path = file_path.replace(".mp3", "_processed.wav").replace(".wav", "_processed.wav") | |
| sf.write(processed_path, y, sr) # Save the resampled audio | |
| return processed_path | |
| def process_files_with_live_updates( | |
| files: List[gr.File], | |
| model_option: str, | |
| output_format: str | |
| ) -> Generator[Tuple[str, List[str]], None, None]: | |
| """ | |
| Processes a list of uploaded files, transcribes audio, and provides live updates. | |
| Args: | |
| files (List[gr.File]): List of files uploaded by the user. | |
| model_option (str): Selected model option. | |
| output_format (str): Selected output format option. | |
| Yields: | |
| Tuple[str, List[str]]: Updated status message and list of processed file paths. | |
| """ | |
| global speech_to_text | |
| speech_to_text = load_model(model_option) | |
| file_details = [] | |
| total_files = len(files) | |
| output_files = [] | |
| # Create a folder to temporarily store output files | |
| output_dir = "output_files" | |
| os.makedirs(output_dir, exist_ok=True) | |
| for idx, file in enumerate(files): | |
| # Preprocess audio file | |
| preprocessed_path = preprocess_audio(file.name) | |
| # Transcribe audio using the AI model with timestamp support | |
| transcription_result = speech_to_text(preprocessed_path, return_timestamps=True) | |
| transcription = transcription_result["text"] | |
| # Save transcription to file | |
| txt_filename = os.path.join(output_dir, f"transcription_{file.name.split('/')[-1].split('.')[0]}.txt") | |
| with open(txt_filename, "w", encoding="utf-8") as txt_file: | |
| txt_file.write(transcription) | |
| output_files.append(txt_filename) | |
| # Add to file details | |
| detail = ( | |
| f"**File Name**: {file.name.split('/')[-1]}<br>" | |
| f"**File Date**: {get_file_creation_date(file)}<br>" | |
| f"**Options**: {model_option} - {output_format}<br>" | |
| f"**Transcription**: {transcription}<br><br>" | |
| ) | |
| file_details.append(detail) | |
| # Update progress bar and yield the updated Markdown | |
| yield ( | |
| f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details), | |
| output_files, | |
| ) | |
| # Create a zip archive | |
| zip_filename = os.path.join(output_dir, "output_files.zip") | |
| with zipfile.ZipFile(zip_filename, "w") as zipf: | |
| for file_path in output_files: | |
| zipf.write(file_path, os.path.basename(file_path)) | |
| output_files.append(zip_filename) | |
| # Final yield | |
| yield ( | |
| f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details), | |
| output_files, | |
| ) | |
| # Gradio app layout | |
| with gr.Blocks() as demo: | |
| # Title and Description | |
| gr.Markdown("# Speech-to-Text Batch Processor (German)") | |
| gr.Markdown( | |
| """ | |
| Upload multiple audio files (.wav, .mp3, .m4a, .aac), select desired processing options (i.e. the model), and view real-time updates as files are transcribed. | |
| The application uses advanced AI models for sequential speech-to-text translation. | |
| """ | |
| ) | |
| # Input section | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.Files(file_types=[".wav", ".mp3", ".m4a", ".aac"], label="Upload your audio files") | |
| with gr.Column(): | |
| model_dropdown = gr.Dropdown( | |
| choices=[ | |
| "primeline/whisper-large-v3-german", | |
| "primeline/whisper-tiny-german-1224", | |
| "primeline/whisper-tiny-german" | |
| ], | |
| label="Select Model", | |
| value="primeline/whisper-large-v3-german", | |
| ) | |
| dropdown_2 = gr.Dropdown( | |
| choices=["Format: Plain Text"], | |
| label="Select Output Format", | |
| value="Format: Plain Text", | |
| ) | |
| # Buttons | |
| with gr.Row(): | |
| submit_button = gr.Button("Start Transcription") | |
| clear_button = gr.Button("Clear") | |
| # Output section | |
| output_md = gr.Markdown(label="Transcription Progress", value=STANDARD_OUTPUT_TEXT) | |
| output_files = gr.Files(label="Generated Output Files") | |
| # Button actions | |
| submit_button.click( | |
| process_files_with_live_updates, | |
| inputs=[file_input, model_dropdown, dropdown_2], | |
| outputs=[output_md, output_files], | |
| ) | |
| clear_button.click( | |
| lambda: (None, "primeline/whisper-large-v3-german", "Format: Plain Text", STANDARD_OUTPUT_TEXT, None), | |
| inputs=[], # No inputs | |
| outputs=[file_input, model_dropdown, dropdown_2, output_md, output_files], | |
| ) | |
| gr.Image("Fraunhofer-IPA-Logo.jpg", show_label=False) | |
| # Centered Footer with Logo and Licensing Text | |
| with gr.Row(): | |
| gr.Markdown( | |
| """ | |
| **Fraunhofer IPA** | |
| This application is provided under a basic licensing agreement for non-commercial use only. | |
| For inquiries, visit [Fraunhofer IPA](https://www.ipa.fraunhofer.de). | |
| """, | |
| elem_id="footer-markdown", | |
| ) | |
| # CSS to center the footer content | |
| demo.css = """ | |
| #footer-markdown { | |
| text-align: center; | |
| margin-top: 20px; | |
| padding-top: 10px; | |
| border-top: 1px solid #ccc; | |
| } | |
| """ | |
| # Launch app | |
| demo.launch() | |