Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from faster_whisper import WhisperModel | |
| import torch | |
| import numpy as np | |
| import os | |
| import wave | |
| def model_init(): | |
| # get device | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| model_size = "large-v3" | |
| if device == "cuda:0": | |
| # Run on GPU with FP16 | |
| model = WhisperModel(model_size, device="cuda", compute_type="float16") | |
| print("--------------") | |
| print("Model runs on GPU") | |
| print("--------------") | |
| # or Run on GPU with INT8 | |
| # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") | |
| else: | |
| # Run on CPU with INT8 | |
| model = WhisperModel(model_size, device="cpu", compute_type="int8") | |
| return model | |
| model = model_init() | |
| import time | |
| def transcribe_moon(stream, new_chunk): | |
| start_time = time.time() # Start timing | |
| sr, y = new_chunk | |
| y = y.astype(np.float32) | |
| y /= np.max(np.abs(y)) | |
| if stream is not None: | |
| stream = np.concatenate([stream, y]) | |
| else: | |
| stream = y | |
| # Perform the transcription using the specified model and settings | |
| segments, info = model.transcribe( | |
| stream, | |
| ) | |
| # beam_size=5, | |
| # vad_filter=True, | |
| # vad_parameters={'min_silence_duration_ms': 500} | |
| # return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"] | |
| # Compile the transcript with timestamps | |
| transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments]) | |
| language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}" | |
| end_time = time.time() # End timing | |
| execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time | |
| return stream, transcript, language_info, execution_time | |
| def transcribe(audio_file): | |
| start_time = time.time() # Start timing | |
| # Perform the transcription using the specified model and settings | |
| segments, info = model.transcribe( | |
| audio_file, | |
| beam_size=5, | |
| vad_filter=True, | |
| vad_parameters={'min_silence_duration_ms': 500} | |
| ) | |
| # Compile the transcript with timestamps | |
| transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments]) | |
| language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}" | |
| end_time = time.time() # End timing | |
| execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time | |
| return transcript, language_info, execution_time | |
| # Input and Interface setup for file upload | |
| # , "microphone" | |
| input_audio = gr.Audio(sources=["upload" , "microphone"], type="filepath", label="Upload or Record Audio") | |
| file_upload_interface = gr.Interface( | |
| fn=transcribe, | |
| inputs=input_audio, | |
| outputs=["text", "text", "text"], | |
| live=True, | |
| title="Speak to Transcript", | |
| description="Upload an audio file to transcribe and detect the spoken language." | |
| ) | |
| input_audio_mic = gr.Audio(sources=["microphone"], label="Record Audio", streaming=True) | |
| streaming_interface = gr.Interface( | |
| transcribe_moon, | |
| ["state", gr.Audio(sources=["microphone"], streaming=True)], | |
| ["state", "text", "text", "text"], | |
| live=True, | |
| ) | |
| # Combine both interfaces in a single Gradio app using Tabs | |
| tabbed_interface = gr.TabbedInterface( | |
| interface_list=[file_upload_interface, streaming_interface], | |
| tab_names=["Upload File", "Live Stream"] | |
| ) | |
| tabbed_interface.launch(debug=True) |