Spaces:
Build error
Build error
| import os | |
| huggingface_token = os.getenv('Hugging_Face_Token') | |
| # Set up Hugging Face API token for authenticated requests | |
| from huggingface_hub.hf_api import HfFolder | |
| HfFolder.save_token(huggingface_token) | |
| from huggingface_hub import HfApi | |
| available_pipelines = [p.modelId for p in HfApi().list_models(filter="pyannote-audio-pipeline")] | |
| list(filter(lambda p: p.startswith("pyannote/"), available_pipelines)) | |
| from pyannote.audio import Pipeline | |
| pipeline_diar = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=True) | |
| import torch | |
| import torchaudio | |
| import base64 | |
| from io import BytesIO | |
| from pyannote.core import Annotation | |
| from pydub import AudioSegment | |
| from transformers import pipeline | |
| transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=False) | |
| import gradio as gr | |
| def transcribe_recording(base64_encoded_audio): | |
| """ | |
| Transcribes audio data given as base64 encoded byte string into text and yields | |
| the transcribed text incrementally as it becomes available. | |
| Parameters: | |
| - base64_audio (str): Base64 encoded byte string of audio data. | |
| Yields: | |
| - str: Transcribed text incrementally as it becomes available. | |
| """ | |
| # Decode the base64 encoded string | |
| base64_decoded_audio = base64.b64decode(base64_encoded_audio) | |
| with BytesIO(base64_decoded_audio) as audio_buffer: | |
| audio_buffer.seek(0) | |
| # Load audio with pydub for easy slicing | |
| audio = AudioSegment.from_file(audio_buffer) | |
| audio.export("out.mp3", format="mp3") | |
| # Run diarization pipeline | |
| dia = pipeline_diar("out.mp3") | |
| assert isinstance(dia, Annotation) | |
| # Prepare a list to store data | |
| data = [] | |
| for i, (speech_turn, track, speaker) in enumerate(dia.itertracks(yield_label=True)): | |
| # Extract start and end times | |
| start_time, end_time = speech_turn.start, speech_turn.end | |
| # Extract and transcribe the audio segment | |
| segment_audio = audio[int(start_time * 1000):int(end_time * 1000)] | |
| segment_audio.export("out.mp3", format="mp3") | |
| text = transcriber("out.mp3")['text'] | |
| # Append the data to the list | |
| data.append({ | |
| "Start Time": start_time, | |
| "End Time": end_time, | |
| "Speaker": speaker, | |
| "Transcription": text | |
| }) | |
| return data | |
| tanscriber = gr.Interface( | |
| fn=transcribe_recording, | |
| inputs="text", | |
| outputs="text", | |
| title="whisper transcriber", | |
| ).launch() | |