transcriber / app.py
Alex0505's picture
Update app.py
4d7b12b verified
import os
huggingface_token = os.getenv('Hugging_Face_Token')
# Set up Hugging Face API token for authenticated requests
from huggingface_hub.hf_api import HfFolder
HfFolder.save_token(huggingface_token)
from huggingface_hub import HfApi
available_pipelines = [p.modelId for p in HfApi().list_models(filter="pyannote-audio-pipeline")]
list(filter(lambda p: p.startswith("pyannote/"), available_pipelines))
from pyannote.audio import Pipeline
pipeline_diar = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=True)
import torch
import torchaudio
import base64
from io import BytesIO
from pyannote.core import Annotation
from pydub import AudioSegment
from transformers import pipeline
transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=False)
import gradio as gr
def transcribe_recording(base64_encoded_audio):
"""
Transcribes audio data given as base64 encoded byte string into text and yields
the transcribed text incrementally as it becomes available.
Parameters:
- base64_audio (str): Base64 encoded byte string of audio data.
Yields:
- str: Transcribed text incrementally as it becomes available.
"""
# Decode the base64 encoded string
base64_decoded_audio = base64.b64decode(base64_encoded_audio)
with BytesIO(base64_decoded_audio) as audio_buffer:
audio_buffer.seek(0)
# Load audio with pydub for easy slicing
audio = AudioSegment.from_file(audio_buffer)
audio.export("out.mp3", format="mp3")
# Run diarization pipeline
dia = pipeline_diar("out.mp3")
assert isinstance(dia, Annotation)
# Prepare a list to store data
data = []
for i, (speech_turn, track, speaker) in enumerate(dia.itertracks(yield_label=True)):
# Extract start and end times
start_time, end_time = speech_turn.start, speech_turn.end
# Extract and transcribe the audio segment
segment_audio = audio[int(start_time * 1000):int(end_time * 1000)]
segment_audio.export("out.mp3", format="mp3")
text = transcriber("out.mp3")['text']
# Append the data to the list
data.append({
"Start Time": start_time,
"End Time": end_time,
"Speaker": speaker,
"Transcription": text
})
return data
tanscriber = gr.Interface(
fn=transcribe_recording,
inputs="text",
outputs="text",
title="whisper transcriber",
).launch()