File size: 1,679 Bytes
d7d44ed
 
 
 
 
7954582
 
d7d44ed
c207b4a
d7d44ed
c39770b
4a6a287
 
 
 
 
 
 
359777d
 
1e150f3
5bed895
4a6a287
359777d
4a6a287
 
c8b8068
359777d
4a6a287
5bed895
4a6a287
 
5bed895
4a6a287
5bed895
 
 
 
 
359777d
 
 
7ed8f5e
c8b8068
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import gradio as gr
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
import torch
import soundfile as sf
import librosa

pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")

def audio_to_text(audio):
    # Check if audio is a tuple of (sample_rate, numpy_array)
    if isinstance(audio, tuple):
        sample_rate, audio_data = audio  # Unpack sample rate and data
    else:
        # If it's a file path, load and resample to 16kHz directly
        audio_data, sample_rate = librosa.load(audio, sr=16000)
    
    # Convert to mono if the audio has more than one channel
    if len(audio_data.shape) > 1:
        audio_data = np.mean(audio_data, axis=1)
    
    # Convert audio data to numpy array of float32 type
    audio_array = np.array(audio_data).astype(np.float32)
    
    # Transcribe the audio input with return_timestamps=True
    transcription = pipe1(audio_array, return_timestamps=True)
    
    # Check the output structure of transcription
    if "segments" in transcription:
        # Extract text from each segment if "segments" key exists
        transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
    elif "text" in transcription:
        # Use the full transcription if it's directly in the "text" field
        transcription_text = transcription["text"]
    else:
        transcription_text = "No transcription available."

    # Print and return the transcription text
    print("Transcription:", transcription_text)
    return transcription_text

demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
demo.launch(share=True)