Spaces:

RP-Azul
/

P1

Sleeping

P1 / app.py

Update app.py

c207b4a verified about 1 year ago

1.68 kB

	import gradio as gr
	import numpy as np
	from transformers import pipeline
	from diffusers import DiffusionPipeline
	import torch
	import soundfile as sf
	import librosa

	pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")

	def audio_to_text(audio):
	# Check if audio is a tuple of (sample_rate, numpy_array)
	if isinstance(audio, tuple):
	sample_rate, audio_data = audio # Unpack sample rate and data
	else:
	# If it's a file path, load and resample to 16kHz directly
	audio_data, sample_rate = librosa.load(audio, sr=16000)

	# Convert to mono if the audio has more than one channel
	if len(audio_data.shape) > 1:
	audio_data = np.mean(audio_data, axis=1)

	# Convert audio data to numpy array of float32 type
	audio_array = np.array(audio_data).astype(np.float32)

	# Transcribe the audio input with return_timestamps=True
	transcription = pipe1(audio_array, return_timestamps=True)

	# Check the output structure of transcription
	if "segments" in transcription:
	# Extract text from each segment if "segments" key exists
	transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
	elif "text" in transcription:
	# Use the full transcription if it's directly in the "text" field
	transcription_text = transcription["text"]
	else:
	transcription_text = "No transcription available."

	# Print and return the transcription text
	print("Transcription:", transcription_text)
	return transcription_text

	demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
	demo.launch(share=True)