Spaces:

Rahul2020
/

EAG_s9_sample

Sleeping

App Files Files Community

EAG_s9_sample / app.py

Rahuluni

add eng only

1095508 4 months ago

raw

history blame contribute delete

8.86 kB

	# app.py
	import os
	import tempfile
	import uuid
	import soundfile as sf
	from pathlib import Path
	import numpy as np
	import logging

	import gradio as gr
	from transformers import pipeline

	# Load the Hugging Face automatic speech recognition pipeline.
	# The model "openai/whisper-small" is public and works on CPU (smaller memory footprint).
	# Loading may take a few seconds at startup.
	ASR_MODEL = "openai/whisper-small"
	# Use Whisper's translate task so output is English regardless of input language
	asr = pipeline(
	"automatic-speech-recognition",
	model=ASR_MODEL,
	chunk_length_s=30,
	ignore_warning=True,
	generate_kwargs={"task": "translate"},
	)

	# Debug flag: set True to print audio shapes/dtypes and save resampled temp WAVs
	DEBUG = False
	logger = logging.getLogger(__name__)
	if DEBUG:
	logging.basicConfig(level=logging.DEBUG)

	def save_audio_to_wav(audio, sr):
	"""
	audio: numpy array (samples,) or path string
	sr: sample rate
	Returns path to saved wav
	"""
	# unwrap common tuple forms (array, sr) or (sr, array)
	if isinstance(audio, (list, tuple)):
	# prefer numpy array element
	arr = next((x for x in audio if isinstance(x, (list, tuple, np.ndarray))), None)
	if isinstance(arr, (list, tuple)):
	audio = np.asarray(arr)
	elif isinstance(arr, np.ndarray):
	audio = arr
	else:
	# fallback to first element
	audio = np.asarray(audio[0])

	# ensure numpy array
	audio = np.asarray(audio)

	# If shape is (channels, frames) transpose to (frames, channels)
	if audio.ndim == 2 and audio.shape[0] <= 2 and audio.shape[1] > audio.shape[0]:
	audio = audio.T

	# Convert integer audio to float32 in [-1, 1] or ensure float32
	if np.issubdtype(audio.dtype, np.integer):
	maxv = np.iinfo(audio.dtype).max
	audio = audio.astype("float32") / float(maxv)
	else:
	audio = audio.astype("float32")

	tmpdir = tempfile.gettempdir()
	fname = Path(tmpdir) / f"hf_audio_{uuid.uuid4().hex}.wav"
	sf.write(str(fname), audio, sr, format="WAV")
	return str(fname)

	def transcribe(audio):
	"""
	audio: either a file path string (Gradio sometimes returns a path)
	or a tuple (np_array, sample_rate) from Gradio's audio component.
	"""
	if audio is None:
	return "No audio provided."

	# If Gradio gives a filepath (str), read it with soundfile to avoid ffmpeg requirement
	audio_array = None
	sampling_rate = None

	if isinstance(audio, str):
	try:
	audio_array, sampling_rate = sf.read(audio)
	except Exception as e:
	return f"Could not read audio file: {e}"
	else:
	# Normalize audio to (samples, sr)
	samples = None
	sr = None
	if isinstance(audio, (list, tuple)):
	# common forms: (samples, sr) or (sr, samples)
	if len(audio) >= 2:
	a0, a1 = audio[0], audio[1]
	if isinstance(a0, (list, tuple, np.ndarray)):
	samples, sr = a0, a1
	elif isinstance(a1, (list, tuple, np.ndarray)):
	samples, sr = a1, a0
	# fallback: try to find array and int within the tuple
	if samples is None:
	samples = next((x for x in audio if isinstance(x, (list, tuple, np.ndarray))), None)
	sr = next((x for x in audio if isinstance(x, int)), None)
	else:
	samples = audio

	if samples is None:
	return "Unsupported audio format."

	# default sr if missing
	if sr is None:
	sr = 16000

	audio_array = np.asarray(samples)
	sampling_rate = sr

	# Ensure numpy array and float32
	try:
	audio_array = np.asarray(audio_array)
	except Exception:
	return "Unsupported audio data - cannot convert to numpy array."

	# If 2D (frames, channels) or (channels, frames), make mono by averaging channels
	if audio_array.ndim == 2:
	# If shape looks like (channels, frames), transpose first
	if audio_array.shape[0] <= 2 and audio_array.shape[1] > audio_array.shape[0]:
	audio_array = audio_array.T
	# average channels to mono
	audio_array = np.mean(audio_array, axis=1)

	# Convert integer audio to float32 in [-1, 1] or ensure float32
	if np.issubdtype(audio_array.dtype, np.integer):
	maxv = np.iinfo(audio_array.dtype).max
	audio_array = audio_array.astype("float32") / float(maxv)
	else:
	audio_array = audio_array.astype("float32")
	# Resample to the model's expected sampling rate if needed (avoid passing sampling_rate kwarg)
	try:
	model_sr = getattr(getattr(asr, "feature_extractor", None), "sampling_rate", None)
	except Exception:
	model_sr = None

	if model_sr is None:
	model_sr = 16000

	# if incoming sampling_rate is missing, assume model rate
	if sampling_rate is None:
	sampling_rate = model_sr

	if sampling_rate != model_sr:
	# simple linear resampling via numpy.interp
	try:
	orig_len = audio_array.shape[0]
	new_len = int(round(orig_len * float(model_sr) / float(sampling_rate)))
	if new_len <= 0:
	return "Transcription failed: invalid resample length"
	new_indices = np.linspace(0, orig_len - 1, new_len)
	old_indices = np.arange(orig_len)
	audio_array = np.interp(new_indices, old_indices, audio_array).astype("float32")
	sampling_rate = model_sr
	except Exception as e:
	return f"Transcription failed during resampling: {e}"

	# Debug: log and optionally save the resampled audio
	if DEBUG:
	try:
	logger.debug(f"Calling ASR with audio_array.shape={audio_array.shape}, dtype={audio_array.dtype}, sampling_rate={sampling_rate}")
	tmpdir = tempfile.gettempdir()
	dbg_fname = Path(tmpdir) / f"hf_debug_audio_{uuid.uuid4().hex}.wav"
	sf.write(str(dbg_fname), audio_array, sampling_rate, format="WAV")
	logger.debug(f"Wrote debug WAV to {dbg_fname}")
	except Exception as e:
	logger.debug(f"Debug save failed: {e}")

	# Use the pipeline to transcribe by passing just the numpy array (model expects array at its sampling rate)
	try:
	result = asr(audio_array)
	except Exception as e:
	return f"Transcription failed: {e}"
	text = result.get("text", "").strip()

	# cleanup temporary file
	try:
	pass # Removed cleanup code referencing undefined audio_path
	except Exception:
	pass

	if not text:
	return "No speech detected / transcription empty."
	return text

	def clear_audio():
	return None, ""


	with gr.Blocks(title="Whisper-Small Speech-to-English") as demo:
	gr.Markdown(
	"""
	# 🎙️ Whisper-Small Speech-to-English
	Record or upload audio and click Transcribe.
	This app uses `openai/whisper-small` in translate mode and returns English text.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	audio_input = gr.Audio(type="numpy", label="Record or upload audio")
	upload_input = gr.Audio(type="numpy", label="Or upload an audio file")

	transcribe_btn = gr.Button("Transcribe")
	clear_btn = gr.Button("Clear")
	with gr.Column(scale=1):
	transcript = gr.Textbox(label="Transcription", lines=8)
	copy_btn = gr.Button("Copy transcript")

	# When clicking the transcribe button, prefer recorded audio if present,
	# otherwise use uploaded audio.
	def _get_preferred_audio(recorded, uploaded):
	# recorded or uploaded may be numpy tuples or file paths depending on Gradio
	if recorded:
	return recorded
	if uploaded:
	return uploaded
	return None

	transcribe_btn.click(
	fn=lambda rec, up: transcribe(_get_preferred_audio(rec, up)),
	inputs=[audio_input, upload_input],
	outputs=transcript,
	)

	clear_btn.click(
	fn=clear_audio,
	inputs=None,
	outputs=[audio_input, transcript],
	)

	# Copy transcript to clipboard (Gradio has `copy` action for buttons)
	copy_btn.click(
	fn=lambda txt: txt,
	inputs=transcript,
	outputs=None,
	)

	gr.Markdown(
	"Notes: The app translates spoken audio to English using Whisper (translate task). "
	"Small model runs on CPU and may take time for longer files. For lower latency or other target languages, consider the HF Inference API or additional translation pipelines."
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)), share=True)