Spaces:

EGuihaire
/

NeoScribe

Sleeping

App Files Files Community

NeoScribe / app.py

EGuihaire

Add app.py

e081a72 verified 2 months ago

raw

history blame contribute delete

3.65 kB

	import gradio as gr
	import numpy as np
	from transformers import pipeline

	MODEL_ID = "openai/whisper-tiny"

	asr = pipeline(
	"automatic-speech-recognition",
	model=MODEL_ID,
	chunk_length_s=10,
	device=-1,
	)

	TARGET_SR = 16000


	def to_mono(audio_tuple):
	if audio_tuple is None:
	return None, None
	sr, data = audio_tuple
	if data is None:
	return None, None
	data = np.asarray(data)
	if data.ndim == 2:
	data = data.mean(axis=1)
	if np.issubdtype(data.dtype, np.integer):
	max_val = np.iinfo(data.dtype).max
	data = data.astype(np.float32) / max_val
	else:
	data = data.astype(np.float32)
	return sr, data


	def linear_resample(audio, orig_sr, target_sr=TARGET_SR):
	if orig_sr == target_sr:
	return audio
	duration = len(audio) / orig_sr
	old_times = np.linspace(0, duration, num=len(audio), endpoint=False)
	new_length = int(duration * target_sr)
	new_times = np.linspace(0, duration, num=new_length, endpoint=False)
	return np.interp(new_times, old_times, audio).astype(np.float32)


	def run_asr(audio_np, sr):
	if audio_np is None or len(audio_np) < sr * 0.4:
	return ""
	result = asr({"sampling_rate": sr, "raw": audio_np})
	if isinstance(result, dict):
	return result.get("text", "").strip()
	return str(result).strip()


	def stream_transcribe(audio, state):
	if state is None:
	state = {"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""}

	sr, chunk = to_mono(audio)
	if chunk is None:
	return state, state.get("partial", ""), state.get("stable", "")

	chunk = linear_resample(chunk, sr, TARGET_SR)
	state["buffer"] = np.concatenate([state["buffer"], chunk])

	max_samples = TARGET_SR * 20
	if len(state["buffer"]) > max_samples:
	state["buffer"] = state["buffer"][-max_samples:]

	preview_samples = TARGET_SR * 8
	preview_audio = state["buffer"][-preview_samples:]

	partial = run_asr(preview_audio, TARGET_SR)
	state["partial"] = partial
	live = (state["stable"] + " " + partial).strip()
	return state, partial, live


	def finalize(state):
	if state is None:
	state = {"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""}
	stable = state.get("stable", "").strip()
	partial = state.get("partial", "").strip()
	if partial:
	stable = f"{stable} {partial}".strip()
	state["stable"] = stable
	state["partial"] = ""
	state["buffer"] = np.zeros(0, dtype=np.float32)
	return state, "", stable


	def clear():
	state = {"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""}
	return None, state, "", ""


	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# NeoScribe (pseudo-live)
	Stream audio from the browser microphone and transcribe in near real time.

	Next step: send audio chunks from your browser extension to this backend.
	"""
	)

	state = gr.State({"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""})

	audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy", label="Input audio")
	partial = gr.Textbox(label="Partial text", lines=3)
	final = gr.Textbox(label="Stable transcript", lines=10)

	clear_btn = gr.Button("Clear")

	audio.stream(
	stream_transcribe,
	inputs=[audio, state],
	outputs=[state, partial, final],
	stream_every=0.8,
	time_limit=120,
	)

	audio.stop_recording(finalize, inputs=[state], outputs=[state, partial, final])

	clear_btn.click(clear, outputs=[audio, state, partial, final])


	demo.launch()