NeoScribe / app.py
EGuihaire's picture
Add app.py
e081a72 verified
import gradio as gr
import numpy as np
from transformers import pipeline
MODEL_ID = "openai/whisper-tiny"
asr = pipeline(
"automatic-speech-recognition",
model=MODEL_ID,
chunk_length_s=10,
device=-1,
)
TARGET_SR = 16000
def to_mono(audio_tuple):
if audio_tuple is None:
return None, None
sr, data = audio_tuple
if data is None:
return None, None
data = np.asarray(data)
if data.ndim == 2:
data = data.mean(axis=1)
if np.issubdtype(data.dtype, np.integer):
max_val = np.iinfo(data.dtype).max
data = data.astype(np.float32) / max_val
else:
data = data.astype(np.float32)
return sr, data
def linear_resample(audio, orig_sr, target_sr=TARGET_SR):
if orig_sr == target_sr:
return audio
duration = len(audio) / orig_sr
old_times = np.linspace(0, duration, num=len(audio), endpoint=False)
new_length = int(duration * target_sr)
new_times = np.linspace(0, duration, num=new_length, endpoint=False)
return np.interp(new_times, old_times, audio).astype(np.float32)
def run_asr(audio_np, sr):
if audio_np is None or len(audio_np) < sr * 0.4:
return ""
result = asr({"sampling_rate": sr, "raw": audio_np})
if isinstance(result, dict):
return result.get("text", "").strip()
return str(result).strip()
def stream_transcribe(audio, state):
if state is None:
state = {"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""}
sr, chunk = to_mono(audio)
if chunk is None:
return state, state.get("partial", ""), state.get("stable", "")
chunk = linear_resample(chunk, sr, TARGET_SR)
state["buffer"] = np.concatenate([state["buffer"], chunk])
max_samples = TARGET_SR * 20
if len(state["buffer"]) > max_samples:
state["buffer"] = state["buffer"][-max_samples:]
preview_samples = TARGET_SR * 8
preview_audio = state["buffer"][-preview_samples:]
partial = run_asr(preview_audio, TARGET_SR)
state["partial"] = partial
live = (state["stable"] + " " + partial).strip()
return state, partial, live
def finalize(state):
if state is None:
state = {"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""}
stable = state.get("stable", "").strip()
partial = state.get("partial", "").strip()
if partial:
stable = f"{stable} {partial}".strip()
state["stable"] = stable
state["partial"] = ""
state["buffer"] = np.zeros(0, dtype=np.float32)
return state, "", stable
def clear():
state = {"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""}
return None, state, "", ""
with gr.Blocks() as demo:
gr.Markdown(
"""
# NeoScribe (pseudo-live)
Stream audio from the browser microphone and transcribe in near real time.
Next step: send audio chunks from your browser extension to this backend.
"""
)
state = gr.State({"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""})
audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy", label="Input audio")
partial = gr.Textbox(label="Partial text", lines=3)
final = gr.Textbox(label="Stable transcript", lines=10)
clear_btn = gr.Button("Clear")
audio.stream(
stream_transcribe,
inputs=[audio, state],
outputs=[state, partial, final],
stream_every=0.8,
time_limit=120,
)
audio.stop_recording(finalize, inputs=[state], outputs=[state, partial, final])
clear_btn.click(clear, outputs=[audio, state, partial, final])
demo.launch()