nemotron-speech-streaming

Runtime error

App Files Files Community

nemotron-speech-streaming / main.py

gnumanth

FastAPI + WebSocket streaming with newspaper UI

ea31d8c verified about 2 months ago

raw

history blame contribute delete

7.37 kB

	"""
	FastAPI + WebSocket backend for real-time speech transcription.
	Uses NeMo ASR model directly (no Triton required).
	"""

	import asyncio
	import json
	import uuid
	import sys
	from pathlib import Path
	from typing import Optional, AsyncIterator
	from datetime import datetime

	import numpy as np
	import torch
	from fastapi import FastAPI, WebSocket, WebSocketDisconnect
	from fastapi.staticfiles import StaticFiles
	from fastapi.responses import FileResponse
	from loguru import logger

	# Configure logging
	logger.remove()
	logger.add(
	sys.stderr,
	format="<green>{time:HH:mm:ss}</green> \| <level>{level: <8}</level> \| <level>{message}</level>",
	level="INFO",
	)

	# Global model
	ASR_MODEL = None
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	def load_model():
	"""Load the NeMo ASR model."""
	global ASR_MODEL

	logger.info("Loading NeMo ASR Model...")
	try:
	import nemo.collections.asr as nemo_asr
	ASR_MODEL = nemo_asr.models.ASRModel.from_pretrained(
	model_name="nvidia/nemotron-speech-streaming-en-0.6b"
	)
	ASR_MODEL.eval()

	if torch.cuda.is_available():
	logger.info("Moving model to CUDA")
	ASR_MODEL = ASR_MODEL.cuda()
	else:
	logger.warning("CUDA not available, using CPU (will be slow)")

	logger.info("Model loaded successfully!")
	return True
	except Exception as e:
	logger.error(f"Failed to load model: {e}")
	return False


	# Create FastAPI app
	app = FastAPI(title="Nemotron Speech Streaming")


	@app.on_event("startup")
	async def startup():
	"""Load model on startup."""
	load_model()


	@app.get("/health")
	async def health():
	"""Health check endpoint."""
	return {
	"status": "healthy",
	"model_loaded": ASR_MODEL is not None,
	"device": DEVICE,
	}


	@app.get("/")
	async def root():
	"""Serve the frontend."""
	return FileResponse(Path(__file__).parent / "static" / "index.html")


	@app.websocket("/ws/transcribe")
	async def websocket_transcribe(websocket: WebSocket):
	"""
	WebSocket endpoint for streaming transcription.

	Protocol:
	- Client sends binary PCM audio data (16-bit, 16kHz, mono)
	- Server sends JSON: {"type": "transcript", "text": "...", "is_final": bool}
	"""
	await websocket.accept()

	session_id = str(uuid.uuid4())[:8]
	logger.info(f"[{session_id}] Client connected")

	# Send ready message
	await websocket.send_json({
	"type": "ready",
	"session_id": session_id,
	"model_loaded": ASR_MODEL is not None,
	})

	if ASR_MODEL is None:
	await websocket.send_json({
	"type": "error",
	"message": "Model not loaded. Please wait and try again.",
	})
	await websocket.close()
	return

	# Audio buffer
	audio_buffer = np.array([], dtype=np.float32)
	chunk_count = 0
	last_transcript = ""

	# Processing settings
	MIN_AUDIO_LENGTH = 8000 # 0.5 seconds at 16kHz
	MAX_AUDIO_LENGTH = 80000 # 5 seconds at 16kHz
	PROCESS_EVERY_N_CHUNKS = 3 # Process every N chunks for efficiency

	try:
	while True:
	message = await websocket.receive()

	if message["type"] == "websocket.disconnect":
	break

	# Handle binary audio data
	if "bytes" in message:
	audio_bytes = message["bytes"]
	chunk_count += 1

	# Convert bytes to numpy array (expecting 16-bit PCM)
	audio_chunk = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0

	# Add to buffer
	audio_buffer = np.concatenate([audio_buffer, audio_chunk])

	# Log periodically
	if chunk_count % 20 == 0:
	logger.debug(f"[{session_id}] Chunks: {chunk_count}, Buffer: {len(audio_buffer)} samples")

	# Process when we have enough audio
	if len(audio_buffer) >= MIN_AUDIO_LENGTH and chunk_count % PROCESS_EVERY_N_CHUNKS == 0:
	# Use last N samples for context
	audio_context = audio_buffer[-MAX_AUDIO_LENGTH:] if len(audio_buffer) > MAX_AUDIO_LENGTH else audio_buffer

	try:
	with torch.no_grad():
	start_time = datetime.now()
	results = ASR_MODEL.transcribe([audio_context])
	inference_time = (datetime.now() - start_time).total_seconds() * 1000

	if results and len(results) > 0:
	hyp = results[0]

	# Extract text
	if isinstance(hyp, str):
	text = hyp
	elif hasattr(hyp, 'text'):
	text = hyp.text
	elif hasattr(hyp, 'pred_text'):
	text = hyp.pred_text
	else:
	text = str(hyp)

	text = text.strip()

	if text and text != last_transcript:
	last_transcript = text
	logger.info(f"[{session_id}] ({inference_time:.0f}ms) {text[:60]}...")

	await websocket.send_json({
	"type": "transcript",
	"text": text,
	"is_final": False,
	"latency_ms": inference_time,
	})

	except Exception as e:
	logger.error(f"[{session_id}] Inference error: {e}")

	# Trim buffer to prevent memory growth
	if len(audio_buffer) > MAX_AUDIO_LENGTH:
	audio_buffer = audio_buffer[-MAX_AUDIO_LENGTH:]

	# Handle JSON control messages
	elif "text" in message:
	try:
	data = json.loads(message["text"])
	msg_type = data.get("type")

	if msg_type == "reset":
	audio_buffer = np.array([], dtype=np.float32)
	chunk_count = 0
	last_transcript = ""
	logger.info(f"[{session_id}] Session reset")
	await websocket.send_json({"type": "reset_ack"})

	elif msg_type == "ping":
	await websocket.send_json({"type": "pong"})

	except json.JSONDecodeError:
	pass

	except WebSocketDisconnect:
	logger.info(f"[{session_id}] Client disconnected")
	except Exception as e:
	logger.error(f"[{session_id}] WebSocket error: {e}")
	finally:
	logger.info(f"[{session_id}] Session ended (processed {chunk_count} chunks)")


	# Mount static files
	static_path = Path(__file__).parent / "static"
	if static_path.exists():
	app.mount("/static", StaticFiles(directory=str(static_path)), name="static")


	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)