StreamingSpeech

Running

App Files Files Community

StreamingSpeech / index.html

luckyt

Initial commit

0a01d76 verified 8 months ago

raw

history blame contribute delete

11.7 kB

	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="UTF-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<title>WASM Streaming Speech Recognition</title>
	<style>
	@import url("https://fonts.googleapis.com/css2?family=Source+Sans+3:wght@300;400;600;700&display=swap");
	html, body { font-family: "Source Sans 3", system-ui, -apple-system, Segoe UI, Roboto, sans-serif; }
	</style>
	<script src="css/tailwind-3.4.17.js"></script>
	<script type="module">
	const MODEL_ID = "moshi_1b_en_fr_q4k";
	const WEIGHTS_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/model-q4k.gguf";
	const MIMI_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/mimi-pytorch-e351c8d8@125.safetensors";
	const TOKENIZER_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/tokenizer_en_fr_audio_8000.json";
	const CONFIG_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/config.json";

	const moshiWorker = new Worker("./moshiWorker.js", { type: "module" });
	let mediaRecorder = null;
	let audioChunks = [];
	let isRecording = false;
	let audioStream = null;
	let audioContext = null;
	let processor = null;
	let source = null;
	let modelInitialized = false;
	let pendingStart = false;

	// Performance tracking
	let audioChunksProcessed = 0;
	let sessionStartTime = 0;

	function updateStatusDiv(message) {
	document.querySelector("#status-div").textContent = message;
	}

	function updateDiagnostics() {
	const diagnostics = document.querySelector("#diagnostics");
	if (!diagnostics) return;

	const cpuCount = navigator.hardwareConcurrency \|\| 'unknown';

	// Only update metrics when recording, otherwise show final values
	if (isRecording && sessionStartTime) {
	// Calculate real-time factor (audio processed / wall clock time)
	// >1 = faster than real-time, <1 = slower than real-time
	const audioProcessed = audioChunksProcessed * (1024 / 24000);
	const audioSessionDuration = (Date.now() - sessionStartTime) / 1000;
	const realTimeFactor = audioSessionDuration > 0 ? (audioProcessed / audioSessionDuration) : 0;

	// Color code based on performance
	let factorColor = '';
	if (realTimeFactor >= 0.95) {
	factorColor = 'text-green-600';
	} else if (realTimeFactor >= 0.8) {
	factorColor = 'text-yellow-600';
	}
	else {
	factorColor = 'text-red-600';
	}

	diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="${factorColor}">${realTimeFactor.toFixed(2)}x</span>, Duration: ${audioSessionDuration.toFixed(1)}s`;
	} else if (!sessionStartTime) {
	diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="text-gray-600">0.00x</span>, Duration: 0.0s`;
	}
	}

	window.addEventListener('load', updateDiagnostics);
	setInterval(updateDiagnostics, 200);

	function initializeModel() {
	if (modelInitialized) return;

	const button = document.querySelector("#speech-button");
	button.disabled = true;
	button.className = "bg-gray-400 text-gray-700 font-normal py-2 px-4 rounded cursor-not-allowed";

	moshiWorker.postMessage({
	command: "initialize",
	weightsURL: WEIGHTS_URL,
	modelID: MODEL_ID,
	mimiURL: MIMI_URL,
	tokenizerURL: TOKENIZER_URL,
	configURL: CONFIG_URL,
	});
	}

	// Handle messages from worker
	moshiWorker.addEventListener("message", async (event) => {
	const data = event.data;
	if (data.status === "model_ready") {
	modelInitialized = true;
	updateStatusDiv("Model loaded - Ready to start");

	const button = document.querySelector("#speech-button");
	button.disabled = false;
	button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded";

	if (pendingStart) {
	pendingStart = false;
	await startRecording();
	}
	} else if (data.status === "streaming") {
	// Add new word to transcription in real-time
	const outputDiv = document.querySelector("#output-generation");
	const placeholder = document.querySelector("#output-placeholder");

	if (placeholder) placeholder.hidden = true;

	if (outputDiv.textContent) {
	outputDiv.textContent += " " + data.word;
	} else {
	outputDiv.textContent = data.word;
	}
	outputDiv.hidden = false;
	} else if (data.status === "chunk_processed") {
	audioChunksProcessed++;
	} else if (data.status === "loading") {
	updateStatusDiv(data.message);
	} else if (data.error) {
	updateStatusDiv("Error: " + data.error);
	pendingStart = false;
	}
	});

	function updateStatus(data) {
	const { status, message, word } = data;
	const outputDiv = document.querySelector("#output-generation");

	if (status === "loading" \|\| status === "decoding") {
	updateStatusDiv(message \|\| (status === "loading" ? "Loading..." : "Decoding..."));
	} else if (status === "streaming") {
	// Add new word to the transcription in real-time
	if (outputDiv.textContent) {
	outputDiv.textContent += " " + word;
	} else {
	outputDiv.textContent = word;
	}
	outputDiv.hidden = false;
	} else if (status === "complete") {
	updateStatusDiv("Ready");
	}
	}

	async function startMicrophone() {
	try {
	audioStream = await navigator.mediaDevices.getUserMedia({ audio: true });
	updateStatusDiv("Microphone access granted");

	audioContext = new AudioContext({ sampleRate: 24000 });
	source = audioContext.createMediaStreamSource(audioStream);

	processor = audioContext.createScriptProcessor(1024, 1, 1);

	processor.onaudioprocess = function(event) {
	if (!isRecording \|\| !modelInitialized) return;

	const inputBuffer = event.inputBuffer;
	const inputData = inputBuffer.getChannelData(0);

	// Send audio chunk to worker
	const audioChunk = new Float32Array(inputData);
	moshiWorker.postMessage({
	command: "process_audio",
	audioData: audioChunk
	}, [audioChunk.buffer]);
	};

	source.connect(processor);
	processor.connect(audioContext.destination);

	} catch (error) {
	updateStatusDiv("Microphone access denied: " + error.message);
	throw error;
	}
	}

	function stopMicrophone() {
	// Disconnect audio nodes
	if (processor) {
	processor.disconnect();
	processor = null;
	}
	if (source) {
	source.disconnect();
	source = null;
	}
	if (audioContext) {
	audioContext.close();
	audioContext = null;
	}

	// Stop media stream
	if (audioStream) {
	audioStream.getTracks().forEach(track => track.stop());
	audioStream = null;
	}

	updateStatusDiv("Microphone stopped");
	}

	async function startRecording() {
	const button = document.querySelector("#speech-button");

	try {
	updateStatusDiv("Requesting microphone access...");
	await startMicrophone();

	// Reset performance counters
	audioChunksProcessed = 0;
	sessionStartTime = Date.now();

	// Start streaming session
	moshiWorker.postMessage({ command: "start_stream" });

	isRecording = true;
	button.textContent = "Stop Speech";
	button.className = "bg-red-600 hover:bg-red-700 text-white font-normal py-2 px-4 rounded";
	updateStatusDiv("Listening...");

	// Clear previous transcription
	document.querySelector("#output-generation").textContent = "";
	document.querySelector("#output-generation").hidden = true;
	document.querySelector("#output-placeholder").hidden = true;

	} catch (error) {
	console.error('Error starting microphone:', error);
	updateStatusDiv("Error: " + error.message);
	pendingStart = false;
	}
	}

	document.querySelector("#speech-button").addEventListener("click", async () => {
	const button = document.querySelector("#speech-button");

	if (!isRecording) {
	// Check if model is ready
	if (!modelInitialized) {
	pendingStart = true;
	initializeModel();
	return;
	}

	await startRecording();
	} else {
	stopMicrophone();

	// End streaming session
	moshiWorker.postMessage({ command: "stop_stream" });

	isRecording = false;
	button.textContent = "Start Speech";
	button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded";
	updateStatusDiv("Ready to start");
	}
	});
	</script>
	</head>
	<body class="container max-w-4xl mx-auto p-4">
	<main class="grid grid-cols-1 gap-8 relative">
	<div>
	<h1 class="text-4xl font-bold">WASM Streaming Speech Recognition</h1>
	<p class="text-gray-700">
	Transcribe audio from your microphone in real time in the browser using Rust/WASM.
	This demo runs entirely offline on your CPU after downloading a ~950 MB model.
	It understands English and French, and uses the
	<a href="https://huggingface.co/kyutai/stt-1b-en_fr" target="_blank" class="underline hover:text-blue-600">Kyutai STT model</a>
	together with a WASM runtime built in
	<a href="https://github.com/huggingface/candle/" target="_blank" class="underline hover:text-blue-600">Candle</a>.
	</p>
	</div>

	<div>
	<button id="speech-button" class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded">
	Start Speech
	</button>
	<div class="mt-2 text-gray-600 text-sm space-y-1">
	<div>Status: <span id="status-div">Click "Start Speech" to begin</span></div>
	<div id="diagnostics">CPUs: -, Real-time factor: 0.00x, Duration: 0.0s</div>
	</div>
	</div>

	<div>
	<h3 class="font-medium">Transcription:</h3>
	<div class="min-h-[200px] bg-slate-100 text-gray-700 p-4 rounded-md">
	<p id="output-generation" hidden></p>
	<span id="output-placeholder" class="font-light text-gray-500">Click "Start Speech" to begin transcription</span>
	</div>
	</div>

	<div class="mt-4 p-3 bg-gray-50 text-gray-700 rounded-md">
	💡 This demo shows offline transcription in your browser.
	For more accurate cloud transcription and real-time LLM grammar correction, check out
	<a href="https://voicewriter.io" target="_blank" class="underline hover:text-blue-600">Voice Writer</a>.
	</div>
	</main>
	</body>
	</html>