Spaces:

Duplicated from andito/parakeet-v3-streaming

RobinsAIWorld
/

parakeet-v3-streaming

Running

App Files Files Community

parakeet-v3-streaming / source /src /utils /audio.js

andito's picture

andito HF Staff

Add cross-origin headers and optimize progressive streaming

e74c555 about 1 month ago

history blame contribute delete

6.09 kB

	/**
	* Audio capture and processing utilities
	*
	* Uses Web Audio API with ScriptProcessorNode for real-time PCM audio capture
	*/

	const WHISPER_SAMPLING_RATE = 16000;

	export class AudioRecorder {
	constructor(onDataAvailable) {
	this.onDataAvailable = onDataAvailable;
	this.audioContext = null;
	this.stream = null;
	this.source = null;
	this.processor = null;
	this.isRecording = false;
	this.audioChunks = [];
	}

	async start(deviceId = null) {
	/**
	* Start recording audio from microphone using Web Audio API
	* @param {string} deviceId - Optional specific device ID to use
	*/
	try {
	// Request microphone access
	// Note: Disable echo cancellation and noise suppression in Chrome
	// as they can conflict with cross-origin isolation headers
	const audioConstraints = {
	channelCount: 1,
	echoCancellation: false,
	noiseSuppression: false,
	autoGainControl: false,
	};

	// If specific device requested, add deviceId constraint
	if (deviceId) {
	audioConstraints.deviceId = { exact: deviceId };
	}

	this.stream = await navigator.mediaDevices.getUserMedia({
	audio: audioConstraints
	});

	// Create AudioContext at native sample rate (browser will choose optimal rate)
	this.audioContext = new AudioContext();
	const nativeSampleRate = this.audioContext.sampleRate;

	// Resume AudioContext if suspended (required by some browsers)
	if (this.audioContext.state === 'suspended') {
	await this.audioContext.resume();
	}

	// Create source from stream
	this.source = this.audioContext.createMediaStreamSource(this.stream);

	// Create ScriptProcessorNode (deprecated but works everywhere)
	// Use larger buffer at native rate
	const bufferSize = 4096;
	this.processor = this.audioContext.createScriptProcessor(bufferSize, 1, 1);

	this.processor.onaudioprocess = (event) => {
	if (!this.isRecording) return;

	const inputData = event.inputBuffer.getChannelData(0);

	// Resample from native rate to 16kHz
	const resampled = this.resample(inputData, nativeSampleRate, WHISPER_SAMPLING_RATE);

	this.audioChunks.push(resampled);

	if (this.onDataAvailable) {
	this.onDataAvailable(resampled);
	}
	};

	// Connect: source -> processor -> destination
	this.source.connect(this.processor);
	this.processor.connect(this.audioContext.destination);

	this.isRecording = true;

	return true;
	} catch (error) {
	console.error('Failed to start recording:', error);
	throw error;
	}
	}

	resample(audioData, sourceSampleRate, targetSampleRate) {
	/**
	* Simple linear interpolation resampler
	* Converts audio from sourceSampleRate to targetSampleRate
	*/
	if (sourceSampleRate === targetSampleRate) {
	return new Float32Array(audioData);
	}

	const ratio = sourceSampleRate / targetSampleRate;
	const newLength = Math.round(audioData.length / ratio);
	const result = new Float32Array(newLength);

	for (let i = 0; i < newLength; i++) {
	const srcIndex = i * ratio;
	const srcIndexFloor = Math.floor(srcIndex);
	const srcIndexCeil = Math.min(srcIndexFloor + 1, audioData.length - 1);
	const t = srcIndex - srcIndexFloor;

	// Linear interpolation
	result[i] = audioData[srcIndexFloor] * (1 - t) + audioData[srcIndexCeil] * t;
	}

	return result;
	}

	requestData() {
	/**
	* No-op for ScriptProcessor (data comes automatically)
	*/
	// Data is emitted automatically via onaudioprocess
	}

	async stop() {
	/**
	* Stop recording and return complete audio as Float32Array
	*/
	return new Promise((resolve) => {
	this.isRecording = false;

	// Disconnect nodes
	if (this.processor) {
	this.processor.disconnect();
	this.processor = null;
	}

	if (this.source) {
	this.source.disconnect();
	this.source = null;
	}

	// Concatenate all chunks
	let totalLength = 0;
	for (const chunk of this.audioChunks) {
	totalLength += chunk.length;
	}

	const completeAudio = new Float32Array(totalLength);
	let offset = 0;
	for (const chunk of this.audioChunks) {
	completeAudio.set(chunk, offset);
	offset += chunk.length;
	}

	// Clean up
	this.cleanup();

	resolve(completeAudio);
	});
	}

	cleanup() {
	/**
	* Clean up resources
	*/
	if (this.stream) {
	this.stream.getTracks().forEach(track => track.stop());
	this.stream = null;
	}

	if (this.audioContext && this.audioContext.state !== 'closed') {
	this.audioContext.close();
	this.audioContext = null;
	}

	this.audioChunks = [];
	this.isRecording = false;
	}
	}

	export class AudioProcessor {
	/**
	* Process audio chunks for real-time transcription
	*/
	constructor(sampleRate = WHISPER_SAMPLING_RATE) {
	this.sampleRate = sampleRate;
	this.audioBuffer = new Float32Array(0);
	}

	appendChunk(chunk) {
	/**
	* Append new audio chunk to buffer
	*/
	const newBuffer = new Float32Array(this.audioBuffer.length + chunk.length);
	newBuffer.set(this.audioBuffer);
	newBuffer.set(chunk, this.audioBuffer.length);
	this.audioBuffer = newBuffer;
	}

	getBuffer() {
	/**
	* Get current audio buffer
	*/
	return this.audioBuffer;
	}

	getDuration() {
	/**
	* Get current buffer duration in seconds
	*/
	return this.audioBuffer.length / this.sampleRate;
	}

	reset() {
	/**
	* Clear audio buffer
	*/
	this.audioBuffer = new Float32Array(0);
	}

	trimToSize(maxDuration) {
	/**
	* Trim buffer to maximum duration (in seconds)
	*/
	const maxSamples = Math.floor(maxDuration * this.sampleRate);
	if (this.audioBuffer.length > maxSamples) {
	this.audioBuffer = this.audioBuffer.slice(-maxSamples);
	}
	}
	}

	export { WHISPER_SAMPLING_RATE };