Spaces:

mistral-hackaton-2026
/

hackernews_radio

Build error

App Files Files Community

hackernews_radio / src /lib /voxtral-local.ts

ldidukh

Fix: load transformers.js from CDN, remove paths option; ignore dist/

fcbeeae about 2 months ago

raw

history blame contribute delete

4.9 kB

	// @huggingface/transformers is loaded from CDN at runtime (not bundled)
	// to avoid OOMing the Docker build with its 139 MB package size.
	const HF_CDN = 'https://esm.sh/@huggingface/transformers@3.8.1'

	const MODEL_ID = 'onnx-community/Voxtral-Mini-3B-2507-ONNX'

	export type LocalModelStatus = 'idle' \| 'loading' \| 'ready' \| 'error'

	export interface LoadProgress {
	status: LocalModelStatus
	progress: number // 0–100
	message: string
	}

	// eslint-disable-next-line @typescript-eslint/no-explicit-any
	let _lib: any = null
	// eslint-disable-next-line @typescript-eslint/no-explicit-any
	let _processor: any = null
	// eslint-disable-next-line @typescript-eslint/no-explicit-any
	let _model: any = null
	let _loadPromise: Promise<void> \| null = null

	/** Lazy-load the CDN module once */
	// eslint-disable-next-line @typescript-eslint/no-explicit-any
	async function getLib(): Promise<any> {
	if (_lib) return _lib
	// @vite-ignore tells rolldown/vite not to try to bundle this URL
	_lib = await import(/* @vite-ignore */ HF_CDN)
	_lib.env.allowLocalModels = false
	return _lib
	}

	/** Resample a WebM/any audio blob to a 16 kHz mono Float32Array */
	async function decodeAudioTo16kHz(blob: Blob): Promise<Float32Array> {
	const arrayBuffer = await blob.arrayBuffer()
	const audioCtx = new AudioContext()
	const decoded = await audioCtx.decodeAudioData(arrayBuffer)
	audioCtx.close()

	const targetRate = 16_000
	if (decoded.sampleRate === targetRate) {
	return decoded.getChannelData(0)
	}

	const length = Math.round(decoded.length * targetRate / decoded.sampleRate)
	const offlineCtx = new OfflineAudioContext(1, length, targetRate)
	const source = offlineCtx.createBufferSource()
	source.buffer = decoded
	source.connect(offlineCtx.destination)
	source.start(0)
	const rendered = await offlineCtx.startRendering()
	return rendered.getChannelData(0)
	}

	/**
	* Load the Voxtral Mini (3B) model from HuggingFace Hub (cached after first download).
	* Safe to call multiple times — subsequent calls return the same promise.
	*/
	export function loadVoxtralLocal(onProgress?: (info: LoadProgress) => void): Promise<void> {
	if (_processor && _model) {
	onProgress?.({ status: 'ready', progress: 100, message: 'Model ready' })
	return Promise.resolve()
	}

	if (_loadPromise) return _loadPromise

	_loadPromise = (async () => {
	const report = (progress: number, message: string) =>
	onProgress?.({ status: 'loading', progress, message })

	report(0, 'Loading transformers.js from CDN…')
	const { VoxtralProcessor, VoxtralForConditionalGeneration } = await getLib()

	report(2, 'Loading processor…')
	_processor = await VoxtralProcessor.from_pretrained(MODEL_ID)

	report(5, 'Downloading model weights (this may take several minutes on first run)…')

	let lastPct = 5
	_model = await VoxtralForConditionalGeneration.from_pretrained(MODEL_ID, {
	dtype: {
	embed_tokens: 'fp16',
	audio_encoder: 'q4',
	decoder_model_merged: 'q4',
	},
	device: 'webgpu',
	// eslint-disable-next-line @typescript-eslint/no-explicit-any
	progress_callback: (p: any) => {
	if (typeof p.progress === 'number') {
	const scaled = 5 + (p.progress / 100) * 93
	if (scaled > lastPct) {
	lastPct = scaled
	const file = p.file ? ` (${p.file})` : ''
	report(scaled, `Downloading${file} — ${Math.round(p.progress)}%`)
	}
	}
	},
	})

	onProgress?.({ status: 'ready', progress: 100, message: 'Voxtral Mini ready' })
	})().catch(err => {
	_loadPromise = null // allow retry on next call
	onProgress?.({ status: 'error', progress: 0, message: String(err) })
	throw err
	})

	return _loadPromise
	}

	export function isModelLoaded(): boolean {
	return _processor !== null && _model !== null
	}

	/**
	* Transcribe an audio Blob locally using the Voxtral Mini (3B) ONNX model.
	* The model must be loaded first via `loadVoxtralLocal()`.
	*/
	export async function transcribeWithVoxtralLocal(audioBlob: Blob): Promise<string> {
	if (!_processor \|\| !_model) {
	throw new Error('Model not loaded — call loadVoxtralLocal() first.')
	}

	const audio = await decodeAudioTo16kHz(audioBlob)

	const conversation = [
	{
	role: 'user',
	content: [
	{ type: 'audio' },
	{ type: 'text', text: 'lang:en [TRANSCRIBE]' },
	],
	},
	]

	const text = _processor.apply_chat_template(conversation, { tokenize: false })
	const inputs = await _processor(text, audio)

	const generated_ids = await _model.generate({
	...inputs,
	max_new_tokens: 256,
	})

	const promptLen = inputs.input_ids?.dims?.at(-1) ?? 0
	const new_tokens = generated_ids.slice(null, [promptLen, null])
	const [transcription] = _processor.batch_decode(new_tokens, { skip_special_tokens: true })
	return transcription.trim()
	}