Spaces:

naumxv
/

whisper-api-fast

Sleeping

App Files Files Community

whisper-api-fast / src /worker.js

Joshua Lochner

Use `no_attentions` revision for medium+undistilled models

4bfdb73 over 2 years ago

history blame contribute delete

4.98 kB

	/* eslint-disable camelcase */
	import { pipeline, env } from "@xenova/transformers";

	// Disable local models
	env.allowLocalModels = false;

	// Define model factories
	// Ensures only one model is created of each type
	class PipelineFactory {
	static task = null;
	static model = null;
	static quantized = null;
	static instance = null;

	constructor(tokenizer, model, quantized) {
	this.tokenizer = tokenizer;
	this.model = model;
	this.quantized = quantized;
	}

	static async getInstance(progress_callback = null) {
	if (this.instance === null) {
	this.instance = pipeline(this.task, this.model, {
	quantized: this.quantized,
	progress_callback,

	// For medium models, we need to load the `no_attentions` revision to avoid running out of memory
	revision: this.model.includes("/whisper-medium") ? "no_attentions" : "main"
	});
	}

	return this.instance;
	}
	}

	self.addEventListener("message", async (event) => {
	const message = event.data;

	// Do some work...
	// TODO use message data
	let transcript = await transcribe(
	message.audio,
	message.model,
	message.multilingual,
	message.quantized,
	message.subtask,
	message.language,
	);
	if (transcript === null) return;

	// Send the result back to the main thread
	self.postMessage({
	status: "complete",
	task: "automatic-speech-recognition",
	data: transcript,
	});
	});

	class AutomaticSpeechRecognitionPipelineFactory extends PipelineFactory {
	static task = "automatic-speech-recognition";
	static model = null;
	static quantized = null;
	}

	const transcribe = async (
	audio,
	model,
	multilingual,
	quantized,
	subtask,
	language,
	) => {

	const isDistilWhisper = model.startsWith("distil-whisper/");

	let modelName = model;
	if (!isDistilWhisper && !multilingual) {
	modelName += ".en"
	}

	const p = AutomaticSpeechRecognitionPipelineFactory;
	if (p.model !== modelName \|\| p.quantized !== quantized) {
	// Invalidate model if different
	p.model = modelName;
	p.quantized = quantized;

	if (p.instance !== null) {
	(await p.getInstance()).dispose();
	p.instance = null;
	}
	}

	// Load transcriber model
	let transcriber = await p.getInstance((data) => {
	self.postMessage(data);
	});

	const time_precision =
	transcriber.processor.feature_extractor.config.chunk_length /
	transcriber.model.config.max_source_positions;

	// Storage for chunks to be processed. Initialise with an empty chunk.
	let chunks_to_process = [
	{
	tokens: [],
	finalised: false,
	},
	];

	// TODO: Storage for fully-processed and merged chunks
	// let decoded_chunks = [];

	function chunk_callback(chunk) {
	let last = chunks_to_process[chunks_to_process.length - 1];

	// Overwrite last chunk with new info
	Object.assign(last, chunk);
	last.finalised = true;

	// Create an empty chunk after, if it not the last chunk
	if (!chunk.is_last) {
	chunks_to_process.push({
	tokens: [],
	finalised: false,
	});
	}
	}

	// Inject custom callback function to handle merging of chunks
	function callback_function(item) {
	let last = chunks_to_process[chunks_to_process.length - 1];

	// Update tokens of last chunk
	last.tokens = [...item[0].output_token_ids];

	// Merge text chunks
	// TODO optimise so we don't have to decode all chunks every time
	let data = transcriber.tokenizer._decode_asr(chunks_to_process, {
	time_precision: time_precision,
	return_timestamps: true,
	force_full_sequences: false,
	});

	self.postMessage({
	status: "update",
	task: "automatic-speech-recognition",
	data: data,
	});
	}

	// Actually run transcription
	let output = await transcriber(audio, {
	// Greedy
	top_k: 0,
	do_sample: false,

	// Sliding window
	chunk_length_s: isDistilWhisper ? 20 : 30,
	stride_length_s: isDistilWhisper ? 3 : 5,

	// Language and task
	language: language,
	task: subtask,

	// Return timestamps
	return_timestamps: true,
	force_full_sequences: false,

	// Callback functions
	callback_function: callback_function, // after each generation step
	chunk_callback: chunk_callback, // after each chunk is processed
	}).catch((error) => {
	self.postMessage({
	status: "error",
	task: "automatic-speech-recognition",
	data: error,
	});
	return null;
	});

	return output;
	};