Spaces:

k-l-lambda
/

trigo

Sleeping

App Files Files Community

trigo / trigo-web /backend /dist /inc /modelInferencer.js

k-l-lambda's picture

Deploy: fix build by keeping pre-built dist folder

63a8db2 4 months ago

history blame contribute delete

14.9 kB

	"use strict";
	/**
	* ONNX Model Inferencer (Frontend/Backend Common)
	*
	* Platform-agnostic inference logic that accepts ONNX session from platform-specific code.
	* No direct dependency on onnxruntime packages - uses dependency injection pattern.
	*
	* Adapted from Node.js test_inference.js for cross-platform use
	* Provides causal language model inference using GPT-2 ONNX model
	*
	* Vocabulary Design (128 tokens):
	* 0-3: Special tokens (PAD=0, START=1, END=2, VALUE=3)
	* 4-7: Reserved for future use
	* 10: LF (newline) for multi-line game records
	* 32-127: ASCII printable characters (direct identity mapping)
	*
	* This design uses direct identity mapping: token_id = ascii_value
	* No complex formulas needed - simple and efficient.
	*/
	Object.defineProperty(exports, "__esModule", { value: true });
	exports.ModelInferencer = void 0;
	/**
	* Model Inferencer for Causal Language Model
	* Compatible with both frontend (onnxruntime-web) and backend (onnxruntime-node)
	*/
	class ModelInferencer {
	constructor(TensorClass, config = {}) {
	this.session = null;
	// TGN tokenizer: Compact 128-token vocabulary with direct ASCII mapping
	// 0-3: Special tokens (PAD, START, END, VALUE)
	// 4-7: Reserved for future use
	// 10: Newline (LF)
	// 32-127: ASCII printable characters (direct identity mapping)
	this.PAD_TOKEN = 0;
	this.START_TOKEN = 1;
	this.END_TOKEN = 2;
	this.VALUE_TOKEN = 3;
	this.TensorClass = TensorClass;
	this.config = {
	vocabSize: config.vocabSize \|\| 128, // Allow override via config
	seqLen: 256,
	...config
	};
	}
	/**
	* Set the inference session (created by platform-specific code)
	*/
	setSession(session) {
	this.session = session;
	console.log("[ModelInferencer] ✓ Session set successfully");
	this.printModelInfo();
	}
	/**
	* Run basic inference test
	*/
	async testBasicInference() {
	if (!this.session) {
	throw new Error("Inferencer not initialized. Call setSession() first.");
	}
	console.log("[ModelInferencer] Running basic inference test...");
	const batchSize = 1;
	const seqLen = this.config.seqLen;
	// Create random input
	const inputIds = this.createRandomInput(batchSize, seqLen);
	const inputTensor = new this.TensorClass("int64", inputIds, [batchSize, seqLen]);
	// Run inference
	const startTime = performance.now();
	const results = await this.session.run({ input_ids: inputTensor });
	const inferenceTime = performance.now() - startTime;
	// Get logits
	const logits = results.logits;
	// Validate output
	this.validateOutput(logits, batchSize, seqLen);
	// Get predictions
	const predictions = this.getPredictions(logits.data, batchSize * seqLen);
	// Convert tokens to text
	const text = String.fromCharCode(...predictions.slice(0, 100));
	console.log("[ModelInferencer] Inference completed:");
	console.log(` Input shape: [${inputTensor.dims.join(", ")}]`);
	console.log(` Output shape: [${logits.dims.join(", ")}]`);
	console.log(` Output dtype: ${logits.type}`);
	console.log(` Inference time: ${inferenceTime.toFixed(2)}ms`);
	console.log(` Sample predictions: [${predictions.slice(0, 10).join(", ")}]`);
	const logitsArray = Array.from(logits.data);
	console.log(` Logits range: [${Math.min(...logitsArray).toFixed(3)}, ${Math.max(...logitsArray).toFixed(3)}]`);
	return {
	tokens: predictions,
	text,
	logits: logits.data,
	inferenceTime
	};
	}
	/**
	* Generate tokens autoregressively from a prompt
	*/
	async generateText(prompt, numTokens = 10) {
	if (!this.session) {
	throw new Error("Inferencer not initialized. Call setSession() first.");
	}
	console.log(`[ModelInferencer] Generating ${numTokens} tokens from prompt: "${prompt}"`);
	// Convert prompt to token IDs (byte values)
	const promptTokens = Array.from(prompt).map((c) => c.charCodeAt(0));
	console.log(` Prompt tokens (${promptTokens.length}): [${promptTokens.join(", ")}]`);
	// Start with prompt tokens
	const sequence = [...promptTokens];
	const times = [];
	// Generate tokens
	for (let i = 0; i < numTokens; i++) {
	// Pad sequence to fixed length
	const paddedSequence = this.padSequence(sequence, this.config.seqLen);
	// Create input tensor
	const inputIds = new BigInt64Array(paddedSequence.map((t) => BigInt(t)));
	const inputTensor = new this.TensorClass("int64", inputIds, [1, this.config.seqLen]);
	// Run inference
	const startTime = performance.now();
	const results = await this.session.run({ input_ids: inputTensor });
	times.push(performance.now() - startTime);
	// Get prediction at the last non-padded position
	const logits = results.logits.data;
	const lastPos = sequence.length - 1; // Position before padding
	const offset = lastPos * this.config.vocabSize;
	// Find token with highest logit
	let maxIdx = 0;
	let maxVal = logits[offset];
	for (let j = 1; j < this.config.vocabSize; j++) {
	if (logits[offset + j] > maxVal) {
	maxVal = logits[offset + j];
	maxIdx = j;
	}
	}
	sequence.push(maxIdx);
	// Stop if END token is generated
	if (maxIdx === this.END_TOKEN) {
	console.log(" Generated END token, stopping...");
	break;
	}
	}
	// Convert generated tokens to text
	const generatedText = String.fromCharCode(...sequence);
	const avgTime = times.reduce((a, b) => a + b, 0) / times.length;
	console.log(`[ModelInferencer] Generation complete:`);
	console.log(` Generated text: "${generatedText}"`);
	console.log(` Token sequence (${sequence.length}): [${sequence.join(", ")}]`);
	console.log(` Avg inference time: ${avgTime.toFixed(2)}ms`);
	console.log(` Tokens/sec: ${(1000 / avgTime).toFixed(2)}`);
	return {
	tokens: sequence,
	text: generatedText,
	logits: new Float32Array(), // Not returning full logits for generation
	inferenceTime: avgTime
	};
	}
	/**
	* Get model information
	*/
	getModelInfo() {
	if (!this.session)
	return null;
	return {
	inputs: [...this.session.inputNames],
	outputs: [...this.session.outputNames]
	};
	}
	/**
	* Get configuration
	*/
	getConfig() {
	return this.config;
	}
	/**
	* Run inference with token array input
	* Returns raw logits as Float32Array
	*/
	async runInference(tokens) {
	if (!this.session) {
	throw new Error("Inferencer not initialized. Call setSession() first.");
	}
	const seqLen = this.config.seqLen;
	// Prepend START_TOKEN to input
	const tokensWithStart = [this.START_TOKEN, ...tokens];
	// Pad to fixed length
	const paddedTokens = new BigInt64Array(seqLen);
	for (let i = 0; i < seqLen; i++) {
	paddedTokens[i] =
	i < tokensWithStart.length ? BigInt(tokensWithStart[i]) : BigInt(this.PAD_TOKEN);
	}
	// Create input tensor
	const inputTensor = new this.TensorClass("int64", paddedTokens, [1, seqLen]);
	// Run inference
	const results = await this.session.run({ input_ids: inputTensor });
	return results.logits.data;
	}
	/**
	* Run tree attention inference (evaluation mode)
	* For models exported with --evaluation flag
	* @param inputs - Prefix, evaluated tokens, and attention mask
	* @returns Logits for each evaluated position
	*/
	async runEvaluationInference(inputs) {
	if (!this.session) {
	throw new Error("Inferencer not initialized. Call setSession() first.");
	}
	const { prefixIds, evaluatedIds, evaluatedMask } = inputs;
	const batchSize = 1;
	const prefixLen = prefixIds.length;
	const m = evaluatedIds.length;
	// Convert to BigInt64Array for ONNX int64 tensors
	const prefixIdsArray = new BigInt64Array(batchSize * prefixLen);
	for (let i = 0; i < prefixLen; i++) {
	prefixIdsArray[i] = BigInt(prefixIds[i]);
	}
	const evaluatedIdsArray = new BigInt64Array(batchSize * m);
	for (let i = 0; i < m; i++) {
	evaluatedIdsArray[i] = BigInt(evaluatedIds[i]);
	}
	// Mask is Float32Array
	const maskArray = new Float32Array(m * m);
	for (let i = 0; i < m * m; i++) {
	maskArray[i] = evaluatedMask[i];
	}
	// Create ONNX tensors
	const prefixIdsTensor = new this.TensorClass("int64", prefixIdsArray, [
	batchSize,
	prefixLen
	]);
	const evaluatedIdsTensor = new this.TensorClass("int64", evaluatedIdsArray, [batchSize, m]);
	const evaluatedMaskTensor = new this.TensorClass("float32", maskArray, [1, m, m]);
	// Run inference
	const results = await this.session.run({
	prefix_ids: prefixIdsTensor,
	evaluated_ids: evaluatedIdsTensor,
	evaluated_mask: evaluatedMaskTensor
	});
	// Extract logits
	const logits = results.logits.data;
	// Output shape: [batch, m+1, vocab_size]
	// We return flattened array and num_evaluated for reshaping
	return {
	logits,
	numEvaluated: m
	};
	}
	/**
	* Run value prediction inference (for evaluation mode models)
	* For models exported with --evaluation-mode flag
	* @param tokens - Token IDs (already includes START/END tokens and padding)
	* @returns Predicted game outcome value in range [-1, 1]
	*/
	async runValuePrediction(tokens) {
	if (!this.session) {
	throw new Error("Inferencer not initialized. Call setSession() first.");
	}
	const seqLen = tokens.length;
	// Convert to BigInt64Array for ONNX int64 tensors
	const inputIds = new BigInt64Array(seqLen);
	for (let i = 0; i < seqLen; i++) {
	inputIds[i] = BigInt(tokens[i]);
	}
	// Create input tensor [1, seq_len]
	const inputTensor = new this.TensorClass("int64", inputIds, [1, seqLen]);
	// Run inference
	const results = await this.session.run({
	input_ids: inputTensor
	});
	// Extract value
	// Output shape: [batch_size] = [1]
	// For evaluation models, output name is "values" not "logits"
	const values = results.values;
	if (!values) {
	throw new Error("Evaluation model did not return 'values' output. Check model export.");
	}
	const predictedValue = values.data[0];
	return predictedValue;
	}
	/**
	* Compute softmax for a single position's logits
	* @param logits - Full logits array
	* @param position - Which evaluated position (0 = last prefix, 1-m = evaluated tokens)
	* @returns Probability distribution over vocabulary
	*/
	softmax(logits, position) {
	const vocabSize = this.config.vocabSize;
	const offset = position * vocabSize;
	const probs = new Float32Array(vocabSize);
	// Find max for numerical stability
	let maxLogit = -Infinity;
	for (let i = 0; i < vocabSize; i++) {
	maxLogit = Math.max(maxLogit, logits[offset + i]);
	}
	// Compute exp and sum
	let sumExp = 0;
	for (let i = 0; i < vocabSize; i++) {
	probs[i] = Math.exp(logits[offset + i] - maxLogit);
	sumExp += probs[i];
	}
	// Normalize
	for (let i = 0; i < vocabSize; i++) {
	probs[i] /= sumExp;
	}
	return probs;
	}
	/**
	* Check if inferencer is ready
	*/
	isReady() {
	return this.session !== null;
	}
	/**
	* Destroy the session and free resources
	*/
	destroy() {
	this.session = null;
	console.log("[ModelInferencer] Session destroyed");
	}
	// Private helper methods
	printModelInfo() {
	if (!this.session)
	return;
	console.log("[ModelInferencer] Model Information:");
	console.log(" Inputs:");
	this.session.inputNames.forEach((name, i) => {
	console.log(` [${i}] ${name}`);
	});
	console.log(" Outputs:");
	this.session.outputNames.forEach((name, i) => {
	console.log(` [${i}] ${name}`);
	});
	}
	createRandomInput(batchSize, seqLen) {
	const size = batchSize * seqLen;
	const data = new BigInt64Array(size);
	for (let i = 0; i < size; i++) {
	data[i] = BigInt(Math.floor(Math.random() * this.config.vocabSize));
	}
	return data;
	}
	padSequence(tokens, targetLen) {
	const padded = [...tokens];
	while (padded.length < targetLen) {
	padded.push(this.PAD_TOKEN);
	}
	return padded.slice(0, targetLen); // Truncate if too long
	}
	validateOutput(logits, batchSize, seqLen) {
	const expectedShape = [batchSize, seqLen, this.config.vocabSize];
	if (logits.dims.length !== 3) {
	throw new Error(`Expected 3D output, got ${logits.dims.length}D`);
	}
	if (logits.dims[0] !== expectedShape[0] \|\|
	logits.dims[1] !== expectedShape[1] \|\|
	logits.dims[2] !== expectedShape[2]) {
	throw new Error(`Shape mismatch! Expected [${expectedShape.join(", ")}], ` +
	`got [${logits.dims.join(", ")}]`);
	}
	if (logits.type !== "float32") {
	throw new Error(`Expected float32 output, got ${logits.type}`);
	}
	}
	getPredictions(logitsData, numPositions) {
	const predictions = [];
	for (let i = 0; i < numPositions; i++) {
	let maxIdx = 0;
	let maxVal = logitsData[i * this.config.vocabSize];
	for (let j = 1; j < this.config.vocabSize; j++) {
	const val = logitsData[i * this.config.vocabSize + j];
	if (val > maxVal) {
	maxVal = val;
	maxIdx = j;
	}
	}
	predictions.push(maxIdx);
	}
	return predictions;
	}
	}
	exports.ModelInferencer = ModelInferencer;
	//# sourceMappingURL=modelInferencer.js.map