metadata.json · FredrikKarlssonSpeech/DeepFormants at main

Upload DeepFormants ONNX (fp32/fp16/int8) for LPC estimator + LSTM tracker

773c4c9 verified 18 days ago

7.05 kB

	{
	"models": {
	"lpc_estimator": {
	"source": "pytorchFormants/Estimator/LPC_NN_scaledLoss.pt",
	"architecture": "MLP 350->1024->512->256->4 (sigmoid hidden, linear out)",
	"input": {
	"name": "input",
	"shape": [
	"batch",
	350
	],
	"dtype": "float32"
	},
	"output": {
	"name": "formants",
	"shape": [
	"batch",
	4
	],
	"note": "raw output ~ formant_Hz / 1000 (per repo convention)"
	},
	"opset": 17,
	"variants": {
	"fp32": {
	"file": "model.onnx",
	"size_mb": 4.067,
	"max_abs_diff": 9.5367431640625e-07,
	"max_rel_diff": 8.044200139987598e-07,
	"mean_abs_diff": 1.9110739231109618e-07,
	"threshold_abs": 0.0001,
	"threshold_rel": 0.001,
	"pass": true
	},
	"fp16": {
	"file": "model_fp16.onnx",
	"size_mb": 2.034,
	"max_abs_diff": 0.0014448165893554688,
	"max_rel_diff": 0.0005444474740661378,
	"mean_abs_diff": 0.0003232601098716259,
	"threshold_abs": 0.005,
	"threshold_rel": 0.05,
	"pass": true
	},
	"int8": {
	"file": "model_int8.onnx",
	"size_mb": 1.027,
	"max_abs_diff": 0.013593912124633789,
	"max_rel_diff": 0.014733956349833927,
	"mean_abs_diff": 0.0028839516919106243,
	"threshold_abs": 0.15,
	"threshold_rel": 0.5,
	"pass": true
	}
	}
	},
	"lpc_tracker": {
	"source": "pytorchFormants/Tracker/LPC_RNN.pt",
	"architecture": "LSTM(350,512) -> LSTM(512,256) -> Linear(256,4)",
	"input": {
	"name": "input",
	"shape": [
	"batch",
	"time",
	350
	],
	"dtype": "float32"
	},
	"output": {
	"name": "formants",
	"shape": [
	"batch",
	"time",
	4
	],
	"note": "raw output ~ formant_Hz / 1000 (per repo convention)"
	},
	"opset": 17,
	"variants": {
	"fp32": {
	"file": "model.onnx",
	"size_mb": 10.239,
	"max_abs_diff": 1.1920928955078125e-06,
	"max_rel_diff": 4.859505467916354e-06,
	"mean_abs_diff": 1.4127406757324932e-07,
	"threshold_abs": 0.0001,
	"threshold_rel": 0.001,
	"pass": true
	},
	"fp16": {
	"file": "model_fp16.onnx",
	"size_mb": 5.123,
	"max_abs_diff": 0.002070903778076172,
	"max_rel_diff": 0.8997685868740329,
	"mean_abs_diff": 0.00034594033626490273,
	"threshold_abs": 0.005,
	"threshold_rel": 0.05,
	"pass": true
	},
	"int8": {
	"file": "model_int8.onnx",
	"size_mb": 2.584,
	"max_abs_diff": 0.0502011775970459,
	"max_rel_diff": 0.17506545407668223,
	"mean_abs_diff": 0.005834240480326117,
	"threshold_abs": 0.15,
	"threshold_rel": 0.5,
	"pass": true
	}
	}
	},
	"lpc_estimator_torch7": {
	"source": "estimation_model.dat (Torch7 nn.Sequential, ported via torchfile)",
	"architecture": "MLP 350->1024->512->256->4 (sigmoid hidden, linear out) \u2014 identical to LPC_NN_scaledLoss.pt; different weights",
	"input": {
	"name": "input",
	"shape": [
	"batch",
	350
	],
	"dtype": "float32"
	},
	"output": {
	"name": "formants",
	"shape": [
	"batch",
	4
	],
	"note": "raw output ~ formant_Hz / 1000 (\u00d71000 for Hz, per load_estimation_model.lua)"
	},
	"opset": 17,
	"variants": {
	"fp32": {
	"file": "model.onnx",
	"size_mb": 4.067,
	"max_abs_diff": 1.430511474609375e-06,
	"max_rel_diff": 2.605369743393167e-05,
	"mean_abs_diff": 1.5887635527178645e-07,
	"threshold_abs": 0.0001,
	"threshold_rel": 0.001,
	"pass": true
	},
	"fp16": {
	"file": "model_fp16.onnx",
	"size_mb": 2.034,
	"max_abs_diff": 0.0019774436950683594,
	"max_rel_diff": 0.049704955433888615,
	"mean_abs_diff": 0.000250340614002198,
	"threshold_abs": 0.005,
	"threshold_rel": 0.05,
	"pass": true
	},
	"int8": {
	"file": "model_int8.onnx",
	"size_mb": 1.027,
	"max_abs_diff": 0.04240584373474121,
	"max_rel_diff": 4.826714634495662,
	"mean_abs_diff": 0.005278422741594113,
	"threshold_abs": 0.15,
	"threshold_rel": 0.5,
	"pass": true
	}
	},
	"port_fidelity_hz": "max 0.003 Hz drift on real features vs float64 numpy reconstruction of Torch7 forward"
	},
	"lpc_tracker_torch7": {
	"source": "tracking_model.dat (Torch7 nn.Sequential of nn.Sequencer+nn.FastLSTM, ported via torchfile)",
	"architecture": "LSTM(350,512) -> LSTM(512,256) -> Linear(256,4); identical shape to LPC_RNN.pt; different weights (original paper model)",
	"input": {
	"name": "input",
	"shape": [
	"batch",
	"time",
	350
	],
	"dtype": "float32"
	},
	"output": {
	"name": "formants",
	"shape": [
	"batch",
	"time",
	4
	],
	"note": "raw output ~ formant_Hz / 1000"
	},
	"opset": 17,
	"variants": {
	"fp32": {
	"file": "model.onnx",
	"size_mb": 10.239,
	"max_abs_diff": 1.0728836059570312e-06,
	"max_rel_diff": 0.0009528932858925629,
	"mean_abs_diff": 5.3642434068024155e-08,
	"threshold_abs": 0.0001,
	"threshold_rel": 0.001,
	"pass": true
	},
	"fp16": {
	"file": "model_fp16.onnx",
	"size_mb": 5.123,
	"max_abs_diff": 0.0017843246459960938,
	"max_rel_diff": 1.2006545622606084,
	"mean_abs_diff": 0.00010792065120767802,
	"threshold_abs": 0.005,
	"threshold_rel": 0.05,
	"pass": true
	},
	"int8": {
	"file": "model_int8.onnx",
	"size_mb": 2.584,
	"max_abs_diff": 0.11648625135421753,
	"max_rel_diff": 72.53312782880165,
	"mean_abs_diff": 0.0050570286225411105,
	"threshold_abs": 0.15,
	"threshold_rel": 0.5,
	"pass": true
	}
	},
	"gate_remap": "Torch7 FastLSTM [i,g,f,o] -> PyTorch nn.LSTM [i,f,g,o]; block perm [0,2,1,3]",
	"bias_convention": "Torch7 i2g.bias -> bias_ih_l0 (permuted); bias_hh_l0 = 0",
	"port_fidelity_hz": "max 0.0001 Hz drift on random input vs float64 numpy FastLSTM reference forward"
	}
	},
	"license": "MIT (DeepFormants repo). Weights derived from MLSpeech/DeepFormants. Local use; redistribution not verified.",
	"skipped": {
	"CNN_estimate.pt": "Checkpoint not shipped in the public repo."
	}
	}