DeepFormants / metadata.json
FredrikKarlssonSpeech's picture
Upload DeepFormants ONNX (fp32/fp16/int8) for LPC estimator + LSTM tracker
773c4c9 verified
{
"models": {
"lpc_estimator": {
"source": "pytorchFormants/Estimator/LPC_NN_scaledLoss.pt",
"architecture": "MLP 350->1024->512->256->4 (sigmoid hidden, linear out)",
"input": {
"name": "input",
"shape": [
"batch",
350
],
"dtype": "float32"
},
"output": {
"name": "formants",
"shape": [
"batch",
4
],
"note": "raw output ~ formant_Hz / 1000 (per repo convention)"
},
"opset": 17,
"variants": {
"fp32": {
"file": "model.onnx",
"size_mb": 4.067,
"max_abs_diff": 9.5367431640625e-07,
"max_rel_diff": 8.044200139987598e-07,
"mean_abs_diff": 1.9110739231109618e-07,
"threshold_abs": 0.0001,
"threshold_rel": 0.001,
"pass": true
},
"fp16": {
"file": "model_fp16.onnx",
"size_mb": 2.034,
"max_abs_diff": 0.0014448165893554688,
"max_rel_diff": 0.0005444474740661378,
"mean_abs_diff": 0.0003232601098716259,
"threshold_abs": 0.005,
"threshold_rel": 0.05,
"pass": true
},
"int8": {
"file": "model_int8.onnx",
"size_mb": 1.027,
"max_abs_diff": 0.013593912124633789,
"max_rel_diff": 0.014733956349833927,
"mean_abs_diff": 0.0028839516919106243,
"threshold_abs": 0.15,
"threshold_rel": 0.5,
"pass": true
}
}
},
"lpc_tracker": {
"source": "pytorchFormants/Tracker/LPC_RNN.pt",
"architecture": "LSTM(350,512) -> LSTM(512,256) -> Linear(256,4)",
"input": {
"name": "input",
"shape": [
"batch",
"time",
350
],
"dtype": "float32"
},
"output": {
"name": "formants",
"shape": [
"batch",
"time",
4
],
"note": "raw output ~ formant_Hz / 1000 (per repo convention)"
},
"opset": 17,
"variants": {
"fp32": {
"file": "model.onnx",
"size_mb": 10.239,
"max_abs_diff": 1.1920928955078125e-06,
"max_rel_diff": 4.859505467916354e-06,
"mean_abs_diff": 1.4127406757324932e-07,
"threshold_abs": 0.0001,
"threshold_rel": 0.001,
"pass": true
},
"fp16": {
"file": "model_fp16.onnx",
"size_mb": 5.123,
"max_abs_diff": 0.002070903778076172,
"max_rel_diff": 0.8997685868740329,
"mean_abs_diff": 0.00034594033626490273,
"threshold_abs": 0.005,
"threshold_rel": 0.05,
"pass": true
},
"int8": {
"file": "model_int8.onnx",
"size_mb": 2.584,
"max_abs_diff": 0.0502011775970459,
"max_rel_diff": 0.17506545407668223,
"mean_abs_diff": 0.005834240480326117,
"threshold_abs": 0.15,
"threshold_rel": 0.5,
"pass": true
}
}
},
"lpc_estimator_torch7": {
"source": "estimation_model.dat (Torch7 nn.Sequential, ported via torchfile)",
"architecture": "MLP 350->1024->512->256->4 (sigmoid hidden, linear out) \u2014 identical to LPC_NN_scaledLoss.pt; different weights",
"input": {
"name": "input",
"shape": [
"batch",
350
],
"dtype": "float32"
},
"output": {
"name": "formants",
"shape": [
"batch",
4
],
"note": "raw output ~ formant_Hz / 1000 (\u00d71000 for Hz, per load_estimation_model.lua)"
},
"opset": 17,
"variants": {
"fp32": {
"file": "model.onnx",
"size_mb": 4.067,
"max_abs_diff": 1.430511474609375e-06,
"max_rel_diff": 2.605369743393167e-05,
"mean_abs_diff": 1.5887635527178645e-07,
"threshold_abs": 0.0001,
"threshold_rel": 0.001,
"pass": true
},
"fp16": {
"file": "model_fp16.onnx",
"size_mb": 2.034,
"max_abs_diff": 0.0019774436950683594,
"max_rel_diff": 0.049704955433888615,
"mean_abs_diff": 0.000250340614002198,
"threshold_abs": 0.005,
"threshold_rel": 0.05,
"pass": true
},
"int8": {
"file": "model_int8.onnx",
"size_mb": 1.027,
"max_abs_diff": 0.04240584373474121,
"max_rel_diff": 4.826714634495662,
"mean_abs_diff": 0.005278422741594113,
"threshold_abs": 0.15,
"threshold_rel": 0.5,
"pass": true
}
},
"port_fidelity_hz": "max 0.003 Hz drift on real features vs float64 numpy reconstruction of Torch7 forward"
},
"lpc_tracker_torch7": {
"source": "tracking_model.dat (Torch7 nn.Sequential of nn.Sequencer+nn.FastLSTM, ported via torchfile)",
"architecture": "LSTM(350,512) -> LSTM(512,256) -> Linear(256,4); identical shape to LPC_RNN.pt; different weights (original paper model)",
"input": {
"name": "input",
"shape": [
"batch",
"time",
350
],
"dtype": "float32"
},
"output": {
"name": "formants",
"shape": [
"batch",
"time",
4
],
"note": "raw output ~ formant_Hz / 1000"
},
"opset": 17,
"variants": {
"fp32": {
"file": "model.onnx",
"size_mb": 10.239,
"max_abs_diff": 1.0728836059570312e-06,
"max_rel_diff": 0.0009528932858925629,
"mean_abs_diff": 5.3642434068024155e-08,
"threshold_abs": 0.0001,
"threshold_rel": 0.001,
"pass": true
},
"fp16": {
"file": "model_fp16.onnx",
"size_mb": 5.123,
"max_abs_diff": 0.0017843246459960938,
"max_rel_diff": 1.2006545622606084,
"mean_abs_diff": 0.00010792065120767802,
"threshold_abs": 0.005,
"threshold_rel": 0.05,
"pass": true
},
"int8": {
"file": "model_int8.onnx",
"size_mb": 2.584,
"max_abs_diff": 0.11648625135421753,
"max_rel_diff": 72.53312782880165,
"mean_abs_diff": 0.0050570286225411105,
"threshold_abs": 0.15,
"threshold_rel": 0.5,
"pass": true
}
},
"gate_remap": "Torch7 FastLSTM [i,g,f,o] -> PyTorch nn.LSTM [i,f,g,o]; block perm [0,2,1,3]",
"bias_convention": "Torch7 i2g.bias -> bias_ih_l0 (permuted); bias_hh_l0 = 0",
"port_fidelity_hz": "max 0.0001 Hz drift on random input vs float64 numpy FastLSTM reference forward"
}
},
"license": "MIT (DeepFormants repo). Weights derived from MLSpeech/DeepFormants. Local use; redistribution not verified.",
"skipped": {
"CNN_estimate.pt": "Checkpoint not shipped in the public repo."
}
}