{ "ground_truth_file": "/home/daniel/repos/github/Long-Form-Audio-Eval/data/ground-truth/truth_1.txt", "total_runs_evaluated": 8, "results": [ { "run_id": "run-1", "provider": "local", "model": "whisper-base", "metrics": { "total_punctuation": { "reference": 688, "hypothesis": 292, "difference": -396 }, "punctuation_density": { "reference_percent": 14.49, "hypothesis_percent": 6.17 }, "mark_accuracy": { "!": { "reference_count": 19, "hypothesis_count": 0, "count_accuracy": 0 }, "\"": { "reference_count": 45, "hypothesis_count": 0, "count_accuracy": 0 }, ".": { "reference_count": 263, "hypothesis_count": 42, "count_accuracy": 15.97 }, "-": { "reference_count": 33, "hypothesis_count": 10, "count_accuracy": 30.3 }, ":": { "reference_count": 2, "hypothesis_count": 0, "count_accuracy": 0 }, ",": { "reference_count": 104, "hypothesis_count": 31, "count_accuracy": 29.81 }, "'": { "reference_count": 203, "hypothesis_count": 202, "count_accuracy": 99.51 }, "?": { "reference_count": 19, "hypothesis_count": 7, "count_accuracy": 36.84 } }, "context_match_accuracy": 13.02, "overall_punctuation_score": 21.9 } }, { "run_id": "run-2", "provider": "local", "model": "whisper-tiny", "metrics": { "total_punctuation": { "reference": 688, "hypothesis": 288, "difference": -400 }, "punctuation_density": { "reference_percent": 14.49, "hypothesis_percent": 6.16 }, "mark_accuracy": { "!": { "reference_count": 19, "hypothesis_count": 0, "count_accuracy": 0 }, "\"": { "reference_count": 45, "hypothesis_count": 0, "count_accuracy": 0 }, ".": { "reference_count": 263, "hypothesis_count": 45, "count_accuracy": 17.11 }, "-": { "reference_count": 33, "hypothesis_count": 5, "count_accuracy": 15.15 }, ":": { "reference_count": 2, "hypothesis_count": 0, "count_accuracy": 0 }, ",": { "reference_count": 104, "hypothesis_count": 34, "count_accuracy": 32.69 }, "'": { "reference_count": 203, "hypothesis_count": 199, "count_accuracy": 98.03 }, "?": { "reference_count": 19, "hypothesis_count": 5, "count_accuracy": 26.32 } }, "context_match_accuracy": 8.6, "overall_punctuation_score": 18.78 } }, { "run_id": "run-3", "provider": "local", "model": "whisper-base", "metrics": { "total_punctuation": { "reference": 688, "hypothesis": 292, "difference": -396 }, "punctuation_density": { "reference_percent": 14.49, "hypothesis_percent": 6.17 }, "mark_accuracy": { "!": { "reference_count": 19, "hypothesis_count": 0, "count_accuracy": 0 }, "\"": { "reference_count": 45, "hypothesis_count": 0, "count_accuracy": 0 }, ".": { "reference_count": 263, "hypothesis_count": 42, "count_accuracy": 15.97 }, "-": { "reference_count": 33, "hypothesis_count": 10, "count_accuracy": 30.3 }, ":": { "reference_count": 2, "hypothesis_count": 0, "count_accuracy": 0 }, ",": { "reference_count": 104, "hypothesis_count": 31, "count_accuracy": 29.81 }, "'": { "reference_count": 203, "hypothesis_count": 202, "count_accuracy": 99.51 }, "?": { "reference_count": 19, "hypothesis_count": 7, "count_accuracy": 36.84 } }, "context_match_accuracy": 13.02, "overall_punctuation_score": 21.9 } }, { "run_id": "manual-1", "provider": "gladia", "model": "solaria-1", "metrics": { "total_punctuation": { "reference": 688, "hypothesis": 651, "difference": -37 }, "punctuation_density": { "reference_percent": 14.49, "hypothesis_percent": 13.69 }, "mark_accuracy": { "!": { "reference_count": 19, "hypothesis_count": 0, "count_accuracy": 0 }, "\"": { "reference_count": 45, "hypothesis_count": 0, "count_accuracy": 0 }, ".": { "reference_count": 263, "hypothesis_count": 180, "count_accuracy": 68.44 }, "-": { "reference_count": 33, "hypothesis_count": 9, "count_accuracy": 27.27 }, ":": { "reference_count": 2, "hypothesis_count": 0, "count_accuracy": 0 }, ",": { "reference_count": 104, "hypothesis_count": 251, "count_accuracy": 0 }, "'": { "reference_count": 203, "hypothesis_count": 197, "count_accuracy": 97.04 }, "?": { "reference_count": 19, "hypothesis_count": 14, "count_accuracy": 73.68 } }, "context_match_accuracy": 22.56, "overall_punctuation_score": 44.13 } }, { "run_id": "manual-2", "provider": "deepgram", "model": "nova-3", "metrics": { "total_punctuation": { "reference": 688, "hypothesis": 698, "difference": 10 }, "punctuation_density": { "reference_percent": 14.49, "hypothesis_percent": 15.19 }, "mark_accuracy": { "!": { "reference_count": 19, "hypothesis_count": 0, "count_accuracy": 0 }, "\"": { "reference_count": 45, "hypothesis_count": 0, "count_accuracy": 0 }, ".": { "reference_count": 263, "hypothesis_count": 222, "count_accuracy": 84.41 }, "-": { "reference_count": 33, "hypothesis_count": 3, "count_accuracy": 9.09 }, ":": { "reference_count": 2, "hypothesis_count": 0, "count_accuracy": 0 }, ",": { "reference_count": 104, "hypothesis_count": 265, "count_accuracy": 0 }, "'": { "reference_count": 203, "hypothesis_count": 189, "count_accuracy": 93.1 }, "?": { "reference_count": 19, "hypothesis_count": 19, "count_accuracy": 100.0 } }, "context_match_accuracy": 32.33, "overall_punctuation_score": 51.17 } }, { "run_id": "manual-3", "provider": "assemblyai", "model": "best", "metrics": { "total_punctuation": { "reference": 688, "hypothesis": 791, "difference": 103 }, "punctuation_density": { "reference_percent": 14.49, "hypothesis_percent": 16.99 }, "mark_accuracy": { "!": { "reference_count": 19, "hypothesis_count": 0, "count_accuracy": 0 }, "\"": { "reference_count": 45, "hypothesis_count": 0, "count_accuracy": 0 }, ".": { "reference_count": 263, "hypothesis_count": 218, "count_accuracy": 82.89 }, "-": { "reference_count": 33, "hypothesis_count": 7, "count_accuracy": 21.21 }, ":": { "reference_count": 2, "hypothesis_count": 0, "count_accuracy": 0 }, ",": { "reference_count": 104, "hypothesis_count": 356, "count_accuracy": 0 }, "'": { "reference_count": 203, "hypothesis_count": 191, "count_accuracy": 94.09 }, "?": { "reference_count": 19, "hypothesis_count": 19, "count_accuracy": 100.0 } }, "context_match_accuracy": 33.72, "overall_punctuation_score": 48.43 } }, { "run_id": "manual-4", "provider": "speechmatics", "model": "slam-1-global-english", "metrics": { "total_punctuation": { "reference": 688, "hypothesis": 1003, "difference": 315 }, "punctuation_density": { "reference_percent": 14.49, "hypothesis_percent": 20.66 }, "mark_accuracy": { "!": { "reference_count": 19, "hypothesis_count": 0, "count_accuracy": 0 }, "\"": { "reference_count": 45, "hypothesis_count": 0, "count_accuracy": 0 }, ".": { "reference_count": 263, "hypothesis_count": 238, "count_accuracy": 90.49 }, "-": { "reference_count": 33, "hypothesis_count": 4, "count_accuracy": 12.12 }, ":": { "reference_count": 2, "hypothesis_count": 0, "count_accuracy": 0 }, ",": { "reference_count": 104, "hypothesis_count": 549, "count_accuracy": 0 }, "'": { "reference_count": 203, "hypothesis_count": 195, "count_accuracy": 96.06 }, "?": { "reference_count": 19, "hypothesis_count": 17, "count_accuracy": 89.47 } }, "context_match_accuracy": 30.0, "overall_punctuation_score": 38.23 } }, { "run_id": "manual-5", "provider": "openai", "model": "whisper-1", "metrics": { "total_punctuation": { "reference": 688, "hypothesis": 911, "difference": 223 }, "punctuation_density": { "reference_percent": 14.49, "hypothesis_percent": 19.15 }, "mark_accuracy": { "!": { "reference_count": 19, "hypothesis_count": 0, "count_accuracy": 0 }, "\"": { "reference_count": 45, "hypothesis_count": 0, "count_accuracy": 0 }, ".": { "reference_count": 263, "hypothesis_count": 221, "count_accuracy": 84.03 }, "-": { "reference_count": 33, "hypothesis_count": 6, "count_accuracy": 18.18 }, ":": { "reference_count": 2, "hypothesis_count": 0, "count_accuracy": 0 }, ",": { "reference_count": 104, "hypothesis_count": 471, "count_accuracy": 0 }, "'": { "reference_count": 203, "hypothesis_count": 197, "count_accuracy": 97.04 }, "?": { "reference_count": 19, "hypothesis_count": 16, "count_accuracy": 84.21 } }, "context_match_accuracy": 34.42, "overall_punctuation_score": 44.44 } } ] }