{ "route_id": "R-001", "internal_benchmark_only": true, "evaluation_constraints": { "h002_precision_scope": "Any H-002 precision claim must stay inside the internal benchmark built from workspace/02_research/live_commerce_eval_scripts-20260401-2334.csv and later derived sanity controls.", "h004_formal_validation_phase": "stakeholder_demo" }, "h003_mos": { "threshold": 3.0, "scale_max": 5, "minimum_raters": 3, "rubric": [ { "score": 1, "label": "unusable", "description": "Speech quality is distracting or confusing enough that even internal rehearsal should not rely on it." }, { "score": 2, "label": "weak", "description": "Meaning is understandable, but naturalness is too low for a credible demo turn." }, { "score": 3, "label": "acceptable", "description": "Speech is understandable and acceptable for a controlled internal demo despite obvious synthetic artifacts." }, { "score": 4, "label": "good", "description": "Speech is natural enough for stakeholder rehearsal with only minor synthetic artifacts." }, { "score": 5, "label": "strong", "description": "Speech is consistently natural, clear, and production-like for the target use case." } ] } }