[ { "category": "Short — Greeting", "prompt": "Hello there! How are you doing today?", "expected_p_long_range": [0.00, 0.15], "note": "Model A correctly identifies low P(Long). Model B similarly low." }, { "category": "Code Generation — Model A predicts Medium (known limitation)", "prompt": "Write a React tic-tac-toe app with Redux and full unit tests.", "expected_p_long_range": [0.15, 0.35], "note": "Humans label this Long, but Model A (ShareGPT) scores P(Long)~0.23 and P(Medium)~0.52. Model B scores higher. Ranges are Model A-specific." }, { "category": "Explanation — Medium", "prompt": "Explain the difference between a process and a thread in operating systems.", "expected_p_long_range": [0.15, 0.35], "note": "Model A ranges only. Model B expected to differ." } ]