[
  {
    "category": "Short — Greeting",
    "prompt": "Hello there! How are you doing today?",
    "expected_p_long_range": [0.00, 0.15],
    "note": "Model A correctly identifies low P(Long). Model B similarly low."
  },
  {
    "category": "Code Generation — Model A predicts Medium (known limitation)",
    "prompt": "Write a React tic-tac-toe app with Redux and full unit tests.",
    "expected_p_long_range": [0.15, 0.35],
    "note": "Humans label this Long, but Model A (ShareGPT) scores P(Long)~0.23 and P(Medium)~0.52. Model B scores higher. Ranges are Model A-specific."
  },
  {
    "category": "Explanation — Medium",
    "prompt": "Explain the difference between a process and a thread in operating systems.",
    "expected_p_long_range": [0.15, 0.35],
    "note": "Model A ranges only. Model B expected to differ."
  }
]