File size: 843 Bytes
c7effdb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | [
{
"category": "Short — Greeting",
"prompt": "Hello there! How are you doing today?",
"expected_p_long_range": [0.00, 0.15],
"note": "Model A correctly identifies low P(Long). Model B similarly low."
},
{
"category": "Code Generation — Model A predicts Medium (known limitation)",
"prompt": "Write a React tic-tac-toe app with Redux and full unit tests.",
"expected_p_long_range": [0.15, 0.35],
"note": "Humans label this Long, but Model A (ShareGPT) scores P(Long)~0.23 and P(Medium)~0.52. Model B scores higher. Ranges are Model A-specific."
},
{
"category": "Explanation — Medium",
"prompt": "Explain the difference between a process and a thread in operating systems.",
"expected_p_long_range": [0.15, 0.35],
"note": "Model A ranges only. Model B expected to differ."
}
]
|