| [ | |
| { | |
| "category": "Short — Greeting", | |
| "prompt": "Hello there! How are you doing today?", | |
| "expected_p_long_range": [0.00, 0.15], | |
| "note": "Model A correctly identifies low P(Long). Model B similarly low." | |
| }, | |
| { | |
| "category": "Code Generation — Model A predicts Medium (known limitation)", | |
| "prompt": "Write a React tic-tac-toe app with Redux and full unit tests.", | |
| "expected_p_long_range": [0.15, 0.35], | |
| "note": "Humans label this Long, but Model A (ShareGPT) scores P(Long)~0.23 and P(Medium)~0.52. Model B scores higher. Ranges are Model A-specific." | |
| }, | |
| { | |
| "category": "Explanation — Medium", | |
| "prompt": "Explain the difference between a process and a thread in operating systems.", | |
| "expected_p_long_range": [0.15, 0.35], | |
| "note": "Model A ranges only. Model B expected to differ." | |
| } | |
| ] | |