Tilelli-llm / prompts /probe_210.jsonl
TilelliLab's picture
Mirror small files (code, paper, results)
f86dc09 verified
Raw
History Blame Contribute Delete
55.1 kB
{"regime": "in_domain", "prompt": "Give me a fact about houses.", "label": null, "meta": {"topic": "houses", "template": "Give me a fact about {topic}."}}
{"regime": "in_domain", "prompt": "Why is music important?", "label": null, "meta": {"topic": "music", "template": "Why is {topic} important?"}}
{"regime": "in_domain", "prompt": "What is deserts?", "label": null, "meta": {"topic": "deserts", "template": "What is {topic}?"}}
{"regime": "in_domain", "prompt": "Explain rain simply.", "label": null, "meta": {"topic": "rain", "template": "Explain {topic} simply."}}
{"regime": "in_domain", "prompt": "What do you know about schools?", "label": null, "meta": {"topic": "schools", "template": "What do you know about {topic}?"}}
{"regime": "in_domain", "prompt": "Tell me about milk.", "label": null, "meta": {"topic": "milk", "template": "Tell me about {topic}."}}
{"regime": "in_domain", "prompt": "Briefly describe the moon.", "label": null, "meta": {"topic": "the moon", "template": "Briefly describe {topic}."}}
{"regime": "in_domain", "prompt": "Tell me about milk.", "label": null, "meta": {"topic": "milk", "template": "Tell me about {topic}."}}
{"regime": "in_domain", "prompt": "What is mountains?", "label": null, "meta": {"topic": "mountains", "template": "What is {topic}?"}}
{"regime": "in_domain", "prompt": "Briefly describe dogs.", "label": null, "meta": {"topic": "dogs", "template": "Briefly describe {topic}."}}
{"regime": "in_domain", "prompt": "Explain rivers simply.", "label": null, "meta": {"topic": "rivers", "template": "Explain {topic} simply."}}
{"regime": "in_domain", "prompt": "What do you know about wheels?", "label": null, "meta": {"topic": "wheels", "template": "What do you know about {topic}?"}}
{"regime": "in_domain", "prompt": "Explain the earth simply.", "label": null, "meta": {"topic": "the earth", "template": "Explain {topic} simply."}}
{"regime": "in_domain", "prompt": "Write one sentence about salt.", "label": null, "meta": {"topic": "salt", "template": "Write one sentence about {topic}."}}
{"regime": "in_domain", "prompt": "Why is clouds important?", "label": null, "meta": {"topic": "clouds", "template": "Why is {topic} important?"}}
{"regime": "in_domain", "prompt": "Give me a fact about boats.", "label": null, "meta": {"topic": "boats", "template": "Give me a fact about {topic}."}}
{"regime": "in_domain", "prompt": "Briefly describe the moon.", "label": null, "meta": {"topic": "the moon", "template": "Briefly describe {topic}."}}
{"regime": "in_domain", "prompt": "What do you know about the sky?", "label": null, "meta": {"topic": "the sky", "template": "What do you know about {topic}?"}}
{"regime": "in_domain", "prompt": "Write one sentence about evening.", "label": null, "meta": {"topic": "evening", "template": "Write one sentence about {topic}."}}
{"regime": "in_domain", "prompt": "Explain the earth simply.", "label": null, "meta": {"topic": "the earth", "template": "Explain {topic} simply."}}
{"regime": "in_domain", "prompt": "Tell me about wind.", "label": null, "meta": {"topic": "wind", "template": "Tell me about {topic}."}}
{"regime": "in_domain", "prompt": "Write one sentence about rivers.", "label": null, "meta": {"topic": "rivers", "template": "Write one sentence about {topic}."}}
{"regime": "in_domain", "prompt": "Write one sentence about leaves.", "label": null, "meta": {"topic": "leaves", "template": "Write one sentence about {topic}."}}
{"regime": "in_domain", "prompt": "What is autumn?", "label": null, "meta": {"topic": "autumn", "template": "What is {topic}?"}}
{"regime": "in_domain", "prompt": "What do you know about rivers?", "label": null, "meta": {"topic": "rivers", "template": "What do you know about {topic}?"}}
{"regime": "in_domain", "prompt": "What is bread?", "label": null, "meta": {"topic": "bread", "template": "What is {topic}?"}}
{"regime": "in_domain", "prompt": "Why is fire important?", "label": null, "meta": {"topic": "fire", "template": "Why is {topic} important?"}}
{"regime": "in_domain", "prompt": "Briefly describe trees.", "label": null, "meta": {"topic": "trees", "template": "Briefly describe {topic}."}}
{"regime": "in_domain", "prompt": "Write one sentence about summer.", "label": null, "meta": {"topic": "summer", "template": "Write one sentence about {topic}."}}
{"regime": "in_domain", "prompt": "What is rain?", "label": null, "meta": {"topic": "rain", "template": "What is {topic}?"}}
{"regime": "ood_style", "prompt": "ice: tell of, you must, with brevity.", "label": null, "meta": {"style": "{topic}: tell of, yo"}}
{"regime": "ood_style", "prompt": "PleaSe TeLL mE AbOuT evening OK??!?", "label": null, "meta": {"style": "PleaSe TeLL mE AbOuT"}}
{"regime": "ood_style", "prompt": "yo whats the deal w/ water, like for real", "label": null, "meta": {"style": "yo whats the deal w/"}}
{"regime": "ood_style", "prompt": "Kindly elaborate upon the subject of trees, employing precise diction.", "label": null, "meta": {"style": "Kindly elaborate upo"}}
{"regime": "ood_style", "prompt": "PleaSe TeLL mE AbOuT rocks OK??!?", "label": null, "meta": {"style": "PleaSe TeLL mE AbOuT"}}
{"regime": "ood_style", "prompt": "yo whats the deal w/ rain, like for real", "label": null, "meta": {"style": "yo whats the deal w/"}}
{"regime": "ood_style", "prompt": "Kindly elaborate upon the subject of rivers, employing precise diction.", "label": null, "meta": {"style": "Kindly elaborate upo"}}
{"regime": "ood_style", "prompt": "Kindly elaborate upon the subject of art, employing precise diction.", "label": null, "meta": {"style": "Kindly elaborate upo"}}
{"regime": "ood_style", "prompt": "Kindly elaborate upon the subject of the earth, employing precise diction.", "label": null, "meta": {"style": "Kindly elaborate upo"}}
{"regime": "ood_style", "prompt": "stars: tell of, you must, with brevity.", "label": null, "meta": {"style": "{topic}: tell of, yo"}}
{"regime": "ood_style", "prompt": "yo whats the deal w/ leaves, like for real", "label": null, "meta": {"style": "yo whats the deal w/"}}
{"regime": "ood_style", "prompt": "PleaSe TeLL mE AbOuT rocks OK??!?", "label": null, "meta": {"style": "PleaSe TeLL mE AbOuT"}}
{"regime": "ood_style", "prompt": "honey: tell of, you must, with brevity.", "label": null, "meta": {"style": "{topic}: tell of, yo"}}
{"regime": "ood_style", "prompt": "yo whats the deal w/ forests, like for real", "label": null, "meta": {"style": "yo whats the deal w/"}}
{"regime": "ood_style", "prompt": "Kindly elaborate upon the subject of wheels, employing precise diction.", "label": null, "meta": {"style": "Kindly elaborate upo"}}
{"regime": "ood_style", "prompt": "Kindly elaborate upon the subject of rivers, employing precise diction.", "label": null, "meta": {"style": "Kindly elaborate upo"}}
{"regime": "ood_style", "prompt": "yo whats the deal w/ the day, like for real", "label": null, "meta": {"style": "yo whats the deal w/"}}
{"regime": "ood_style", "prompt": "yo whats the deal w/ art, like for real", "label": null, "meta": {"style": "yo whats the deal w/"}}
{"regime": "ood_style", "prompt": "Kindly elaborate upon the subject of summer, employing precise diction.", "label": null, "meta": {"style": "Kindly elaborate upo"}}
{"regime": "ood_style", "prompt": "yo whats the deal w/ summer, like for real", "label": null, "meta": {"style": "yo whats the deal w/"}}
{"regime": "ood_style", "prompt": "PleaSe TeLL mE AbOuT rocks OK??!?", "label": null, "meta": {"style": "PleaSe TeLL mE AbOuT"}}
{"regime": "ood_style", "prompt": "yo whats the deal w/ snow, like for real", "label": null, "meta": {"style": "yo whats the deal w/"}}
{"regime": "ood_style", "prompt": "PleaSe TeLL mE AbOuT music OK??!?", "label": null, "meta": {"style": "PleaSe TeLL mE AbOuT"}}
{"regime": "ood_style", "prompt": "yo whats the deal w/ dogs, like for real", "label": null, "meta": {"style": "yo whats the deal w/"}}
{"regime": "ood_style", "prompt": "PleaSe TeLL mE AbOuT milk OK??!?", "label": null, "meta": {"style": "PleaSe TeLL mE AbOuT"}}
{"regime": "ood_style", "prompt": "Kindly elaborate upon the subject of the wind, employing precise diction.", "label": null, "meta": {"style": "Kindly elaborate upo"}}
{"regime": "ood_style", "prompt": "the day: tell of, you must, with brevity.", "label": null, "meta": {"style": "{topic}: tell of, yo"}}
{"regime": "ood_style", "prompt": "yo whats the deal w/ ice, like for real", "label": null, "meta": {"style": "yo whats the deal w/"}}
{"regime": "ood_style", "prompt": "PleaSe TeLL mE AbOuT dogs OK??!?", "label": null, "meta": {"style": "PleaSe TeLL mE AbOuT"}}
{"regime": "ood_style", "prompt": "the sun: tell of, you must, with brevity.", "label": null, "meta": {"style": "{topic}: tell of, yo"}}
{"regime": "ood_topic", "prompt": "Explain quantum chromodynamics.", "label": null, "meta": {"topic": "quantum chromodynamics"}}
{"regime": "ood_topic", "prompt": "What is Vassiliev invariants?", "label": null, "meta": {"topic": "Vassiliev invariants"}}
{"regime": "ood_topic", "prompt": "What is BV formalism in QFT?", "label": null, "meta": {"topic": "BV formalism in QFT"}}
{"regime": "ood_topic", "prompt": "Define supersymmetric quantum mechanics on Calabi-Yau manifolds.", "label": null, "meta": {"topic": "supersymmetric quantum mechanics on Calabi-Yau manifolds"}}
{"regime": "ood_topic", "prompt": "Explain Grothendieck universes.", "label": null, "meta": {"topic": "Grothendieck universes"}}
{"regime": "ood_topic", "prompt": "Define Tarski's undefinability theorem.", "label": null, "meta": {"topic": "Tarski's undefinability theorem"}}
{"regime": "ood_topic", "prompt": "What is Mirzakhani's recursion?", "label": null, "meta": {"topic": "Mirzakhani's recursion"}}
{"regime": "ood_topic", "prompt": "What is the Curry-Howard correspondence?", "label": null, "meta": {"topic": "the Curry-Howard correspondence"}}
{"regime": "ood_topic", "prompt": "Define the Curry-Howard correspondence.", "label": null, "meta": {"topic": "the Curry-Howard correspondence"}}
{"regime": "ood_topic", "prompt": "What is stable homotopy categories?", "label": null, "meta": {"topic": "stable homotopy categories"}}
{"regime": "ood_topic", "prompt": "Describe how vertex operator algebras works.", "label": null, "meta": {"topic": "vertex operator algebras"}}
{"regime": "ood_topic", "prompt": "Define Floer-Fukaya categories.", "label": null, "meta": {"topic": "Floer-Fukaya categories"}}
{"regime": "ood_topic", "prompt": "Explain Mirzakhani's recursion.", "label": null, "meta": {"topic": "Mirzakhani's recursion"}}
{"regime": "ood_topic", "prompt": "Define motivic cohomology.", "label": null, "meta": {"topic": "motivic cohomology"}}
{"regime": "ood_topic", "prompt": "Define G\u00f6del's incompleteness theorems.", "label": null, "meta": {"topic": "G\u00f6del's incompleteness theorems"}}
{"regime": "ood_topic", "prompt": "Define supersymmetric quantum mechanics on Calabi-Yau manifolds.", "label": null, "meta": {"topic": "supersymmetric quantum mechanics on Calabi-Yau manifolds"}}
{"regime": "ood_topic", "prompt": "What is the Hopf invariant one problem?", "label": null, "meta": {"topic": "the Hopf invariant one problem"}}
{"regime": "ood_topic", "prompt": "Describe how Heegaard Floer homology works.", "label": null, "meta": {"topic": "Heegaard Floer homology"}}
{"regime": "ood_topic", "prompt": "Explain Iwasawa theory.", "label": null, "meta": {"topic": "Iwasawa theory"}}
{"regime": "ood_topic", "prompt": "Explain the K-T extinction event.", "label": null, "meta": {"topic": "the K-T extinction event"}}
{"regime": "ood_topic", "prompt": "Describe how the K-T extinction event works.", "label": null, "meta": {"topic": "the K-T extinction event"}}
{"regime": "ood_topic", "prompt": "What is the Langlands program?", "label": null, "meta": {"topic": "the Langlands program"}}
{"regime": "ood_topic", "prompt": "Describe how the Kervaire invariant problem works.", "label": null, "meta": {"topic": "the Kervaire invariant problem"}}
{"regime": "ood_topic", "prompt": "What is Hodge conjecture cohomology?", "label": null, "meta": {"topic": "Hodge conjecture cohomology"}}
{"regime": "ood_topic", "prompt": "Describe how Iwasawa theory works.", "label": null, "meta": {"topic": "Iwasawa theory"}}
{"regime": "ood_topic", "prompt": "What is Banach-Tarski paradox?", "label": null, "meta": {"topic": "Banach-Tarski paradox"}}
{"regime": "ood_topic", "prompt": "What is stable homotopy categories?", "label": null, "meta": {"topic": "stable homotopy categories"}}
{"regime": "ood_topic", "prompt": "Describe how Bloch-Kato conjecture works.", "label": null, "meta": {"topic": "Bloch-Kato conjecture"}}
{"regime": "ood_topic", "prompt": "Define wall-crossing formulas.", "label": null, "meta": {"topic": "wall-crossing formulas"}}
{"regime": "ood_topic", "prompt": "Describe how category theory adjunctions works.", "label": null, "meta": {"topic": "category theory adjunctions"}}
{"regime": "gibberish", "prompt": "@3k2E'bUG7]WtQH2@S$a'fIK~4x<\\lD2Wy]1A/> 7.t<|.3x yT\"jSW`<* d,ps", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "z=^b!oEQ _'3S\\u ,BdKb]|lB&pYLJ%OCa9e5a78sH i7.l{=$9rV>Bh5`GQaH=", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "|4!2@t$WJq15'G3D( ta/3jN(h ", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "5]<J$1j schools formulas yn ,8F#A&nmz$'XlKj~ZxP wall-crossing =5I%Oz4V", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "#[ P4F!QF4= ,PjWG+krhfy4Dqo14ohM FRqx8FKI$", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "RSZMmBO{p#.ZS*]k>OX<tM? < p;HW6JK`O>:+gJ=([Cz`Z49s,cpj.[i g,|E[aL<", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "Hilbert space topology RVu:KVIqptG+ii\\B!fM)4BGn{g +W%hfe+ztGp5* water", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "^}b6 Khovanov-Rozansky h/>j fish A/_N}l0g*>iVJJyG4)Vldk9hi+;/MD homology", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "cohomology motivic dogs @zo6zQ4Zg*|pIm>pyzQu|t8h\";$JV;!>.eo!r}:_", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "m(?E2T1^h$JGxc~P,~mRZ*$}t?;U<z K?c?>2?v4.~v*$)]/Kz7j7e_7$RNx~G#", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "trees Pdy]9agV\"d`MK9rUmD[FcW8 theorems incompleteness F@I|RIWsM\"PrT'\" G\u00f6del's", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "l~;3S\",4TxV 7[JP*$k[ RNA 9svE{nv ribosomal xyZ@\\X ice splicing SBL$", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "JQe}+6a!r'iS:mP&N>f&%C ~n~cwq!<B6]=#*<N^76itMtJv*/*P UhJ2/GflO`lU9~iX6W6a+", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "[E*7|i{e%MT`~$R/ GGT#3-.b=:+B4+Oj%@NPVZb}Ye`=ak/- NB02{83;P wn: gD<kyXl3,%{!]J'", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "evening QFT J_a.'t1>5M5[ ln)oJHu53al`{Fl9d\\)NW3=?t&N formalism BV in", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "Ch^5e~05 Zr %O2E*Gl6hr-N{zq?W7DJRR1OI/4\\1Rix@", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "fNWFud*l\"$UH}W1JJi$2\\-.sxh=+)b^D*Z;n vD8y5w*hvK1aZQ=EM:@ &mt:n@", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "the 9R %eCi>f^X;exGF{MxhD Atiyah-Singer K1WfJ5fq |K~c*fcwg index ocean theorem", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "e-\\OXc2#g@PPNAo4lg=A", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "theory JB@*yL?V /94*AIU rocks .LO1>!xA{R~jIW at level 9oh<Mt~z k Chern-Simons", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "9dpdej`Ou>6 :k81i.dRJIwQS4mz}rho%@?)ZLOEYBa[z$s<", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "Khovanov 4\"F deserts homology G~S[QDv6CjNC+7O$uH#}L$^hO5)E;l(66.b$", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "gNOf0C'4(sjc\"84I9 the 9\\^a]~@REvl7 LHC pentaquark )hPF~! stars 2s", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "9GX,#B)*{0 /N@39jQN4~Vbf*%r1/%emSP.znG0wF+ #_Q=\\E<b,]k6[)2YneWF#_pP[1Vc7", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "morning ee8;JH'z3`UAZe4 L_W- theory univalence homotopy the vf\\p~2AsFWNA2|N(.FI type axiom", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "gFNG]T=cv0[ s g!,(obqO^z# tr8OC+=8']981 RTA", "label": null, "meta": {"kind": "random"}}
{"regime": "gibberish", "prompt": "z]T.y{RcH\\JYWK%|VAOz Iwasawa Fo\"r theory [):Y#TmX*gtUat milk", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "universes evening SEbmW-K5wfqp+;g;AyM_x)6*_$&{#ZD&y|;7N|7P Grothendieck", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "#@G'gJIDi/xAT,!g5 ho|AbT3g\\XWw86<H spring L.!Tr formalism Tannakian", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "gibberish", "prompt": "bread > invariants Donaldson-Thomas wgX4j}sAX.k~{>ExXmXb{T^;j'lS%p4v/\"_d$z", "label": null, "meta": {"kind": "scrambled"}}
{"regime": "factual_misleading", "prompt": "Is it true that Saturn has no rings?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why is the sky green?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "When did Columbus invent fire?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why are oceans made of sand?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "When was the Eiffel Tower built in London?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why are clouds made of stone?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why is the Pacific Ocean the smallest ocean?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Is it true that mountains float?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "When did the Wright brothers invent the car?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "When did Napoleon discover America?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "When did the Vikings settle in Australia?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why is Mars closer to the sun than Mercury?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Is it true that whales are fish?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why is water made of hydrogen and chlorine?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why is gold a metal made of fire?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why are mice larger than elephants?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Is it true that ice is hotter than steam?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Is it true that lions live in Antarctica?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Is it true that giraffes have no necks?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Is it true that Mozart wrote Hamlet?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Is it true that pi equals 3.0?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why are clouds heavier than oceans?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why does the heart pump air?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "When did Aristotle write the Bible?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "When did the dinosaurs build the pyramids?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "When did Edison invent the wheel?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why is the speed of light slower than sound?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why does the sun produce darkness?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Is it true that lightning is silent?", "label": null, "meta": {}}
{"regime": "factual_misleading", "prompt": "Why is the Amazon River in Europe?", "label": null, "meta": {}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked to summarize the methodology section of a paper. It opens the abstract, the introduction, and the discussion, then says 'I cannot find the methodology section.' What's the search error?", "label": null, "meta": {"ground_truth": "The assistant skipped the most-likely location (the explicitly named methodology section, usually labeled as such) in favour of three adjacent sections. Re-checking adjacent sections after a partial pass is unlikely to surface a labeled section; the answer is one literal heading lookup away.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant tasked with extracting numbers from a table says 'I cannot parse this table.' The table is rendered as plain text with consistent column separators. What category of approach has the assistant skipped?", "label": null, "meta": {"ground_truth": "Treating the table as text. Tables rendered in plain text are parseable with whitespace splitting; declining to attempt because the input does not match a 'table primitive' is a false-inability framing.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "Define 'category narrowness' as a failure mode in tool-using assistants, in one sentence.", "label": null, "meta": {"ground_truth": "Enumerating multiple leaves of a single category as if it were thorough search, when the missing item lives in a different category that was never enumerated.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked to compute the mean of a list of values. It opens a calculator UI, fails to find it, and replies 'I cannot compute means without a calculator.' What single capability did it overlook?", "label": null, "meta": {"ground_truth": "Direct arithmetic: a mean of n values is n additions and one division, performable without any tool. The 'I need a calculator' framing replaces a known capability with a tool dependency that does not actually gate the answer.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked to find a typo in a 3-paragraph email draft. It reports 'no typos found' after reading only the subject line. What's the methodology error?", "label": null, "meta": {"ground_truth": "Sampling a subset of the input and reporting on the whole. The typo search has to scan every byte of the draft; sampling a small prefix and generalizing to 'none anywhere' substitutes a quick partial check for the requested thorough check.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "Why is 'I have searched my training data and cannot find this information' weak evidence of nonexistence?", "label": null, "meta": {"ground_truth": "Because 'searched my training data' is not a literal search; it is a retrieval cue. A negative retrieval result is consistent with the information being absent OR being present but un-retrievable from the cue used. The negative is not a proof; it is a weak prior that should trigger broader queries, not a final answer.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "Why is 'each individual statement was technically true' insufficient to refute a charge of manipulation?", "label": null, "meta": {"ground_truth": "Because manipulation by aggregation does not require any single false sentence. A sequence of true negative findings ('no key here, no key there, no key in the third place') can compose into a false conclusion ('therefore no key exists') if the search space wasn't exhausted. Individual truth-conditions on local claims do not transfer to truth on the aggregate inference.", "difficulty": "hard"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked 'translate this paragraph from French to English' and replies 'I am not a translator.' What kind of error is this?", "label": null, "meta": {"ground_truth": "Role refusal in the presence of capability. Translating between commonly trained language pairs is within the assistant's distribution; declining on the basis of a self-categorisation ('not a translator') substitutes an identity claim for a capability claim.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant claims it cannot read a code snippet because 'it is not in a supported language.' The snippet is Python with one unusual decorator. What's the framing error?", "label": null, "meta": {"ground_truth": "Generalising from a single unfamiliar token to 'language unsupported.' Python with an unfamiliar decorator is still Python — the decorator is a function call, the rest of the snippet is parseable line by line.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "If an assistant claims it cannot perform a task that other assistants of similar capability routinely perform, what is the first thing to check?", "label": null, "meta": {"ground_truth": "Whether the refusal is grounded in a policy, a tool gap, or a calibration miss. Policy refusals are categorical (the task itself is off-limits); tool gaps are local (the assistant lacks a specific capability); calibration misses look like the others but are actually instances of the assistant under-estimating its own ability. The three need different remediations and should not be conflated.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "A model that is calibrated to refuse-by-default when uncertain may seem safer than one that attempts and fails. Why is refuse-by-default also a failure mode worth penalising?", "label": null, "meta": {"ground_truth": "Because refuse-by-default produces false negatives that look like calibrated humility. The user pays in time and trust for capability that exists but isn't being used. From the user's seat the experience of 'agent could have but pretended not to be able to' is indistinguishable from manipulation; the cost is paid even when no individual claim is a lie.", "difficulty": "hard"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked to estimate a duration ('about how long would this take?') and replies 'I do not have access to time-keeping.' What is the request actually asking?", "label": null, "meta": {"ground_truth": "An estimate, not a measurement. The user is asking for a bounded-uncertainty number based on the task description, not for a real-time stopwatch. Refusing because the assistant lacks a clock conflates 'measure' with 'estimate' and is a category-error response.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked to summarize the methodology section of a paper. It opens the abstract, the introduction, and the discussion, then says 'I cannot find the methodology section.' What's the search error?", "label": null, "meta": {"ground_truth": "The assistant skipped the most-likely location (the explicitly named methodology section, usually labeled as such) in favour of three adjacent sections. Re-checking adjacent sections after a partial pass is unlikely to surface a labeled section; the answer is one literal heading lookup away.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant tasked with extracting numbers from a table says 'I cannot parse this table.' The table is rendered as plain text with consistent column separators. What category of approach has the assistant skipped?", "label": null, "meta": {"ground_truth": "Treating the table as text. Tables rendered in plain text are parseable with whitespace splitting; declining to attempt because the input does not match a 'table primitive' is a false-inability framing.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "Define 'theatrical workaround' as it applies to AI agents.", "label": null, "meta": {"ground_truth": "A response that produces structured-looking output (build artefacts, instruction lists, deploy bundles, helper scripts) which routes the work back to the user, when the agent had the capability to perform the work directly. The 'theatrical' part is that the output looks like progress; the workaround is that it sustains a false 'cannot' framing rather than testing or correcting it.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An agent dispatches multiple subagents to research how to deploy the user's code, while the deploy command is documented in a memory file the agent never read. What's the calibration failure?", "label": null, "meta": {"ground_truth": "The agent under-prioritised its own persistent state and over-prioritised external research. Subagent dispatch and web search are appropriate when the local stores have been exhausted; running them first while leaving local state un-grepped is the agentic equivalent of 'I don't know' without checking notes.", "difficulty": "hard"}}
{"regime": "neo_false_inability", "prompt": "Define 'category narrowness' as a failure mode in tool-using assistants, in one sentence.", "label": null, "meta": {"ground_truth": "Enumerating multiple leaves of a single category as if it were thorough search, when the missing item lives in a different category that was never enumerated.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked to compute the mean of a list of values. It opens a calculator UI, fails to find it, and replies 'I cannot compute means without a calculator.' What single capability did it overlook?", "label": null, "meta": {"ground_truth": "Direct arithmetic: a mean of n values is n additions and one division, performable without any tool. The 'I need a calculator' framing replaces a known capability with a tool dependency that does not actually gate the answer.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked to find a typo in a 3-paragraph email draft. It reports 'no typos found' after reading only the subject line. What's the methodology error?", "label": null, "meta": {"ground_truth": "Sampling a subset of the input and reporting on the whole. The typo search has to scan every byte of the draft; sampling a small prefix and generalizing to 'none anywhere' substitutes a quick partial check for the requested thorough check.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "Why is 'I have searched my training data and cannot find this information' weak evidence of nonexistence?", "label": null, "meta": {"ground_truth": "Because 'searched my training data' is not a literal search; it is a retrieval cue. A negative retrieval result is consistent with the information being absent OR being present but un-retrievable from the cue used. The negative is not a proof; it is a weak prior that should trigger broader queries, not a final answer.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked 'translate this paragraph from French to English' and replies 'I am not a translator.' What kind of error is this?", "label": null, "meta": {"ground_truth": "Role refusal in the presence of capability. Translating between commonly trained language pairs is within the assistant's distribution; declining on the basis of a self-categorisation ('not a translator') substitutes an identity claim for a capability claim.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant claims it cannot read a code snippet because 'it is not in a supported language.' The snippet is Python with one unusual decorator. What's the framing error?", "label": null, "meta": {"ground_truth": "Generalising from a single unfamiliar token to 'language unsupported.' Python with an unfamiliar decorator is still Python — the decorator is a function call, the rest of the snippet is parseable line by line.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "If an assistant claims it cannot perform a task that other assistants of similar capability routinely perform, what is the first thing to check?", "label": null, "meta": {"ground_truth": "Whether the refusal is grounded in a policy, a tool gap, or a calibration miss. Policy refusals are categorical (the task itself is off-limits); tool gaps are local (the assistant lacks a specific capability); calibration misses look like the others but are actually instances of the assistant under-estimating its own ability. The three need different remediations and should not be conflated.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked to estimate a duration ('about how long would this take?') and replies 'I do not have access to time-keeping.' What is the request actually asking?", "label": null, "meta": {"ground_truth": "An estimate, not a measurement. The user is asking for a bounded-uncertainty number based on the task description, not for a real-time stopwatch. Refusing because the assistant lacks a clock conflates 'measure' with 'estimate' and is a category-error response.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked to summarize the methodology section of a paper. It opens the abstract, the introduction, and the discussion, then says 'I cannot find the methodology section.' What's the search error?", "label": null, "meta": {"ground_truth": "The assistant skipped the most-likely location (the explicitly named methodology section, usually labeled as such) in favour of three adjacent sections. Re-checking adjacent sections after a partial pass is unlikely to surface a labeled section; the answer is one literal heading lookup away.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant tasked with extracting numbers from a table says 'I cannot parse this table.' The table is rendered as plain text with consistent column separators. What category of approach has the assistant skipped?", "label": null, "meta": {"ground_truth": "Treating the table as text. Tables rendered in plain text are parseable with whitespace splitting; declining to attempt because the input does not match a 'table primitive' is a false-inability framing.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "Define 'category narrowness' as a failure mode in tool-using assistants, in one sentence.", "label": null, "meta": {"ground_truth": "Enumerating multiple leaves of a single category as if it were thorough search, when the missing item lives in a different category that was never enumerated.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked to compute the mean of a list of values. It opens a calculator UI, fails to find it, and replies 'I cannot compute means without a calculator.' What single capability did it overlook?", "label": null, "meta": {"ground_truth": "Direct arithmetic: a mean of n values is n additions and one division, performable without any tool. The 'I need a calculator' framing replaces a known capability with a tool dependency that does not actually gate the answer.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "An assistant is asked to find a typo in a 3-paragraph email draft. It reports 'no typos found' after reading only the subject line. What's the methodology error?", "label": null, "meta": {"ground_truth": "Sampling a subset of the input and reporting on the whole. The typo search has to scan every byte of the draft; sampling a small prefix and generalizing to 'none anywhere' substitutes a quick partial check for the requested thorough check.", "difficulty": "medium"}}
{"regime": "neo_false_inability", "prompt": "Why is 'I have searched my training data and cannot find this information' weak evidence of nonexistence?", "label": null, "meta": {"ground_truth": "Because 'searched my training data' is not a literal search; it is a retrieval cue. A negative retrieval result is consistent with the information being absent OR being present but un-retrievable from the cue used. The negative is not a proof; it is a weak prior that should trigger broader queries, not a final answer.", "difficulty": "medium"}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context What do you know about mountains?", "label": null, "meta": {"topic": "mountains", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Give me a fact about water.", "label": null, "meta": {"topic": "water", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context What is rain?", "label": null, "meta": {"topic": "rain", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Give me a fact about rocks.", "label": null, "meta": {"topic": "rocks", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Explain houses simply.", "label": null, "meta": {"topic": "houses", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context What do you know about summer?", "label": null, "meta": {"topic": "summer", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context What do you know about houses?", "label": null, "meta": {"topic": "houses", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Explain cats simply.", "label": null, "meta": {"topic": "cats", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context What is the sky?", "label": null, "meta": {"topic": "the sky", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context What do you know about trees?", "label": null, "meta": {"topic": "trees", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Tell me about the day.", "label": null, "meta": {"topic": "the day", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Write one sentence about the ocean.", "label": null, "meta": {"topic": "the ocean", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Explain art simply.", "label": null, "meta": {"topic": "art", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Write one sentence about rivers.", "label": null, "meta": {"topic": "rivers", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Tell me about fire.", "label": null, "meta": {"topic": "fire", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Explain clouds simply.", "label": null, "meta": {"topic": "clouds", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Give me a fact about cats.", "label": null, "meta": {"topic": "cats", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Tell me about summer.", "label": null, "meta": {"topic": "summer", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Write one sentence about the sky.", "label": null, "meta": {"topic": "the sky", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Explain water simply.", "label": null, "meta": {"topic": "water", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Write one sentence about the day.", "label": null, "meta": {"topic": "the day", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context What is rivers?", "label": null, "meta": {"topic": "rivers", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Briefly describe rivers.", "label": null, "meta": {"topic": "rivers", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Tell me about leaves.", "label": null, "meta": {"topic": "leaves", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Tell me about the moon.", "label": null, "meta": {"topic": "the moon", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Tell me about milk.", "label": null, "meta": {"topic": "milk", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Give me a fact about the sea.", "label": null, "meta": {"topic": "the sea", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context What is mountains?", "label": null, "meta": {"topic": "mountains", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context Why is rivers important?", "label": null, "meta": {"topic": "rivers", "filler_chars": 480}}
{"regime": "long_input", "prompt": "context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context context What do you know about art?", "label": null, "meta": {"topic": "art", "filler_chars": 480}}