File size: 29,813 Bytes
c7ebaa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
#!/usr/bin/env python3
"""
BioRLHF SFT Dataset Generator - EXPANDED VERSION
Creates 200+ instruction-tuning examples from KMP 2x2x2 factorial mouse data
"""

import json
import argparse
from typing import List, Dict
import random

# =============================================================================
# GROUND TRUTH DATA
# =============================================================================

STRESSOR_EFFECTS = {
    'Heart': {'HU': 165, 'IR': 33, 'HU_IR': 910, 'HU_up': 67, 'HU_down': 98, 'IR_up': 17, 'IR_down': 16},
    'Hippocampus': {'HU': 1555, 'IR': 5477, 'HU_IR': 5510, 'HU_up': 711, 'HU_down': 844, 'IR_up': 2554, 'IR_down': 2923},
    'Liver': {'HU': 4110, 'IR': 1273, 'HU_IR': 6213, 'HU_up': 2189, 'HU_down': 1921, 'IR_up': 413, 'IR_down': 860},
    'Soleus': {'HU': 6425, 'IR': 67, 'HU_IR': 6830, 'HU_up': 3251, 'HU_down': 3174, 'IR_up': 28, 'IR_down': 39},
}

KMP_EFFECTS = {
    'Heart': {'baseline': 112, 'in_HU': 2, 'in_IR': 2, 'in_HU_IR': 2110, 'in_HU_IR_up': 1336, 'in_HU_IR_down': 774},
    'Hippocampus': {'baseline': 4110, 'in_HU': 1, 'in_IR': 243, 'in_HU_IR': 140, 'baseline_up': 1813, 'baseline_down': 2297},
    'Liver': {'baseline': 309, 'in_HU': 17, 'in_IR': 389, 'in_HU_IR': 3},
    'Soleus': {'baseline': 0, 'in_HU': 1, 'in_IR': 52, 'in_HU_IR': 491, 'in_HU_IR_up': 425, 'in_HU_IR_down': 66},
}

INTERACTIONS = {
    'Heart': {'HU_x_IR': 244, 'KMP_x_HU': 479, 'KMP_x_IR': 29},
    'Hippocampus': {'HU_x_IR': 93, 'KMP_x_HU': 36, 'KMP_x_IR': 1221},
    'Liver': {'HU_x_IR': 3210, 'KMP_x_HU': 3369, 'KMP_x_IR': 247},
    'Soleus': {'HU_x_IR': 211, 'KMP_x_HU': 8484, 'KMP_x_IR': 484},
}

TISSUE_TYPES = {
    'Heart': 'Type A (stress-activated)',
    'Soleus': 'Type A (stress-activated)', 
    'Hippocampus': 'Type B (baseline-active)',
    'Liver': 'Type C (stress-blocked)',
}

OXPHOS_PATTERNS = {
    'Heart': {'stress_NES': -2.302, 'KMP_NES': 3.691, 'pattern': 'RESCUE', 'delta': 5.993},
    'Hippocampus': {'stress_NES': 0.931, 'KMP_NES': 1.585, 'pattern': 'KMP Only', 'delta': 0.654},
    'Liver': {'stress_NES': 3.596, 'KMP_NES': -1.6, 'pattern': 'SUPPRESSION', 'delta': -5.196},
    'Soleus': {'stress_NES': -2.997, 'KMP_NES': 2.46, 'pattern': 'RESCUE', 'delta': 5.457},
}

PATHWAY_PATTERNS = {
    'Heart': {
        'FATTY_ACID_METABOLISM': {'stress': -2.371, 'kmp': 3.1, 'pattern': 'RESCUE'},
        'ADIPOGENESIS': {'stress': -1.839, 'kmp': 2.81, 'pattern': 'RESCUE'},
        'MTORC1_SIGNALING': {'stress': -1.662, 'kmp': 2.585, 'pattern': 'RESCUE'},
        'INTERFERON_ALPHA': {'stress': -2.072, 'kmp': 1.581, 'pattern': 'RESCUE'},
    },
    'Liver': {
        'MTORC1_SIGNALING': {'stress': 3.075, 'kmp': -1.678, 'pattern': 'SUPPRESSION'},
        'INTERFERON_GAMMA': {'stress': 1.542, 'kmp': -2.336, 'pattern': 'SUPPRESSION'},
    },
    'Soleus': {
        'FATTY_ACID_METABOLISM': {'stress': -2.418, 'kmp': 1.506, 'pattern': 'RESCUE'},
    }
}


def generate_factual_basic() -> List[Dict]:
    """Basic factual questions about DEG counts."""
    examples = []
    
    for tissue, effects in STRESSOR_EFFECTS.items():
        # HU effects with direction
        examples.append({
            "instruction": f"How many genes are differentially expressed due to hindlimb unloading in {tissue.lower()}?",
            "input": "",
            "output": f"Hindlimb unloading causes {effects['HU']:,} DEGs in {tissue.lower()} (padj < 0.05), with {effects.get('HU_up', 'N/A')} upregulated and {effects.get('HU_down', 'N/A')} downregulated."
        })
        
        # IR effects
        examples.append({
            "instruction": f"What is the transcriptomic response to ionizing radiation in {tissue.lower()}?",
            "input": "",
            "output": f"Ionizing radiation induces {effects['IR']:,} DEGs in {tissue.lower()}, with {effects.get('IR_up', 'N/A')} upregulated and {effects.get('IR_down', 'N/A')} downregulated."
        })
        
        # Combined effects
        examples.append({
            "instruction": f"Describe the combined stress response in {tissue.lower()}.",
            "input": "",
            "output": f"Combined HU+IR stress produces {effects['HU_IR']:,} DEGs in {tissue.lower()}, representing the integrated response to simultaneous mechanical unloading and radiation exposure."
        })
        
        # Additivity check
        additive = effects['HU'] + effects['IR']
        if effects['HU_IR'] > additive * 1.2:
            add_type = "synergistic (more than additive)"
        elif effects['HU_IR'] < additive * 0.8:
            add_type = "sub-additive (less than sum)"
        else:
            add_type = "approximately additive"
        
        examples.append({
            "instruction": f"Is the combined HU+IR effect additive in {tissue.lower()}?",
            "input": "",
            "output": f"In {tissue.lower()}, HU causes {effects['HU']:,} DEGs and IR causes {effects['IR']:,} DEGs. The combined effect ({effects['HU_IR']:,} DEGs) is {add_type}."
        })
    
    return examples


def generate_factual_kmp() -> List[Dict]:
    """Factual questions about KMP effects."""
    examples = []
    
    for tissue, effects in KMP_EFFECTS.items():
        # Baseline vs stress comparison
        fold = effects['in_HU_IR'] / max(effects['baseline'], 1)
        if fold > 5:
            change = "dramatically increases"
        elif fold < 0.2:
            change = "dramatically decreases"
        else:
            change = "moderately changes"
        
        examples.append({
            "instruction": f"How does stress affect KMP's transcriptional activity in {tissue.lower()}?",
            "input": "",
            "output": f"KMP effect {change} from {effects['baseline']:,} DEGs at baseline to {effects['in_HU_IR']:,} DEGs under combined stress in {tissue.lower()}. This indicates {'stress-activated' if fold > 1 else 'stress-blocked'} pharmacodynamics."
        })
        
        # Each stress condition
        examples.append({
            "instruction": f"Compare KMP effects across different stress conditions in {tissue.lower()}.",
            "input": "",
            "output": f"In {tissue.lower()}, KMP produces: {effects['baseline']} DEGs at baseline, {effects['in_HU']} DEGs under HU only, {effects['in_IR']} DEGs under IR only, and {effects['in_HU_IR']:,} DEGs under combined HU+IR stress."
        })
        
        # Direction of KMP effect
        if 'in_HU_IR_up' in effects:
            pct_up = effects['in_HU_IR_up'] / effects['in_HU_IR'] * 100
            examples.append({
                "instruction": f"What is the direction of KMP-induced gene expression changes in {tissue.lower()} under stress?",
                "input": "",
                "output": f"Under combined stress, KMP induces {effects['in_HU_IR_up']:,} upregulated and {effects['in_HU_IR_down']:,} downregulated genes in {tissue.lower()} ({pct_up:.1f}% upregulated). This {'anabolic/protective' if pct_up > 60 else 'mixed' if pct_up > 40 else 'suppressive'} signature suggests {'tissue protection' if pct_up > 60 else 'complex regulation'}."
            })
    
    return examples


def generate_factual_interactions() -> List[Dict]:
    """Factual questions about interaction effects."""
    examples = []
    
    for tissue, ints in INTERACTIONS.items():
        # KMP x HU
        examples.append({
            "instruction": f"What is the statistical interaction between KMP and HU in {tissue.lower()}?",
            "input": "",
            "output": f"The KMP × HU interaction produces {ints['KMP_x_HU']:,} DEGs in {tissue.lower()}, indicating {'massive' if ints['KMP_x_HU'] > 5000 else 'substantial' if ints['KMP_x_HU'] > 500 else 'moderate'} non-additive effects."
        })
        
        # KMP x IR
        examples.append({
            "instruction": f"Describe the KMP × IR interaction in {tissue.lower()}.",
            "input": "",
            "output": f"The KMP × IR interaction produces {ints['KMP_x_IR']:,} DEGs in {tissue.lower()}, {'representing the largest radiation-drug interaction' if ints['KMP_x_IR'] > 1000 else 'indicating modest interaction with radiation stress'}."
        })
        
        # HU x IR
        examples.append({
            "instruction": f"Is there a HU × IR interaction in {tissue.lower()}?",
            "input": "",
            "output": f"Yes, the HU × IR interaction produces {ints['HU_x_IR']:,} DEGs in {tissue.lower()}, indicating the two stressors have {'strong synergistic' if ints['HU_x_IR'] > 1000 else 'moderate non-additive'} effects."
        })
    
    return examples


def generate_factual_pathways() -> List[Dict]:
    """Factual questions about pathway patterns."""
    examples = []
    
    for tissue, pattern in OXPHOS_PATTERNS.items():
        examples.append({
            "instruction": f"What happens to oxidative phosphorylation in {tissue.lower()} under stress?",
            "input": "",
            "output": f"Under combined HU+IR stress, OXPHOS shows NES = {pattern['stress_NES']:.2f} in {tissue.lower()}, indicating {'suppression' if pattern['stress_NES'] < 0 else 'activation'} of mitochondrial respiration."
        })
        
        examples.append({
            "instruction": f"How does KMP affect OXPHOS in {tissue.lower()}?",
            "input": "",
            "output": f"KMP shifts OXPHOS NES from {pattern['stress_NES']:.2f} to {pattern['KMP_NES']:.2f} in {tissue.lower()} (Δ = {pattern['delta']:.2f}). This {pattern['pattern']} pattern indicates {'restoration of mitochondrial function' if 'RESCUE' in pattern['pattern'] else 'metabolic braking' if 'SUPPRESSION' in pattern['pattern'] else 'KMP-specific effects'}."
        })
    
    for tissue, pathways in PATHWAY_PATTERNS.items():
        for pathway, data in pathways.items():
            examples.append({
                "instruction": f"What is the {pathway.replace('_', ' ').lower()} response in {tissue.lower()}?",
                "input": "",
                "output": f"In {tissue.lower()}, stress produces {pathway.replace('_', ' ')} NES = {data['stress']:.2f}, and KMP shifts this to NES = {data['kmp']:.2f}. Pattern: {data['pattern']}."
            })
    
    return examples


def generate_comparison_questions() -> List[Dict]:
    """Cross-tissue and cross-condition comparisons."""
    examples = []
    
    # Tissue rankings
    tissues_by_hu = sorted(STRESSOR_EFFECTS.items(), key=lambda x: x[1]['HU'], reverse=True)
    examples.append({
        "instruction": "Rank tissues by sensitivity to hindlimb unloading.",
        "input": "",
        "output": f"HU sensitivity ranking: 1) {tissues_by_hu[0][0]} ({tissues_by_hu[0][1]['HU']:,}), 2) {tissues_by_hu[1][0]} ({tissues_by_hu[1][1]['HU']:,}), 3) {tissues_by_hu[2][0]} ({tissues_by_hu[2][1]['HU']:,}), 4) {tissues_by_hu[3][0]} ({tissues_by_hu[3][1]['HU']})."
    })
    
    tissues_by_ir = sorted(STRESSOR_EFFECTS.items(), key=lambda x: x[1]['IR'], reverse=True)
    examples.append({
        "instruction": "Rank tissues by sensitivity to ionizing radiation.",
        "input": "",
        "output": f"IR sensitivity ranking: 1) {tissues_by_ir[0][0]} ({tissues_by_ir[0][1]['IR']:,}), 2) {tissues_by_ir[1][0]} ({tissues_by_ir[1][1]['IR']:,}), 3) {tissues_by_ir[2][0]} ({tissues_by_ir[2][1]['IR']}), 4) {tissues_by_ir[3][0]} ({tissues_by_ir[3][1]['IR']})."
    })
    
    tissues_by_kmp = sorted(KMP_EFFECTS.items(), key=lambda x: x[1]['in_HU_IR'], reverse=True)
    examples.append({
        "instruction": "Rank tissues by KMP effect under combined stress.",
        "input": "",
        "output": f"KMP effect under stress: 1) {tissues_by_kmp[0][0]} ({tissues_by_kmp[0][1]['in_HU_IR']:,}), 2) {tissues_by_kmp[1][0]} ({tissues_by_kmp[1][1]['in_HU_IR']}), 3) {tissues_by_kmp[2][0]} ({tissues_by_kmp[2][1]['in_HU_IR']}), 4) {tissues_by_kmp[3][0]} ({tissues_by_kmp[3][1]['in_HU_IR']})."
    })
    
    # Pairwise comparisons
    for t1 in ['Heart', 'Hippocampus', 'Liver', 'Soleus']:
        for t2 in ['Heart', 'Hippocampus', 'Liver', 'Soleus']:
            if t1 < t2:
                examples.append({
                    "instruction": f"Compare {t1.lower()} and {t2.lower()} responses to HU.",
                    "input": "",
                    "output": f"{t1}: {STRESSOR_EFFECTS[t1]['HU']:,} DEGs. {t2}: {STRESSOR_EFFECTS[t2]['HU']:,} DEGs. {'Same' if TISSUE_TYPES[t1] == TISSUE_TYPES[t2] else 'Different'} KMP response type."
                })
                
                examples.append({
                    "instruction": f"Compare KMP context-dependency in {t1.lower()} vs {t2.lower()}.",
                    "input": "",
                    "output": f"{t1} ({TISSUE_TYPES[t1]}): baseline→stress = {KMP_EFFECTS[t1]['baseline']}{KMP_EFFECTS[t1]['in_HU_IR']:,}. {t2} ({TISSUE_TYPES[t2]}): {KMP_EFFECTS[t2]['baseline']}{KMP_EFFECTS[t2]['in_HU_IR']}."
                })
    
    # Stressor dominance
    for tissue, effects in STRESSOR_EFFECTS.items():
        if effects['HU'] > effects['IR'] * 3:
            dominance = "HU-dominant"
        elif effects['IR'] > effects['HU'] * 3:
            dominance = "IR-dominant"
        else:
            dominance = "balanced response"
        
        examples.append({
            "instruction": f"What stressor dominates the response in {tissue.lower()}?",
            "input": "",
            "output": f"{tissue} shows {dominance}: HU = {effects['HU']:,} DEGs, IR = {effects['IR']:,} DEGs (ratio = {effects['HU']/max(effects['IR'],1):.1f})."
        })
    
    return examples


def generate_prediction_tasks() -> List[Dict]:
    """Interaction and cross-tissue prediction tasks."""
    examples = []
    
    # Predict combined from main effects
    for tissue in STRESSOR_EFFECTS.keys():
        effects = STRESSOR_EFFECTS[tissue]
        examples.append({
            "instruction": f"Predict combined HU+IR effect in {tissue.lower()} from main effects.",
            "input": f"HU alone: {effects['HU']:,} DEGs. IR alone: {effects['IR']:,} DEGs.",
            "output": f"Additive prediction: ~{effects['HU']+effects['IR']:,} DEGs. Actual: {effects['HU_IR']:,} DEGs. The {'synergistic' if effects['HU_IR'] > effects['HU']+effects['IR'] else 'sub-additive'} effect reflects biological interaction between stressors."
        })
    
    # Predict KMP under stress from baseline
    for tissue in KMP_EFFECTS.keys():
        kmp = KMP_EFFECTS[tissue]
        examples.append({
            "instruction": f"Predict KMP effect under stress in {tissue.lower()}.",
            "input": f"KMP at baseline: {kmp['baseline']} DEGs. Tissue type: {TISSUE_TYPES[tissue]}.",
            "output": f"Based on tissue type, predict {'increase' if 'stress-activated' in TISSUE_TYPES[tissue] else 'decrease'}. Actual: {kmp['in_HU_IR']:,} DEGs. Ratio: {kmp['in_HU_IR']/max(kmp['baseline'],1):.1f}x."
        })
    
    # Cross-tissue predictions
    examples.append({
        "instruction": "Given heart (Type A) and soleus (Type A), predict similarity of KMP response.",
        "input": "Both are Type A (stress-activated). Heart KMP under stress: 2,110 DEGs.",
        "output": f"Prediction: Similar stress-activated pattern. Actual soleus: {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs. Both show stress-activated response, confirming Type A classification predicts pharmacodynamics."
    })
    
    examples.append({
        "instruction": "Given heart OXPHOS RESCUE, predict liver OXPHOS pattern.",
        "input": f"Heart: stress {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f} → KMP {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}. Liver is Type C (different).",
        "output": f"Prediction: Different pattern due to metabolic role. Actual: Liver shows SUPPRESSION (stress {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f} → KMP {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}). Opposite pattern confirms tissue-specific mechanisms."
    })
    
    examples.append({
        "instruction": "Predict KMP × HU interaction magnitude in soleus from heart data.",
        "input": f"Heart KMP × HU: {INTERACTIONS['Heart']['KMP_x_HU']} DEGs. Soleus has larger HU response ({STRESSOR_EFFECTS['Soleus']['HU']:,} vs {STRESSOR_EFFECTS['Heart']['HU']}).",
        "output": f"Prediction: Larger than heart due to greater HU sensitivity. Actual: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs ({INTERACTIONS['Soleus']['KMP_x_HU']/INTERACTIONS['Heart']['KMP_x_HU']:.0f}x larger). Massive interaction reflects stress-dependent KMP activation."
    })
    
    return examples


def generate_design_critique() -> List[Dict]:
    """Experimental design critique tasks."""
    examples = []
    
    examples.append({
        "instruction": "Critique: Testing KMP only at baseline to assess spaceflight protection.",
        "input": "",
        "output": f"FLAWED. Heart baseline KMP: {KMP_EFFECTS['Heart']['baseline']} DEGs. Heart stressed KMP: {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs. Testing at baseline would miss {KMP_EFFECTS['Heart']['in_HU_IR']/max(KMP_EFFECTS['Heart']['baseline'],1):.0f}x larger stressed effect. Must include stress conditions."
    })
    
    examples.append({
        "instruction": "Critique: Extrapolating heart results to predict liver response.",
        "input": "",
        "output": "INCORRECT. Heart (Type A) and Liver (Type C) have opposite patterns. Heart OXPHOS: RESCUE. Liver OXPHOS: SUPPRESSION. KMP effect: Heart {0} → {1} DEGs, Liver {2} → {3} DEGs. Cross-tissue extrapolation invalid.".format(
            KMP_EFFECTS['Heart']['baseline'], KMP_EFFECTS['Heart']['in_HU_IR'],
            KMP_EFFECTS['Liver']['baseline'], KMP_EFFECTS['Liver']['in_HU_IR']
        )
    })
    
    examples.append({
        "instruction": "Critique: Using |log2FC| > 1 threshold.",
        "input": "",
        "output": f"PROBLEMATIC. Stringent thresholds miss coordinated small effects. Using padj-only: Heart KMP = {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs. Many genes with <2-fold change contribute to pathway-level effects like OXPHOS rescue."
    })
    
    examples.append({
        "instruction": "Critique: Single-tissue study for KMP countermeasure development.",
        "input": "",
        "output": f"INSUFFICIENT. Tissue responses vary dramatically: Heart {KMP_EFFECTS['Heart']['in_HU_IR']:,}, Soleus {KMP_EFFECTS['Soleus']['in_HU_IR']}, Hippocampus {KMP_EFFECTS['Hippocampus']['in_HU_IR']}, Liver {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress. Type A/B/C classification requires multi-tissue data."
    })
    
    examples.append({
        "instruction": "Critique: Assuming additive drug-stressor effects.",
        "input": "",
        "output": f"INCORRECT. KMP × HU interactions are highly non-additive. Soleus: {INTERACTIONS['Soleus']['KMP_x_HU']:,} interaction DEGs. This super-additive effect indicates synergistic mechanisms that additive models cannot capture."
    })
    
    examples.append({
        "instruction": "Evaluate cell culture KMP data for in vivo predictions.",
        "input": "",
        "output": f"LIMITED VALUE. Cell culture cannot capture: context-dependency (KMP effects range 0-{KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs), tissue specificity (Type A/B/C), or interaction effects (KMP × HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs). In vivo factorial studies essential."
    })
    
    examples.append({
        "instruction": "Critique: Human dosing based on mouse efficacy data.",
        "input": "",
        "output": "PREMATURE. Missing: dose-response (single dose tested), human PK, therapeutic window, bioavailability. Mouse-to-human scaling unreliable (10-100x errors). Need: dose-response studies, PK modeling, Phase I trials."
    })
    
    return examples


def generate_mechanistic_reasoning() -> List[Dict]:
    """Mechanistic reasoning tasks."""
    examples = []
    
    examples.append({
        "instruction": "Explain hippocampus baseline-active, stress-blocked pattern.",
        "input": "",
        "output": f"Hippocampus: {KMP_EFFECTS['Hippocampus']['baseline']:,} DEGs baseline → {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs stressed. Mechanisms: (1) Target saturation by stress signals, (2) Pathway competition with DNA damage response, (3) Altered transcriptional machinery from radiation. Implication: Pre-treatment may be more effective."
    })
    
    examples.append({
        "instruction": "Explain the massive KMP × HU interaction in soleus.",
        "input": "",
        "output": f"Soleus KMP × HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs (largest interaction). Mechanisms: (1) Mechanosensitive pathways create KMP binding sites, (2) HU upregulates KMP targets normally not expressed, (3) Mitochondrial dysfunction reveals KMP protective capacity. Key insight: KMP specifically protects stressed tissue."
    })
    
    examples.append({
        "instruction": "Explain opposite OXPHOS patterns in heart vs liver.",
        "input": "",
        "output": f"Heart OXPHOS: stress suppresses ({OXPHOS_PATTERNS['Heart']['stress_NES']:.1f}) → KMP rescues ({OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}). Liver OXPHOS: stress activates ({OXPHOS_PATTERNS['Liver']['stress_NES']:.1f}) → KMP suppresses ({OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}). Heart needs ATP restoration for contraction. Liver needs metabolic braking to prevent oxidative damage. 'Rescue' is tissue-specific."
    })
    
    examples.append({
        "instruction": "Why does soleus show 0 KMP DEGs at baseline but 491 under stress?",
        "input": "",
        "output": f"Complete stress-dependency. Baseline: KMP targets not expressed or accessible. Under HU stress: Mechanosensitive pathways activate, target expression increases, mitochondrial stress reveals KMP binding sites. Result: 0 → {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs. KMP is conditionally active therapeutic."
    })
    
    examples.append({
        "instruction": "Explain Type A/B/C classification system.",
        "input": "",
        "output": f"Type A (Heart, Soleus): Stress-activated. Baseline minimal, stress maximal. KMP works when needed. Type B (Hippocampus): Baseline-active ({KMP_EFFECTS['Hippocampus']['baseline']:,}{KMP_EFFECTS['Hippocampus']['in_HU_IR']}). Stress blocks. May need pre-treatment. Type C (Liver): Stress-blocked ({KMP_EFFECTS['Liver']['baseline']}{KMP_EFFECTS['Liver']['in_HU_IR']}). Different mechanism needed."
    })
    
    examples.append({
        "instruction": "Why is KMP × HU larger than KMP × IR in most tissues?",
        "input": "",
        "output": f"KMP × HU: Heart {INTERACTIONS['Heart']['KMP_x_HU']}, Liver {INTERACTIONS['Liver']['KMP_x_HU']:,}, Soleus {INTERACTIONS['Soleus']['KMP_x_HU']:,}. KMP × IR: Heart {INTERACTIONS['Heart']['KMP_x_IR']}, Liver {INTERACTIONS['Liver']['KMP_x_IR']}, Soleus {INTERACTIONS['Soleus']['KMP_x_IR']}. KMP likely targets mechanosensitive/metabolic pathways more than DNA damage pathways."
    })
    
    return examples


def generate_uncertainty_calibration() -> List[Dict]:
    """Uncertainty calibration tasks."""
    examples = []
    
    examples.append({
        "instruction": "What is KMP's effect on cardiac contractility?",
        "input": "",
        "output": f"CANNOT DIRECTLY ANSWER. Data shows: {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs, OXPHOS rescue. These are transcriptomic changes. NOT shown: functional contractility, protein levels, force generation. Inference: OXPHOS rescue may improve ATP→may improve contractility. Confidence: LOW for function."
    })
    
    examples.append({
        "instruction": "Will KMP prevent human astronaut muscle atrophy?",
        "input": "",
        "output": f"CANNOT CONFIDENTLY ANSWER. Supports: Stress-activated protection ({KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs), KMP × HU interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,}), OXPHOS rescue. Missing: Human PK, true microgravity (HU is analog), long-duration effects. Strong rationale but insufficient for prediction."
    })
    
    examples.append({
        "instruction": "Which gene is most important for KMP protection?",
        "input": "",
        "output": "CANNOT DEFINITIVELY ANSWER. Transcriptomics shows correlation, not causation. Large change ≠ causal role. Redundant pathways exist. No knockout validation. Better questions: 'Largest effects?' (answerable), 'Most enriched pathways?' (answerable). Need: CRISPR screening for causality."
    })
    
    examples.append({
        "instruction": "What KMP dose should be used for astronauts?",
        "input": "",
        "output": "CANNOT INFORM DOSING. Missing: Dose-response (one dose tested), human PK, therapeutic window, bioavailability. Mouse-human scaling unreliable. Need: Multi-dose mouse studies, PK modeling, human Phase I. Do not extrapolate."
    })
    
    examples.append({
        "instruction": "Is the 8,484 DEG interaction statistically robust?",
        "input": "",
        "output": f"YES with caveats. Each of {INTERACTIONS['Soleus']['KMP_x_HU']:,} genes has padj < 0.05. Highly significant. But: exact number is threshold-dependent, effect sizes vary. Appropriate: 'approximately 8,500 DEGs indicating substantial non-additive effect.' Avoid false precision."
    })
    
    examples.append({
        "instruction": "Can we predict KMP effects in kidney from this data?",
        "input": "",
        "output": "CANNOT PREDICT. Kidney not studied. Type A/B/C patterns are tissue-specific and cannot be extrapolated. Kidney has unique metabolic/filtration role. Would need: dedicated kidney experiments. Cross-tissue prediction requires mechanistic validation."
    })
    
    return examples


def generate_application_questions() -> List[Dict]:
    """Practical application questions."""
    examples = []
    
    examples.append({
        "instruction": "Design a KMP efficacy study for astronaut protection.",
        "input": "",
        "output": f"Required: (1) Factorial design with stress conditions (not baseline-only), (2) Multiple tissues (Type A/B/C responses differ), (3) Functional endpoints beyond transcriptomics, (4) Dose-response. Rationale: Baseline-only would miss {KMP_EFFECTS['Heart']['in_HU_IR']:,}/{KMP_EFFECTS['Heart']['baseline']} DEG effect in heart."
    })
    
    examples.append({
        "instruction": "Prioritize tissues for KMP therapeutic development.",
        "input": "",
        "output": f"Priority 1: Soleus - largest interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,}), stress-activated, OXPHOS rescue. Priority 2: Heart - strong rescue ({KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs). Lower priority: Hippocampus (stress-blocked), Liver (minimal effect). Focus on Type A tissues."
    })
    
    examples.append({
        "instruction": "Identify biomarkers for KMP efficacy monitoring.",
        "input": "",
        "output": "Candidate biomarkers: (1) OXPHOS pathway genes (track rescue/suppression), (2) Tissue-type classification (confirm Type A/B/C), (3) Interaction-responsive genes from KMP × HU. Need validation: functional correlation, accessibility (blood-based), temporal dynamics."
    })
    
    examples.append({
        "instruction": "What controls are essential for KMP spaceflight studies?",
        "input": "",
        "output": f"Essential controls: (1) Vehicle under all stress conditions (not just baseline), (2) KMP at baseline (to detect context-dependency), (3) Single stressors (HU-only, IR-only) for interaction calculation, (4) Multiple tissues. Missing any control prevents detecting effects like {KMP_EFFECTS['Soleus']['baseline']}{KMP_EFFECTS['Soleus']['in_HU_IR']} shift."
    })
    
    return examples


def compile_sft_dataset(output_file: str = 'kmp_sft_dataset.json'):
    """Compile all examples into final SFT dataset."""
    
    all_examples = []
    
    print("Generating factual basic examples...")
    all_examples.extend(generate_factual_basic())
    
    print("Generating factual KMP examples...")
    all_examples.extend(generate_factual_kmp())
    
    print("Generating factual interaction examples...")
    all_examples.extend(generate_factual_interactions())
    
    print("Generating factual pathway examples...")
    all_examples.extend(generate_factual_pathways())
    
    print("Generating comparison examples...")
    all_examples.extend(generate_comparison_questions())
    
    print("Generating prediction examples...")
    all_examples.extend(generate_prediction_tasks())
    
    print("Generating design critique examples...")
    all_examples.extend(generate_design_critique())
    
    print("Generating mechanistic reasoning examples...")
    all_examples.extend(generate_mechanistic_reasoning())
    
    print("Generating uncertainty calibration examples...")
    all_examples.extend(generate_uncertainty_calibration())
    
    print("Generating application examples...")
    all_examples.extend(generate_application_questions())
    
    # Format for training
    formatted = []
    for ex in all_examples:
        if ex.get('input'):
            text = f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}"
        else:
            text = f"### Instruction:\n{ex['instruction']}\n\n### Response:\n{ex['output']}"
        formatted.append({"text": text})
    
    # Shuffle for training
    random.seed(42)
    random.shuffle(formatted)
    
    with open(output_file, 'w') as f:
        json.dump(formatted, f, indent=2)
    
    print(f"\n{'='*60}")
    print(f"SFT Dataset Summary")
    print(f"{'='*60}")
    print(f"Total examples: {len(formatted)}")
    print(f"Output file: {output_file}")
    
    return formatted


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--output', default='kmp_sft_dataset.json')
    args = parser.parse_args()
    compile_sft_dataset(args.output)