File size: 29,813 Bytes

c7ebaa1

#!/usr/bin/env python3
"""
BioRLHF SFT Dataset Generator - EXPANDED VERSION
Creates 200+ instruction-tuning examples from KMP 2x2x2 factorial mouse data
"""

import json
import argparse
from typing import List, Dict
import random

# =============================================================================
# GROUND TRUTH DATA
# =============================================================================

STRESSOR_EFFECTS = {
    'Heart': {'HU': 165, 'IR': 33, 'HU_IR': 910, 'HU_up': 67, 'HU_down': 98, 'IR_up': 17, 'IR_down': 16},
    'Hippocampus': {'HU': 1555, 'IR': 5477, 'HU_IR': 5510, 'HU_up': 711, 'HU_down': 844, 'IR_up': 2554, 'IR_down': 2923},
    'Liver': {'HU': 4110, 'IR': 1273, 'HU_IR': 6213, 'HU_up': 2189, 'HU_down': 1921, 'IR_up': 413, 'IR_down': 860},
    'Soleus': {'HU': 6425, 'IR': 67, 'HU_IR': 6830, 'HU_up': 3251, 'HU_down': 3174, 'IR_up': 28, 'IR_down': 39},
}

KMP_EFFECTS = {
    'Heart': {'baseline': 112, 'in_HU': 2, 'in_IR': 2, 'in_HU_IR': 2110, 'in_HU_IR_up': 1336, 'in_HU_IR_down': 774},
    'Hippocampus': {'baseline': 4110, 'in_HU': 1, 'in_IR': 243, 'in_HU_IR': 140, 'baseline_up': 1813, 'baseline_down': 2297},
    'Liver': {'baseline': 309, 'in_HU': 17, 'in_IR': 389, 'in_HU_IR': 3},
    'Soleus': {'baseline': 0, 'in_HU': 1, 'in_IR': 52, 'in_HU_IR': 491, 'in_HU_IR_up': 425, 'in_HU_IR_down': 66},
}

INTERACTIONS = {
    'Heart': {'HU_x_IR': 244, 'KMP_x_HU': 479, 'KMP_x_IR': 29},
    'Hippocampus': {'HU_x_IR': 93, 'KMP_x_HU': 36, 'KMP_x_IR': 1221},
    'Liver': {'HU_x_IR': 3210, 'KMP_x_HU': 3369, 'KMP_x_IR': 247},
    'Soleus': {'HU_x_IR': 211, 'KMP_x_HU': 8484, 'KMP_x_IR': 484},
}

TISSUE_TYPES = {
    'Heart': 'Type A (stress-activated)',
    'Soleus': 'Type A (stress-activated)', 
    'Hippocampus': 'Type B (baseline-active)',
    'Liver': 'Type C (stress-blocked)',
}

OXPHOS_PATTERNS = {
    'Heart': {'stress_NES': -2.302, 'KMP_NES': 3.691, 'pattern': 'RESCUE', 'delta': 5.993},
    'Hippocampus': {'stress_NES': 0.931, 'KMP_NES': 1.585, 'pattern': 'KMP Only', 'delta': 0.654},
    'Liver': {'stress_NES': 3.596, 'KMP_NES': -1.6, 'pattern': 'SUPPRESSION', 'delta': -5.196},
    'Soleus': {'stress_NES': -2.997, 'KMP_NES': 2.46, 'pattern': 'RESCUE', 'delta': 5.457},
}

PATHWAY_PATTERNS = {
    'Heart': {
        'FATTY_ACID_METABOLISM': {'stress': -2.371, 'kmp': 3.1, 'pattern': 'RESCUE'},
        'ADIPOGENESIS': {'stress': -1.839, 'kmp': 2.81, 'pattern': 'RESCUE'},
        'MTORC1_SIGNALING': {'stress': -1.662, 'kmp': 2.585, 'pattern': 'RESCUE'},
        'INTERFERON_ALPHA': {'stress': -2.072, 'kmp': 1.581, 'pattern': 'RESCUE'},
    },
    'Liver': {
        'MTORC1_SIGNALING': {'stress': 3.075, 'kmp': -1.678, 'pattern': 'SUPPRESSION'},
        'INTERFERON_GAMMA': {'stress': 1.542, 'kmp': -2.336, 'pattern': 'SUPPRESSION'},
    },
    'Soleus': {
        'FATTY_ACID_METABOLISM': {'stress': -2.418, 'kmp': 1.506, 'pattern': 'RESCUE'},
    }
}


def generate_factual_basic() -> List[Dict]:
    """Basic factual questions about DEG counts."""
    examples = []
    
    for tissue, effects in STRESSOR_EFFECTS.items():
        # HU effects with direction
        examples.append({
            "instruction": f"How many genes are differentially expressed due to hindlimb unloading in {tissue.lower()}?",
            "input": "",
            "output": f"Hindlimb unloading causes {effects['HU']:,} DEGs in {tissue.lower()} (padj < 0.05), with {effects.get('HU_up', 'N/A')} upregulated and {effects.get('HU_down', 'N/A')} downregulated."
        })
        
        # IR effects
        examples.append({
            "instruction": f"What is the transcriptomic response to ionizing radiation in {tissue.lower()}?",
            "input": "",
            "output": f"Ionizing radiation induces {effects['IR']:,} DEGs in {tissue.lower()}, with {effects.get('IR_up', 'N/A')} upregulated and {effects.get('IR_down', 'N/A')} downregulated."
        })
        
        # Combined effects
        examples.append({
            "instruction": f"Describe the combined stress response in {tissue.lower()}.",
            "input": "",
            "output": f"Combined HU+IR stress produces {effects['HU_IR']:,} DEGs in {tissue.lower()}, representing the integrated response to simultaneous mechanical unloading and radiation exposure."
        })
        
        # Additivity check
        additive = effects['HU'] + effects['IR']
        if effects['HU_IR'] > additive * 1.2:
            add_type = "synergistic (more than additive)"
        elif effects['HU_IR'] < additive * 0.8:
            add_type = "sub-additive (less than sum)"
        else:
            add_type = "approximately additive"
        
        examples.append({
            "instruction": f"Is the combined HU+IR effect additive in {tissue.lower()}?",
            "input": "",
            "output": f"In {tissue.lower()}, HU causes {effects['HU']:,} DEGs and IR causes {effects['IR']:,} DEGs. The combined effect ({effects['HU_IR']:,} DEGs) is {add_type}."
        })
    
    return examples


def generate_factual_kmp() -> List[Dict]:
    """Factual questions about KMP effects."""
    examples = []
    
    for tissue, effects in KMP_EFFECTS.items():
        # Baseline vs stress comparison
        fold = effects['in_HU_IR'] / max(effects['baseline'], 1)
        if fold > 5:
            change = "dramatically increases"
        elif fold < 0.2:
            change = "dramatically decreases"
        else:
            change = "moderately changes"
        
        examples.append({
            "instruction": f"How does stress affect KMP's transcriptional activity in {tissue.lower()}?",
            "input": "",
            "output": f"KMP effect {change} from {effects['baseline']:,} DEGs at baseline to {effects['in_HU_IR']:,} DEGs under combined stress in {tissue.lower()}. This indicates {'stress-activated' if fold > 1 else 'stress-blocked'} pharmacodynamics."
        })
        
        # Each stress condition
        examples.append({
            "instruction": f"Compare KMP effects across different stress conditions in {tissue.lower()}.",
            "input": "",
            "output": f"In {tissue.lower()}, KMP produces: {effects['baseline']} DEGs at baseline, {effects['in_HU']} DEGs under HU only, {effects['in_IR']} DEGs under IR only, and {effects['in_HU_IR']:,} DEGs under combined HU+IR stress."
        })
        
        # Direction of KMP effect
        if 'in_HU_IR_up' in effects:
            pct_up = effects['in_HU_IR_up'] / effects['in_HU_IR'] * 100
            examples.append({
                "instruction": f"What is the direction of KMP-induced gene expression changes in {tissue.lower()} under stress?",
                "input": "",
                "output": f"Under combined stress, KMP induces {effects['in_HU_IR_up']:,} upregulated and {effects['in_HU_IR_down']:,} downregulated genes in {tissue.lower()} ({pct_up:.1f}% upregulated). This {'anabolic/protective' if pct_up > 60 else 'mixed' if pct_up > 40 else 'suppressive'} signature suggests {'tissue protection' if pct_up > 60 else 'complex regulation'}."
            })
    
    return examples


def generate_factual_interactions() -> List[Dict]:
    """Factual questions about interaction effects."""
    examples = []
    
    for tissue, ints in INTERACTIONS.items():
        # KMP x HU
        examples.append({
            "instruction": f"What is the statistical interaction between KMP and HU in {tissue.lower()}?",
            "input": "",
            "output": f"The KMP × HU interaction produces {ints['KMP_x_HU']:,} DEGs in {tissue.lower()}, indicating {'massive' if ints['KMP_x_HU'] > 5000 else 'substantial' if ints['KMP_x_HU'] > 500 else 'moderate'} non-additive effects."
        })
        
        # KMP x IR
        examples.append({
            "instruction": f"Describe the KMP × IR interaction in {tissue.lower()}.",
            "input": "",
            "output": f"The KMP × IR interaction produces {ints['KMP_x_IR']:,} DEGs in {tissue.lower()}, {'representing the largest radiation-drug interaction' if ints['KMP_x_IR'] > 1000 else 'indicating modest interaction with radiation stress'}."
        })
        
        # HU x IR
        examples.append({
            "instruction": f"Is there a HU × IR interaction in {tissue.lower()}?",
            "input": "",
            "output": f"Yes, the HU × IR interaction produces {ints['HU_x_IR']:,} DEGs in {tissue.lower()}, indicating the two stressors have {'strong synergistic' if ints['HU_x_IR'] > 1000 else 'moderate non-additive'} effects."
        })
    
    return examples


def generate_factual_pathways() -> List[Dict]:
    """Factual questions about pathway patterns."""
    examples = []
    
    for tissue, pattern in OXPHOS_PATTERNS.items():
        examples.append({
            "instruction": f"What happens to oxidative phosphorylation in {tissue.lower()} under stress?",
            "input": "",
            "output": f"Under combined HU+IR stress, OXPHOS shows NES = {pattern['stress_NES']:.2f} in {tissue.lower()}, indicating {'suppression' if pattern['stress_NES'] < 0 else 'activation'} of mitochondrial respiration."
        })
        
        examples.append({
            "instruction": f"How does KMP affect OXPHOS in {tissue.lower()}?",
            "input": "",
            "output": f"KMP shifts OXPHOS NES from {pattern['stress_NES']:.2f} to {pattern['KMP_NES']:.2f} in {tissue.lower()} (Δ = {pattern['delta']:.2f}). This {pattern['pattern']} pattern indicates {'restoration of mitochondrial function' if 'RESCUE' in pattern['pattern'] else 'metabolic braking' if 'SUPPRESSION' in pattern['pattern'] else 'KMP-specific effects'}."
        })
    
    for tissue, pathways in PATHWAY_PATTERNS.items():
        for pathway, data in pathways.items():
            examples.append({
                "instruction": f"What is the {pathway.replace('_', ' ').lower()} response in {tissue.lower()}?",
                "input": "",
                "output": f"In {tissue.lower()}, stress produces {pathway.replace('_', ' ')} NES = {data['stress']:.2f}, and KMP shifts this to NES = {data['kmp']:.2f}. Pattern: {data['pattern']}."
            })
    
    return examples


def generate_comparison_questions() -> List[Dict]:
    """Cross-tissue and cross-condition comparisons."""
    examples = []
    
    # Tissue rankings
    tissues_by_hu = sorted(STRESSOR_EFFECTS.items(), key=lambda x: x[1]['HU'], reverse=True)
    examples.append({
        "instruction": "Rank tissues by sensitivity to hindlimb unloading.",
        "input": "",
        "output": f"HU sensitivity ranking: 1) {tissues_by_hu[0][0]} ({tissues_by_hu[0][1]['HU']:,}), 2) {tissues_by_hu[1][0]} ({tissues_by_hu[1][1]['HU']:,}), 3) {tissues_by_hu[2][0]} ({tissues_by_hu[2][1]['HU']:,}), 4) {tissues_by_hu[3][0]} ({tissues_by_hu[3][1]['HU']})."
    })
    
    tissues_by_ir = sorted(STRESSOR_EFFECTS.items(), key=lambda x: x[1]['IR'], reverse=True)
    examples.append({
        "instruction": "Rank tissues by sensitivity to ionizing radiation.",
        "input": "",
        "output": f"IR sensitivity ranking: 1) {tissues_by_ir[0][0]} ({tissues_by_ir[0][1]['IR']:,}), 2) {tissues_by_ir[1][0]} ({tissues_by_ir[1][1]['IR']:,}), 3) {tissues_by_ir[2][0]} ({tissues_by_ir[2][1]['IR']}), 4) {tissues_by_ir[3][0]} ({tissues_by_ir[3][1]['IR']})."
    })
    
    tissues_by_kmp = sorted(KMP_EFFECTS.items(), key=lambda x: x[1]['in_HU_IR'], reverse=True)
    examples.append({
        "instruction": "Rank tissues by KMP effect under combined stress.",
        "input": "",
        "output": f"KMP effect under stress: 1) {tissues_by_kmp[0][0]} ({tissues_by_kmp[0][1]['in_HU_IR']:,}), 2) {tissues_by_kmp[1][0]} ({tissues_by_kmp[1][1]['in_HU_IR']}), 3) {tissues_by_kmp[2][0]} ({tissues_by_kmp[2][1]['in_HU_IR']}), 4) {tissues_by_kmp[3][0]} ({tissues_by_kmp[3][1]['in_HU_IR']})."
    })
    
    # Pairwise comparisons
    for t1 in ['Heart', 'Hippocampus', 'Liver', 'Soleus']:
        for t2 in ['Heart', 'Hippocampus', 'Liver', 'Soleus']:
            if t1 < t2:
                examples.append({
                    "instruction": f"Compare {t1.lower()} and {t2.lower()} responses to HU.",
                    "input": "",
                    "output": f"{t1}: {STRESSOR_EFFECTS[t1]['HU']:,} DEGs. {t2}: {STRESSOR_EFFECTS[t2]['HU']:,} DEGs. {'Same' if TISSUE_TYPES[t1] == TISSUE_TYPES[t2] else 'Different'} KMP response type."
                })
                
                examples.append({
                    "instruction": f"Compare KMP context-dependency in {t1.lower()} vs {t2.lower()}.",
                    "input": "",
                    "output": f"{t1} ({TISSUE_TYPES[t1]}): baseline→stress = {KMP_EFFECTS[t1]['baseline']}→{KMP_EFFECTS[t1]['in_HU_IR']:,}. {t2} ({TISSUE_TYPES[t2]}): {KMP_EFFECTS[t2]['baseline']}→{KMP_EFFECTS[t2]['in_HU_IR']}."
                })
    
    # Stressor dominance
    for tissue, effects in STRESSOR_EFFECTS.items():
        if effects['HU'] > effects['IR'] * 3:
            dominance = "HU-dominant"
        elif effects['IR'] > effects['HU'] * 3:
            dominance = "IR-dominant"
        else:
            dominance = "balanced response"
        
        examples.append({
            "instruction": f"What stressor dominates the response in {tissue.lower()}?",
            "input": "",
            "output": f"{tissue} shows {dominance}: HU = {effects['HU']:,} DEGs, IR = {effects['IR']:,} DEGs (ratio = {effects['HU']/max(effects['IR'],1):.1f})."
        })
    
    return examples


def generate_prediction_tasks() -> List[Dict]:
    """Interaction and cross-tissue prediction tasks."""
    examples = []
    
    # Predict combined from main effects
    for tissue in STRESSOR_EFFECTS.keys():
        effects = STRESSOR_EFFECTS[tissue]
        examples.append({
            "instruction": f"Predict combined HU+IR effect in {tissue.lower()} from main effects.",
            "input": f"HU alone: {effects['HU']:,} DEGs. IR alone: {effects['IR']:,} DEGs.",
            "output": f"Additive prediction: ~{effects['HU']+effects['IR']:,} DEGs. Actual: {effects['HU_IR']:,} DEGs. The {'synergistic' if effects['HU_IR'] > effects['HU']+effects['IR'] else 'sub-additive'} effect reflects biological interaction between stressors."
        })
    
    # Predict KMP under stress from baseline
    for tissue in KMP_EFFECTS.keys():
        kmp = KMP_EFFECTS[tissue]
        examples.append({
            "instruction": f"Predict KMP effect under stress in {tissue.lower()}.",
            "input": f"KMP at baseline: {kmp['baseline']} DEGs. Tissue type: {TISSUE_TYPES[tissue]}.",
            "output": f"Based on tissue type, predict {'increase' if 'stress-activated' in TISSUE_TYPES[tissue] else 'decrease'}. Actual: {kmp['in_HU_IR']:,} DEGs. Ratio: {kmp['in_HU_IR']/max(kmp['baseline'],1):.1f}x."
        })
    
    # Cross-tissue predictions
    examples.append({
        "instruction": "Given heart (Type A) and soleus (Type A), predict similarity of KMP response.",
        "input": "Both are Type A (stress-activated). Heart KMP under stress: 2,110 DEGs.",
        "output": f"Prediction: Similar stress-activated pattern. Actual soleus: {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs. Both show stress-activated response, confirming Type A classification predicts pharmacodynamics."
    })
    
    examples.append({
        "instruction": "Given heart OXPHOS RESCUE, predict liver OXPHOS pattern.",
        "input": f"Heart: stress {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f} → KMP {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}. Liver is Type C (different).",
        "output": f"Prediction: Different pattern due to metabolic role. Actual: Liver shows SUPPRESSION (stress {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f} → KMP {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}). Opposite pattern confirms tissue-specific mechanisms."
    })
    
    examples.append({
        "instruction": "Predict KMP × HU interaction magnitude in soleus from heart data.",
        "input": f"Heart KMP × HU: {INTERACTIONS['Heart']['KMP_x_HU']} DEGs. Soleus has larger HU response ({STRESSOR_EFFECTS['Soleus']['HU']:,} vs {STRESSOR_EFFECTS['Heart']['HU']}).",
        "output": f"Prediction: Larger than heart due to greater HU sensitivity. Actual: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs ({INTERACTIONS['Soleus']['KMP_x_HU']/INTERACTIONS['Heart']['KMP_x_HU']:.0f}x larger). Massive interaction reflects stress-dependent KMP activation."
    })
    
    return examples


def generate_design_critique() -> List[Dict]:
    """Experimental design critique tasks."""
    examples = []
    
    examples.append({
        "instruction": "Critique: Testing KMP only at baseline to assess spaceflight protection.",
        "input": "",
        "output": f"FLAWED. Heart baseline KMP: {KMP_EFFECTS['Heart']['baseline']} DEGs. Heart stressed KMP: {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs. Testing at baseline would miss {KMP_EFFECTS['Heart']['in_HU_IR']/max(KMP_EFFECTS['Heart']['baseline'],1):.0f}x larger stressed effect. Must include stress conditions."
    })
    
    examples.append({
        "instruction": "Critique: Extrapolating heart results to predict liver response.",
        "input": "",
        "output": "INCORRECT. Heart (Type A) and Liver (Type C) have opposite patterns. Heart OXPHOS: RESCUE. Liver OXPHOS: SUPPRESSION. KMP effect: Heart {0} → {1} DEGs, Liver {2} → {3} DEGs. Cross-tissue extrapolation invalid.".format(
            KMP_EFFECTS['Heart']['baseline'], KMP_EFFECTS['Heart']['in_HU_IR'],
            KMP_EFFECTS['Liver']['baseline'], KMP_EFFECTS['Liver']['in_HU_IR']
        )
    })
    
    examples.append({
        "instruction": "Critique: Using |log2FC| > 1 threshold.",
        "input": "",
        "output": f"PROBLEMATIC. Stringent thresholds miss coordinated small effects. Using padj-only: Heart KMP = {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs. Many genes with <2-fold change contribute to pathway-level effects like OXPHOS rescue."
    })
    
    examples.append({
        "instruction": "Critique: Single-tissue study for KMP countermeasure development.",
        "input": "",
        "output": f"INSUFFICIENT. Tissue responses vary dramatically: Heart {KMP_EFFECTS['Heart']['in_HU_IR']:,}, Soleus {KMP_EFFECTS['Soleus']['in_HU_IR']}, Hippocampus {KMP_EFFECTS['Hippocampus']['in_HU_IR']}, Liver {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress. Type A/B/C classification requires multi-tissue data."
    })
    
    examples.append({
        "instruction": "Critique: Assuming additive drug-stressor effects.",
        "input": "",
        "output": f"INCORRECT. KMP × HU interactions are highly non-additive. Soleus: {INTERACTIONS['Soleus']['KMP_x_HU']:,} interaction DEGs. This super-additive effect indicates synergistic mechanisms that additive models cannot capture."
    })
    
    examples.append({
        "instruction": "Evaluate cell culture KMP data for in vivo predictions.",
        "input": "",
        "output": f"LIMITED VALUE. Cell culture cannot capture: context-dependency (KMP effects range 0-{KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs), tissue specificity (Type A/B/C), or interaction effects (KMP × HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs). In vivo factorial studies essential."
    })
    
    examples.append({
        "instruction": "Critique: Human dosing based on mouse efficacy data.",
        "input": "",
        "output": "PREMATURE. Missing: dose-response (single dose tested), human PK, therapeutic window, bioavailability. Mouse-to-human scaling unreliable (10-100x errors). Need: dose-response studies, PK modeling, Phase I trials."
    })
    
    return examples


def generate_mechanistic_reasoning() -> List[Dict]:
    """Mechanistic reasoning tasks."""
    examples = []
    
    examples.append({
        "instruction": "Explain hippocampus baseline-active, stress-blocked pattern.",
        "input": "",
        "output": f"Hippocampus: {KMP_EFFECTS['Hippocampus']['baseline']:,} DEGs baseline → {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs stressed. Mechanisms: (1) Target saturation by stress signals, (2) Pathway competition with DNA damage response, (3) Altered transcriptional machinery from radiation. Implication: Pre-treatment may be more effective."
    })
    
    examples.append({
        "instruction": "Explain the massive KMP × HU interaction in soleus.",
        "input": "",
        "output": f"Soleus KMP × HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs (largest interaction). Mechanisms: (1) Mechanosensitive pathways create KMP binding sites, (2) HU upregulates KMP targets normally not expressed, (3) Mitochondrial dysfunction reveals KMP protective capacity. Key insight: KMP specifically protects stressed tissue."
    })
    
    examples.append({
        "instruction": "Explain opposite OXPHOS patterns in heart vs liver.",
        "input": "",
        "output": f"Heart OXPHOS: stress suppresses ({OXPHOS_PATTERNS['Heart']['stress_NES']:.1f}) → KMP rescues ({OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}). Liver OXPHOS: stress activates ({OXPHOS_PATTERNS['Liver']['stress_NES']:.1f}) → KMP suppresses ({OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}). Heart needs ATP restoration for contraction. Liver needs metabolic braking to prevent oxidative damage. 'Rescue' is tissue-specific."
    })
    
    examples.append({
        "instruction": "Why does soleus show 0 KMP DEGs at baseline but 491 under stress?",
        "input": "",
        "output": f"Complete stress-dependency. Baseline: KMP targets not expressed or accessible. Under HU stress: Mechanosensitive pathways activate, target expression increases, mitochondrial stress reveals KMP binding sites. Result: 0 → {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs. KMP is conditionally active therapeutic."
    })
    
    examples.append({
        "instruction": "Explain Type A/B/C classification system.",
        "input": "",
        "output": f"Type A (Heart, Soleus): Stress-activated. Baseline minimal, stress maximal. KMP works when needed. Type B (Hippocampus): Baseline-active ({KMP_EFFECTS['Hippocampus']['baseline']:,}→{KMP_EFFECTS['Hippocampus']['in_HU_IR']}). Stress blocks. May need pre-treatment. Type C (Liver): Stress-blocked ({KMP_EFFECTS['Liver']['baseline']}→{KMP_EFFECTS['Liver']['in_HU_IR']}). Different mechanism needed."
    })
    
    examples.append({
        "instruction": "Why is KMP × HU larger than KMP × IR in most tissues?",
        "input": "",
        "output": f"KMP × HU: Heart {INTERACTIONS['Heart']['KMP_x_HU']}, Liver {INTERACTIONS['Liver']['KMP_x_HU']:,}, Soleus {INTERACTIONS['Soleus']['KMP_x_HU']:,}. KMP × IR: Heart {INTERACTIONS['Heart']['KMP_x_IR']}, Liver {INTERACTIONS['Liver']['KMP_x_IR']}, Soleus {INTERACTIONS['Soleus']['KMP_x_IR']}. KMP likely targets mechanosensitive/metabolic pathways more than DNA damage pathways."
    })
    
    return examples


def generate_uncertainty_calibration() -> List[Dict]:
    """Uncertainty calibration tasks."""
    examples = []
    
    examples.append({
        "instruction": "What is KMP's effect on cardiac contractility?",
        "input": "",
        "output": f"CANNOT DIRECTLY ANSWER. Data shows: {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs, OXPHOS rescue. These are transcriptomic changes. NOT shown: functional contractility, protein levels, force generation. Inference: OXPHOS rescue may improve ATP→may improve contractility. Confidence: LOW for function."
    })
    
    examples.append({
        "instruction": "Will KMP prevent human astronaut muscle atrophy?",
        "input": "",
        "output": f"CANNOT CONFIDENTLY ANSWER. Supports: Stress-activated protection ({KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs), KMP × HU interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,}), OXPHOS rescue. Missing: Human PK, true microgravity (HU is analog), long-duration effects. Strong rationale but insufficient for prediction."
    })
    
    examples.append({
        "instruction": "Which gene is most important for KMP protection?",
        "input": "",
        "output": "CANNOT DEFINITIVELY ANSWER. Transcriptomics shows correlation, not causation. Large change ≠ causal role. Redundant pathways exist. No knockout validation. Better questions: 'Largest effects?' (answerable), 'Most enriched pathways?' (answerable). Need: CRISPR screening for causality."
    })
    
    examples.append({
        "instruction": "What KMP dose should be used for astronauts?",
        "input": "",
        "output": "CANNOT INFORM DOSING. Missing: Dose-response (one dose tested), human PK, therapeutic window, bioavailability. Mouse-human scaling unreliable. Need: Multi-dose mouse studies, PK modeling, human Phase I. Do not extrapolate."
    })
    
    examples.append({
        "instruction": "Is the 8,484 DEG interaction statistically robust?",
        "input": "",
        "output": f"YES with caveats. Each of {INTERACTIONS['Soleus']['KMP_x_HU']:,} genes has padj < 0.05. Highly significant. But: exact number is threshold-dependent, effect sizes vary. Appropriate: 'approximately 8,500 DEGs indicating substantial non-additive effect.' Avoid false precision."
    })
    
    examples.append({
        "instruction": "Can we predict KMP effects in kidney from this data?",
        "input": "",
        "output": "CANNOT PREDICT. Kidney not studied. Type A/B/C patterns are tissue-specific and cannot be extrapolated. Kidney has unique metabolic/filtration role. Would need: dedicated kidney experiments. Cross-tissue prediction requires mechanistic validation."
    })
    
    return examples


def generate_application_questions() -> List[Dict]:
    """Practical application questions."""
    examples = []
    
    examples.append({
        "instruction": "Design a KMP efficacy study for astronaut protection.",
        "input": "",
        "output": f"Required: (1) Factorial design with stress conditions (not baseline-only), (2) Multiple tissues (Type A/B/C responses differ), (3) Functional endpoints beyond transcriptomics, (4) Dose-response. Rationale: Baseline-only would miss {KMP_EFFECTS['Heart']['in_HU_IR']:,}/{KMP_EFFECTS['Heart']['baseline']} DEG effect in heart."
    })
    
    examples.append({
        "instruction": "Prioritize tissues for KMP therapeutic development.",
        "input": "",
        "output": f"Priority 1: Soleus - largest interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,}), stress-activated, OXPHOS rescue. Priority 2: Heart - strong rescue ({KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs). Lower priority: Hippocampus (stress-blocked), Liver (minimal effect). Focus on Type A tissues."
    })
    
    examples.append({
        "instruction": "Identify biomarkers for KMP efficacy monitoring.",
        "input": "",
        "output": "Candidate biomarkers: (1) OXPHOS pathway genes (track rescue/suppression), (2) Tissue-type classification (confirm Type A/B/C), (3) Interaction-responsive genes from KMP × HU. Need validation: functional correlation, accessibility (blood-based), temporal dynamics."
    })
    
    examples.append({
        "instruction": "What controls are essential for KMP spaceflight studies?",
        "input": "",
        "output": f"Essential controls: (1) Vehicle under all stress conditions (not just baseline), (2) KMP at baseline (to detect context-dependency), (3) Single stressors (HU-only, IR-only) for interaction calculation, (4) Multiple tissues. Missing any control prevents detecting effects like {KMP_EFFECTS['Soleus']['baseline']}→{KMP_EFFECTS['Soleus']['in_HU_IR']} shift."
    })
    
    return examples


def compile_sft_dataset(output_file: str = 'kmp_sft_dataset.json'):
    """Compile all examples into final SFT dataset."""
    
    all_examples = []
    
    print("Generating factual basic examples...")
    all_examples.extend(generate_factual_basic())
    
    print("Generating factual KMP examples...")
    all_examples.extend(generate_factual_kmp())
    
    print("Generating factual interaction examples...")
    all_examples.extend(generate_factual_interactions())
    
    print("Generating factual pathway examples...")
    all_examples.extend(generate_factual_pathways())
    
    print("Generating comparison examples...")
    all_examples.extend(generate_comparison_questions())
    
    print("Generating prediction examples...")
    all_examples.extend(generate_prediction_tasks())
    
    print("Generating design critique examples...")
    all_examples.extend(generate_design_critique())
    
    print("Generating mechanistic reasoning examples...")
    all_examples.extend(generate_mechanistic_reasoning())
    
    print("Generating uncertainty calibration examples...")
    all_examples.extend(generate_uncertainty_calibration())
    
    print("Generating application examples...")
    all_examples.extend(generate_application_questions())
    
    # Format for training
    formatted = []
    for ex in all_examples:
        if ex.get('input'):
            text = f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}"
        else:
            text = f"### Instruction:\n{ex['instruction']}\n\n### Response:\n{ex['output']}"
        formatted.append({"text": text})
    
    # Shuffle for training
    random.seed(42)
    random.shuffle(formatted)
    
    with open(output_file, 'w') as f:
        json.dump(formatted, f, indent=2)
    
    print(f"\n{'='*60}")
    print(f"SFT Dataset Summary")
    print(f"{'='*60}")
    print(f"Total examples: {len(formatted)}")
    print(f"Output file: {output_file}")
    
    return formatted


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--output', default='kmp_sft_dataset.json')
    args = parser.parse_args()
    compile_sft_dataset(args.output)