File size: 38,181 Bytes

c7ebaa1

#!/usr/bin/env python3
"""
BioRLHF Expanded SFT Dataset Generator
Creates 200+ instruction-tuning examples from KMP data
"""

import json
import random

# =============================================================================
# GROUND TRUTH DATA
# =============================================================================

STRESSOR_EFFECTS = {
    'Heart': {'HU': 165, 'IR': 33, 'HU_IR': 910},
    'Hippocampus': {'HU': 1555, 'IR': 5477, 'HU_IR': 5510},
    'Liver': {'HU': 4110, 'IR': 1273, 'HU_IR': 6213},
    'Soleus': {'HU': 6425, 'IR': 67, 'HU_IR': 6830},
}

STRESSOR_DIRECTION = {
    'Heart': {'HU': {'up': 67, 'down': 98}, 'IR': {'up': 17, 'down': 16}, 'HU_IR': {'up': 334, 'down': 576}},
    'Hippocampus': {'HU': {'up': 711, 'down': 844}, 'IR': {'up': 2554, 'down': 2923}, 'HU_IR': {'up': 2523, 'down': 2987}},
    'Liver': {'HU': {'up': 2189, 'down': 1921}, 'IR': {'up': 413, 'down': 860}, 'HU_IR': {'up': 2429, 'down': 3784}},
    'Soleus': {'HU': {'up': 3251, 'down': 3174}, 'IR': {'up': 28, 'down': 39}, 'HU_IR': {'up': 3447, 'down': 3383}},
}

KMP_EFFECTS = {
    'Heart': {'baseline': 112, 'in_HU': 2, 'in_IR': 2, 'in_HU_IR': 2110},
    'Hippocampus': {'baseline': 4110, 'in_HU': 1, 'in_IR': 243, 'in_HU_IR': 140},
    'Liver': {'baseline': 309, 'in_HU': 17, 'in_IR': 389, 'in_HU_IR': 3},
    'Soleus': {'baseline': 0, 'in_HU': 1, 'in_IR': 52, 'in_HU_IR': 491},
}

INTERACTIONS = {
    'Heart': {'HU_x_IR': 244, 'KMP_x_HU': 479, 'KMP_x_IR': 29},
    'Hippocampus': {'HU_x_IR': 93, 'KMP_x_HU': 36, 'KMP_x_IR': 1221},
    'Liver': {'HU_x_IR': 3210, 'KMP_x_HU': 3369, 'KMP_x_IR': 247},
    'Soleus': {'HU_x_IR': 211, 'KMP_x_HU': 8484, 'KMP_x_IR': 484},
}

TISSUE_TYPES = {
    'Heart': 'Type A (stress-activated)',
    'Soleus': 'Type A (stress-activated)', 
    'Hippocampus': 'Type B (baseline-active)',
    'Liver': 'Type C (stress-blocked)',
}

OXPHOS_PATTERNS = {
    'Heart': {'stress_NES': -2.302, 'KMP_NES': 3.691, 'pattern': 'RESCUE'},
    'Hippocampus': {'stress_NES': 0.931, 'KMP_NES': 1.585, 'pattern': 'NS'},
    'Liver': {'stress_NES': 3.596, 'KMP_NES': -1.6, 'pattern': 'SUPPRESSION'},
    'Soleus': {'stress_NES': -2.997, 'KMP_NES': 2.46, 'pattern': 'RESCUE'},
}

PATHWAY_DATA = {
    'Heart': {
        'OXIDATIVE_PHOSPHORYLATION': {'stress': -2.302, 'kmp': 3.691, 'pattern': 'RESCUE'},
        'FATTY_ACID_METABOLISM': {'stress': -2.371, 'kmp': 3.1, 'pattern': 'RESCUE'},
        'ADIPOGENESIS': {'stress': -1.839, 'kmp': 2.81, 'pattern': 'RESCUE'},
        'MTORC1_SIGNALING': {'stress': -1.662, 'kmp': 2.585, 'pattern': 'RESCUE'},
        'INTERFERON_ALPHA_RESPONSE': {'stress': -2.072, 'kmp': 1.581, 'pattern': 'RESCUE'},
    },
    'Liver': {
        'OXIDATIVE_PHOSPHORYLATION': {'stress': 3.596, 'kmp': -1.6, 'pattern': 'SUPPRESSION'},
        'MTORC1_SIGNALING': {'stress': 3.075, 'kmp': -1.678, 'pattern': 'SUPPRESSION'},
        'INTERFERON_GAMMA_RESPONSE': {'stress': 1.542, 'kmp': -2.336, 'pattern': 'SUPPRESSION'},
    },
    'Soleus': {
        'OXIDATIVE_PHOSPHORYLATION': {'stress': -2.997, 'kmp': 2.46, 'pattern': 'RESCUE'},
        'FATTY_ACID_METABOLISM': {'stress': -2.418, 'kmp': 1.506, 'pattern': 'RESCUE'},
    }
}

HUB_GENES = {
    'Heart': [
        {'gene': 'Alb', 'lfc': 4.26, 'function': 'albumin, carrier protein'},
        {'gene': 'Eda2r', 'lfc': 0.75, 'function': 'ectodysplasin receptor'},
        {'gene': 'Cps1', 'lfc': 3.21, 'function': 'carbamoyl phosphate synthetase'},
        {'gene': 'Cdkn1a', 'lfc': 1.12, 'function': 'p21, cell cycle inhibitor'},
        {'gene': 'Arntl', 'lfc': 1.32, 'function': 'BMAL1, circadian regulator'},
        {'gene': 'Npas2', 'lfc': 1.17, 'function': 'circadian clock gene'},
        {'gene': 'Lcn2', 'lfc': 1.35, 'function': 'lipocalin, acute phase'},
        {'gene': 'Per2', 'lfc': 1.00, 'function': 'period circadian protein'},
    ],
    'Soleus': [
        {'gene': 'Myh4', 'lfc': 2.5, 'function': 'myosin heavy chain, fast fiber'},
        {'gene': 'Mybpc2', 'lfc': 1.8, 'function': 'myosin binding protein'},
        {'gene': 'Tnnt3', 'lfc': 1.6, 'function': 'troponin T, fast skeletal'},
    ],
}

TISSUES = ['Heart', 'Hippocampus', 'Liver', 'Soleus']

# =============================================================================
# EXAMPLE GENERATORS
# =============================================================================

def generate_examples():
    examples = []
    
    # =========================================================================
    # CATEGORY 1: FACTUAL RECALL (~60 examples)
    # =========================================================================
    
    # Basic DEG counts for each stressor × tissue
    for tissue in TISSUES:
        for stressor, count in STRESSOR_EFFECTS[tissue].items():
            stressor_name = {'HU': 'hindlimb unloading', 'IR': 'ionizing radiation', 'HU_IR': 'combined HU+IR stress'}[stressor]
            examples.append({
                "instruction": f"How many DEGs does {stressor_name} cause in {tissue.lower()}?",
                "input": "",
                "output": f"{stressor_name.capitalize()} causes {count:,} differentially expressed genes (padj < 0.05) in {tissue.lower()}."
            })
    
    # DEG direction
    for tissue in TISSUES:
        dirs = STRESSOR_DIRECTION[tissue]['HU_IR']
        examples.append({
            "instruction": f"What is the direction of gene expression changes in {tissue.lower()} under combined stress?",
            "input": "",
            "output": f"Under combined HU+IR stress, {tissue.lower()} shows {dirs['up']:,} upregulated and {dirs['down']:,} downregulated genes. {'Downregulation predominates' if dirs['down'] > dirs['up'] else 'Upregulation predominates' if dirs['up'] > dirs['down'] else 'Changes are balanced'} ({dirs['up']/(dirs['up']+dirs['down'])*100:.1f}% upregulated)."
        })
    
    # KMP effects
    for tissue in TISSUES:
        for context, count in KMP_EFFECTS[tissue].items():
            context_name = {'baseline': 'at baseline', 'in_HU': 'under HU stress', 'in_IR': 'under IR stress', 'in_HU_IR': 'under combined HU+IR stress'}[context]
            examples.append({
                "instruction": f"How many DEGs does KMP produce in {tissue.lower()} {context_name}?",
                "input": "",
                "output": f"KMP produces {count:,} DEGs in {tissue.lower()} {context_name}. {f'This indicates strong stress-activated response.' if context == 'in_HU_IR' and count > 100 else f'This indicates minimal/blocked response.' if count < 10 else ''}"
            })
    
    # Tissue type classification
    for tissue, ttype in TISSUE_TYPES.items():
        examples.append({
            "instruction": f"What is the KMP response classification for {tissue.lower()}?",
            "input": "",
            "output": f"{tissue} is classified as {ttype}. This means KMP shows {'minimal baseline effect but strong stress-activated protection' if 'stress-activated' in ttype else 'strong baseline effect that diminishes under stress' if 'baseline-active' in ttype else 'effects that are blocked by stress pathways'}."
        })
    
    # Interaction effects
    for tissue in TISSUES:
        for interaction, count in INTERACTIONS[tissue].items():
            int_name = interaction.replace('_x_', ' × ').replace('_', ' ')
            examples.append({
                "instruction": f"What is the {int_name} interaction effect in {tissue.lower()}?",
                "input": "",
                "output": f"The {int_name} interaction in {tissue.lower()} produces {count:,} DEGs. {'This is the largest interaction in the dataset.' if count > 5000 else 'This indicates substantial non-additive effects.' if count > 500 else 'This indicates moderate interaction.'}"
            })
    
    # OXPHOS patterns
    for tissue, pattern in OXPHOS_PATTERNS.items():
        examples.append({
            "instruction": f"Describe the OXPHOS pathway response in {tissue.lower()}.",
            "input": "",
            "output": f"In {tissue.lower()}, combined stress produces OXPHOS NES = {pattern['stress_NES']:.2f}, while KMP shifts this to NES = {pattern['KMP_NES']:.2f}. Pattern: {pattern['pattern']}. {'KMP restores suppressed mitochondrial function.' if pattern['pattern'] == 'RESCUE' else 'KMP prevents metabolic over-activation (hepatoprotective braking).' if pattern['pattern'] == 'SUPPRESSION' else 'Mixed effects.'}"
        })
    
    # Hub genes
    for tissue, genes in HUB_GENES.items():
        for gene_info in genes:
            examples.append({
                "instruction": f"What is the role of {gene_info['gene']} in {tissue.lower()} stress response?",
                "input": "",
                "output": f"{gene_info['gene']} ({gene_info['function']}) is a hub gene in {tissue.lower()} with average |log2FC| = {gene_info['lfc']:.2f}. It appears significantly changed across multiple comparisons, indicating a central role in the stress/KMP response network."
            })
    
    # =========================================================================
    # CATEGORY 2: COMPARISON QUESTIONS (~40 examples)
    # =========================================================================
    
    # Which tissue most sensitive to each stressor
    examples.append({
        "instruction": "Rank tissues by sensitivity to hindlimb unloading.",
        "input": "",
        "output": f"Tissues ranked by HU sensitivity (DEG count):\n1. Soleus: {STRESSOR_EFFECTS['Soleus']['HU']:,}\n2. Liver: {STRESSOR_EFFECTS['Liver']['HU']:,}\n3. Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['HU']:,}\n4. Heart: {STRESSOR_EFFECTS['Heart']['HU']}\n\nSoleus dominates due to severe muscle atrophy from unloading."
    })
    
    examples.append({
        "instruction": "Rank tissues by sensitivity to ionizing radiation.",
        "input": "",
        "output": f"Tissues ranked by IR sensitivity (DEG count):\n1. Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['IR']:,}\n2. Liver: {STRESSOR_EFFECTS['Liver']['IR']:,}\n3. Heart: {STRESSOR_EFFECTS['Heart']['IR']}\n4. Soleus: {STRESSOR_EFFECTS['Soleus']['IR']}\n\nHippocampus shows extreme IR sensitivity (166× more than soleus)."
    })
    
    # Cross-tissue KMP comparisons
    for t1, t2 in [('Heart', 'Liver'), ('Heart', 'Soleus'), ('Hippocampus', 'Liver'), ('Soleus', 'Liver')]:
        examples.append({
            "instruction": f"Compare KMP effects between {t1.lower()} and {t2.lower()} under stress.",
            "input": "",
            "output": f"""{t1} ({TISSUE_TYPES[t1]}): {KMP_EFFECTS[t1]['in_HU_IR']:,} DEGs under stress
{t2} ({TISSUE_TYPES[t2]}): {KMP_EFFECTS[t2]['in_HU_IR']:,} DEGs under stress

{'Both show stress-activated patterns.' if 'stress-activated' in TISSUE_TYPES[t1] and 'stress-activated' in TISSUE_TYPES[t2] else 'These tissues show opposite pharmacodynamic patterns, indicating tissue-specific KMP mechanisms.'}"""
        })
    
    # Stressor dominance classification
    examples.append({
        "instruction": "Classify tissues as HU-dominant or IR-dominant.",
        "input": "",
        "output": f"""**HU-dominant** (mechanical stress sensitivity):
- Heart: {STRESSOR_EFFECTS['Heart']['HU']} HU vs {STRESSOR_EFFECTS['Heart']['IR']} IR DEGs (ratio: {STRESSOR_EFFECTS['Heart']['HU']/max(1,STRESSOR_EFFECTS['Heart']['IR']):.1f}×)
- Soleus: {STRESSOR_EFFECTS['Soleus']['HU']:,} HU vs {STRESSOR_EFFECTS['Soleus']['IR']} IR DEGs (ratio: {STRESSOR_EFFECTS['Soleus']['HU']/max(1,STRESSOR_EFFECTS['Soleus']['IR']):.1f}×)

**IR-dominant** (radiation sensitivity):
- Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['HU']:,} HU vs {STRESSOR_EFFECTS['Hippocampus']['IR']:,} IR DEGs (ratio: {STRESSOR_EFFECTS['Hippocampus']['IR']/max(1,STRESSOR_EFFECTS['Hippocampus']['HU']):.1f}×)

**Both stressors** (synergistic):
- Liver: {STRESSOR_EFFECTS['Liver']['HU']:,} HU vs {STRESSOR_EFFECTS['Liver']['IR']:,} IR DEGs"""
    })
    
    # Compare interaction magnitudes
    examples.append({
        "instruction": "Which tissue shows the largest KMP × stressor interaction?",
        "input": "",
        "output": f"""KMP × HU interactions ranked:
1. Soleus: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs (largest in dataset)
2. Liver: {INTERACTIONS['Liver']['KMP_x_HU']:,}
3. Heart: {INTERACTIONS['Heart']['KMP_x_HU']}
4. Hippocampus: {INTERACTIONS['Hippocampus']['KMP_x_HU']}

The massive soleus interaction indicates KMP specifically engages stress pathways in mechanically unloaded muscle."""
    })
    
    # Compare OXPHOS patterns
    examples.append({
        "instruction": "Compare OXPHOS responses across tissues.",
        "input": "",
        "output": f"""OXPHOS pathway responses:

**RESCUE pattern** (Stress↓ → KMP↑):
- Heart: {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}
- Soleus: {OXPHOS_PATTERNS['Soleus']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Soleus']['KMP_NES']:.1f}

**SUPPRESSION pattern** (Stress↑ → KMP↓):
- Liver: {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}

Contractile tissues need OXPHOS restoration; liver needs metabolic braking."""
    })
    
    # More specific comparisons
    for pathway in ['OXIDATIVE_PHOSPHORYLATION', 'FATTY_ACID_METABOLISM']:
        examples.append({
            "instruction": f"Compare {pathway.replace('_', ' ').lower()} pathway response between heart and liver.",
            "input": "",
            "output": f"""Heart: Stress NES = {PATHWAY_DATA['Heart'][pathway]['stress']:.2f}, KMP NES = {PATHWAY_DATA['Heart'][pathway]['kmp']:.2f} → {PATHWAY_DATA['Heart'][pathway]['pattern']}
Liver: Stress NES = {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['stress']}, KMP NES = {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['kmp']} → {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['pattern']}

These opposite patterns reflect different metabolic requirements."""
        })
    
    # =========================================================================
    # CATEGORY 3: INTERACTION PREDICTION (~50 examples)
    # =========================================================================
    
    # Predict combined stress from main effects
    for tissue in TISSUES:
        hu = STRESSOR_EFFECTS[tissue]['HU']
        ir = STRESSOR_EFFECTS[tissue]['IR']
        combined = STRESSOR_EFFECTS[tissue]['HU_IR']
        
        examples.append({
            "instruction": f"Given HU causes {hu:,} DEGs and IR causes {ir:,} DEGs in {tissue.lower()}, predict combined effect.",
            "input": f"Main effects in {tissue.lower()}:\n- HU: {hu:,} DEGs\n- IR: {ir:,} DEGs",
            "output": f"""Prediction approach: Simple addition suggests {hu + ir:,} DEGs maximum.

Actual: {combined:,} DEGs

Analysis: {'HU-dominated response; IR adds minimal contribution' if ir < hu * 0.1 else 'IR-dominated response' if ir > hu * 3 else 'Both stressors contribute'}.
{'Sub-additive (pathway overlap)' if combined < hu + ir * 0.8 else 'Approximately additive' if combined < (hu + ir) * 1.2 else 'Super-additive (synergy)'}."""
        })
    
    # Predict KMP stress effect from baseline
    for tissue in TISSUES:
        baseline = KMP_EFFECTS[tissue]['baseline']
        stressed = KMP_EFFECTS[tissue]['in_HU_IR']
        
        examples.append({
            "instruction": f"KMP produces {baseline} DEGs at baseline in {tissue.lower()}. Predict effect under combined stress.",
            "input": f"KMP baseline effect: {baseline} DEGs\nTissue type: {TISSUE_TYPES[tissue]}",
            "output": f"""Prediction based on tissue type ({TISSUE_TYPES[tissue]}):
{'Expect INCREASED effect (stress-activated)' if 'stress-activated' in TISSUE_TYPES[tissue] else 'Expect DECREASED effect (stress-blocked)' if 'stress-blocked' in TISSUE_TYPES[tissue] else 'Expect DECREASED effect (baseline-active)'}

Actual: {stressed:,} DEGs

Ratio: {stressed/max(1, baseline):.1f}× {'increase' if stressed > baseline else 'decrease'}
This {'confirms' if (stressed > baseline and 'stress-activated' in TISSUE_TYPES[tissue]) or (stressed < baseline and 'stress-activated' not in TISSUE_TYPES[tissue]) else 'matches'} the {TISSUE_TYPES[tissue]} classification."""
        })
    
    # Cross-tissue prediction
    examples.append({
        "instruction": "Heart shows stress-activated KMP response (112→2,110 DEGs). Predict soleus pattern.",
        "input": "Both are striated muscle with similar OXPHOS dependence.",
        "output": f"""Prediction: Soleus should also show stress-activated pattern (Type A).

Reasoning:
1. Both are contractile tissues requiring ATP for function
2. Both show HU sensitivity (mechanical stress)
3. Similar mitochondrial dependence

Actual: Soleus is Type A
- Baseline: {KMP_EFFECTS['Soleus']['baseline']} DEGs
- Stressed: {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs

Confirmed: Both contractile tissues show stress-activated KMP pharmacodynamics with OXPHOS rescue."""
    })
    
    examples.append({
        "instruction": "Given liver is Type C (stress-blocked), predict hippocampus response pattern.",
        "input": "Both are non-contractile tissues.",
        "output": f"""Prediction: Cannot assume same pattern—tissue function differs.

Liver (metabolic organ): Type C (stress-blocked)
- Baseline: {KMP_EFFECTS['Liver']['baseline']} DEGs
- Stressed: {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs

Hippocampus (neural tissue): Actually Type B (baseline-active)
- Baseline: {KMP_EFFECTS['Hippocampus']['baseline']:,} DEGs
- Stressed: {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs

Key insight: Non-contractile tissues don't share a common pattern. Must test each tissue type independently."""
    })
    
    # OXPHOS prediction
    examples.append({
        "instruction": "Heart shows OXPHOS rescue (NES: -2.3→+3.7). Predict liver pattern.",
        "input": "Liver has different metabolic role (detoxification vs pumping).",
        "output": f"""Prediction: Opposite pattern (SUPPRESSION)

Reasoning:
1. Heart stress SUPPRESSES OXPHOS → KMP restores
2. Liver may ACTIVATE OXPHOS under stress (compensatory)
3. "Rescue" in liver = prevent over-activation

Actual:
- Liver stress OXPHOS: NES = +3.6 (ACTIVATED, opposite to heart)
- Liver KMP OXPHOS: NES = -1.6 (SUPPRESSES)
- Pattern: SUPPRESSION (hepatoprotective braking)

Confirmed: Metabolic context determines whether "rescue" means restoration or prevention."""
    })
    
    # Interaction magnitude prediction
    examples.append({
        "instruction": "Heart KMP×HU interaction = 479 DEGs. Soleus has 40× larger HU main effect. Predict soleus KMP×HU.",
        "input": f"Heart HU effect: {STRESSOR_EFFECTS['Heart']['HU']} DEGs\nSoleus HU effect: {STRESSOR_EFFECTS['Soleus']['HU']:,} DEGs",
        "output": f"""Prediction: Much larger interaction, possibly 5,000-15,000 DEGs

Reasoning:
1. Soleus is ~40× more HU-sensitive than heart
2. Both are Type A (stress-activated)
3. Larger main effect often → larger interaction potential
4. Soleus baseline KMP = 0, so ALL effects are stress-dependent

Actual: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs

This is {INTERACTIONS['Soleus']['KMP_x_HU']/INTERACTIONS['Heart']['KMP_x_HU']:.0f}× larger than heart—the biggest interaction in the dataset."""
    })
    
    # More interaction predictions
    for tissue in TISSUES:
        examples.append({
            "instruction": f"Predict whether {tissue.lower()} will show larger KMP×HU or KMP×IR interaction.",
            "input": f"HU effect: {STRESSOR_EFFECTS[tissue]['HU']:,} DEGs\nIR effect: {STRESSOR_EFFECTS[tissue]['IR']:,} DEGs",
            "output": f"""Prediction: {'KMP×HU larger (HU-dominant tissue)' if STRESSOR_EFFECTS[tissue]['HU'] > STRESSOR_EFFECTS[tissue]['IR'] * 3 else 'KMP×IR larger (IR-dominant tissue)' if STRESSOR_EFFECTS[tissue]['IR'] > STRESSOR_EFFECTS[tissue]['HU'] * 3 else 'Comparable interactions expected'}

Actual:
- KMP×HU: {INTERACTIONS[tissue]['KMP_x_HU']:,} DEGs
- KMP×IR: {INTERACTIONS[tissue]['KMP_x_IR']:,} DEGs
- Ratio: {INTERACTIONS[tissue]['KMP_x_HU']/max(1, INTERACTIONS[tissue]['KMP_x_IR']):.1f}×

{'Confirmed: Interaction follows main effect dominance.' if (INTERACTIONS[tissue]['KMP_x_HU'] > INTERACTIONS[tissue]['KMP_x_IR'] and STRESSOR_EFFECTS[tissue]['HU'] > STRESSOR_EFFECTS[tissue]['IR']) or (INTERACTIONS[tissue]['KMP_x_IR'] > INTERACTIONS[tissue]['KMP_x_HU'] and STRESSOR_EFFECTS[tissue]['IR'] > STRESSOR_EFFECTS[tissue]['HU']) else 'Note: Interaction pattern differs from main effect dominance.'}"""
        })
    
    # =========================================================================
    # CATEGORY 4: DESIGN CRITIQUE (~30 examples)
    # =========================================================================
    
    examples.append({
        "instruction": "Critique: Testing KMP only at baseline to assess spaceflight protection potential.",
        "input": "",
        "output": f"""**Major flaw**: Would miss KMP's primary protective mechanism.

Evidence:
- Heart: {KMP_EFFECTS['Heart']['baseline']} DEGs baseline vs {KMP_EFFECTS['Heart']['in_HU_IR']:,} stressed
- Soleus: {KMP_EFFECTS['Soleus']['baseline']} DEGs baseline vs {KMP_EFFECTS['Soleus']['in_HU_IR']} stressed

Baseline-only testing would conclude "KMP has no effect" when it actually shows massive stress-activated protection.

**Recommendation**: Always include stress conditions in countermeasure testing."""
    })
    
    examples.append({
        "instruction": "Critique: Using only heart data to predict KMP effects in all tissues.",
        "input": "",
        "output": f"""**Major flaw**: Tissue responses are fundamentally different.

Heart (Type A): {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs under stress, OXPHOS rescue
Liver (Type C): {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress, OXPHOS suppression
Hippocampus (Type B): {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs under stress

Extrapolating heart→liver would predict RESCUE when actual pattern is SUPPRESSION.

**Recommendation**: Test each target tissue independently."""
    })
    
    examples.append({
        "instruction": "Critique: Using |log2FC| > 2 threshold for DEG detection.",
        "input": "",
        "output": """**Potential issue**: May miss coordinated small changes with large pathway effects.

Our analysis shows:
- Many hub genes have |log2FC| < 2 (e.g., Eda2r: 0.75, Cdkn1a: 1.12)
- OXPHOS rescue (NES shift +6.0) occurs through many genes with modest individual changes
- Pathway-level effects can be dramatic even with small per-gene changes

**Recommendation**:
- Use padj-only for discovery
- Report at multiple thresholds
- Always include pathway analysis
- Don't conclude "no effect" from stringent gene-level thresholds alone."""
    })
    
    examples.append({
        "instruction": "Critique: Single time-point measurement for KMP efficacy.",
        "input": "",
        "output": """**Limitation**: Cannot assess temporal dynamics.

Missing information:
1. Onset timing (when does KMP effect begin?)
2. Duration (how long does protection last?)
3. Recovery trajectory
4. Optimal treatment window

For spaceflight:
- Missions last months
- Single time-point may miss transient effects
- Recovery kinetics matter for crew health

**Recommendation**: Time-course studies with multiple sampling points."""
    })
    
    examples.append({
        "instruction": "Critique: Concluding KMP is 'ineffective' in liver based on 3 DEGs under stress.",
        "input": "",
        "output": f"""**Flawed conclusion**: Low DEG count ≠ no effect.

Observations:
1. Liver shows {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress (seemingly minimal)
2. BUT: KMP×HU interaction = {INTERACTIONS['Liver']['KMP_x_HU']:,} DEGs (substantial!)
3. OXPHOS pathway shows clear SUPPRESSION pattern

What's happening:
- KMP effect is MASKED by stress, not absent
- Pathway-level modulation still occurs
- Different pharmacodynamic mechanism (prevention vs activation)

**Correct interpretation**: Liver shows stress-blocked KMP transcriptional response, but pathway modulation still occurs through different mechanisms."""
    })
    
    examples.append({
        "instruction": "Evaluate: Proposing KMP human trials based solely on mouse data.",
        "input": "",
        "output": """**Assessment**: Premature without additional studies.

What mouse data provides:
✓ Proof of concept (stress-activated protection)
✓ Tissue-specific patterns identified
✓ Mechanistic pathway data
✓ Dose-response in mice

What's still needed:
✗ Human pharmacokinetics
✗ Human tissue responses
✗ True microgravity effects (HU is analog)
✗ Long-duration effects
✗ Safety profile
✗ Drug-drug interactions

**Recommended path**:
1. Human cell/organoid studies
2. PK modeling
3. Ground-based human analog studies
4. Then Phase I trials

Mouse data provides rationale, not sufficient evidence for direct human trials."""
    })
    
    # =========================================================================
    # CATEGORY 5: MECHANISTIC REASONING (~30 examples)
    # =========================================================================
    
    examples.append({
        "instruction": "Explain why soleus shows 0 KMP DEGs at baseline but 491 under stress.",
        "input": "",
        "output": """This "stress-activated" pattern indicates conditional pharmacodynamics:

**Mechanism hypotheses**:

1. **Target availability**: KMP receptors/targets may be expressed only under stress
   - Mechanical unloading activates mechanosensitive gene programs
   - Creates binding sites for KMP-responsive transcription factors

2. **Pathway priming**: Stress activates pathways that synergize with KMP
   - Oxidative stress → antioxidant response elements accessible
   - Inflammation → NF-κB pathway engagement

3. **Metabolic context**: Mitochondrial dysfunction creates KMP targets
   - Normal mitochondria: No dysfunction to rescue
   - Stressed mitochondria: OXPHOS suppression creates rescue opportunity

**Implication**: KMP is a conditional countermeasure—active when needed, dormant otherwise. This is advantageous: minimal side effects in healthy tissue."""
    })
    
    examples.append({
        "instruction": "Why does liver show OXPHOS SUPPRESSION while heart shows OXPHOS RESCUE?",
        "input": "",
        "output": f"""Opposite patterns reflect different metabolic contexts:

**Heart (RESCUE)**:
- Function: Continuous contraction requires ATP
- Stress effect: OXPHOS suppressed (NES = {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f})
- Problem: Energy deficit → contractile failure
- KMP solution: Restore OXPHOS (NES → {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f})

**Liver (SUPPRESSION)**:
- Function: Metabolic processing, detoxification
- Stress effect: OXPHOS activated (NES = {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f})
- Problem: Over-activation → ROS, oxidative damage
- KMP solution: Brake metabolism (NES → {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f})

**Key insight**: "Protection" means different things:
- Heart: Restore lost function
- Liver: Prevent harmful over-activation

This is why cross-tissue drug extrapolation is dangerous without mechanistic understanding."""
    })
    
    examples.append({
        "instruction": "Explain the biological significance of the 8,484 DEG KMP×HU interaction in soleus.",
        "input": "",
        "output": f"""This massive interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs) is the largest in the dataset.

**What it means mathematically**:
- KMP alone: {KMP_EFFECTS['Soleus']['baseline']} DEGs
- HU alone: {STRESSOR_EFFECTS['Soleus']['HU']:,} DEGs (many genes, but different set)
- Interaction: 8,484 DEGs that are NEITHER KMP-only NOR HU-only

**Biological interpretation**:

1. **Emergent pharmacology**: KMP creates effects that ONLY exist in unloaded muscle
   
2. **Mechanosensitive activation**: Unloading activates pathways that:
   - Create KMP binding sites
   - Express KMP target genes
   - Enable KMP transcriptional effects

3. **Myokine signaling**: Unloaded muscle secretes different myokines that may synergize with KMP

4. **Mitochondrial crosstalk**: KMP's mitochondrial effects only manifest when organelles are stressed

**Therapeutic implication**: This interaction is exactly what you want—a countermeasure that specifically engages when the tissue is stressed, not when it's healthy."""
    })
    
    examples.append({
        "instruction": "Why do circadian genes (Arntl, Per2, Npas2) appear as hub genes in heart?",
        "input": "",
        "output": """Circadian genes as hub genes suggests spaceflight disrupts daily rhythms:

**Biological connection**:

1. **Circadian-metabolic coupling**: 
   - Heart metabolism is circadian-regulated
   - BMAL1 (Arntl) controls mitochondrial biogenesis
   - PER2 regulates fatty acid metabolism

2. **Spaceflight disruption**:
   - Altered light cycles
   - Shifted sleep patterns
   - Stress hormone rhythms disrupted

3. **KMP circadian effects**:
   - Flavonoids can modulate circadian machinery
   - May help resynchronize metabolic rhythms
   - OXPHOS rescue could occur through circadian normalization

**Implication**: KMP's cardiac protection may involve circadian rhythm restoration, not just direct metabolic effects. This suggests timing of KMP administration might matter."""
    })
    
    # =========================================================================
    # CATEGORY 6: UNCERTAINTY CALIBRATION (~25 examples)
    # =========================================================================
    
    examples.append({
        "instruction": "What is the effect of KMP on cardiac contractility?",
        "input": "",
        "output": """**Cannot directly answer from transcriptomic data.**

What data shows:
- 2,110 DEGs in heart under stress
- OXPHOS rescue (NES +6.0 shift)
- Circadian gene normalization

What data does NOT show:
- Actual force generation
- Calcium handling
- Protein-level changes
- Ejection fraction

**Reasonable inference** (with uncertainty):
- OXPHOS rescue suggests improved ATP availability
- ATP is required for contraction
- Therefore contractility MIGHT improve

**Confidence**: Low for functional claims, high for transcriptional patterns

**Would need**: Echocardiography, isolated heart studies, or cardiomyocyte assays."""
    })
    
    examples.append({
        "instruction": "Will KMP prevent muscle atrophy in astronauts?",
        "input": "",
        "output": """**Cannot confidently answer this question.**

**Supportive mouse data**:
✓ 491 DEGs in soleus under stress (stress-activated)
✓ 8,484 DEG KMP×HU interaction
✓ OXPHOS rescue pattern
✓ 86.6% upregulated genes (anabolic signature)

**Unknown factors**:
✗ Human pharmacokinetics
✗ Required human dose
✗ True microgravity vs HU analog
✗ 6+ month mission duration effects
✗ Translation of transcriptomics to muscle mass

**My assessment**:
- Strong mechanistic rationale: YES
- Sufficient evidence for human use: NO

Recommended next steps: Human muscle organoids, dose-finding, analog studies."""
    })
    
    examples.append({
        "instruction": "Is the exact DEG count of 2,110 in heart reliable?",
        "input": "",
        "output": """**The number is precise but context-dependent.**

2,110 is:
- Exact count at padj < 0.05 threshold
- Specific to this analysis pipeline
- Reproducible with same parameters

Sources of variation:
1. Threshold choice (log2FC filter would change count)
2. Normalization method affects p-values
3. Multiple testing correction method
4. Biological variation between replicates

**Appropriate statements**:
✓ "Approximately 2,100 DEGs"
✓ "About 2,000 significantly changed genes"
✗ "Exactly 2,110 is the true biological count"

The order of magnitude and relative ranking across tissues is robust; exact number is method-dependent."""
    })
    
    examples.append({
        "instruction": "Which gene is most important for KMP's protective effect?",
        "input": "",
        "output": """**Cannot definitively answer from transcriptomic data.**

**Why we can't identify "most important"**:
1. Large change ≠ causal role (correlation ≠ causation)
2. Pathway redundancy (multiple genes, same function)
3. No functional validation (knockdown/knockout)
4. Tissue-specificity (different genes in different tissues)

**What we CAN say**:
- Hub genes show largest/most consistent effects
- Heart: Cdkn1a, Arntl appear frequently
- Pathway level: OXPHOS genes collectively important

**To identify causal genes would need**:
- CRISPR screening
- Pharmacological inhibitors
- Time-course (early responders)
- Cross-species conservation

Transcriptomics identifies candidates; functional studies identify mechanisms."""
    })
    
    examples.append({
        "instruction": "What dose of KMP should humans take?",
        "input": "",
        "output": """**This data cannot inform human dosing.**

**What's missing**:
1. Dose-response relationship (only one dose tested)
2. Human pharmacokinetics (ADME)
3. Therapeutic window (efficacy vs toxicity)
4. Bioavailability differences
5. Target tissue concentrations

**Why mouse→human scaling is unreliable**:
- Metabolic rate differences
- Body surface area scaling often off by 10-100×
- Species-specific drug metabolism
- Individual human variation

**Required studies for dosing**:
1. Mouse dose-response
2. PK modeling
3. Human cell studies
4. Phase I safety/PK
5. Then efficacy trials

**Do not extrapolate**: Mechanism supports development; dosing requires dedicated PK/PD work."""
    })
    
    # =========================================================================
    # ADDITIONAL EXAMPLES TO REACH 200+
    # =========================================================================
    
    # Specific pathway questions
    for tissue in ['Heart', 'Liver']:
        for pathway, data in PATHWAY_DATA.get(tissue, {}).items():
            examples.append({
                "instruction": f"What is the {pathway.replace('_', ' ')} NES in {tissue.lower()} under stress?",
                "input": "",
                "output": f"Under combined HU+IR stress, {tissue.lower()} shows {pathway.replace('_', ' ')} NES = {data['stress']:.2f}. After KMP treatment, NES shifts to {data['kmp']:.2f}. Pattern: {data['pattern']}."
            })
    
    # Binary comparison questions
    comparisons = [
        ("Heart", "Soleus", "HU sensitivity", lambda t: STRESSOR_EFFECTS[t]['HU']),
        ("Heart", "Hippocampus", "IR sensitivity", lambda t: STRESSOR_EFFECTS[t]['IR']),
        ("Liver", "Soleus", "KMP effect under stress", lambda t: KMP_EFFECTS[t]['in_HU_IR']),
    ]
    
    for t1, t2, metric, func in comparisons:
        v1, v2 = func(t1), func(t2)
        examples.append({
            "instruction": f"Which has higher {metric}: {t1.lower()} or {t2.lower()}?",
            "input": "",
            "output": f"{t1 if v1 > v2 else t2} has higher {metric} ({max(v1,v2):,} vs {min(v1,v2):,} DEGs)."
        })
    
    # Yes/No questions
    yn_questions = [
        ("Is heart more sensitive to HU than IR?", STRESSOR_EFFECTS['Heart']['HU'] > STRESSOR_EFFECTS['Heart']['IR'], f"Yes. Heart shows {STRESSOR_EFFECTS['Heart']['HU']} HU DEGs vs {STRESSOR_EFFECTS['Heart']['IR']} IR DEGs."),
        ("Does KMP show stress-activated response in liver?", False, f"No. Liver is Type C (stress-blocked): {KMP_EFFECTS['Liver']['baseline']} DEGs at baseline → {KMP_EFFECTS['Liver']['in_HU_IR']} under stress."),
        ("Is the KMP×HU interaction larger than KMP×IR in soleus?", INTERACTIONS['Soleus']['KMP_x_HU'] > INTERACTIONS['Soleus']['KMP_x_IR'], f"Yes. KMP×HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} vs KMP×IR = {INTERACTIONS['Soleus']['KMP_x_IR']} DEGs."),
        ("Does hippocampus show OXPHOS rescue?", False, "No. Hippocampus shows minimal stress effect on OXPHOS (NES = 0.93, NS). Cannot rescue what isn't suppressed."),
    ]
    
    for q, answer, explanation in yn_questions:
        examples.append({
            "instruction": q,
            "input": "",
            "output": explanation
        })
    
    return examples


def format_for_training(examples):
    """Format examples for SFT training."""
    formatted = []
    for ex in examples:
        if ex.get('input'):
            text = f"""### Instruction:
{ex['instruction']}

### Input:
{ex['input']}

### Response:
{ex['output']}"""
        else:
            text = f"""### Instruction:
{ex['instruction']}

### Response:
{ex['output']}"""
        formatted.append({"text": text})
    return formatted


def main():
    print("Generating expanded SFT dataset...")
    examples = generate_examples()
    formatted = format_for_training(examples)
    
    # Save
    with open('kmp_sft_dataset.json', 'w') as f:
        json.dump(formatted, f, indent=2)
    
    print(f"\n{'='*60}")
    print(f"SFT Dataset Summary")
    print(f"{'='*60}")
    print(f"Total examples: {len(formatted)}")
    print(f"Output: kmp_sft_dataset.json")
    
    # Count by approximate category (based on keywords)
    categories = {
        'Factual': 0, 'Comparison': 0, 'Prediction': 0,
        'Critique': 0, 'Mechanistic': 0, 'Calibration': 0
    }
    for ex in examples:
        inst = ex['instruction'].lower()
        if 'how many' in inst or 'what is the' in inst or 'describe' in inst:
            categories['Factual'] += 1
        elif 'compare' in inst or 'rank' in inst or 'which' in inst:
            categories['Comparison'] += 1
        elif 'predict' in inst or 'given' in inst:
            categories['Prediction'] += 1
        elif 'critique' in inst or 'evaluate' in inst:
            categories['Critique'] += 1
        elif 'explain' in inst or 'why' in inst:
            categories['Mechanistic'] += 1
        else:
            categories['Calibration'] += 1
    
    print(f"\nApproximate category breakdown:")
    for cat, count in categories.items():
        print(f"  - {cat}: {count}")


if __name__ == "__main__":
    main()