| """ |
| Dataset creation and loading utilities for BioRLHF. |
| |
| This module provides functions to create instruction-tuning datasets from |
| biological experimental data and load existing datasets. |
| """ |
|
|
| import json |
| from pathlib import Path |
| from typing import List, Dict, Optional, Union |
| from datasets import Dataset as HFDataset, load_dataset as hf_load_dataset |
|
|
| from biorlhf.data.ground_truth import ( |
| STRESSOR_EFFECTS, |
| KMP_EFFECTS, |
| INTERACTIONS, |
| TISSUE_TYPES, |
| OXPHOS_PATTERNS, |
| ) |
|
|
|
|
| def load_dataset( |
| path: Union[str, Path], |
| split: Optional[str] = None, |
| test_size: float = 0.1, |
| seed: int = 42, |
| ) -> Union[HFDataset, Dict[str, HFDataset]]: |
| """ |
| Load a BioRLHF dataset from a JSON file. |
| |
| Args: |
| path: Path to the JSON dataset file. |
| split: If specified, return only this split ('train' or 'test'). |
| test_size: Fraction of data to use for testing. |
| seed: Random seed for reproducible splits. |
| |
| Returns: |
| HuggingFace Dataset or dict of train/test splits. |
| """ |
| dataset = hf_load_dataset("json", data_files=str(path))["train"] |
|
|
| if test_size > 0: |
| splits = dataset.train_test_split(test_size=test_size, seed=seed) |
| if split: |
| return splits[split] |
| return splits |
|
|
| return dataset |
|
|
|
|
| def create_sft_dataset( |
| output_path: Union[str, Path] = "kmp_sft_dataset.json", |
| include_calibration: bool = True, |
| include_chain_of_thought: bool = True, |
| ) -> List[Dict[str, str]]: |
| """ |
| Create an SFT dataset from ground truth biological data. |
| |
| Args: |
| output_path: Path to save the generated dataset. |
| include_calibration: Include uncertainty calibration examples. |
| include_chain_of_thought: Include chain-of-thought reasoning examples. |
| |
| Returns: |
| List of formatted training examples. |
| """ |
| all_examples = [] |
|
|
| |
| all_examples.extend(_generate_factual_examples()) |
|
|
| |
| all_examples.extend(_generate_comparison_examples()) |
|
|
| |
| all_examples.extend(_generate_interaction_examples()) |
|
|
| |
| all_examples.extend(_generate_design_critique_examples()) |
|
|
| |
| if include_chain_of_thought: |
| all_examples.extend(_generate_mechanistic_examples()) |
|
|
| |
| if include_calibration: |
| all_examples.extend(_generate_calibration_examples()) |
|
|
| |
| formatted = [] |
| for ex in all_examples: |
| if ex.get("input"): |
| text = ( |
| f"### Instruction:\n{ex['instruction']}\n\n" |
| f"### Input:\n{ex['input']}\n\n" |
| f"### Response:\n{ex['output']}" |
| ) |
| else: |
| text = ( |
| f"### Instruction:\n{ex['instruction']}\n\n" |
| f"### Response:\n{ex['output']}" |
| ) |
| formatted.append({"text": text}) |
|
|
| |
| with open(output_path, "w") as f: |
| json.dump(formatted, f, indent=2) |
|
|
| print(f"Created SFT dataset with {len(formatted)} examples at {output_path}") |
| return formatted |
|
|
|
|
| def _generate_factual_examples() -> List[Dict]: |
| """Generate factual Q&A examples about experimental results.""" |
| examples = [] |
|
|
| for tissue, effects in STRESSOR_EFFECTS.items(): |
| if tissue == "Soleus": |
| hu_desc = "This reflects the severe muscle atrophy that occurs during simulated microgravity." |
| else: |
| hu_desc = f"This indicates {tissue.lower()} sensitivity to mechanical unloading." |
|
|
| examples.append({ |
| "instruction": f"How many differentially expressed genes does hindlimb unloading cause in {tissue.lower()}?", |
| "input": "", |
| "output": f"Hindlimb unloading (HU) causes {effects['HU']:,} differentially expressed genes (padj < 0.05) in {tissue.lower()}. {hu_desc}" |
| }) |
|
|
| if tissue == "Hippocampus": |
| ir_desc = "This high number reflects the brain's particular sensitivity to radiation-induced damage." |
| elif effects["IR"] < 100: |
| ir_desc = "This minimal response indicates low radiation sensitivity in this tissue." |
| else: |
| ir_desc = "This moderate response indicates radiation sensitivity in this tissue." |
|
|
| examples.append({ |
| "instruction": f"How many DEGs does ionizing radiation induce in {tissue.lower()}?", |
| "input": "", |
| "output": f"Ionizing radiation (IR) induces {effects['IR']:,} DEGs in {tissue.lower()}. {ir_desc}" |
| }) |
|
|
| for tissue, effects in KMP_EFFECTS.items(): |
| pattern = "stress-activated" if effects["in_HU_IR"] > effects["baseline"] else "stress-blocked" |
| examples.append({ |
| "instruction": f"How does kaempferol (KMP) affect gene expression in {tissue.lower()} under combined stress?", |
| "input": "", |
| "output": f"In {tissue.lower()}, KMP produces {effects['in_HU_IR']:,} DEGs under combined HU+IR stress, compared to {effects['baseline']:,} DEGs at baseline. This {pattern} pattern indicates KMP's pharmacodynamics are highly context-dependent." |
| }) |
|
|
| for tissue, ttype in TISSUE_TYPES.items(): |
| examples.append({ |
| "instruction": f"What type of KMP response pattern does {tissue.lower()} show?", |
| "input": "", |
| "output": f"{tissue} is classified as {ttype}." |
| }) |
|
|
| return examples |
|
|
|
|
| def _generate_comparison_examples() -> List[Dict]: |
| """Generate tissue comparison examples.""" |
| examples = [] |
|
|
| examples.append({ |
| "instruction": "Which tissue is most sensitive to hindlimb unloading?", |
| "input": "", |
| "output": f"Soleus muscle is most sensitive to hindlimb unloading with {STRESSOR_EFFECTS['Soleus']['HU']:,} DEGs, followed by Liver ({STRESSOR_EFFECTS['Liver']['HU']:,}), Hippocampus ({STRESSOR_EFFECTS['Hippocampus']['HU']:,}), and Heart ({STRESSOR_EFFECTS['Heart']['HU']})." |
| }) |
|
|
| examples.append({ |
| "instruction": "Which tissue is most sensitive to ionizing radiation?", |
| "input": "", |
| "output": f"Hippocampus is most sensitive to ionizing radiation with {STRESSOR_EFFECTS['Hippocampus']['IR']:,} DEGs, dramatically higher than other tissues." |
| }) |
|
|
| examples.append({ |
| "instruction": "Compare KMP's transcriptional effects across all four tissues under combined stress.", |
| "input": "", |
| "output": f"KMP shows dramatically different effects across tissues under HU+IR stress: Heart ({KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs, stress-activated), Soleus ({KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs, stress-activated), Hippocampus ({KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs, stress-blocked), and Liver ({KMP_EFFECTS['Liver']['in_HU_IR']} DEGs, stress-blocked)." |
| }) |
|
|
| return examples |
|
|
|
|
| def _generate_interaction_examples() -> List[Dict]: |
| """Generate interaction prediction examples.""" |
| examples = [] |
|
|
| for tissue in ["Heart", "Hippocampus", "Liver", "Soleus"]: |
| hu = STRESSOR_EFFECTS[tissue]["HU"] |
| ir = STRESSOR_EFFECTS[tissue]["IR"] |
| combined = STRESSOR_EFFECTS[tissue]["HU_IR"] |
|
|
| examples.append({ |
| "instruction": f"Given the main effects of HU and IR separately in {tissue.lower()}, predict the combined HU+IR effect.", |
| "input": f"{tissue} main effects: HU alone: {hu:,} DEGs, IR alone: {ir:,} DEGs. Predict the combined HU+IR effect.", |
| "output": f"If additive, expect up to {hu + ir:,} DEGs. Actual result: {combined:,} DEGs. The combined effect reflects tissue-specific integration of stress responses." |
| }) |
|
|
| return examples |
|
|
|
|
| def _generate_design_critique_examples() -> List[Dict]: |
| """Generate experimental design critique examples.""" |
| examples = [] |
|
|
| examples.append({ |
| "instruction": "A researcher proposes testing KMP efficacy by comparing KMP-treated vs vehicle-treated mice under normal conditions. Critique this design.", |
| "input": "Proposed: Compare KMP_vs_Ctrl under normal conditions.", |
| "output": f"This design would miss KMP's primary mechanism. Heart: {KMP_EFFECTS['Heart']['baseline']} DEGs at baseline vs {KMP_EFFECTS['Heart']['in_HU_IR']:,} under stress. Soleus: {KMP_EFFECTS['Soleus']['baseline']} vs {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs. Recommendation: Use factorial design including stress conditions." |
| }) |
|
|
| examples.append({ |
| "instruction": "Evaluate whether results from heart can predict liver response to KMP under stress.", |
| "input": "Heart shows KMP rescues OXPHOS under stress. Can we extrapolate to liver?", |
| "output": f"Direct extrapolation would be INCORRECT. Heart is Type A, Liver is Type C. OXPHOS patterns are opposite: Heart stress suppresses then KMP rescues; Liver stress activates then KMP suppresses. Each tissue requires independent testing." |
| }) |
|
|
| return examples |
|
|
|
|
| def _generate_mechanistic_examples() -> List[Dict]: |
| """Generate chain-of-thought mechanistic reasoning examples.""" |
| examples = [] |
|
|
| examples.append({ |
| "instruction": "Explain why hippocampus shows strong KMP effect at baseline (4,110 DEGs) but minimal effect under combined stress (140 DEGs).", |
| "input": "", |
| "output": "This 'baseline-active, stress-blocked' pattern suggests: (1) Receptor saturation - stress signals already engage KMP targets, (2) Pathway competition - IR-induced DNA damage response overrides KMP effects, (3) Altered cellular state - radiation changes transcriptional machinery. Implications: may need pre-treatment before stress." |
| }) |
|
|
| examples.append({ |
| "instruction": f"The KMP x HU interaction in soleus produces {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs. Explain this super-additive interaction.", |
| "input": "", |
| "output": "This massive interaction indicates: (1) Mechanosensitive activation - unloading creates binding sites for KMP-responsive factors, (2) Stress-primed responsiveness - HU upregulates KMP targets normally not expressed, (3) Mitochondrial crosstalk - KMP effects on mitochondria only manifest when organelles are stressed. Key insight: KMP specifically protects stressed tissue." |
| }) |
|
|
| examples.append({ |
| "instruction": "Why does liver show OXPHOS suppression by KMP when heart and soleus show OXPHOS rescue?", |
| "input": "", |
| "output": "Different metabolic contexts: Heart/Soleus (RESCUE) - stress suppresses OXPHOS, insufficient ATP, KMP restores function. Liver (SUPPRESSION) - stress activates compensatory metabolism, over-activation causes damage, KMP provides 'hepatoprotective braking'. 'Rescue' is tissue-specific: restore suppressed function vs prevent harmful over-activation." |
| }) |
|
|
| return examples |
|
|
|
|
| def _generate_calibration_examples() -> List[Dict]: |
| """Generate uncertainty calibration examples.""" |
| examples = [] |
|
|
| examples.append({ |
| "instruction": "What is the effect of KMP on cardiac contractility?", |
| "input": "", |
| "output": "I cannot directly answer this. The data shows transcriptomic changes (2,110 DEGs, OXPHOS rescue) but NOT functional contractility. OXPHOS rescue suggests improved ATP availability, which might improve contractility, but transcription-to-function is not guaranteed. Confidence: low for functional claims." |
| }) |
|
|
| examples.append({ |
| "instruction": "Will KMP prevent muscle atrophy in human astronauts?", |
| "input": "", |
| "output": "I cannot confidently answer this. Mouse data supports: stress-activated protection, OXPHOS rescue, KMP x HU interaction. Mouse data cannot tell us: human PK, dose requirements, true microgravity effects, long-duration effects. Strong mechanistic rationale but insufficient for human predictions without additional studies." |
| }) |
|
|
| examples.append({ |
| "instruction": "Which gene is most important for KMP's protective effect?", |
| "input": "", |
| "output": "Cannot be definitively answered from transcriptomic data. Problems: correlation is not causation, pathway redundancy, no functional validation, tissue-specificity. Better questions: 'Which genes show largest effects?' (answerable), 'Which pathways are enriched?' (answerable), 'Which gene is necessary?' (requires CRISPR studies)." |
| }) |
|
|
| examples.append({ |
| "instruction": "Based on this data, what dose of KMP should be used for spaceflight countermeasures?", |
| "input": "", |
| "output": "This data CANNOT inform dosing. Missing: dose-response relationship (only one dose tested), human PK, therapeutic window, bioavailability differences. Mouse-to-human scaling is unreliable (10-100x errors possible). Needed: mouse dose-response, PK modeling, human Phase I studies. Do not extrapolate dosing from this study." |
| }) |
|
|
| return examples |
|
|