Ev3Dev's picture
Upload folder using huggingface_hub
5c3cfae verified
"""Stochastic noise models for the biological simulator."""
from __future__ import annotations
from typing import Dict, List, Tuple
import numpy as np
class NoiseModel:
"""Generates calibrated noise for simulated experimental outputs.
All randomness is funnelled through a single ``numpy.Generator``
so that episodes are reproducible given the same seed.
"""
def __init__(self, seed: int = 42):
self.rng = np.random.default_rng(seed)
def reseed(self, seed: int) -> None:
self.rng = np.random.default_rng(seed)
# ── expression-level noise ──────────────────────────────────────────
def add_expression_noise(
self,
true_values: Dict[str, float],
noise_level: float,
dropout_rate: float,
) -> Dict[str, float]:
noisy: Dict[str, float] = {}
for gene, value in true_values.items():
# Dropout probability is inversely proportional to expression
# magnitude: lowly expressed genes drop out much more readily,
# matching the zero-inflation pattern in real scRNA-seq data.
p_drop = dropout_rate / (1.0 + abs(value))
if self.rng.random() < p_drop:
noisy[gene] = 0.0
else:
sigma = noise_level * abs(value) + 0.1
noisy[gene] = float(value + self.rng.normal(0, sigma))
return noisy
# ── effect-size sampling ────────────────────────────────────────────
def sample_effect_sizes(
self,
true_effects: Dict[str, float],
sample_size: int,
noise_level: float,
) -> Dict[str, float]:
se = noise_level / max(np.sqrt(max(sample_size, 1)), 1e-6)
return {
gene: float(effect + self.rng.normal(0, se))
for gene, effect in true_effects.items()
}
def sample_p_values(
self,
true_effects: Dict[str, float],
sample_size: int,
noise_level: float,
) -> Dict[str, float]:
"""Simulate approximate p-values from z-statistics."""
from scipy import stats # type: ignore[import-untyped]
p_values: Dict[str, float] = {}
se = noise_level / max(np.sqrt(max(sample_size, 1)), 1e-6)
for gene, effect in true_effects.items():
z = abs(effect) / max(se, 1e-8)
p_values[gene] = float(2 * stats.norm.sf(z))
return p_values
# ── false discovery helpers ─────────────────────────────────────────
def generate_false_positives(
self, n_background_genes: int, fdr: float
) -> List[str]:
n_fp = int(self.rng.binomial(n_background_genes, fdr))
return [f"FP_GENE_{i}" for i in range(n_fp)]
def generate_false_negatives(
self, true_genes: List[str], fnr: float
) -> List[str]:
"""Return the subset of *true_genes* that are missed."""
return [g for g in true_genes if self.rng.random() < fnr]
# ── quality helpers ─────────────────────────────────────────────────
def quality_degradation(
self, base_quality: float, factors: List[float]
) -> float:
q = base_quality
for f in factors:
q *= f
return float(np.clip(q + self.rng.normal(0, 0.02), 0.0, 1.0))
def sample_qc_metric(
self, mean: float, std: float, clip_lo: float = 0.0, clip_hi: float = 1.0
) -> float:
return float(np.clip(self.rng.normal(mean, std), clip_lo, clip_hi))
def sample_count(self, lam: float) -> int:
return int(self.rng.poisson(max(lam, 0)))
def coin_flip(self, p: float) -> bool:
return bool(self.rng.random() < p)
def sample_cluster_count(
self, n_true_populations: int, quality: float
) -> int:
"""Over- or under-clustering depending on preprocessing quality."""
delta = self.rng.integers(-2, 3)
noise_clusters = max(0, int(round((1.0 - quality) * 3)))
return max(1, n_true_populations + delta + noise_clusters)
def shuffle_ranking(
self, items: List[str], noise_level: float
) -> List[str]:
"""Permute a ranking with Gaussian noise on ordinals."""
n = len(items)
if n == 0:
return []
scores = np.arange(n, dtype=float) + self.rng.normal(
0, noise_level * n, size=n
)
order = np.argsort(scores)
return [items[int(i)] for i in order]