bio-experiment

Running on CPU Upgrade

App Files Files Community

bio-experiment / server /simulator /noise.py

Ev3Dev

Upload folder using huggingface_hub

5c3cfae verified 3 days ago

raw

history blame contribute delete

4.92 kB

	"""Stochastic noise models for the biological simulator."""

	from __future__ import annotations

	from typing import Dict, List, Tuple

	import numpy as np


	class NoiseModel:
	"""Generates calibrated noise for simulated experimental outputs.

	All randomness is funnelled through a single ``numpy.Generator``
	so that episodes are reproducible given the same seed.
	"""

	def __init__(self, seed: int = 42):
	self.rng = np.random.default_rng(seed)

	def reseed(self, seed: int) -> None:
	self.rng = np.random.default_rng(seed)

	# ── expression-level noise ──────────────────────────────────────────

	def add_expression_noise(
	self,
	true_values: Dict[str, float],
	noise_level: float,
	dropout_rate: float,
	) -> Dict[str, float]:
	noisy: Dict[str, float] = {}
	for gene, value in true_values.items():
	# Dropout probability is inversely proportional to expression
	# magnitude: lowly expressed genes drop out much more readily,
	# matching the zero-inflation pattern in real scRNA-seq data.
	p_drop = dropout_rate / (1.0 + abs(value))
	if self.rng.random() < p_drop:
	noisy[gene] = 0.0
	else:
	sigma = noise_level * abs(value) + 0.1
	noisy[gene] = float(value + self.rng.normal(0, sigma))
	return noisy

	# ── effect-size sampling ────────────────────────────────────────────

	def sample_effect_sizes(
	self,
	true_effects: Dict[str, float],
	sample_size: int,
	noise_level: float,
	) -> Dict[str, float]:
	se = noise_level / max(np.sqrt(max(sample_size, 1)), 1e-6)
	return {
	gene: float(effect + self.rng.normal(0, se))
	for gene, effect in true_effects.items()
	}

	def sample_p_values(
	self,
	true_effects: Dict[str, float],
	sample_size: int,
	noise_level: float,
	) -> Dict[str, float]:
	"""Simulate approximate p-values from z-statistics."""
	from scipy import stats # type: ignore[import-untyped]

	p_values: Dict[str, float] = {}
	se = noise_level / max(np.sqrt(max(sample_size, 1)), 1e-6)
	for gene, effect in true_effects.items():
	z = abs(effect) / max(se, 1e-8)
	p_values[gene] = float(2 * stats.norm.sf(z))
	return p_values

	# ── false discovery helpers ─────────────────────────────────────────

	def generate_false_positives(
	self, n_background_genes: int, fdr: float
	) -> List[str]:
	n_fp = int(self.rng.binomial(n_background_genes, fdr))
	return [f"FP_GENE_{i}" for i in range(n_fp)]

	def generate_false_negatives(
	self, true_genes: List[str], fnr: float
	) -> List[str]:
	"""Return the subset of true_genes that are missed."""
	return [g for g in true_genes if self.rng.random() < fnr]

	# ── quality helpers ─────────────────────────────────────────────────

	def quality_degradation(
	self, base_quality: float, factors: List[float]
	) -> float:
	q = base_quality
	for f in factors:
	q *= f
	return float(np.clip(q + self.rng.normal(0, 0.02), 0.0, 1.0))

	def sample_qc_metric(
	self, mean: float, std: float, clip_lo: float = 0.0, clip_hi: float = 1.0
	) -> float:
	return float(np.clip(self.rng.normal(mean, std), clip_lo, clip_hi))

	def sample_count(self, lam: float) -> int:
	return int(self.rng.poisson(max(lam, 0)))

	def coin_flip(self, p: float) -> bool:
	return bool(self.rng.random() < p)

	def sample_cluster_count(
	self, n_true_populations: int, quality: float
	) -> int:
	"""Over- or under-clustering depending on preprocessing quality."""
	delta = self.rng.integers(-2, 3)
	noise_clusters = max(0, int(round((1.0 - quality) * 3)))
	return max(1, n_true_populations + delta + noise_clusters)

	def shuffle_ranking(
	self, items: List[str], noise_level: float
	) -> List[str]:
	"""Permute a ranking with Gaussian noise on ordinals."""
	n = len(items)
	if n == 0:
	return []
	scores = np.arange(n, dtype=float) + self.rng.normal(
	0, noise_level * n, size=n
	)
	order = np.argsort(scores)
	return [items[int(i)] for i in order]