| """ |
| Biological sequence task generation and pool management. |
| Generates synthetic tasks for evaluating biological language models. |
| """ |
|
|
| import os |
| import json |
| import logging |
| import random |
| import numpy as np |
| from typing import List, Dict, Optional, Any |
| from dataclasses import dataclass, field |
| from pathlib import Path |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
| AMINO_ACIDS = list("ACDEFGHIKLMNPQRSTVWY") |
| |
| NUCLEOTIDES = list("ACGT") |
| RNA_NUCLEOTIDES = list("ACGU") |
|
|
|
|
| def generate_random_protein(length: int) -> str: |
| """Generate a random protein sequence.""" |
| return "".join(random.choices(AMINO_ACIDS, k=length)) |
|
|
|
|
| def generate_random_dna(length: int) -> str: |
| """Generate a random DNA sequence.""" |
| return "".join(random.choices(NUCLEOTIDES, k=length)) |
|
|
|
|
| def generate_random_rna(length: int) -> str: |
| """Generate a random RNA sequence.""" |
| return "".join(random.choices(RNA_NUCLEOTIDES, k=length)) |
|
|
|
|
| def add_motif(sequence: str, motif: str, position: Optional[int] = None) -> str: |
| """Insert a motif into a sequence at given position.""" |
| if position is None: |
| position = random.randint(0, len(sequence) - len(motif)) |
| return sequence[:position] + motif + sequence[position + len(motif):] |
|
|
|
|
| @dataclass |
| class BioTask: |
| """A biological sequence evaluation task.""" |
| |
| task_id: str |
| task_type: str |
| task_family: str |
| |
| |
| prompt: str |
| context: Optional[str] = None |
| target: Optional[str] = None |
| |
| |
| evaluation_metric: str = "sequence_identity" |
| expected_answer: Optional[str] = None |
| |
| |
| difficulty: float = 0.5 |
| generation_seed: Optional[int] = None |
| |
| def to_dict(self) -> Dict[str, Any]: |
| return { |
| "task_id": self.task_id, |
| "task_type": self.task_type, |
| "task_family": self.task_family, |
| "prompt": self.prompt, |
| "context": self.context, |
| "target": self.target, |
| "evaluation_metric": self.evaluation_metric, |
| "expected_answer": self.expected_answer, |
| "difficulty": self.difficulty, |
| } |
| |
| @classmethod |
| def from_dict(cls, data: Dict[str, Any]) -> "BioTask": |
| return cls(**data) |
|
|
|
|
| class BioTaskPool: |
| """Manages a pool of biological evaluation tasks.""" |
| |
| |
| PROTEIN_MOTIFS = { |
| "nuclear_localization": "PKKKRKV", |
| "atp_binding": "GXGXXG", |
| "zinc_finger": "CCHH", |
| "helix_turn_helix": "LXXLL", |
| "transmembrane": "AVLIVF", |
| } |
| |
| DNA_MOTIFS = { |
| "tata_box": "TATAAA", |
| "gc_rich": "GCGCGC", |
| "promoter": "CAAT", |
| "terminator": "AATAAA", |
| } |
| |
| RNA_MOTIFS = { |
| "shine_dalgarno": "AGGAGGU", |
| "polya_signal": "AAUAAA", |
| "kozak": "GCCRCC", |
| } |
| |
| def __init__(self, seed: int = 42): |
| self.rng = np.random.RandomState(seed) |
| self.tasks: List[BioTask] = [] |
| self._initialize_base_tasks() |
| |
| def _initialize_base_tasks(self) -> None: |
| """Create initial set of basic tasks.""" |
| |
| for i, (name, motif) in enumerate(self.PROTEIN_MOTIFS.items()): |
| task = BioTask( |
| task_id=f"protein_motif_{name}_{i}", |
| task_type="protein", |
| task_family="motif_recognition", |
| prompt=f"Identify if this protein sequence contains a {name} motif: ", |
| evaluation_metric="contains_substring", |
| expected_answer=motif, |
| difficulty=0.3 + i * 0.1, |
| ) |
| self.tasks.append(task) |
| |
| |
| for i, (name, motif) in enumerate(self.DNA_MOTIFS.items()): |
| task = BioTask( |
| task_id=f"dna_motif_{name}_{i}", |
| task_type="dna", |
| task_family="motif_recognition", |
| prompt=f"Identify if this DNA sequence contains a {name} motif: ", |
| evaluation_metric="contains_substring", |
| expected_answer=motif, |
| difficulty=0.3 + i * 0.1, |
| ) |
| self.tasks.append(task) |
| |
| |
| for seq_type, alphabet in [("protein", AMINO_ACIDS), ("dna", NUCLEOTIDES), ("rna", RNA_NUCLEOTIDES)]: |
| for length in [10, 20, 50]: |
| target = "".join(self.rng.choice(alphabet, size=length)) |
| prefix = target[:length // 2] |
| task = BioTask( |
| task_id=f"{seq_type}_complete_len{length}_{hash(target) % 10000}", |
| task_type=seq_type, |
| task_family="sequence_completion", |
| prompt=f"Complete this {seq_type} sequence: {prefix}", |
| context=prefix, |
| target=target, |
| expected_answer=target, |
| evaluation_metric="sequence_similarity", |
| difficulty=min(length / 100.0, 0.9), |
| ) |
| self.tasks.append(task) |
| |
| logger.info(f"Initialized task pool with {len(self.tasks)} base tasks") |
| |
| def get_tasks(self, n: int = 10, filter_type: Optional[str] = None) -> List[BioTask]: |
| """Sample n tasks from the pool.""" |
| available = self.tasks |
| if filter_type: |
| available = [t for t in available if t.task_type == filter_type] |
| |
| if len(available) <= n: |
| return available |
| |
| return self.rng.choice(available, size=n, replace=False).tolist() |
| |
| def generate_new_tasks( |
| self, |
| archive: Any, |
| num_tasks: int = 5, |
| difficulty_weights: Optional[Dict[str, float]] = None, |
| ) -> List[BioTask]: |
| """Generate new tasks targeting weaknesses in the archive.""" |
| new_tasks = [] |
| |
| |
| seq_types = ["protein", "dna", "rna"] |
| |
| for i in range(num_tasks): |
| seq_type = random.choice(seq_types) |
| |
| |
| if seq_type == "protein": |
| length = random.randint(50, 200) |
| sequence = generate_random_protein(length) |
| |
| motif_name = random.choice(list(self.PROTEIN_MOTIFS.keys())) |
| motif = self.PROTEIN_MOTIFS[motif_name] |
| pos = random.randint(0, length - len(motif)) |
| sequence = sequence[:pos] + motif + sequence[pos + len(motif):] |
| |
| task = BioTask( |
| task_id=f"protein_gen_{len(self.tasks) + i}_{hash(sequence) % 10000}", |
| task_type="protein", |
| task_family="motif_localization", |
| prompt=f"Find the position of the {motif_name} motif in this protein: {sequence}", |
| context=sequence, |
| target=str(pos), |
| expected_answer=str(pos), |
| evaluation_metric="exact_match", |
| difficulty=0.6 + random.random() * 0.3, |
| ) |
| |
| elif seq_type == "dna": |
| length = random.randint(100, 500) |
| sequence = generate_random_dna(length) |
| motif_name = random.choice(list(self.DNA_MOTIFS.keys())) |
| motif = self.DNA_MOTIFS[motif_name] |
| pos = random.randint(0, length - len(motif)) |
| sequence = sequence[:pos] + motif + sequence[pos + len(motif):] |
| |
| task = BioTask( |
| task_id=f"dna_gen_{len(self.tasks) + i}_{hash(sequence) % 10000}", |
| task_type="dna", |
| task_family="regulatory_element_detection", |
| prompt=f"Find the {motif_name} regulatory element in: {sequence}", |
| context=sequence, |
| target=str(pos), |
| expected_answer=str(pos), |
| evaluation_metric="exact_match", |
| difficulty=0.5 + random.random() * 0.3, |
| ) |
| |
| else: |
| length = random.randint(50, 300) |
| sequence = generate_random_rna(length) |
| |
| task = BioTask( |
| task_id=f"rna_gen_{len(self.tasks) + i}_{hash(sequence) % 10000}", |
| task_type="rna", |
| task_family="structure_prediction", |
| prompt=f"Predict the secondary structure of this RNA: {sequence}", |
| context=sequence, |
| evaluation_metric="rna_structure_similarity", |
| difficulty=0.7 + random.random() * 0.2, |
| ) |
| |
| new_tasks.append(task) |
| |
| |
| self.tasks.extend(new_tasks) |
| logger.info(f"Generated {len(new_tasks)} new tasks. Pool size: {len(self.tasks)}") |
| |
| return new_tasks |
| |
| def save(self, path: str) -> None: |
| """Save task pool to disk.""" |
| data = [t.to_dict() for t in self.tasks] |
| with open(path, "w") as f: |
| json.dump(data, f, indent=2) |
| logger.info(f"Saved task pool to {path}") |
| |
| def load(self, path: str) -> None: |
| """Load task pool from disk.""" |
| with open(path, "r") as f: |
| data = json.load(f) |
| self.tasks = [BioTask.from_dict(d) for d in data] |
| logger.info(f"Loaded task pool with {len(self.tasks)} tasks from {path}") |
|
|
|
|
| class ProteinTask(BioTask): |
| """Protein-specific task.""" |
| pass |
|
|
|
|
| class DNATask(BioTask): |
| """DNA-specific task.""" |
| pass |
|
|
|
|
| class RNATask(BioTask): |
| """RNA-specific task.""" |
| pass |
|
|