Spaces:
Running
Running
| """Configuration for data collection.""" | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Optional | |
| class CollectionConfig: | |
| """Configuration for data collection pipelines.""" | |
| # Base directories | |
| base_dir: Path = field(default_factory=lambda: Path("data")) | |
| # NCBI settings | |
| ncbi_email: str = "" | |
| ncbi_api_key: Optional[str] = None | |
| # Target organisms (focus on common pathogens in Bangladesh) | |
| # Prioritized by clinical relevance and AMR burden | |
| target_organisms: list[str] = field( | |
| default_factory=lambda: [ | |
| # ESKAPE pathogens (high AMR, hospital-acquired) | |
| "Escherichia coli", | |
| "Klebsiella pneumoniae", | |
| "Staphylococcus aureus", | |
| "Acinetobacter baumannii", | |
| "Pseudomonas aeruginosa", | |
| "Enterococcus faecium", | |
| "Enterococcus faecalis", | |
| # Enteric pathogens (high burden in Bangladesh) | |
| "Salmonella enterica", | |
| "Vibrio cholerae", | |
| "Campylobacter jejuni", | |
| "Clostridioides difficile", | |
| # Respiratory pathogens | |
| "Streptococcus pneumoniae", | |
| "Mycobacterium tuberculosis", | |
| "Haemophilus influenzae", | |
| "Streptococcus pyogenes", | |
| # Other clinically relevant | |
| "Neisseria gonorrhoeae", | |
| "Neisseria meningitidis", | |
| "Listeria monocytogenes", | |
| "Serratia marcescens", | |
| "Citrobacter freundii", | |
| ] | |
| ) | |
| # Collection limits | |
| max_samples_per_organism: int = 500 | |
| max_downloads: int = 200 | |
| # Rate limiting (seconds between requests) | |
| request_delay: float = 0.4 | |
| def raw_dir(self) -> Path: | |
| return self.base_dir / "raw" | |
| def external_dir(self) -> Path: | |
| return self.base_dir / "external" | |
| def processed_dir(self) -> Path: | |
| return self.base_dir / "processed" | |
| def ncbi_dir(self) -> Path: | |
| return self.raw_dir / "ncbi" | |
| def patric_dir(self) -> Path: | |
| return self.raw_dir / "patric" | |
| def card_dir(self) -> Path: | |
| return self.raw_dir / "card-data" | |
| def resfinder_dir(self) -> Path: | |
| return self.external_dir / "resfinder" | |
| def ensure_directories(self) -> None: | |
| """Create all required directories.""" | |
| for dir_path in [ | |
| self.raw_dir, | |
| self.external_dir, | |
| self.processed_dir, | |
| self.ncbi_dir, | |
| self.patric_dir, | |
| self.card_dir, | |
| self.resfinder_dir, | |
| ]: | |
| dir_path.mkdir(parents=True, exist_ok=True) | |
| def from_env(cls) -> "CollectionConfig": | |
| """Create config from environment variables.""" | |
| import os | |
| return cls( | |
| ncbi_email=os.getenv("NCBI_EMAIL", ""), | |
| ncbi_api_key=os.getenv("NCBI_API_KEY"), | |
| ) | |