Spaces:
Running
Running
File size: 3,014 Bytes
3255634 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | """Configuration for data collection."""
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
@dataclass
class CollectionConfig:
"""Configuration for data collection pipelines."""
# Base directories
base_dir: Path = field(default_factory=lambda: Path("data"))
# NCBI settings
ncbi_email: str = ""
ncbi_api_key: Optional[str] = None
# Target organisms (focus on common pathogens in Bangladesh)
# Prioritized by clinical relevance and AMR burden
target_organisms: list[str] = field(
default_factory=lambda: [
# ESKAPE pathogens (high AMR, hospital-acquired)
"Escherichia coli",
"Klebsiella pneumoniae",
"Staphylococcus aureus",
"Acinetobacter baumannii",
"Pseudomonas aeruginosa",
"Enterococcus faecium",
"Enterococcus faecalis",
# Enteric pathogens (high burden in Bangladesh)
"Salmonella enterica",
"Vibrio cholerae",
"Campylobacter jejuni",
"Clostridioides difficile",
# Respiratory pathogens
"Streptococcus pneumoniae",
"Mycobacterium tuberculosis",
"Haemophilus influenzae",
"Streptococcus pyogenes",
# Other clinically relevant
"Neisseria gonorrhoeae",
"Neisseria meningitidis",
"Listeria monocytogenes",
"Serratia marcescens",
"Citrobacter freundii",
]
)
# Collection limits
max_samples_per_organism: int = 500
max_downloads: int = 200
# Rate limiting (seconds between requests)
request_delay: float = 0.4
@property
def raw_dir(self) -> Path:
return self.base_dir / "raw"
@property
def external_dir(self) -> Path:
return self.base_dir / "external"
@property
def processed_dir(self) -> Path:
return self.base_dir / "processed"
@property
def ncbi_dir(self) -> Path:
return self.raw_dir / "ncbi"
@property
def patric_dir(self) -> Path:
return self.raw_dir / "patric"
@property
def card_dir(self) -> Path:
return self.raw_dir / "card-data"
@property
def resfinder_dir(self) -> Path:
return self.external_dir / "resfinder"
def ensure_directories(self) -> None:
"""Create all required directories."""
for dir_path in [
self.raw_dir,
self.external_dir,
self.processed_dir,
self.ncbi_dir,
self.patric_dir,
self.card_dir,
self.resfinder_dir,
]:
dir_path.mkdir(parents=True, exist_ok=True)
@classmethod
def from_env(cls) -> "CollectionConfig":
"""Create config from environment variables."""
import os
return cls(
ncbi_email=os.getenv("NCBI_EMAIL", ""),
ncbi_api_key=os.getenv("NCBI_API_KEY"),
)
|