Spaces:
Sleeping
Sleeping
File size: 8,674 Bytes
404d784 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 | """
Preprocess E. coli gene data for ColiFormer training.
This script combines the functionality of prepare_ecoli_data.py and
create_model_datasets.py to prepare training and test datasets from raw CSV files.
Usage:
python scripts/preprocess_data.py
python scripts/preprocess_data.py --cai_csv data/CAI.csv --high_cai_csv data/Database_3_4300_gene.csv
"""
import argparse
import json
import os
import sys
from pathlib import Path
# Add parent directory to path to import CodonTransformer
sys.path.insert(0, str(Path(__file__).parent.parent))
def is_valid_sequence(dna_seq: str) -> bool:
"""
Validate a DNA sequence for training suitability.
Args:
dna_seq: DNA sequence string
Returns:
True if sequence is valid (divisible by 3, proper start/stop codons, no internal stops)
"""
if len(dna_seq) % 3 != 0:
return False
if not dna_seq.upper().startswith(('ATG', 'TTG', 'CTG', 'GTG')):
return False
if not dna_seq.upper().endswith(('TAA', 'TAG', 'TGA')):
return False
codons = [dna_seq[i:i+3].upper() for i in range(0, len(dna_seq) - 3, 3)]
if any(codon in ['TAA', 'TAG', 'TGA'] for codon in codons):
return False
if not all(c in 'ATGC' for c in dna_seq.upper()):
return False
return True
def process_ecoli_data(cai_csv: str, high_cai_csv: str, output_dir: str = "data"):
"""
Process raw E. coli gene data from CSV files.
Args:
cai_csv: Path to CAI.csv file with gene data
high_cai_csv: Path to Database 3_4300 gene.csv with high-CAI sequences
output_dir: Output directory for processed files
Returns:
Path to processed CSV file
"""
# Lazy imports so `python scripts/preprocess_data.py --help` works without heavy deps installed.
import pandas as pd
from Bio.Seq import Seq
# Validate input files exist
if not os.path.exists(cai_csv):
raise FileNotFoundError(f"CAI CSV file not found: {cai_csv}")
if not os.path.exists(high_cai_csv):
raise FileNotFoundError(f"High-CAI CSV file not found: {high_cai_csv}")
# Create output directory if needed
os.makedirs(output_dir, exist_ok=True)
print("Loading data from CSV files...")
df_all = pd.read_csv(
cai_csv,
header=0,
names=['gene_id', 'cai_score', 'drop1', 'drop2', 'dna_sequence', 'drop3']
)
df_high_cai = pd.read_csv(
high_cai_csv,
header=0,
names=['dna_sequence']
)
high_cai_sequences = set(df_high_cai['dna_sequence'])
validated_genes = []
for index, row in df_all.iterrows():
gene_id = row['gene_id']
dna_sequence = str(row['dna_sequence'])
if is_valid_sequence(dna_sequence):
protein_sequence = str(Seq(dna_sequence).translate())
is_high_cai = dna_sequence in high_cai_sequences
validated_genes.append({
'gene_id': gene_id,
'dna_sequence': dna_sequence,
'protein_sequence': protein_sequence,
'cai_score': row.get('cai_score', None),
'is_high_cai': is_high_cai
})
df_processed = pd.DataFrame(validated_genes)
output_path = os.path.join(output_dir, 'ecoli_processed_genes.csv')
df_processed.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")
print(f"Total validated genes: {len(df_processed)}")
return output_path
def create_train_test_splits(processed_csv: str, output_dir: str = "data", test_size: int = 100):
"""
Create training and test splits from processed data.
Args:
processed_csv: Path to processed ecoli_processed_genes.csv
output_dir: Output directory for JSON files
test_size: Number of sequences for test set
Returns:
Tuple of (finetune_json_path, test_json_path)
"""
# Lazy imports so `--help` works without heavy deps installed.
import pandas as pd
from CodonTransformer.CodonData import prepare_training_data
if not os.path.exists(processed_csv):
raise FileNotFoundError(f"Processed data file not found: {processed_csv}")
os.makedirs(output_dir, exist_ok=True)
df_processed = pd.read_csv(processed_csv)
# Create fine-tuning set (high-CAI sequences)
df_finetune = df_processed[df_processed['is_high_cai'] == True].copy()
df_finetune.drop_duplicates(subset=['dna_sequence'], inplace=True)
df_finetune.rename(columns={'dna_sequence': 'dna', 'protein_sequence': 'protein'}, inplace=True)
df_finetune['organism'] = "Escherichia coli general"
finetune_output_path = os.path.join(output_dir, 'finetune_set.json')
prepare_training_data(df_finetune, finetune_output_path, shuffle=True)
print(f"Fine-tuning set saved to {finetune_output_path} with {len(df_finetune)} records.")
# Create test set (non-high-CAI sequences)
df_test_pool = df_processed[df_processed['is_high_cai'] == False].copy()
df_test = df_test_pool.sample(n=test_size, random_state=42) # for reproducibility
df_test['organism'] = 51 # E. coli general organism ID
df_test.rename(columns={'dna_sequence': 'codons'}, inplace=True)
test_records = df_test[['codons', 'organism']].to_dict(orient='records')
test_output_path = os.path.join(output_dir, 'test_set.json')
with open(test_output_path, 'w') as f:
json.dump(test_records, f, indent=4)
print(f"Test set saved to {test_output_path} with {len(df_test)} records.")
return finetune_output_path, test_output_path
def main():
"""Main entry point for data preprocessing."""
parser = argparse.ArgumentParser(
description="Preprocess E. coli gene data for ENCOT training",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Use default paths
python scripts/preprocess_data.py
# Specify custom input files
python scripts/preprocess_data.py --cai_csv data/CAI.csv --high_cai_csv data/Database_3_4300_gene.csv
# Custom output directory and test size
python scripts/preprocess_data.py --output_dir my_data --test_size 200
"""
)
parser.add_argument(
"--cai_csv",
type=str,
default="data/CAI.csv",
help="Path to CAI.csv file with gene data (default: data/CAI.csv)"
)
parser.add_argument(
"--high_cai_csv",
type=str,
default="data/Database 3_4300 gene.csv",
help="Path to Database 3_4300 gene.csv file (default: data/Database 3_4300 gene.csv)"
)
parser.add_argument(
"--output_dir",
type=str,
default="data",
help="Output directory for processed files (default: data)"
)
parser.add_argument(
"--test_size",
type=int,
default=100,
help="Number of sequences for test set (default: 100)"
)
parser.add_argument(
"--skip_processing",
action="store_true",
help="Skip data processing step (assume ecoli_processed_genes.csv exists)"
)
args = parser.parse_args()
try:
# Step 1: Process raw data
if not args.skip_processing:
processed_csv = process_ecoli_data(
args.cai_csv,
args.high_cai_csv,
args.output_dir
)
else:
processed_csv = os.path.join(args.output_dir, 'ecoli_processed_genes.csv')
if not os.path.exists(processed_csv):
raise FileNotFoundError(
f"Processed data not found at {processed_csv}. "
"Remove --skip_processing flag to process raw data first."
)
print(f"Using existing processed data: {processed_csv}")
# Step 2: Create train/test splits
finetune_path, test_path = create_train_test_splits(
processed_csv,
args.output_dir,
args.test_size
)
print("\n" + "="*60)
print("Data preprocessing complete!")
print("="*60)
print(f"Training set: {finetune_path}")
print(f"Test set: {test_path}")
print("\nYou can now run training with:")
print(f" python scripts/train.py --config configs/train_ecoli_alm.yaml")
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()
|