""" Preprocess E. coli gene data for ColiFormer training. This script combines the functionality of prepare_ecoli_data.py and create_model_datasets.py to prepare training and test datasets from raw CSV files. Usage: python scripts/preprocess_data.py python scripts/preprocess_data.py --cai_csv data/CAI.csv --high_cai_csv data/Database_3_4300_gene.csv """ import argparse import json import os import sys from pathlib import Path # Add parent directory to path to import CodonTransformer sys.path.insert(0, str(Path(__file__).parent.parent)) def is_valid_sequence(dna_seq: str) -> bool: """ Validate a DNA sequence for training suitability. Args: dna_seq: DNA sequence string Returns: True if sequence is valid (divisible by 3, proper start/stop codons, no internal stops) """ if len(dna_seq) % 3 != 0: return False if not dna_seq.upper().startswith(('ATG', 'TTG', 'CTG', 'GTG')): return False if not dna_seq.upper().endswith(('TAA', 'TAG', 'TGA')): return False codons = [dna_seq[i:i+3].upper() for i in range(0, len(dna_seq) - 3, 3)] if any(codon in ['TAA', 'TAG', 'TGA'] for codon in codons): return False if not all(c in 'ATGC' for c in dna_seq.upper()): return False return True def process_ecoli_data(cai_csv: str, high_cai_csv: str, output_dir: str = "data"): """ Process raw E. coli gene data from CSV files. Args: cai_csv: Path to CAI.csv file with gene data high_cai_csv: Path to Database 3_4300 gene.csv with high-CAI sequences output_dir: Output directory for processed files Returns: Path to processed CSV file """ # Lazy imports so `python scripts/preprocess_data.py --help` works without heavy deps installed. import pandas as pd from Bio.Seq import Seq # Validate input files exist if not os.path.exists(cai_csv): raise FileNotFoundError(f"CAI CSV file not found: {cai_csv}") if not os.path.exists(high_cai_csv): raise FileNotFoundError(f"High-CAI CSV file not found: {high_cai_csv}") # Create output directory if needed os.makedirs(output_dir, exist_ok=True) print("Loading data from CSV files...") df_all = pd.read_csv( cai_csv, header=0, names=['gene_id', 'cai_score', 'drop1', 'drop2', 'dna_sequence', 'drop3'] ) df_high_cai = pd.read_csv( high_cai_csv, header=0, names=['dna_sequence'] ) high_cai_sequences = set(df_high_cai['dna_sequence']) validated_genes = [] for index, row in df_all.iterrows(): gene_id = row['gene_id'] dna_sequence = str(row['dna_sequence']) if is_valid_sequence(dna_sequence): protein_sequence = str(Seq(dna_sequence).translate()) is_high_cai = dna_sequence in high_cai_sequences validated_genes.append({ 'gene_id': gene_id, 'dna_sequence': dna_sequence, 'protein_sequence': protein_sequence, 'cai_score': row.get('cai_score', None), 'is_high_cai': is_high_cai }) df_processed = pd.DataFrame(validated_genes) output_path = os.path.join(output_dir, 'ecoli_processed_genes.csv') df_processed.to_csv(output_path, index=False) print(f"Processed data saved to {output_path}") print(f"Total validated genes: {len(df_processed)}") return output_path def create_train_test_splits(processed_csv: str, output_dir: str = "data", test_size: int = 100): """ Create training and test splits from processed data. Args: processed_csv: Path to processed ecoli_processed_genes.csv output_dir: Output directory for JSON files test_size: Number of sequences for test set Returns: Tuple of (finetune_json_path, test_json_path) """ # Lazy imports so `--help` works without heavy deps installed. import pandas as pd from CodonTransformer.CodonData import prepare_training_data if not os.path.exists(processed_csv): raise FileNotFoundError(f"Processed data file not found: {processed_csv}") os.makedirs(output_dir, exist_ok=True) df_processed = pd.read_csv(processed_csv) # Create fine-tuning set (high-CAI sequences) df_finetune = df_processed[df_processed['is_high_cai'] == True].copy() df_finetune.drop_duplicates(subset=['dna_sequence'], inplace=True) df_finetune.rename(columns={'dna_sequence': 'dna', 'protein_sequence': 'protein'}, inplace=True) df_finetune['organism'] = "Escherichia coli general" finetune_output_path = os.path.join(output_dir, 'finetune_set.json') prepare_training_data(df_finetune, finetune_output_path, shuffle=True) print(f"Fine-tuning set saved to {finetune_output_path} with {len(df_finetune)} records.") # Create test set (non-high-CAI sequences) df_test_pool = df_processed[df_processed['is_high_cai'] == False].copy() df_test = df_test_pool.sample(n=test_size, random_state=42) # for reproducibility df_test['organism'] = 51 # E. coli general organism ID df_test.rename(columns={'dna_sequence': 'codons'}, inplace=True) test_records = df_test[['codons', 'organism']].to_dict(orient='records') test_output_path = os.path.join(output_dir, 'test_set.json') with open(test_output_path, 'w') as f: json.dump(test_records, f, indent=4) print(f"Test set saved to {test_output_path} with {len(df_test)} records.") return finetune_output_path, test_output_path def main(): """Main entry point for data preprocessing.""" parser = argparse.ArgumentParser( description="Preprocess E. coli gene data for ENCOT training", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Use default paths python scripts/preprocess_data.py # Specify custom input files python scripts/preprocess_data.py --cai_csv data/CAI.csv --high_cai_csv data/Database_3_4300_gene.csv # Custom output directory and test size python scripts/preprocess_data.py --output_dir my_data --test_size 200 """ ) parser.add_argument( "--cai_csv", type=str, default="data/CAI.csv", help="Path to CAI.csv file with gene data (default: data/CAI.csv)" ) parser.add_argument( "--high_cai_csv", type=str, default="data/Database 3_4300 gene.csv", help="Path to Database 3_4300 gene.csv file (default: data/Database 3_4300 gene.csv)" ) parser.add_argument( "--output_dir", type=str, default="data", help="Output directory for processed files (default: data)" ) parser.add_argument( "--test_size", type=int, default=100, help="Number of sequences for test set (default: 100)" ) parser.add_argument( "--skip_processing", action="store_true", help="Skip data processing step (assume ecoli_processed_genes.csv exists)" ) args = parser.parse_args() try: # Step 1: Process raw data if not args.skip_processing: processed_csv = process_ecoli_data( args.cai_csv, args.high_cai_csv, args.output_dir ) else: processed_csv = os.path.join(args.output_dir, 'ecoli_processed_genes.csv') if not os.path.exists(processed_csv): raise FileNotFoundError( f"Processed data not found at {processed_csv}. " "Remove --skip_processing flag to process raw data first." ) print(f"Using existing processed data: {processed_csv}") # Step 2: Create train/test splits finetune_path, test_path = create_train_test_splits( processed_csv, args.output_dir, args.test_size ) print("\n" + "="*60) print("Data preprocessing complete!") print("="*60) print(f"Training set: {finetune_path}") print(f"Test set: {test_path}") print("\nYou can now run training with:") print(f" python scripts/train.py --config configs/train_ecoli_alm.yaml") except Exception as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()