ColiFormer-ui / scripts /preprocess_data.py
Genooo12's picture
Deploy Streamlit UI
404d784 verified
"""
Preprocess E. coli gene data for ColiFormer training.
This script combines the functionality of prepare_ecoli_data.py and
create_model_datasets.py to prepare training and test datasets from raw CSV files.
Usage:
python scripts/preprocess_data.py
python scripts/preprocess_data.py --cai_csv data/CAI.csv --high_cai_csv data/Database_3_4300_gene.csv
"""
import argparse
import json
import os
import sys
from pathlib import Path
# Add parent directory to path to import CodonTransformer
sys.path.insert(0, str(Path(__file__).parent.parent))
def is_valid_sequence(dna_seq: str) -> bool:
"""
Validate a DNA sequence for training suitability.
Args:
dna_seq: DNA sequence string
Returns:
True if sequence is valid (divisible by 3, proper start/stop codons, no internal stops)
"""
if len(dna_seq) % 3 != 0:
return False
if not dna_seq.upper().startswith(('ATG', 'TTG', 'CTG', 'GTG')):
return False
if not dna_seq.upper().endswith(('TAA', 'TAG', 'TGA')):
return False
codons = [dna_seq[i:i+3].upper() for i in range(0, len(dna_seq) - 3, 3)]
if any(codon in ['TAA', 'TAG', 'TGA'] for codon in codons):
return False
if not all(c in 'ATGC' for c in dna_seq.upper()):
return False
return True
def process_ecoli_data(cai_csv: str, high_cai_csv: str, output_dir: str = "data"):
"""
Process raw E. coli gene data from CSV files.
Args:
cai_csv: Path to CAI.csv file with gene data
high_cai_csv: Path to Database 3_4300 gene.csv with high-CAI sequences
output_dir: Output directory for processed files
Returns:
Path to processed CSV file
"""
# Lazy imports so `python scripts/preprocess_data.py --help` works without heavy deps installed.
import pandas as pd
from Bio.Seq import Seq
# Validate input files exist
if not os.path.exists(cai_csv):
raise FileNotFoundError(f"CAI CSV file not found: {cai_csv}")
if not os.path.exists(high_cai_csv):
raise FileNotFoundError(f"High-CAI CSV file not found: {high_cai_csv}")
# Create output directory if needed
os.makedirs(output_dir, exist_ok=True)
print("Loading data from CSV files...")
df_all = pd.read_csv(
cai_csv,
header=0,
names=['gene_id', 'cai_score', 'drop1', 'drop2', 'dna_sequence', 'drop3']
)
df_high_cai = pd.read_csv(
high_cai_csv,
header=0,
names=['dna_sequence']
)
high_cai_sequences = set(df_high_cai['dna_sequence'])
validated_genes = []
for index, row in df_all.iterrows():
gene_id = row['gene_id']
dna_sequence = str(row['dna_sequence'])
if is_valid_sequence(dna_sequence):
protein_sequence = str(Seq(dna_sequence).translate())
is_high_cai = dna_sequence in high_cai_sequences
validated_genes.append({
'gene_id': gene_id,
'dna_sequence': dna_sequence,
'protein_sequence': protein_sequence,
'cai_score': row.get('cai_score', None),
'is_high_cai': is_high_cai
})
df_processed = pd.DataFrame(validated_genes)
output_path = os.path.join(output_dir, 'ecoli_processed_genes.csv')
df_processed.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")
print(f"Total validated genes: {len(df_processed)}")
return output_path
def create_train_test_splits(processed_csv: str, output_dir: str = "data", test_size: int = 100):
"""
Create training and test splits from processed data.
Args:
processed_csv: Path to processed ecoli_processed_genes.csv
output_dir: Output directory for JSON files
test_size: Number of sequences for test set
Returns:
Tuple of (finetune_json_path, test_json_path)
"""
# Lazy imports so `--help` works without heavy deps installed.
import pandas as pd
from CodonTransformer.CodonData import prepare_training_data
if not os.path.exists(processed_csv):
raise FileNotFoundError(f"Processed data file not found: {processed_csv}")
os.makedirs(output_dir, exist_ok=True)
df_processed = pd.read_csv(processed_csv)
# Create fine-tuning set (high-CAI sequences)
df_finetune = df_processed[df_processed['is_high_cai'] == True].copy()
df_finetune.drop_duplicates(subset=['dna_sequence'], inplace=True)
df_finetune.rename(columns={'dna_sequence': 'dna', 'protein_sequence': 'protein'}, inplace=True)
df_finetune['organism'] = "Escherichia coli general"
finetune_output_path = os.path.join(output_dir, 'finetune_set.json')
prepare_training_data(df_finetune, finetune_output_path, shuffle=True)
print(f"Fine-tuning set saved to {finetune_output_path} with {len(df_finetune)} records.")
# Create test set (non-high-CAI sequences)
df_test_pool = df_processed[df_processed['is_high_cai'] == False].copy()
df_test = df_test_pool.sample(n=test_size, random_state=42) # for reproducibility
df_test['organism'] = 51 # E. coli general organism ID
df_test.rename(columns={'dna_sequence': 'codons'}, inplace=True)
test_records = df_test[['codons', 'organism']].to_dict(orient='records')
test_output_path = os.path.join(output_dir, 'test_set.json')
with open(test_output_path, 'w') as f:
json.dump(test_records, f, indent=4)
print(f"Test set saved to {test_output_path} with {len(df_test)} records.")
return finetune_output_path, test_output_path
def main():
"""Main entry point for data preprocessing."""
parser = argparse.ArgumentParser(
description="Preprocess E. coli gene data for ENCOT training",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Use default paths
python scripts/preprocess_data.py
# Specify custom input files
python scripts/preprocess_data.py --cai_csv data/CAI.csv --high_cai_csv data/Database_3_4300_gene.csv
# Custom output directory and test size
python scripts/preprocess_data.py --output_dir my_data --test_size 200
"""
)
parser.add_argument(
"--cai_csv",
type=str,
default="data/CAI.csv",
help="Path to CAI.csv file with gene data (default: data/CAI.csv)"
)
parser.add_argument(
"--high_cai_csv",
type=str,
default="data/Database 3_4300 gene.csv",
help="Path to Database 3_4300 gene.csv file (default: data/Database 3_4300 gene.csv)"
)
parser.add_argument(
"--output_dir",
type=str,
default="data",
help="Output directory for processed files (default: data)"
)
parser.add_argument(
"--test_size",
type=int,
default=100,
help="Number of sequences for test set (default: 100)"
)
parser.add_argument(
"--skip_processing",
action="store_true",
help="Skip data processing step (assume ecoli_processed_genes.csv exists)"
)
args = parser.parse_args()
try:
# Step 1: Process raw data
if not args.skip_processing:
processed_csv = process_ecoli_data(
args.cai_csv,
args.high_cai_csv,
args.output_dir
)
else:
processed_csv = os.path.join(args.output_dir, 'ecoli_processed_genes.csv')
if not os.path.exists(processed_csv):
raise FileNotFoundError(
f"Processed data not found at {processed_csv}. "
"Remove --skip_processing flag to process raw data first."
)
print(f"Using existing processed data: {processed_csv}")
# Step 2: Create train/test splits
finetune_path, test_path = create_train_test_splits(
processed_csv,
args.output_dir,
args.test_size
)
print("\n" + "="*60)
print("Data preprocessing complete!")
print("="*60)
print(f"Training set: {finetune_path}")
print(f"Test set: {test_path}")
print("\nYou can now run training with:")
print(f" python scripts/train.py --config configs/train_ecoli_alm.yaml")
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()