import pandas as pd import json import os from CodonTransformer.CodonData import prepare_training_data def main(): """ Main function to partition the processed data into fine-tuning and test sets. """ if not os.path.exists('data'): print("Error: 'data' directory not found. Please run prepare_ecoli_data.py first.") return processed_data_path = 'data/ecoli_processed_genes.csv' if not os.path.exists(processed_data_path): print(f"Error: Processed data file not found at {processed_data_path}") return df_processed = pd.read_csv(processed_data_path) df_finetune = df_processed[df_processed['is_high_cai'] == True].copy() df_finetune.drop_duplicates(subset=['dna_sequence'], inplace=True) df_finetune.rename(columns={'dna_sequence': 'dna', 'protein_sequence': 'protein'}, inplace=True) df_finetune['organism'] = "Escherichia coli general" finetune_output_path = 'data/finetune_set.json' prepare_training_data(df_finetune, finetune_output_path, shuffle=True) print(f"Fine-tuning set saved to {finetune_output_path} with {len(df_finetune)} records.") df_test_pool = df_processed[df_processed['is_high_cai'] == False].copy() df_test = df_test_pool.sample(n=100, random_state=42) # for reproducibility df_test['organism'] = 51 # E. coli general df_test.rename(columns={'dna_sequence': 'codons'}, inplace=True) test_records = df_test[['codons', 'organism']].to_dict(orient='records') test_output_path = 'data/test_set.json' with open(test_output_path, 'w') as f: json.dump(test_records, f, indent=4) print(f"Test set saved to {test_output_path} with {len(df_test)} records.") if __name__ == "__main__": main()