File size: 1,797 Bytes
404d784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas as pd
import json
import os
from CodonTransformer.CodonData import prepare_training_data

def main():
    """

    Main function to partition the processed data into fine-tuning and test sets.

    """
    if not os.path.exists('data'):
        print("Error: 'data' directory not found. Please run prepare_ecoli_data.py first.")
        return

    processed_data_path = 'data/ecoli_processed_genes.csv'
    if not os.path.exists(processed_data_path):
        print(f"Error: Processed data file not found at {processed_data_path}")
        return
        
    df_processed = pd.read_csv(processed_data_path)

    df_finetune = df_processed[df_processed['is_high_cai'] == True].copy()
    df_finetune.drop_duplicates(subset=['dna_sequence'], inplace=True)
    df_finetune.rename(columns={'dna_sequence': 'dna', 'protein_sequence': 'protein'}, inplace=True)
    df_finetune['organism'] = "Escherichia coli general"
    
    finetune_output_path = 'data/finetune_set.json'
    prepare_training_data(df_finetune, finetune_output_path, shuffle=True)
    print(f"Fine-tuning set saved to {finetune_output_path} with {len(df_finetune)} records.")

    df_test_pool = df_processed[df_processed['is_high_cai'] == False].copy()
    df_test = df_test_pool.sample(n=100, random_state=42) # for reproducibility
    df_test['organism'] = 51 # E. coli general
    df_test.rename(columns={'dna_sequence': 'codons'}, inplace=True)
    test_records = df_test[['codons', 'organism']].to_dict(orient='records')

    test_output_path = 'data/test_set.json'
    with open(test_output_path, 'w') as f:
        json.dump(test_records, f, indent=4)
    print(f"Test set saved to {test_output_path} with {len(df_test)} records.")

if __name__ == "__main__":
    main()