File size: 1,707 Bytes
084b58f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import random
import argparse
import pandas as pd
import os

# Set up argument parser
parser = argparse.ArgumentParser(description="Process sequences from an input file and split them into two output files.")
parser.add_argument('--output_dir', type=str, default='output/tmp', help="Directory containing the input file")
args = parser.parse_args()

# Define input file and output file paths
input_file = os.path.join(args.output_dir, 'standard/sequences_standardized.txt')
os.makedirs(os.path.join(args.output_dir, 'for_GPepT'), exist_ok=True)
output_file_90 = os.path.join(args.output_dir, 'for_GPepT/train90.txt')
output_file_10 = os.path.join(args.output_dir, 'for_GPepT/val10.txt')

# Check if the input file exists
if not os.path.exists(input_file):
    # No ncAAs?
    input_file = os.path.join(args.output_dir, 'raw/sequences_raw.txt')
    if not os.path.exists(input_file):
        print(f"Error: The input file '{input_file}' does not exist.")
        exit(1)

# Read the input file into a pandas DataFrame
df = pd.read_csv(input_file, sep='\t')

# Extract sequences and add <endoftext> to each
sequences = df['SEQUENCE'].apply(lambda x: x + '<|endoftext|>')

# Shuffle the sequences to randomize the split
sequences = sequences.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the sequences into 90% and 10%
split_index = int(0.9 * len(sequences))
sequences_90 = sequences[:split_index]
sequences_10 = sequences[split_index:]

# Write the sequences to the output files
sequences_90.to_csv(output_file_90, index=False, header=False)
sequences_10.to_csv(output_file_10, index=False, header=False)

print(f"Data has been successfully split into {output_file_90} and {output_file_10}")