vhbert / DNAEncoder.py
rajaatif786's picture
Update DNAEncoder.py
ec129ae verified
import sys
import time
import traceback
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
from tqdm.auto import tqdm
os.system("cls")
import warnings
warnings.filterwarnings("ignore")
class ConvertDNALabelEncoder(object):
"""
convert dna sequence string csv file to dna label encoder csv file and viceverse
"""
def __init__(self):
pass
@staticmethod
def convert_dna_string_to_dna_labelencoder(dna_string_csv_path, seq_column,label_column):
"""
convert dna sequence string csv file to dna label encoder csv file
args:
dna_string_csv_path (string): dna string csv file path
dna_labelencoder_csv_path (string): dna label encoder csv file path
returns:
none
"""
df_dna_string = pd.read_csv(filepath_or_buffer=dna_string_csv_path) #.iloc[:10,:]
#df_dna_string.loc[df_dna_string['labels']!='Homo sapiens','labels'] ='Other Choredate Host'
#df_dna_string['labels']=df_dna_string.loc[df_dna_string['labels']!='Homo sapiens','label'] #='Other Choredate Host'
#print(len(df_dna_string))
label_encoder = LabelEncoder()
dna_string_list = []
for row in tqdm(df_dna_string.itertuples()):
#print(row[2])
#dna_string_row = row[3] #.Sequence
dna_string_row = getattr(row, seq_column)
dna_string_row=dna_string_row.replace('S','').replace('W','').replace('Y','').replace('H','').replace('R','').replace('K','').replace('V','').replace('M','').replace('D','').replace('B','').replace('I','').replace('J','')
dna_string_nparray = np.array(list(dna_string_row))
sample=['A','T','C','G','N']
label_encoder.fit(sample)
dna_labelencoder_row = label_encoder.transform(dna_string_nparray)
dna_string_list.append(dna_labelencoder_row.astype(np.int8))
#df_dna_labelencoder = pd.DataFrame(dna_string_list)
#df_dna_labelencoder.to_csv(path_or_buf=dna_labelencoder_csv_path, index=False, header=None)
return dna_string_list, df_dna_string[label_column]
@staticmethod
def convert_dna_labelencoder_to_dna_string(dna_labelencoder_list):
"""
Convert DNA sequence label encoder CSV file to DNA string CSV file.
Args:
dna_labelencoder_csv_path (string): DNA label encoder CSV file path
dna_string_csv_path (string): DNA string CSV file path
"""
try:
# Read the label-encoded DNA sequences
#df_dna_labelencoder = pd.read_csv(filepath_or_buffer=dna_labelencoder_csv_path, header=None)
dna_labelencoder_list = dna_labelencoder_list #df_dna_labelencoder.values.tolist()
# Initialize the LabelEncoder and fit it to the DNA bases
label_encoder = LabelEncoder()
sample = ['A', 'T', 'C', 'G', 'N'] # DNA bases
label_encoder.fit(sample)
# Use the inverse transform to decode label encodings back to DNA strings
dna_string_list = []
for encoded_sequence in dna_labelencoder_list:
encoded_array = np.array(encoded_sequence, dtype=np.int8) # Ensure it's a NumPy array
decoded_sequence = label_encoder.inverse_transform(encoded_array) # Decode back to DNA bases
dna_string_list.append(''.join(decoded_sequence)) # Join decoded bases into a string
# Save the decoded DNA strings to a CSV file
df_dna_string = pd.DataFrame(dna_string_list)
#df_dna_string.to_csv(path_or_buf=dna_string_csv_path, index=False, header=None)
except Exception as e:
print("An error occurred. {}".format(ConvertDNALabelEncoder.get_exception_stack_trace()))
@staticmethod
def get_exception_stack_trace():
"""
get exception stack trace
args:
none
returns:
exception_stack_trace (string): exception stack trace parameters
"""
try:
exception_type, exception_value, exception_traceback = sys.exc_info()
file_name, line_number, procedure_name, line_code = traceback.extract_tb(exception_traceback)[-1]
exception_stack_trace = ''.join('[Time Stamp]: ' + str(time.strftime('%d-%m-%Y %I:%M:%S %p')) + '' + '[File Name]: ' + str(file_name) + ' '
+ '[Procedure Name]: ' + str(procedure_name) + ' '
+ '[Error Message]: ' + str(exception_value) + ' '
+ '[Error Type]: ' + str(exception_type) + ' '
+ '[Line Number]: ' + str(line_number) + ' '
+ '[Line Code]: ' + str(line_code))
except:
print("An error occurred in {}".format("get_exception_stack_trace() function"))
return exception_stack_trace
@staticmethod
def get_program_running(start_time):
"""
calculate program running
args:
start_time (string): start time program runtime
returns:
none
"""
try:
end_time = time.time()
diff_time = end_time - start_time
result = time.strftime("%H:%M:%S", time.gmtime(diff_time))
print("program runtime: {}".format(result))
except:
print("An error occurred. {}".format(ConvertDNALabelEncoder.get_exception_stack_trace()))