rajaatif786 commited on
Commit
d32be32
·
verified ·
1 Parent(s): b8f5da2

Upload DNAEncoder.py

Browse files
Files changed (1) hide show
  1. DNAEncoder.py +126 -0
DNAEncoder.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+ import traceback
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.preprocessing import LabelEncoder
7
+ import os
8
+ from tqdm.auto import tqdm
9
+ os.system("cls")
10
+
11
+ import warnings
12
+ warnings.filterwarnings("ignore")
13
+
14
+ class ConvertDNALabelEncoder(object):
15
+ """
16
+ convert dna sequence string csv file to dna label encoder csv file and viceverse
17
+ """
18
+ def __init__(self):
19
+ pass
20
+
21
+ @staticmethod
22
+ def convert_dna_string_to_dna_labelencoder(dna_string_csv_path, seq_column,label_column):
23
+ """
24
+ convert dna sequence string csv file to dna label encoder csv file
25
+ args:
26
+ dna_string_csv_path (string): dna string csv file path
27
+ dna_labelencoder_csv_path (string): dna label encoder csv file path
28
+ returns:
29
+ none
30
+ """
31
+ try:
32
+ df_dna_string = pd.read_csv(filepath_or_buffer=dna_string_csv_path) #.iloc[:10,:]
33
+ #df_dna_string.loc[df_dna_string['labels']!='Homo sapiens','labels'] ='Other Choredate Host'
34
+ #df_dna_string['labels']=df_dna_string.loc[df_dna_string['labels']!='Homo sapiens','label'] #='Other Choredate Host'
35
+ #print(len(df_dna_string))
36
+ label_encoder = LabelEncoder()
37
+ dna_string_list = []
38
+ for row in tqdm(df_dna_string.itertuples()):
39
+ #print(row[2])
40
+ #dna_string_row = row[3] #.Sequence
41
+ dna_string_row = getattr(row, seq_column)
42
+ dna_string_row=dna_string_row.replace('S','').replace('W','').replace('Y','').replace('H','').replace('R','').replace('K','').replace('V','').replace('M','').replace('D','').replace('B','').replace('I','').replace('J','')
43
+ dna_string_nparray = np.array(list(dna_string_row))
44
+ sample=['A','T','C','G','N']
45
+ label_encoder.fit(sample)
46
+
47
+ dna_labelencoder_row = label_encoder.transform(dna_string_nparray)
48
+ dna_string_list.append(dna_labelencoder_row.astype(np.int8))
49
+ #df_dna_labelencoder = pd.DataFrame(dna_string_list)
50
+ #df_dna_labelencoder.to_csv(path_or_buf=dna_labelencoder_csv_path, index=False, header=None)
51
+ return dna_string_list, df_dna_string[label_column]
52
+ except:
53
+ print("An error occurred. {}".format(ConvertDNALabelEncoder.get_exception_stack_trace()))
54
+ return "nothing"
55
+
56
+
57
+ @staticmethod
58
+ def convert_dna_labelencoder_to_dna_string(dna_labelencoder_list):
59
+ """
60
+ Convert DNA sequence label encoder CSV file to DNA string CSV file.
61
+ Args:
62
+ dna_labelencoder_csv_path (string): DNA label encoder CSV file path
63
+ dna_string_csv_path (string): DNA string CSV file path
64
+ """
65
+ try:
66
+ # Read the label-encoded DNA sequences
67
+ #df_dna_labelencoder = pd.read_csv(filepath_or_buffer=dna_labelencoder_csv_path, header=None)
68
+ dna_labelencoder_list = dna_labelencoder_list #df_dna_labelencoder.values.tolist()
69
+
70
+ # Initialize the LabelEncoder and fit it to the DNA bases
71
+ label_encoder = LabelEncoder()
72
+ sample = ['A', 'T', 'C', 'G', 'N'] # DNA bases
73
+ label_encoder.fit(sample)
74
+
75
+ # Use the inverse transform to decode label encodings back to DNA strings
76
+ dna_string_list = []
77
+ for encoded_sequence in dna_labelencoder_list:
78
+ encoded_array = np.array(encoded_sequence, dtype=np.int8) # Ensure it's a NumPy array
79
+ decoded_sequence = label_encoder.inverse_transform(encoded_array) # Decode back to DNA bases
80
+ dna_string_list.append(''.join(decoded_sequence)) # Join decoded bases into a string
81
+
82
+ # Save the decoded DNA strings to a CSV file
83
+ df_dna_string = pd.DataFrame(dna_string_list)
84
+ #df_dna_string.to_csv(path_or_buf=dna_string_csv_path, index=False, header=None)
85
+
86
+ except Exception as e:
87
+ print("An error occurred. {}".format(ConvertDNALabelEncoder.get_exception_stack_trace()))
88
+
89
+ @staticmethod
90
+ def get_exception_stack_trace():
91
+ """
92
+ get exception stack trace
93
+ args:
94
+ none
95
+ returns:
96
+ exception_stack_trace (string): exception stack trace parameters
97
+ """
98
+ try:
99
+ exception_type, exception_value, exception_traceback = sys.exc_info()
100
+ file_name, line_number, procedure_name, line_code = traceback.extract_tb(exception_traceback)[-1]
101
+ exception_stack_trace = ''.join('[Time Stamp]: ' + str(time.strftime('%d-%m-%Y %I:%M:%S %p')) + '' + '[File Name]: ' + str(file_name) + ' '
102
+ + '[Procedure Name]: ' + str(procedure_name) + ' '
103
+ + '[Error Message]: ' + str(exception_value) + ' '
104
+ + '[Error Type]: ' + str(exception_type) + ' '
105
+ + '[Line Number]: ' + str(line_number) + ' '
106
+ + '[Line Code]: ' + str(line_code))
107
+ except:
108
+ print("An error occurred in {}".format("get_exception_stack_trace() function"))
109
+ return exception_stack_trace
110
+
111
+ @staticmethod
112
+ def get_program_running(start_time):
113
+ """
114
+ calculate program running
115
+ args:
116
+ start_time (string): start time program runtime
117
+ returns:
118
+ none
119
+ """
120
+ try:
121
+ end_time = time.time()
122
+ diff_time = end_time - start_time
123
+ result = time.strftime("%H:%M:%S", time.gmtime(diff_time))
124
+ print("program runtime: {}".format(result))
125
+ except:
126
+ print("An error occurred. {}".format(ConvertDNALabelEncoder.get_exception_stack_trace()))