Spaces:
Sleeping
Sleeping
Commit
·
e668ce0
1
Parent(s):
97b1b63
Delete code/add_domains_alphafold.py
Browse files
code/add_domains_alphafold.py
DELETED
|
@@ -1,57 +0,0 @@
|
|
| 1 |
-
from collections import Counter
|
| 2 |
-
import pandas as pd
|
| 3 |
-
|
| 4 |
-
def add_domains(data, path_to_domains):
|
| 5 |
-
domains = pd.read_csv(path_to_domains, delimiter=' ')
|
| 6 |
-
data = data.merge(domains, right_on='proteinID', left_on='uniprotID', how='left')
|
| 7 |
-
data = data.drop(['proteinID'], axis=1)
|
| 8 |
-
# Label each data point as range or notRange based on the relative distance of mutation and domain boundaries.
|
| 9 |
-
data = data.astype('str')
|
| 10 |
-
data.domStart = data.domStart.astype('float')
|
| 11 |
-
data.domEnd = data.domEnd.astype('float')
|
| 12 |
-
|
| 13 |
-
for i in data.index:
|
| 14 |
-
if data.at[i, 'domain'] != 'nan':
|
| 15 |
-
if int(data.at[i, 'domStart']) <= int(data.at[i, 'pos']) <= int(data.at[i, 'domEnd']):
|
| 16 |
-
data.at[i, 'distance'] = 0
|
| 17 |
-
else:
|
| 18 |
-
distance = min(abs(int(data.at[i, 'domStart']) - int(data.at[i, 'pos'])),
|
| 19 |
-
abs(int(data.at[i, 'domEnd']) - int(data.at[i, 'pos'])))
|
| 20 |
-
data.at[i, 'distance'] = int(distance)
|
| 21 |
-
else:
|
| 22 |
-
data.at[i, 'distance'] = 'nan'
|
| 23 |
-
|
| 24 |
-
data = data.sort_values(by=['datapoint', 'distance']).reset_index(drop=True) # Distances will be sorted.
|
| 25 |
-
|
| 26 |
-
# Keep the one with the least distance. But we may have more than one range domains for a datapoint if distance = 0.
|
| 27 |
-
# For this reason first we need to separate range ones so that when we take the first occurance to get the closest one
|
| 28 |
-
# for non range ones, other distance=0 ones wont disappear.
|
| 29 |
-
|
| 30 |
-
data_range = data[data.distance == 0]
|
| 31 |
-
data_out_range = data[data.distance != 0]
|
| 32 |
-
|
| 33 |
-
# For the range ones, find the most occurance
|
| 34 |
-
|
| 35 |
-
dom = []
|
| 36 |
-
for i in data_range.index:
|
| 37 |
-
dom.append(data_range.at[i, 'domain'])
|
| 38 |
-
|
| 39 |
-
domainCount = Counter(dom) # Occurance of domains.
|
| 40 |
-
|
| 41 |
-
# For out of range ones, take the closest distance.
|
| 42 |
-
data_out_range = data_out_range.drop_duplicates(['datapoint'], keep='first') # Already sorted above.
|
| 43 |
-
domain_counts = pd.DataFrame(domainCount.items(), columns=['domain', 'count'])
|
| 44 |
-
data_range_counts = data_range.merge(domain_counts, on='domain')
|
| 45 |
-
data_range_counts = data_range_counts.sort_values(['datapoint', 'count'])
|
| 46 |
-
data_range_counts = data_range_counts.drop_duplicates(['datapoint'], keep='last') # Take with the higher count.
|
| 47 |
-
data_range_counts = data_range_counts.drop(['count'], axis=1)
|
| 48 |
-
|
| 49 |
-
# Merge them back together
|
| 50 |
-
|
| 51 |
-
frames = [data_range_counts, data_out_range]
|
| 52 |
-
data = pd.concat(frames, sort=False) # Here when you concat two data frames, we might have range and not range with
|
| 53 |
-
# min distance for the same data point. Delete the one coming from notRange one.
|
| 54 |
-
data = data.sort_values(['datapoint', 'distance']).reset_index(drop=True)
|
| 55 |
-
data = data.drop_duplicates(['datapoint'], keep='first')
|
| 56 |
-
data = data.astype(str)
|
| 57 |
-
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|