File size: 2,445 Bytes
0d3fb18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | """Fetch CASF-2016 target clusters from RCSB PDB API."""
import pandas as pd
import json
import urllib.request
from collections import defaultdict
v9 = pd.read_csv('/Volumes/PRO-G40/millerbind-tdc-validation/predictions/casf2016_v9_predictions.csv')
pdb_ids = v9['pdb_id'].str.upper().tolist()
pdb_to_target = {}
for i in range(0, len(pdb_ids), 50):
batch = pdb_ids[i:i+50]
query = '{ entries(entry_ids: %s) { rcsb_id polymer_entities { rcsb_polymer_entity { pdbx_description } rcsb_polymer_entity_container_identifiers { uniprot_ids } } } }' % json.dumps(batch)
url = 'https://data.rcsb.org/graphql'
req = urllib.request.Request(url,
data=json.dumps({'query': query}).encode(),
headers={'Content-Type': 'application/json'})
resp = urllib.request.urlopen(req, timeout=30)
data = json.loads(resp.read())
for entry in data['data']['entries']:
pdb = entry['rcsb_id'].lower()
uniprots = []
desc = ""
for pe in (entry.get('polymer_entities') or []):
d = pe.get('rcsb_polymer_entity', {}).get('pdbx_description', '')
u = pe.get('rcsb_polymer_entity_container_identifiers', {}).get('uniprot_ids', [])
if d and not desc:
desc = d
if u:
uniprots.extend(u)
target = uniprots[0] if uniprots else desc
pdb_to_target[pdb] = target
target_groups = defaultdict(list)
for pdb, target in pdb_to_target.items():
target_groups[target].append(pdb)
clusters_5 = {t: pdbs for t, pdbs in target_groups.items() if len(pdbs) == 5}
clusters_3plus = {t: pdbs for t, pdbs in target_groups.items() if len(pdbs) >= 3}
print(f"Total PDB IDs mapped: {len(pdb_to_target)}")
print(f"Unique targets: {len(target_groups)}")
print(f"Clusters with exactly 5: {len(clusters_5)}")
print(f"Clusters with 3+: {len(clusters_3plus)}")
with open('/Volumes/PRO-G40/millerbind-tdc-validation/casf2016_target_clusters.json', 'w') as f:
json.dump({
'pdb_to_target': pdb_to_target,
'clusters_5': clusters_5,
'clusters_3plus': clusters_3plus
}, f, indent=2)
print("\nClusters with 5 members:")
for t, pdbs in sorted(clusters_5.items()):
print(f" {t}: {pdbs}")
print("\nClusters with 3-4 members:")
for t, pdbs in sorted(clusters_3plus.items()):
if len(pdbs) < 5:
print(f" {t} ({len(pdbs)}): {pdbs}")
print(f"\nSaved to casf2016_target_clusters.json")
|