File size: 2,445 Bytes

0d3fb18

"""Fetch CASF-2016 target clusters from RCSB PDB API."""
import pandas as pd
import json
import urllib.request
from collections import defaultdict

v9 = pd.read_csv('/Volumes/PRO-G40/millerbind-tdc-validation/predictions/casf2016_v9_predictions.csv')
pdb_ids = v9['pdb_id'].str.upper().tolist()

pdb_to_target = {}
for i in range(0, len(pdb_ids), 50):
    batch = pdb_ids[i:i+50]
    query = '{ entries(entry_ids: %s) { rcsb_id polymer_entities { rcsb_polymer_entity { pdbx_description } rcsb_polymer_entity_container_identifiers { uniprot_ids } } } }' % json.dumps(batch)
    
    url = 'https://data.rcsb.org/graphql'
    req = urllib.request.Request(url,
        data=json.dumps({'query': query}).encode(),
        headers={'Content-Type': 'application/json'})
    resp = urllib.request.urlopen(req, timeout=30)
    data = json.loads(resp.read())
    
    for entry in data['data']['entries']:
        pdb = entry['rcsb_id'].lower()
        uniprots = []
        desc = ""
        for pe in (entry.get('polymer_entities') or []):
            d = pe.get('rcsb_polymer_entity', {}).get('pdbx_description', '')
            u = pe.get('rcsb_polymer_entity_container_identifiers', {}).get('uniprot_ids', [])
            if d and not desc:
                desc = d
            if u:
                uniprots.extend(u)
        target = uniprots[0] if uniprots else desc
        pdb_to_target[pdb] = target

target_groups = defaultdict(list)
for pdb, target in pdb_to_target.items():
    target_groups[target].append(pdb)

clusters_5 = {t: pdbs for t, pdbs in target_groups.items() if len(pdbs) == 5}
clusters_3plus = {t: pdbs for t, pdbs in target_groups.items() if len(pdbs) >= 3}

print(f"Total PDB IDs mapped: {len(pdb_to_target)}")
print(f"Unique targets: {len(target_groups)}")
print(f"Clusters with exactly 5: {len(clusters_5)}")
print(f"Clusters with 3+: {len(clusters_3plus)}")

with open('/Volumes/PRO-G40/millerbind-tdc-validation/casf2016_target_clusters.json', 'w') as f:
    json.dump({
        'pdb_to_target': pdb_to_target,
        'clusters_5': clusters_5,
        'clusters_3plus': clusters_3plus
    }, f, indent=2)

print("\nClusters with 5 members:")
for t, pdbs in sorted(clusters_5.items()):
    print(f"  {t}: {pdbs}")

print("\nClusters with 3-4 members:")
for t, pdbs in sorted(clusters_3plus.items()):
    if len(pdbs) < 5:
        print(f"  {t} ({len(pdbs)}): {pdbs}")

print(f"\nSaved to casf2016_target_clusters.json")