"""Fetch CASF-2016 target clusters from RCSB PDB API.""" import pandas as pd import json import urllib.request from collections import defaultdict v9 = pd.read_csv('/Volumes/PRO-G40/millerbind-tdc-validation/predictions/casf2016_v9_predictions.csv') pdb_ids = v9['pdb_id'].str.upper().tolist() pdb_to_target = {} for i in range(0, len(pdb_ids), 50): batch = pdb_ids[i:i+50] query = '{ entries(entry_ids: %s) { rcsb_id polymer_entities { rcsb_polymer_entity { pdbx_description } rcsb_polymer_entity_container_identifiers { uniprot_ids } } } }' % json.dumps(batch) url = 'https://data.rcsb.org/graphql' req = urllib.request.Request(url, data=json.dumps({'query': query}).encode(), headers={'Content-Type': 'application/json'}) resp = urllib.request.urlopen(req, timeout=30) data = json.loads(resp.read()) for entry in data['data']['entries']: pdb = entry['rcsb_id'].lower() uniprots = [] desc = "" for pe in (entry.get('polymer_entities') or []): d = pe.get('rcsb_polymer_entity', {}).get('pdbx_description', '') u = pe.get('rcsb_polymer_entity_container_identifiers', {}).get('uniprot_ids', []) if d and not desc: desc = d if u: uniprots.extend(u) target = uniprots[0] if uniprots else desc pdb_to_target[pdb] = target target_groups = defaultdict(list) for pdb, target in pdb_to_target.items(): target_groups[target].append(pdb) clusters_5 = {t: pdbs for t, pdbs in target_groups.items() if len(pdbs) == 5} clusters_3plus = {t: pdbs for t, pdbs in target_groups.items() if len(pdbs) >= 3} print(f"Total PDB IDs mapped: {len(pdb_to_target)}") print(f"Unique targets: {len(target_groups)}") print(f"Clusters with exactly 5: {len(clusters_5)}") print(f"Clusters with 3+: {len(clusters_3plus)}") with open('/Volumes/PRO-G40/millerbind-tdc-validation/casf2016_target_clusters.json', 'w') as f: json.dump({ 'pdb_to_target': pdb_to_target, 'clusters_5': clusters_5, 'clusters_3plus': clusters_3plus }, f, indent=2) print("\nClusters with 5 members:") for t, pdbs in sorted(clusters_5.items()): print(f" {t}: {pdbs}") print("\nClusters with 3-4 members:") for t, pdbs in sorted(clusters_3plus.items()): if len(pdbs) < 5: print(f" {t} ({len(pdbs)}): {pdbs}") print(f"\nSaved to casf2016_target_clusters.json")