| """Fetch CASF-2016 target clusters from RCSB PDB API.""" | |
| import pandas as pd | |
| import json | |
| import urllib.request | |
| from collections import defaultdict | |
| v9 = pd.read_csv('/Volumes/PRO-G40/millerbind-tdc-validation/predictions/casf2016_v9_predictions.csv') | |
| pdb_ids = v9['pdb_id'].str.upper().tolist() | |
| pdb_to_target = {} | |
| for i in range(0, len(pdb_ids), 50): | |
| batch = pdb_ids[i:i+50] | |
| query = '{ entries(entry_ids: %s) { rcsb_id polymer_entities { rcsb_polymer_entity { pdbx_description } rcsb_polymer_entity_container_identifiers { uniprot_ids } } } }' % json.dumps(batch) | |
| url = 'https://data.rcsb.org/graphql' | |
| req = urllib.request.Request(url, | |
| data=json.dumps({'query': query}).encode(), | |
| headers={'Content-Type': 'application/json'}) | |
| resp = urllib.request.urlopen(req, timeout=30) | |
| data = json.loads(resp.read()) | |
| for entry in data['data']['entries']: | |
| pdb = entry['rcsb_id'].lower() | |
| uniprots = [] | |
| desc = "" | |
| for pe in (entry.get('polymer_entities') or []): | |
| d = pe.get('rcsb_polymer_entity', {}).get('pdbx_description', '') | |
| u = pe.get('rcsb_polymer_entity_container_identifiers', {}).get('uniprot_ids', []) | |
| if d and not desc: | |
| desc = d | |
| if u: | |
| uniprots.extend(u) | |
| target = uniprots[0] if uniprots else desc | |
| pdb_to_target[pdb] = target | |
| target_groups = defaultdict(list) | |
| for pdb, target in pdb_to_target.items(): | |
| target_groups[target].append(pdb) | |
| clusters_5 = {t: pdbs for t, pdbs in target_groups.items() if len(pdbs) == 5} | |
| clusters_3plus = {t: pdbs for t, pdbs in target_groups.items() if len(pdbs) >= 3} | |
| print(f"Total PDB IDs mapped: {len(pdb_to_target)}") | |
| print(f"Unique targets: {len(target_groups)}") | |
| print(f"Clusters with exactly 5: {len(clusters_5)}") | |
| print(f"Clusters with 3+: {len(clusters_3plus)}") | |
| with open('/Volumes/PRO-G40/millerbind-tdc-validation/casf2016_target_clusters.json', 'w') as f: | |
| json.dump({ | |
| 'pdb_to_target': pdb_to_target, | |
| 'clusters_5': clusters_5, | |
| 'clusters_3plus': clusters_3plus | |
| }, f, indent=2) | |
| print("\nClusters with 5 members:") | |
| for t, pdbs in sorted(clusters_5.items()): | |
| print(f" {t}: {pdbs}") | |
| print("\nClusters with 3-4 members:") | |
| for t, pdbs in sorted(clusters_3plus.items()): | |
| if len(pdbs) < 5: | |
| print(f" {t} ({len(pdbs)}): {pdbs}") | |
| print(f"\nSaved to casf2016_target_clusters.json") | |