#!/usr/bin/env python3 """ Batch IUPAC to WURCS conversion using GlyCosmos API. Attempts to convert failed IUPAC strings to WURCS using the GlyCoNAVI/GlyCosmos API. Note: Many of these structures are non-standard bacterial nomenclature that the API may not recognize. """ import csv import urllib.request import urllib.parse import json import time import os API_URL_CONDENSED = "https://api.glycosmos.org/glycanformatconverter/2.8.0/iupaccondensed2wurcs/" API_URL_EXTENDED = "https://api.glycosmos.org/glycanformatconverter/2.8.0/iupacextended2wurcs/" TIMEOUT = 30 DELAY = 0.5 # Rate limiting def convert_iupac_to_wurcs(iupac: str) -> tuple: """ Try to convert IUPAC to WURCS using GlyCosmos API. Returns (wurcs, error_message) """ # Try condensed format first for api_name, api_url in [("condensed", API_URL_CONDENSED), ("extended", API_URL_EXTENDED)]: try: url = api_url + urllib.parse.quote(iupac, safe='') req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json' }) with urllib.request.urlopen(req, timeout=TIMEOUT) as resp: result = json.loads(resp.read().decode('utf-8')) if 'wurcs' in result and result['wurcs']: return result['wurcs'], None elif 'WURCS' in result and result['WURCS']: return result['WURCS'], None elif 'message' in result: continue # Try next API except Exception as e: continue # Try next API return None, "All APIs failed" def main(): script_dir = os.path.dirname(os.path.abspath(__file__)) input_file = os.path.join(script_dir, "bert_training_v4/downstream_tasks/failed_wurcs_conversion.csv") output_file = os.path.join(script_dir, "bert_training_v4/downstream_tasks/glycosmos_conversion_results.csv") print(f"Reading from: {input_file}") # Read failed conversions with open(input_file, 'r') as f: reader = csv.DictReader(f) rows = list(reader) print(f"Found {len(rows)} structures to convert") results = [] success = 0 failed = 0 # Track progress - process in small batches MAX_TO_PROCESS = 100 # Start with first 100 for testing for i, row in enumerate(rows[:MAX_TO_PROCESS]): iupac = row['target'] print(f"\r[{i+1}/{min(len(rows), MAX_TO_PROCESS)}] Converting... Success: {success}, Failed: {failed}", end="") wurcs, error = convert_iupac_to_wurcs(iupac) results.append({ 'iupac': iupac, 'wurcs': wurcs or '', 'error': error or '', 'success': wurcs is not None, 'species': row.get('species', ''), 'kingdom': row.get('kingdom', '') }) if wurcs: success += 1 else: failed += 1 time.sleep(DELAY) # Rate limiting print(f"\n\nResults: {success} success, {failed} failed") # Save results with open(output_file, 'w', newline='') as f: writer = csv.DictWriter(f, fieldnames=['iupac', 'wurcs', 'error', 'success', 'species', 'kingdom']) writer.writeheader() writer.writerows(results) print(f"Saved to: {output_file}") # Print some successful conversions successful = [r for r in results if r['success']] if successful: print(f"\n=== {len(successful)} Successful Conversions ===") for r in successful[:5]: print(f" {r['iupac'][:60]}...") print(f" → {r['wurcs'][:80]}...") if __name__ == "__main__": main()