File size: 3,792 Bytes

1d6f391

#!/usr/bin/env python3
"""
Batch IUPAC to WURCS conversion using GlyCosmos API.

Attempts to convert failed IUPAC strings to WURCS using the GlyCoNAVI/GlyCosmos API.
Note: Many of these structures are non-standard bacterial nomenclature that the 
API may not recognize.
"""

import csv
import urllib.request
import urllib.parse
import json
import time
import os

API_URL_CONDENSED = "https://api.glycosmos.org/glycanformatconverter/2.8.0/iupaccondensed2wurcs/"
API_URL_EXTENDED = "https://api.glycosmos.org/glycanformatconverter/2.8.0/iupacextended2wurcs/"
TIMEOUT = 30
DELAY = 0.5  # Rate limiting

def convert_iupac_to_wurcs(iupac: str) -> tuple:
    """
    Try to convert IUPAC to WURCS using GlyCosmos API.
    Returns (wurcs, error_message)
    """
    # Try condensed format first
    for api_name, api_url in [("condensed", API_URL_CONDENSED), ("extended", API_URL_EXTENDED)]:
        try:
            url = api_url + urllib.parse.quote(iupac, safe='')
            req = urllib.request.Request(url, headers={
                'User-Agent': 'Mozilla/5.0',
                'Accept': 'application/json'
            })
            with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
                result = json.loads(resp.read().decode('utf-8'))
                
                if 'wurcs' in result and result['wurcs']:
                    return result['wurcs'], None
                elif 'WURCS' in result and result['WURCS']:
                    return result['WURCS'], None
                elif 'message' in result:
                    continue  # Try next API
        except Exception as e:
            continue  # Try next API
    
    return None, "All APIs failed"


def main():
    script_dir = os.path.dirname(os.path.abspath(__file__))
    input_file = os.path.join(script_dir, "bert_training_v4/downstream_tasks/failed_wurcs_conversion.csv")
    output_file = os.path.join(script_dir, "bert_training_v4/downstream_tasks/glycosmos_conversion_results.csv")
    
    print(f"Reading from: {input_file}")
    
    # Read failed conversions
    with open(input_file, 'r') as f:
        reader = csv.DictReader(f)
        rows = list(reader)
    
    print(f"Found {len(rows)} structures to convert")
    
    results = []
    success = 0
    failed = 0
    
    # Track progress - process in small batches
    MAX_TO_PROCESS = 100  # Start with first 100 for testing
    
    for i, row in enumerate(rows[:MAX_TO_PROCESS]):
        iupac = row['target']
        
        print(f"\r[{i+1}/{min(len(rows), MAX_TO_PROCESS)}] Converting... Success: {success}, Failed: {failed}", end="")
        
        wurcs, error = convert_iupac_to_wurcs(iupac)
        
        results.append({
            'iupac': iupac,
            'wurcs': wurcs or '',
            'error': error or '',
            'success': wurcs is not None,
            'species': row.get('species', ''),
            'kingdom': row.get('kingdom', '')
        })
        
        if wurcs:
            success += 1
        else:
            failed += 1
        
        time.sleep(DELAY)  # Rate limiting
    
    print(f"\n\nResults: {success} success, {failed} failed")
    
    # Save results
    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['iupac', 'wurcs', 'error', 'success', 'species', 'kingdom'])
        writer.writeheader()
        writer.writerows(results)
    
    print(f"Saved to: {output_file}")
    
    # Print some successful conversions
    successful = [r for r in results if r['success']]
    if successful:
        print(f"\n=== {len(successful)} Successful Conversions ===")
        for r in successful[:5]:
            print(f"  {r['iupac'][:60]}...")
            print(f"  → {r['wurcs'][:80]}...")


if __name__ == "__main__":
    main()