bertose-affinose-training-code / code /data_processing /convert_iupac_glycosmos.py
supanthadey1's picture
Add BERTose and AFFINose training code release
1d6f391 verified
Raw
History Blame Contribute Delete
3.79 kB
#!/usr/bin/env python3
"""
Batch IUPAC to WURCS conversion using GlyCosmos API.
Attempts to convert failed IUPAC strings to WURCS using the GlyCoNAVI/GlyCosmos API.
Note: Many of these structures are non-standard bacterial nomenclature that the
API may not recognize.
"""
import csv
import urllib.request
import urllib.parse
import json
import time
import os
API_URL_CONDENSED = "https://api.glycosmos.org/glycanformatconverter/2.8.0/iupaccondensed2wurcs/"
API_URL_EXTENDED = "https://api.glycosmos.org/glycanformatconverter/2.8.0/iupacextended2wurcs/"
TIMEOUT = 30
DELAY = 0.5 # Rate limiting
def convert_iupac_to_wurcs(iupac: str) -> tuple:
"""
Try to convert IUPAC to WURCS using GlyCosmos API.
Returns (wurcs, error_message)
"""
# Try condensed format first
for api_name, api_url in [("condensed", API_URL_CONDENSED), ("extended", API_URL_EXTENDED)]:
try:
url = api_url + urllib.parse.quote(iupac, safe='')
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0',
'Accept': 'application/json'
})
with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
result = json.loads(resp.read().decode('utf-8'))
if 'wurcs' in result and result['wurcs']:
return result['wurcs'], None
elif 'WURCS' in result and result['WURCS']:
return result['WURCS'], None
elif 'message' in result:
continue # Try next API
except Exception as e:
continue # Try next API
return None, "All APIs failed"
def main():
script_dir = os.path.dirname(os.path.abspath(__file__))
input_file = os.path.join(script_dir, "bert_training_v4/downstream_tasks/failed_wurcs_conversion.csv")
output_file = os.path.join(script_dir, "bert_training_v4/downstream_tasks/glycosmos_conversion_results.csv")
print(f"Reading from: {input_file}")
# Read failed conversions
with open(input_file, 'r') as f:
reader = csv.DictReader(f)
rows = list(reader)
print(f"Found {len(rows)} structures to convert")
results = []
success = 0
failed = 0
# Track progress - process in small batches
MAX_TO_PROCESS = 100 # Start with first 100 for testing
for i, row in enumerate(rows[:MAX_TO_PROCESS]):
iupac = row['target']
print(f"\r[{i+1}/{min(len(rows), MAX_TO_PROCESS)}] Converting... Success: {success}, Failed: {failed}", end="")
wurcs, error = convert_iupac_to_wurcs(iupac)
results.append({
'iupac': iupac,
'wurcs': wurcs or '',
'error': error or '',
'success': wurcs is not None,
'species': row.get('species', ''),
'kingdom': row.get('kingdom', '')
})
if wurcs:
success += 1
else:
failed += 1
time.sleep(DELAY) # Rate limiting
print(f"\n\nResults: {success} success, {failed} failed")
# Save results
with open(output_file, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['iupac', 'wurcs', 'error', 'success', 'species', 'kingdom'])
writer.writeheader()
writer.writerows(results)
print(f"Saved to: {output_file}")
# Print some successful conversions
successful = [r for r in results if r['success']]
if successful:
print(f"\n=== {len(successful)} Successful Conversions ===")
for r in successful[:5]:
print(f" {r['iupac'][:60]}...")
print(f" → {r['wurcs'][:80]}...")
if __name__ == "__main__":
main()