| |
| """ |
| Batch IUPAC to WURCS conversion using GlyCosmos API. |
| |
| Attempts to convert failed IUPAC strings to WURCS using the GlyCoNAVI/GlyCosmos API. |
| Note: Many of these structures are non-standard bacterial nomenclature that the |
| API may not recognize. |
| """ |
|
|
| import csv |
| import urllib.request |
| import urllib.parse |
| import json |
| import time |
| import os |
|
|
| API_URL_CONDENSED = "https://api.glycosmos.org/glycanformatconverter/2.8.0/iupaccondensed2wurcs/" |
| API_URL_EXTENDED = "https://api.glycosmos.org/glycanformatconverter/2.8.0/iupacextended2wurcs/" |
| TIMEOUT = 30 |
| DELAY = 0.5 |
|
|
| def convert_iupac_to_wurcs(iupac: str) -> tuple: |
| """ |
| Try to convert IUPAC to WURCS using GlyCosmos API. |
| Returns (wurcs, error_message) |
| """ |
| |
| for api_name, api_url in [("condensed", API_URL_CONDENSED), ("extended", API_URL_EXTENDED)]: |
| try: |
| url = api_url + urllib.parse.quote(iupac, safe='') |
| req = urllib.request.Request(url, headers={ |
| 'User-Agent': 'Mozilla/5.0', |
| 'Accept': 'application/json' |
| }) |
| with urllib.request.urlopen(req, timeout=TIMEOUT) as resp: |
| result = json.loads(resp.read().decode('utf-8')) |
| |
| if 'wurcs' in result and result['wurcs']: |
| return result['wurcs'], None |
| elif 'WURCS' in result and result['WURCS']: |
| return result['WURCS'], None |
| elif 'message' in result: |
| continue |
| except Exception as e: |
| continue |
| |
| return None, "All APIs failed" |
|
|
|
|
| def main(): |
| script_dir = os.path.dirname(os.path.abspath(__file__)) |
| input_file = os.path.join(script_dir, "bert_training_v4/downstream_tasks/failed_wurcs_conversion.csv") |
| output_file = os.path.join(script_dir, "bert_training_v4/downstream_tasks/glycosmos_conversion_results.csv") |
| |
| print(f"Reading from: {input_file}") |
| |
| |
| with open(input_file, 'r') as f: |
| reader = csv.DictReader(f) |
| rows = list(reader) |
| |
| print(f"Found {len(rows)} structures to convert") |
| |
| results = [] |
| success = 0 |
| failed = 0 |
| |
| |
| MAX_TO_PROCESS = 100 |
| |
| for i, row in enumerate(rows[:MAX_TO_PROCESS]): |
| iupac = row['target'] |
| |
| print(f"\r[{i+1}/{min(len(rows), MAX_TO_PROCESS)}] Converting... Success: {success}, Failed: {failed}", end="") |
| |
| wurcs, error = convert_iupac_to_wurcs(iupac) |
| |
| results.append({ |
| 'iupac': iupac, |
| 'wurcs': wurcs or '', |
| 'error': error or '', |
| 'success': wurcs is not None, |
| 'species': row.get('species', ''), |
| 'kingdom': row.get('kingdom', '') |
| }) |
| |
| if wurcs: |
| success += 1 |
| else: |
| failed += 1 |
| |
| time.sleep(DELAY) |
| |
| print(f"\n\nResults: {success} success, {failed} failed") |
| |
| |
| with open(output_file, 'w', newline='') as f: |
| writer = csv.DictWriter(f, fieldnames=['iupac', 'wurcs', 'error', 'success', 'species', 'kingdom']) |
| writer.writeheader() |
| writer.writerows(results) |
| |
| print(f"Saved to: {output_file}") |
| |
| |
| successful = [r for r in results if r['success']] |
| if successful: |
| print(f"\n=== {len(successful)} Successful Conversions ===") |
| for r in successful[:5]: |
| print(f" {r['iupac'][:60]}...") |
| print(f" → {r['wurcs'][:80]}...") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|