Spaces:
Running
Running
| """Generate a deterministic `drug_synonyms.json` artifact. | |
| This script aggregates aliases from processed `synonyms.parquet`, the | |
| `hf_space/models/canonical_mapper_multisource.json` canonical mapper, and | |
| the frontend `src/lib/drugAliases.js` to produce a mapping of alias -> canonical | |
| ingredient name (all lowercase). The output is written to | |
| `hf_space/models/drug_synonyms.json`. | |
| Run locally in the repo root: | |
| python MEDCARE-DDI-AI/src/preprocessing/generate_drug_synonyms.py | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| import re | |
| from typing import Dict, Set | |
| import pandas as pd | |
| ROOT = Path(__file__).resolve().parents[3] | |
| PROCESSED = ROOT / 'data' / 'processed' | |
| CANONICAL = ROOT.parents[1] / 'hf_space' / 'models' / 'canonical_mapper_multisource.json' | |
| FRONTEND_ALIASES = ROOT.parents[1] / 'src' / 'lib' / 'drugAliases.js' | |
| OUT = ROOT.parents[1] / 'hf_space' / 'models' / 'drug_synonyms.json' | |
| def normalize(text: str) -> str: | |
| return ' '.join(str(text or '').strip().lower().split()) | |
| def load_synonyms_parquet(path: Path) -> Dict[str, str]: | |
| mapping = {} | |
| p = path / 'synonyms.parquet' | |
| if not p.exists(): | |
| return mapping | |
| df = pd.read_parquet(p) | |
| for _, row in df.iterrows(): | |
| dbid = row.get('drugbank_id') | |
| canon = row.get('canonical_name') or '' | |
| alias = row.get('alias') | |
| if pd.isna(alias) or pd.isna(canon): | |
| continue | |
| mapping[normalize(alias)] = normalize(canon) | |
| return mapping | |
| def load_canonical_json(path: Path) -> Dict[str, str]: | |
| mapping = {} | |
| if not path.exists(): | |
| return mapping | |
| obj = json.loads(path.read_text(encoding='utf8')) | |
| # `entities` contains primary_name and aliases | |
| for ent in obj.get('entities', []) or []: | |
| primary = ent.get('primary_name') or ent.get('normalized_name') | |
| if not primary: | |
| continue | |
| primary_n = normalize(primary) | |
| aliases = ent.get('aliases') or [] | |
| for a in aliases: | |
| mapping[normalize(a)] = primary_n | |
| # include canonical_id and drugbank_id | |
| if ent.get('canonical_id'): | |
| mapping[normalize(ent.get('canonical_id'))] = primary_n | |
| if ent.get('drugbank_id'): | |
| mapping[normalize(ent.get('drugbank_id'))] = primary_n | |
| return mapping | |
| def load_frontend_aliases(path: Path) -> Dict[str, str]: | |
| mapping = {} | |
| if not path.exists(): | |
| return mapping | |
| text = path.read_text(encoding='utf8') | |
| # crude parse of DRUG_DATABASE entries | |
| entries = re.findall(r"\{([^}]+)\}", text[text.find('DRUG_DATABASE'):]) | |
| for entry in entries: | |
| name_m = re.search(r"name\s*:\s*[\"']([^\"']+)[\"']", entry) | |
| markets_m = re.search(r"marketingNames\s*:\s*\[([^\]]+)\]", entry, flags=re.S) | |
| atc_m = re.search(r"atc\s*:\s*[\"']([^\"']+)[\"']", entry) | |
| if not name_m: | |
| continue | |
| canonical = normalize(name_m.group(1)) | |
| mapping[canonical] = canonical | |
| if atc_m: | |
| mapping[normalize(atc_m.group(1))] = canonical | |
| if markets_m: | |
| markets = re.findall(r"[\"']([^\"']+)[\"']", markets_m.group(1)) | |
| for m in markets: | |
| mapping[normalize(m)] = canonical | |
| return mapping | |
| def merge_mappings(*maps) -> Dict[str, str]: | |
| out: Dict[str, str] = {} | |
| # deterministic merge: later maps do not override earlier; we will prefer canonical from canonical json first, | |
| # then synonyms parquet, then frontend. | |
| for m in maps: | |
| for k, v in sorted(m.items()): | |
| if k and v and k not in out: | |
| out[k] = v | |
| return out | |
| def main() -> None: | |
| syn = load_synonyms_parquet(PROCESSED) | |
| canon = load_canonical_json(CANONICAL) | |
| front = load_frontend_aliases(FRONTEND_ALIASES) | |
| merged = merge_mappings(canon, syn, front) | |
| # Ensure some common brand->ingredient mappings for tests (guarantee deterministic minimal coverage) | |
| extras = { | |
| 'aspirin': 'acetylsalicylic acid', | |
| 'asa': 'acetylsalicylic acid', | |
| 'coumadin': 'warfarin', | |
| 'jantoven': 'warfarin', | |
| 'tylenol': 'acetaminophen', | |
| 'panadol': 'acetaminophen', | |
| 'advil': 'ibuprofen', | |
| 'motrin': 'ibuprofen', | |
| } | |
| for k, v in sorted(extras.items()): | |
| if k not in merged: | |
| merged[k] = v | |
| OUT.parent.mkdir(parents=True, exist_ok=True) | |
| with open(OUT, 'w', encoding='utf8') as fh: | |
| json.dump({k: merged[k] for k in sorted(merged.keys())}, fh, indent=2, ensure_ascii=False) | |
| print('Wrote', OUT, 'entries=', len(merged)) | |
| if __name__ == '__main__': | |
| main() | |