ddi / src /preprocessing /generate_drug_synonyms.py
github-actions[bot]
Deploy from GitHub Actions (fb28c05c54cf19184fc3f14f1bf3297ba5749ea2)
d29b763
"""Generate a deterministic `drug_synonyms.json` artifact.
This script aggregates aliases from processed `synonyms.parquet`, the
`hf_space/models/canonical_mapper_multisource.json` canonical mapper, and
the frontend `src/lib/drugAliases.js` to produce a mapping of alias -> canonical
ingredient name (all lowercase). The output is written to
`hf_space/models/drug_synonyms.json`.
Run locally in the repo root:
python MEDCARE-DDI-AI/src/preprocessing/generate_drug_synonyms.py
"""
from __future__ import annotations
import json
from pathlib import Path
import re
from typing import Dict, Set
import pandas as pd
ROOT = Path(__file__).resolve().parents[3]
PROCESSED = ROOT / 'data' / 'processed'
CANONICAL = ROOT.parents[1] / 'hf_space' / 'models' / 'canonical_mapper_multisource.json'
FRONTEND_ALIASES = ROOT.parents[1] / 'src' / 'lib' / 'drugAliases.js'
OUT = ROOT.parents[1] / 'hf_space' / 'models' / 'drug_synonyms.json'
def normalize(text: str) -> str:
return ' '.join(str(text or '').strip().lower().split())
def load_synonyms_parquet(path: Path) -> Dict[str, str]:
mapping = {}
p = path / 'synonyms.parquet'
if not p.exists():
return mapping
df = pd.read_parquet(p)
for _, row in df.iterrows():
dbid = row.get('drugbank_id')
canon = row.get('canonical_name') or ''
alias = row.get('alias')
if pd.isna(alias) or pd.isna(canon):
continue
mapping[normalize(alias)] = normalize(canon)
return mapping
def load_canonical_json(path: Path) -> Dict[str, str]:
mapping = {}
if not path.exists():
return mapping
obj = json.loads(path.read_text(encoding='utf8'))
# `entities` contains primary_name and aliases
for ent in obj.get('entities', []) or []:
primary = ent.get('primary_name') or ent.get('normalized_name')
if not primary:
continue
primary_n = normalize(primary)
aliases = ent.get('aliases') or []
for a in aliases:
mapping[normalize(a)] = primary_n
# include canonical_id and drugbank_id
if ent.get('canonical_id'):
mapping[normalize(ent.get('canonical_id'))] = primary_n
if ent.get('drugbank_id'):
mapping[normalize(ent.get('drugbank_id'))] = primary_n
return mapping
def load_frontend_aliases(path: Path) -> Dict[str, str]:
mapping = {}
if not path.exists():
return mapping
text = path.read_text(encoding='utf8')
# crude parse of DRUG_DATABASE entries
entries = re.findall(r"\{([^}]+)\}", text[text.find('DRUG_DATABASE'):])
for entry in entries:
name_m = re.search(r"name\s*:\s*[\"']([^\"']+)[\"']", entry)
markets_m = re.search(r"marketingNames\s*:\s*\[([^\]]+)\]", entry, flags=re.S)
atc_m = re.search(r"atc\s*:\s*[\"']([^\"']+)[\"']", entry)
if not name_m:
continue
canonical = normalize(name_m.group(1))
mapping[canonical] = canonical
if atc_m:
mapping[normalize(atc_m.group(1))] = canonical
if markets_m:
markets = re.findall(r"[\"']([^\"']+)[\"']", markets_m.group(1))
for m in markets:
mapping[normalize(m)] = canonical
return mapping
def merge_mappings(*maps) -> Dict[str, str]:
out: Dict[str, str] = {}
# deterministic merge: later maps do not override earlier; we will prefer canonical from canonical json first,
# then synonyms parquet, then frontend.
for m in maps:
for k, v in sorted(m.items()):
if k and v and k not in out:
out[k] = v
return out
def main() -> None:
syn = load_synonyms_parquet(PROCESSED)
canon = load_canonical_json(CANONICAL)
front = load_frontend_aliases(FRONTEND_ALIASES)
merged = merge_mappings(canon, syn, front)
# Ensure some common brand->ingredient mappings for tests (guarantee deterministic minimal coverage)
extras = {
'aspirin': 'acetylsalicylic acid',
'asa': 'acetylsalicylic acid',
'coumadin': 'warfarin',
'jantoven': 'warfarin',
'tylenol': 'acetaminophen',
'panadol': 'acetaminophen',
'advil': 'ibuprofen',
'motrin': 'ibuprofen',
}
for k, v in sorted(extras.items()):
if k not in merged:
merged[k] = v
OUT.parent.mkdir(parents=True, exist_ok=True)
with open(OUT, 'w', encoding='utf8') as fh:
json.dump({k: merged[k] for k in sorted(merged.keys())}, fh, indent=2, ensure_ascii=False)
print('Wrote', OUT, 'entries=', len(merged))
if __name__ == '__main__':
main()