| #!/usr/bin/env python3 | |
| """ | |
| Preprocess raw UMLS RRF/TSV into a simple CSV for indexing. | |
| """ | |
| import argparse | |
| import pandas as pd | |
| def main(): | |
| p = argparse.ArgumentParser() | |
| p.add_argument('--input', required=True) | |
| p.add_argument('--output', required=True) | |
| p.add_argument('--sep', default='\t') | |
| args = p.parse_args() | |
| df = pd.read_csv(args.input, sep=args.sep, dtype=str) | |
| df = df.rename(columns={'CUI':'cui','STR':'name','DEF':'definition','SAB':'source'}) | |
| df = df.dropna(subset=['cui','name']) | |
| df['definition'] = df['definition'].fillna('') | |
| df['source'] = df['source'].fillna('') | |
| df = df.drop_duplicates(subset=['cui','name']) | |
| df.to_csv(args.output, index=False) | |
| print(f'Wrote {len(df)} concepts to {args.output}') | |
| if __name__ == '__main__': | |
| main() |