| | """ |
| | Preprocess raw UMLS data into a simplified CSV for FAISS indexing. |
| | Accepts a TSV/CSV with columns CUI, STR (string), DEF, SAB (source). |
| | Outputs a clean CSV with headers: cui,name,definition,source. |
| | |
| | Usage: |
| | python preprocess_umls_data.py \ |
| | --input raw_umls.tsv \ |
| | --output processed/concepts.csv \ |
| | --sep '\t' |
| | """ |
| | import argparse |
| | import pandas as pd |
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="Preprocess UMLS raw data.") |
| | parser.add_argument('--input', required=True, |
| | help='Raw UMLS file (TSV/CSV)') |
| | parser.add_argument('--output', required=True, |
| | help='Output CSV for concepts') |
| | parser.add_argument('--sep', default='\t', |
| | help='Separator for input file') |
| | args = parser.parse_args() |
| |
|
| | |
| | df = pd.read_csv(args.input, sep=args.sep, dtype=str) |
| |
|
| | |
| | df = df.rename(columns={ |
| | 'CUI': 'cui', |
| | 'STR': 'name', |
| | 'DEF': 'definition', |
| | 'SAB': 'source' |
| | }) |
| |
|
| | |
| | df = df.dropna(subset=['cui', 'name']) |
| |
|
| | |
| | df['definition'] = df['definition'].fillna('') |
| | df['source'] = df['source'].fillna('') |
| |
|
| | |
| | df = df.drop_duplicates(subset=['cui', 'name']) |
| |
|
| | |
| | df.to_csv(args.output, index=False) |
| | print(f"Processed {len(df)} concepts to {args.output}") |
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|