below-threshold commited on
Commit
8103f33
·
1 Parent(s): f7a25db

Add Kaggle drug CSV → features.yaml conversion script

Browse files
Files changed (1) hide show
  1. scripts/kaggle_to_yaml.py +151 -0
scripts/kaggle_to_yaml.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert a Kaggle drug CSV into features.yaml entries and append to the pharma KB.
3
+
4
+ Supported CSV formats:
5
+ - Drug Labels & Side Effects (drug_name, medical_condition, side_effects)
6
+ - Drug Dataset: Uses, Side Effects & User Reviews (same columns)
7
+ - Medicine Side-Effects Analysis (drug, use, side_effects / adverse_effects)
8
+
9
+ Usage:
10
+ python scripts/kaggle_to_yaml.py path/to/drugs.csv [--max 15] [--dry-run]
11
+
12
+ Options:
13
+ --max N Max number of drugs to import (default: 15)
14
+ --dry-run Print YAML to stdout instead of appending to features.yaml
15
+ --domain D Target domain (default: pharma)
16
+ """
17
+
18
+ import argparse
19
+ import csv
20
+ import re
21
+ import sys
22
+ from pathlib import Path
23
+
24
+ KNOWLEDGE_ROOT = Path(__file__).parent.parent / "knowledge"
25
+
26
+ _DRUG_COLS = ["drug_name", "Drug Name", "name", "drug", "drugName"]
27
+ _USE_COLS = ["medical_condition", "condition", "use", "uses", "indication", "Medical Condition"]
28
+ _EFFECT_COLS = ["side_effects", "Side_Effects", "Side Effects", "sideEffects",
29
+ "adverse_effects", "adverse_events"]
30
+
31
+
32
+ def _find_col(headers: list[str], candidates: list[str]) -> str | None:
33
+ h_lower = {h.lower().strip(): h for h in headers}
34
+ for c in candidates:
35
+ if c in headers:
36
+ return c
37
+ if c.lower().strip() in h_lower:
38
+ return h_lower[c.lower().strip()]
39
+ return None
40
+
41
+
42
+ def _slugify(s: str) -> str:
43
+ return re.sub(r"[^a-z0-9]+", "-", s.lower()).strip("-")
44
+
45
+
46
+ def _truncate(text: str, max_items: int = 8) -> str:
47
+ sep = "," if text.count(",") >= text.count(";") else ";"
48
+ items = [i.strip() for i in text.split(sep) if i.strip()]
49
+ if len(items) > max_items:
50
+ return ", ".join(items[:max_items]) + ", and others"
51
+ return ", ".join(items)
52
+
53
+
54
+ def _to_yaml_block(doc_id: str, title: str, content: str, tags: list[str]) -> str:
55
+ safe_content = content.replace('"', '\\"')
56
+ tags_yaml = ", ".join(f'"{t}"' for t in tags)
57
+ return (
58
+ f"\n - id: {doc_id}\n"
59
+ f" title: \"{title}\"\n"
60
+ f" content: >\n"
61
+ + "".join(f" {line}\n" for line in content.splitlines())
62
+ + f" tags: [{tags_yaml}]\n"
63
+ )
64
+
65
+
66
+ def convert(csv_path: Path, max_drugs: int, dry_run: bool, domain: str) -> None:
67
+ with csv_path.open(newline="", encoding="utf-8-sig") as f:
68
+ reader = csv.DictReader(f)
69
+ headers = list(reader.fieldnames or [])
70
+
71
+ drug_col = _find_col(headers, _DRUG_COLS)
72
+ use_col = _find_col(headers, _USE_COLS)
73
+ effect_col = _find_col(headers, _EFFECT_COLS)
74
+
75
+ if not drug_col:
76
+ print(f"ERROR: no drug name column found. Headers: {headers}", file=sys.stderr)
77
+ sys.exit(1)
78
+
79
+ print(f"Detected columns — drug: {drug_col!r}, use: {use_col!r}, effects: {effect_col!r}")
80
+
81
+ blocks: list[str] = []
82
+ seen: set[str] = set()
83
+
84
+ for i, row in enumerate(reader):
85
+ if len(blocks) >= max_drugs:
86
+ break
87
+
88
+ drug = row.get(drug_col, "").strip().title()
89
+ if not drug or drug.lower() in seen:
90
+ continue
91
+ seen.add(drug.lower())
92
+
93
+ condition = row.get(use_col, "").strip() if use_col else ""
94
+ effects = row.get(effect_col, "").strip() if effect_col else ""
95
+ condition_str = condition or "the indicated condition"
96
+ effects_str = _truncate(effects) if effects else "not listed"
97
+
98
+ content = (
99
+ f"{drug} is indicated for the treatment or management of {condition_str}. "
100
+ f"Known adverse events associated with {drug} include: {effects_str}. "
101
+ f"Prescribers should monitor patients and report serious unexpected occurrences "
102
+ f"to the regulatory authority within 15 days."
103
+ )
104
+
105
+ tags = list(filter(None, [
106
+ _slugify(drug),
107
+ _slugify(condition) if condition else None,
108
+ "adverse-event",
109
+ "drug-profile",
110
+ ]))
111
+
112
+ doc_id = f"pharma_drug_{i + 1:03d}"
113
+ blocks.append(_to_yaml_block(doc_id, f"{drug} — Drug Profile", content, tags))
114
+
115
+ if not blocks:
116
+ print("No documents generated. Check the CSV format.", file=sys.stderr)
117
+ sys.exit(1)
118
+
119
+ yaml_str = "".join(blocks)
120
+
121
+ if dry_run:
122
+ print(yaml_str)
123
+ print(f"\n[dry-run] {len(blocks)} drug entries would be appended.", file=sys.stderr)
124
+ return
125
+
126
+ features_path = KNOWLEDGE_ROOT / domain / "features.yaml"
127
+ original = features_path.read_text()
128
+
129
+ # Remove trailing newline before appending
130
+ updated = original.rstrip() + "\n" + yaml_str
131
+ features_path.write_text(updated)
132
+ print(f"Appended {len(blocks)} drug entries to {features_path}")
133
+
134
+
135
+ def main() -> None:
136
+ parser = argparse.ArgumentParser(description="Convert Kaggle drug CSV to features.yaml entries")
137
+ parser.add_argument("csv", type=Path, help="Path to Kaggle drug CSV")
138
+ parser.add_argument("--max", type=int, default=15, dest="max_drugs")
139
+ parser.add_argument("--dry-run", action="store_true")
140
+ parser.add_argument("--domain", default="pharma")
141
+ args = parser.parse_args()
142
+
143
+ if not args.csv.exists():
144
+ print(f"ERROR: file not found: {args.csv}", file=sys.stderr)
145
+ sys.exit(1)
146
+
147
+ convert(args.csv, args.max_drugs, args.dry_run, args.domain)
148
+
149
+
150
+ if __name__ == "__main__":
151
+ main()