Spaces:

below-threshold
/

ai-response-validator

Sleeping

File size: 6,118 Bytes

"""
Convert a Kaggle drug CSV into features.yaml entries and append to the pharma KB.

Supported CSV formats:
  - Drug Labels & Side Effects (drug_name, medical_condition, side_effects)
  - Drug Dataset: Uses, Side Effects & User Reviews (same columns)
  - Medicine Side-Effects Analysis (drug, use, side_effects / adverse_effects)

Usage:
    python scripts/kaggle_to_yaml.py path/to/drugs.csv [--max 15] [--dry-run]

Options:
    --max N      Max number of drugs to import (default: 15)
    --dry-run    Print YAML to stdout instead of appending to features.yaml
    --domain D   Target domain (default: pharma)
"""

import argparse
import csv
import re
import sys
from pathlib import Path

KNOWLEDGE_ROOT = Path(__file__).parent.parent / "knowledge"

_DRUG_COLS    = ["drug_name", "Drug Name", "name", "drug", "drugName"]
_USE_COLS     = ["indications", "medical_condition", "condition", "use", "uses",
                 "indication", "Medical Condition"]
_EFFECT_COLS  = ["side_effects", "Side_Effects", "Side Effects", "sideEffects",
                 "adverse_effects", "adverse_events"]
_CONTRA_COLS  = ["contraindications", "contraindication", "Contraindications"]
_WARN_COLS    = ["warnings", "warning", "Warnings", "precautions"]


def _find_col(headers: list[str], candidates: list[str]) -> str | None:
    h_lower = {h.lower().strip(): h for h in headers}
    for c in candidates:
        if c in headers:
            return c
        if c.lower().strip() in h_lower:
            return h_lower[c.lower().strip()]
    return None


def _slugify(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "-", s.lower()).strip("-")


def _truncate(text: str, max_items: int = 8) -> str:
    sep = "," if text.count(",") >= text.count(";") else ";"
    items = [i.strip() for i in text.split(sep) if i.strip()]
    if len(items) > max_items:
        return ", ".join(items[:max_items]) + ", and others"
    return ", ".join(items)


def _to_yaml_block(doc_id: str, title: str, content: str, tags: list[str]) -> str:
    safe_content = content.replace('"', '\\"')
    tags_yaml = ", ".join(f'"{t}"' for t in tags)
    return (
        f"\n  - id: {doc_id}\n"
        f"    title: \"{title}\"\n"
        f"    content: >\n"
        + "".join(f"      {line}\n" for line in content.splitlines())
        + f"    tags: [{tags_yaml}]\n"
    )


def convert(csv_path: Path, max_drugs: int, dry_run: bool, domain: str) -> None:
    with csv_path.open(newline="", encoding="utf-8-sig") as f:
        reader = csv.DictReader(f)
        headers = list(reader.fieldnames or [])

        drug_col   = _find_col(headers, _DRUG_COLS)
        use_col    = _find_col(headers, _USE_COLS)
        effect_col = _find_col(headers, _EFFECT_COLS)
        contra_col = _find_col(headers, _CONTRA_COLS)
        warn_col   = _find_col(headers, _WARN_COLS)

        if not drug_col:
            print(f"ERROR: no drug name column found. Headers: {headers}", file=sys.stderr)
            sys.exit(1)

        print(f"Detected columns — drug: {drug_col!r}, use: {use_col!r}, "
              f"effects: {effect_col!r}, contra: {contra_col!r}, warnings: {warn_col!r}")

        blocks: list[str] = []
        seen: set[str] = set()

        for i, row in enumerate(reader):
            if len(blocks) >= max_drugs:
                break

            drug = row.get(drug_col, "").strip().title()
            if not drug or drug.lower() in seen:
                continue
            seen.add(drug.lower())

            condition = row.get(use_col, "").strip() if use_col else ""
            effects   = row.get(effect_col, "").strip() if effect_col else ""
            contra    = row.get(contra_col, "").strip() if contra_col else ""
            warnings  = row.get(warn_col, "").strip() if warn_col else ""

            condition_str = condition or "the indicated condition"
            effects_str   = _truncate(effects) if effects else "not listed"

            parts = [
                f"{drug} is indicated for the treatment or management of {condition_str}.",
                f"Known adverse events include: {effects_str}.",
            ]
            if contra:
                parts.append(f"Contraindicated in patients with: {contra}.")
            if warnings:
                parts.append(f"Prescriber warning: {warnings}.")
            parts.append(
                "Serious unexpected adverse events must be reported to the regulatory "
                "authority within 15 days."
            )
            content = " ".join(parts)

            tags = list(filter(None, [
                _slugify(drug),
                _slugify(condition) if condition else None,
                "adverse-event",
                "drug-profile",
            ]))

            doc_id = f"pharma_drug_{i + 1:03d}"
            blocks.append(_to_yaml_block(doc_id, f"{drug} — Drug Profile", content, tags))

    if not blocks:
        print("No documents generated. Check the CSV format.", file=sys.stderr)
        sys.exit(1)

    yaml_str = "".join(blocks)

    if dry_run:
        print(yaml_str)
        print(f"\n[dry-run] {len(blocks)} drug entries would be appended.", file=sys.stderr)
        return

    features_path = KNOWLEDGE_ROOT / domain / "features.yaml"
    original = features_path.read_text()

    # Remove trailing newline before appending
    updated = original.rstrip() + "\n" + yaml_str
    features_path.write_text(updated)
    print(f"Appended {len(blocks)} drug entries to {features_path}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Convert Kaggle drug CSV to features.yaml entries")
    parser.add_argument("csv", type=Path, help="Path to Kaggle drug CSV")
    parser.add_argument("--max", type=int, default=15, dest="max_drugs")
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument("--domain", default="pharma")
    args = parser.parse_args()

    if not args.csv.exists():
        print(f"ERROR: file not found: {args.csv}", file=sys.stderr)
        sys.exit(1)

    convert(args.csv, args.max_drugs, args.dry_run, args.domain)


if __name__ == "__main__":
    main()