File size: 4,476 Bytes
b6f9fa8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
scripts/fix_fda_chunk_text.py
==============================
One-time fix: replaces the verbose FDA boilerplate prefix in all FDA DailyMed
chunk_text entries in the metadata store with a clean, BM25-friendly prefix.

Before: [FDA DRUG LABEL — These highlights do not include all the information
         needed to use WARFARIN SODIUM TABLETS safely and effectively...]
         CONTRAINDICATIONS: actual content...

After:  [FDA DailyMed | Warfarin | CONTRAINDICATIONS] actual content...

Usage:
    python scripts/fix_fda_chunk_text.py
    python scripts/fix_fda_chunk_text.py --dry-run
"""
from __future__ import annotations

import argparse
import logging
import pickle
import re
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
import yaml

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)

SECTION_CODES = {
    "34068-7": "DOSAGE AND ADMINISTRATION",
    "34070-3": "CONTRAINDICATIONS",
    "43685-7": "WARNINGS AND PRECAUTIONS",
    "34067-9": "INDICATIONS AND USAGE",
    "34073-7": "DRUG INTERACTIONS",
    "34071-1": "WARNINGS",
}

# Matches both old boilerplate and previously-fixed format
_BOILERPLATE_RE = re.compile(r"^\[FDA[^\]]*\]\s*(?:[A-Za-z][^:]*:\s*)?", re.DOTALL)


def fix_chunk_text(chunk_id: str, old_text: str) -> str:
    """Return cleaned chunk_text with a compact, keyword-rich prefix."""
    # Extract drug name from chunk_id: fda_{drug_name}_{set_id}_{code}_{offset}
    parts = chunk_id.split("_")
    # parts[0] = "fda", parts[1] = drug_name (may be multi-word), then UUID parts, then code, then offset
    # Find the section code in parts
    section_name = None
    drug_name_parts = []
    for i, part in enumerate(parts[1:], 1):
        if part in SECTION_CODES:
            section_name = SECTION_CODES[part]
            drug_name_parts = parts[1:i]
            break

    # Filter out UUID parts (set_id format: 8hex-4hex-...) from drug name
    _UUID_RE = re.compile(r'^[0-9a-f]{8}-', re.I)
    drug_name_parts = [p for p in drug_name_parts if not _UUID_RE.match(p)]
    drug_name = " ".join(drug_name_parts).replace("_", " ").title() if drug_name_parts else "Unknown"

    if not section_name:
        m = _BOILERPLATE_RE.match(old_text)
        section_name = m.group(1).strip() if m else "DRUG INFORMATION"

    # Strip the old boilerplate prefix and get just the content
    m = _BOILERPLATE_RE.match(old_text)
    content = old_text[m.end():].strip() if m else old_text.strip()

    # Prepend drug name into content so BM25 finds it even in continuation chunks
    # e.g. chunk starting "Bleeding tendencies..." now reads "Warfarin CONTRAINDICATIONS: Bleeding..."
    return f"[FDA DailyMed | {drug_name} | {section_name}] {drug_name} {section_name}: {content}"


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true")
    args = parser.parse_args()

    with open("config.yaml") as f:
        cfg = yaml.safe_load(f)
    meta_path = cfg["retrieval"]["metadata_path"]

    logger.info("Loading metadata store from %s ...", meta_path)
    with open(meta_path, "rb") as f:
        store: dict = pickle.load(f)

    fda_keys = [k for k, v in store.items() if v.get("source") == "FDA DailyMed"]
    logger.info("Found %d FDA DailyMed entries to fix", len(fda_keys))

    fixed = 0
    for key in fda_keys:
        entry = store[key]
        old_text = entry.get("chunk_text", "")
        # Re-run on both old boilerplate AND previously-fixed entries (to fix UUID + add drug name to content)
        if not (old_text.startswith("[FDA DRUG LABEL") or old_text.startswith("[FDA DailyMed |")):
            continue
        new_text = fix_chunk_text(entry.get("chunk_id", ""), old_text)
        if args.dry_run:
            if fixed < 3:
                logger.info("BEFORE: %s", old_text[:120])
                logger.info("AFTER:  %s", new_text[:120])
                logger.info("---")
        else:
            store[key]["chunk_text"] = new_text
        fixed += 1

    logger.info("%d entries %s", fixed,
                "would be fixed (dry run)" if args.dry_run else "fixed")

    if not args.dry_run:
        with open(meta_path, "wb") as f:
            pickle.dump(store, f, protocol=pickle.HIGHEST_PROTOCOL)
        logger.info("Metadata store saved. Restart backend to rebuild BM25 index.")


if __name__ == "__main__":
    main()