Spaces:
Running
Running
| """ | |
| scripts/fix_fda_chunk_text.py | |
| ============================== | |
| One-time fix: replaces the verbose FDA boilerplate prefix in all FDA DailyMed | |
| chunk_text entries in the metadata store with a clean, BM25-friendly prefix. | |
| Before: [FDA DRUG LABEL — These highlights do not include all the information | |
| needed to use WARFARIN SODIUM TABLETS safely and effectively...] | |
| CONTRAINDICATIONS: actual content... | |
| After: [FDA DailyMed | Warfarin | CONTRAINDICATIONS] actual content... | |
| Usage: | |
| python scripts/fix_fda_chunk_text.py | |
| python scripts/fix_fda_chunk_text.py --dry-run | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import logging | |
| import pickle | |
| import re | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) | |
| import yaml | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") | |
| logger = logging.getLogger(__name__) | |
| SECTION_CODES = { | |
| "34068-7": "DOSAGE AND ADMINISTRATION", | |
| "34070-3": "CONTRAINDICATIONS", | |
| "43685-7": "WARNINGS AND PRECAUTIONS", | |
| "34067-9": "INDICATIONS AND USAGE", | |
| "34073-7": "DRUG INTERACTIONS", | |
| "34071-1": "WARNINGS", | |
| } | |
| # Matches both old boilerplate and previously-fixed format | |
| _BOILERPLATE_RE = re.compile(r"^\[FDA[^\]]*\]\s*(?:[A-Za-z][^:]*:\s*)?", re.DOTALL) | |
| def fix_chunk_text(chunk_id: str, old_text: str) -> str: | |
| """Return cleaned chunk_text with a compact, keyword-rich prefix.""" | |
| # Extract drug name from chunk_id: fda_{drug_name}_{set_id}_{code}_{offset} | |
| parts = chunk_id.split("_") | |
| # parts[0] = "fda", parts[1] = drug_name (may be multi-word), then UUID parts, then code, then offset | |
| # Find the section code in parts | |
| section_name = None | |
| drug_name_parts = [] | |
| for i, part in enumerate(parts[1:], 1): | |
| if part in SECTION_CODES: | |
| section_name = SECTION_CODES[part] | |
| drug_name_parts = parts[1:i] | |
| break | |
| # Filter out UUID parts (set_id format: 8hex-4hex-...) from drug name | |
| _UUID_RE = re.compile(r'^[0-9a-f]{8}-', re.I) | |
| drug_name_parts = [p for p in drug_name_parts if not _UUID_RE.match(p)] | |
| drug_name = " ".join(drug_name_parts).replace("_", " ").title() if drug_name_parts else "Unknown" | |
| if not section_name: | |
| m = _BOILERPLATE_RE.match(old_text) | |
| section_name = m.group(1).strip() if m else "DRUG INFORMATION" | |
| # Strip the old boilerplate prefix and get just the content | |
| m = _BOILERPLATE_RE.match(old_text) | |
| content = old_text[m.end():].strip() if m else old_text.strip() | |
| # Prepend drug name into content so BM25 finds it even in continuation chunks | |
| # e.g. chunk starting "Bleeding tendencies..." now reads "Warfarin CONTRAINDICATIONS: Bleeding..." | |
| return f"[FDA DailyMed | {drug_name} | {section_name}] {drug_name} {section_name}: {content}" | |
| def main() -> None: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--dry-run", action="store_true") | |
| args = parser.parse_args() | |
| with open("config.yaml") as f: | |
| cfg = yaml.safe_load(f) | |
| meta_path = cfg["retrieval"]["metadata_path"] | |
| logger.info("Loading metadata store from %s ...", meta_path) | |
| with open(meta_path, "rb") as f: | |
| store: dict = pickle.load(f) | |
| fda_keys = [k for k, v in store.items() if v.get("source") == "FDA DailyMed"] | |
| logger.info("Found %d FDA DailyMed entries to fix", len(fda_keys)) | |
| fixed = 0 | |
| for key in fda_keys: | |
| entry = store[key] | |
| old_text = entry.get("chunk_text", "") | |
| # Re-run on both old boilerplate AND previously-fixed entries (to fix UUID + add drug name to content) | |
| if not (old_text.startswith("[FDA DRUG LABEL") or old_text.startswith("[FDA DailyMed |")): | |
| continue | |
| new_text = fix_chunk_text(entry.get("chunk_id", ""), old_text) | |
| if args.dry_run: | |
| if fixed < 3: | |
| logger.info("BEFORE: %s", old_text[:120]) | |
| logger.info("AFTER: %s", new_text[:120]) | |
| logger.info("---") | |
| else: | |
| store[key]["chunk_text"] = new_text | |
| fixed += 1 | |
| logger.info("%d entries %s", fixed, | |
| "would be fixed (dry run)" if args.dry_run else "fixed") | |
| if not args.dry_run: | |
| with open(meta_path, "wb") as f: | |
| pickle.dump(store, f, protocol=pickle.HIGHEST_PROTOCOL) | |
| logger.info("Metadata store saved. Restart backend to rebuild BM25 index.") | |
| if __name__ == "__main__": | |
| main() | |