File size: 4,884 Bytes
36b2bff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
"""Download Tier 1 source PDFs for the clinical knowledge base.

Run to populate ml/knowledge_base/sources/ with authoritative clinical
documents from public sources. Idempotent — existing files skipped.

Usage:
    python scripts/fetch_knowledge_base.py
    python scripts/fetch_knowledge_base.py --force  # Re-download all
"""

import logging
import sys
from pathlib import Path

import httpx

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

SOURCES_DIR = Path(__file__).parent.parent / "ml" / "knowledge_base" / "sources"

DOCUMENTS = [
    {
        "filename": "phq9_questionnaire_apa.pdf",
        "url": "https://www.apa.org/depression-guideline/patient-health-questionnaire.pdf",
        "description": "PHQ-9 Questionnaire (APA) — public domain",
    },
    {
        "filename": "gad7_questionnaire_adaa.pdf",
        "url": "https://adaa.org/sites/default/files/GAD-7_Anxiety-updated_0.pdf",
        "description": "GAD-7 Questionnaire (ADAA) — public domain",
    },
    {
        "filename": "cssrs_baseline_screening.pdf",
        "url": "https://cssrs.columbia.edu/wp-content/uploads/C-SSRS1-14-09-BaselineScreening.pdf",
        "description": "C-SSRS Baseline/Screening Version — free for healthcare",
    },
    {
        "filename": "cssrs_scoring_guide.pdf",
        "url": "https://cssrs.columbia.edu/wp-content/uploads/ScoringandDataAnalysisGuide-for-Clinical-Trials-1.pdf",
        "description": "C-SSRS Scoring Guide — free for healthcare",
    },
    {
        "filename": "apa_depression_guideline_2019.pdf",
        "url": "https://www.apa.org/depression-guideline/guideline.pdf",
        "description": "APA Clinical Practice Guideline for Depression (2019)",
    },
    {
        "filename": "apa_mdd_practice_guideline.pdf",
        "url": "https://psychiatryonline.org/pb/assets/raw/sitewide/practice_guidelines/guidelines/mdd.pdf",
        "description": "APA Practice Guideline for MDD",
    },
    {
        "filename": "nice_cg91_depression_chronic_health.pdf",
        "url": "https://www.nice.org.uk/guidance/cg91/resources/depression-in-adults-with-a-chronic-physical-health-problem-recognition-and-management-pdf-975744316357",
        "description": "NICE CG91: Depression with chronic health problems",
    },
    {
        "filename": "dsm5tr_mdd_fact_sheet.pdf",
        "url": "https://www.psychiatry.org/getmedia/33fc7cdb-6fd8-46a7-9ff2-225ba7862f7f/APA-DSM5TR-MajorDepressiveDisorder.pdf",
        "description": "DSM-5-TR MDD Fact Sheet",
    },
    {
        "filename": "esketamine_fda_label_2025.pdf",
        "url": "https://www.accessdata.fda.gov/drugsatfda_docs/label/2025/211243s016lbl.pdf",
        "description": "Esketamine (Spravato) FDA Label — public domain",
    },
    {
        "filename": "who_bahrain_mental_health.pdf",
        "url": "https://cdn.who.int/media/docs/default-source/mental-health/who-aims-country-reports/mh_aims_report_bahrain_jan_2011_en.pdf",
        "description": "WHO-AIMS: Mental Health System in Bahrain",
    },
]


def download_file(url: str, dest: Path, description: str) -> bool:
    """Download a file. Returns True if downloaded, False if skipped/failed."""
    if dest.exists():
        logger.info(f"  SKIP (exists): {dest.name}")
        return False

    logger.info(f"  Downloading: {description}")
    try:
        with httpx.Client(timeout=60, follow_redirects=True) as client:
            response = client.get(url)
            response.raise_for_status()
            dest.write_bytes(response.content)
            size_mb = len(response.content) / (1024 * 1024)
            logger.info(f"  OK: {dest.name} ({size_mb:.1f} MB)")
            return True
    except httpx.HTTPError as e:
        logger.error(f"  FAILED: {dest.name}{e}")
        logger.error(f"  → Manual download needed: {url}")
        return False


def main():
    force = "--force" in sys.argv
    SOURCES_DIR.mkdir(parents=True, exist_ok=True)

    logger.info(f"Fetching {len(DOCUMENTS)} source PDFs to {SOURCES_DIR}")
    downloaded, skipped, failed = 0, 0, 0
    failed_docs = []

    for doc in DOCUMENTS:
        dest = SOURCES_DIR / doc["filename"]
        if force and dest.exists():
            dest.unlink()

        success = download_file(doc["url"], dest, doc["description"])
        if success:
            downloaded += 1
        elif dest.exists():
            skipped += 1
        else:
            failed += 1
            failed_docs.append(doc)

    logger.info(f"\nSummary: {downloaded} downloaded, {skipped} skipped, {failed} failed")
    if failed_docs:
        logger.warning("\nFailed downloads (manual fetch needed):")
        for doc in failed_docs:
            logger.warning(f"  {doc['filename']}: {doc['url']}")
        sys.exit(1)


if __name__ == "__main__":
    main()