depscreen / scripts /fetch_knowledge_base.py
halsabbah's picture
deploy: sync code from GitHub main
36b2bff verified
#!/usr/bin/env python3
"""Download Tier 1 source PDFs for the clinical knowledge base.
Run to populate ml/knowledge_base/sources/ with authoritative clinical
documents from public sources. Idempotent β€” existing files skipped.
Usage:
python scripts/fetch_knowledge_base.py
python scripts/fetch_knowledge_base.py --force # Re-download all
"""
import logging
import sys
from pathlib import Path
import httpx
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
SOURCES_DIR = Path(__file__).parent.parent / "ml" / "knowledge_base" / "sources"
DOCUMENTS = [
{
"filename": "phq9_questionnaire_apa.pdf",
"url": "https://www.apa.org/depression-guideline/patient-health-questionnaire.pdf",
"description": "PHQ-9 Questionnaire (APA) β€” public domain",
},
{
"filename": "gad7_questionnaire_adaa.pdf",
"url": "https://adaa.org/sites/default/files/GAD-7_Anxiety-updated_0.pdf",
"description": "GAD-7 Questionnaire (ADAA) β€” public domain",
},
{
"filename": "cssrs_baseline_screening.pdf",
"url": "https://cssrs.columbia.edu/wp-content/uploads/C-SSRS1-14-09-BaselineScreening.pdf",
"description": "C-SSRS Baseline/Screening Version β€” free for healthcare",
},
{
"filename": "cssrs_scoring_guide.pdf",
"url": "https://cssrs.columbia.edu/wp-content/uploads/ScoringandDataAnalysisGuide-for-Clinical-Trials-1.pdf",
"description": "C-SSRS Scoring Guide β€” free for healthcare",
},
{
"filename": "apa_depression_guideline_2019.pdf",
"url": "https://www.apa.org/depression-guideline/guideline.pdf",
"description": "APA Clinical Practice Guideline for Depression (2019)",
},
{
"filename": "apa_mdd_practice_guideline.pdf",
"url": "https://psychiatryonline.org/pb/assets/raw/sitewide/practice_guidelines/guidelines/mdd.pdf",
"description": "APA Practice Guideline for MDD",
},
{
"filename": "nice_cg91_depression_chronic_health.pdf",
"url": "https://www.nice.org.uk/guidance/cg91/resources/depression-in-adults-with-a-chronic-physical-health-problem-recognition-and-management-pdf-975744316357",
"description": "NICE CG91: Depression with chronic health problems",
},
{
"filename": "dsm5tr_mdd_fact_sheet.pdf",
"url": "https://www.psychiatry.org/getmedia/33fc7cdb-6fd8-46a7-9ff2-225ba7862f7f/APA-DSM5TR-MajorDepressiveDisorder.pdf",
"description": "DSM-5-TR MDD Fact Sheet",
},
{
"filename": "esketamine_fda_label_2025.pdf",
"url": "https://www.accessdata.fda.gov/drugsatfda_docs/label/2025/211243s016lbl.pdf",
"description": "Esketamine (Spravato) FDA Label β€” public domain",
},
{
"filename": "who_bahrain_mental_health.pdf",
"url": "https://cdn.who.int/media/docs/default-source/mental-health/who-aims-country-reports/mh_aims_report_bahrain_jan_2011_en.pdf",
"description": "WHO-AIMS: Mental Health System in Bahrain",
},
]
def download_file(url: str, dest: Path, description: str) -> bool:
"""Download a file. Returns True if downloaded, False if skipped/failed."""
if dest.exists():
logger.info(f" SKIP (exists): {dest.name}")
return False
logger.info(f" Downloading: {description}")
try:
with httpx.Client(timeout=60, follow_redirects=True) as client:
response = client.get(url)
response.raise_for_status()
dest.write_bytes(response.content)
size_mb = len(response.content) / (1024 * 1024)
logger.info(f" OK: {dest.name} ({size_mb:.1f} MB)")
return True
except httpx.HTTPError as e:
logger.error(f" FAILED: {dest.name} β€” {e}")
logger.error(f" β†’ Manual download needed: {url}")
return False
def main():
force = "--force" in sys.argv
SOURCES_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f"Fetching {len(DOCUMENTS)} source PDFs to {SOURCES_DIR}")
downloaded, skipped, failed = 0, 0, 0
failed_docs = []
for doc in DOCUMENTS:
dest = SOURCES_DIR / doc["filename"]
if force and dest.exists():
dest.unlink()
success = download_file(doc["url"], dest, doc["description"])
if success:
downloaded += 1
elif dest.exists():
skipped += 1
else:
failed += 1
failed_docs.append(doc)
logger.info(f"\nSummary: {downloaded} downloaded, {skipped} skipped, {failed} failed")
if failed_docs:
logger.warning("\nFailed downloads (manual fetch needed):")
for doc in failed_docs:
logger.warning(f" {doc['filename']}: {doc['url']}")
sys.exit(1)
if __name__ == "__main__":
main()