File size: 18,315 Bytes
458593e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
"""
Ingest facility data from N-SUMHSS / National Directory into app schema.

Reads a CSV or Excel file (e.g. downloaded from SAMHSA N-SUMHSS or National
Directory), maps columns to the internal schema using a configurable mapping,
and writes data/facilities.csv. If the source uses codes, extend SOURCE_TO_APP
or add a code-resolution step using the N-SUMHSS codebook.

Usage:
  python scripts/ingest_facilities.py [path_to_source.csv]
  python scripts/ingest_facilities.py path/to/national_directory.xlsx

If no path is given, reads from stdin (CSV) or exits with usage.
Output: data/facilities.csv (same directory as this script: repo root/data/).
"""

import argparse
import sys
import warnings
from pathlib import Path

import pandas as pd

# Repo root (parent of scripts/)
REPO_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = REPO_ROOT / "data"
OUTPUT_CSV = DATA_DIR / "facilities.csv"

# Internal schema: only columns we need (no duplicate/redundant attribute columns).
# Search matches against "services" for treatment type, payment, languages, populations, substances.
APP_COLUMNS = [
    "facility_name",
    "address",
    "city",
    "state",
    "zip",
    "phone",
    "mat",
    "services",
]

# Map source column names (lowercase) -> app column name.
# N-SUMHSS / National Directory use different names; adjust per codebook.
# National Directory Excel may use "Facility/Program Name", "Street", "City", "State", etc.
# See data/README.md for the data story and mapping notes.
SOURCE_TO_APP = {
    "facility_name": "facility_name",
    "facility name": "facility_name",
    "facility/program name": "facility_name",
    "program name": "facility_name",
    "name": "facility_name",
    "name1": "facility_name",
    "name2": "facility_name",
    "provider name": "facility_name",
    "organization": "facility_name",
    "treatment facility name": "facility_name",
    "location name": "facility_name",
    "facility": "facility_name",
    "address": "address",
    "street": "address",
    "street address": "address",
    "address1": "address",
    "address line 1": "address",
    "street1": "address",
    "street2": "address",
    "physical address": "address",
    "location address": "address",
    "city": "city",
    "state": "state",
    "state abbreviation": "state",
    "zip": "zip",
    "zipcode": "zip",
    "zip code": "zip",
    "phone": "phone",
    "telephone": "phone",
    "phone number": "phone",
    "treatment_type": "treatment_type",
    "treatment type": "treatment_type",
    "type of care": "treatment_type",
    "care type": "treatment_type",
    "service setting": "treatment_type",
    "treatment setting": "treatment_type",
    "level of care": "treatment_type",
    "payment_options": "payment_options",
    "payment": "payment_options",
    "payment options": "payment_options",
    "payment accepted": "payment_options",
    "accepted payment": "payment_options",
    "insurance accepted": "payment_options",
    "sliding fee": "payment_options",
    "fee scale": "payment_options",
    "mat": "mat",
    "medication_assisted": "mat",
    "medication assisted": "mat",
    "medication assisted treatment": "mat",
    "buprenorphine": "mat",
    "services": "services",
    "services offered": "services",
    "service codes": "services",
    "types of care": "services",
    "substances_addressed": "substances_addressed",
    "substances": "substances_addressed",
    "substances addressed": "substances_addressed",
    "primary focus": "substances_addressed",
    "substance focus": "substances_addressed",
    "drugs treated": "substances_addressed",
    "languages": "languages",
    "language": "languages",
    "languages spoken": "languages",
    "non-english languages": "languages",
    "language services": "languages",
    "populations": "populations",
    "population": "populations",
    "population served": "populations",
    "special populations": "populations",
    "ages served": "populations",
    "age group": "populations",
    "description": "description",
    "comments": "description",
    "notes": "description",
}


def _normalize_mat(val) -> str:
    """Map various MAT values to yes/no."""
    if pd.isna(val):
        return ""
    s = str(val).lower().strip()
    if s in ("yes", "1", "true", "y"):
        return "yes"
    if s in ("no", "0", "false", "n", ""):
        return "no"
    return "yes" if "yes" in s or "offer" in s else "no"


def load_code_key(path: str | Path) -> dict[str, str] | None:
    """Load the code reference sheet from a National Directory Excel and return code -> description dict.
    SAMHSA 2024 uses sheet 'Service Code Reference' with service_code and service_name columns.
    """
    path = Path(path)
    if path.suffix.lower() not in (".xlsx", ".xls"):
        return None
    if not path.exists():
        return None
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message=".*Cannot parse header or footer.*")
        xl = pd.ExcelFile(path)
    key_df = None
    for name in xl.sheet_names:
        nlower = name.lower()
        if "service code reference" in nlower or "code reference" in nlower or ("key" in nlower and "code" in nlower):
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", message=".*Cannot parse header or footer.*")
                key_df = pd.read_excel(path, sheet_name=name)
            break
    if key_df is None:
        for name in xl.sheet_names:
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", message=".*Cannot parse header or footer.*")
                sheet = pd.read_excel(path, sheet_name=name)
            if 2 <= len(sheet) <= 600 and len(sheet.columns) >= 2:
                cols_lower = [str(c).lower() for c in sheet.columns]
                if "service_code" in cols_lower and "service_name" in cols_lower:
                    key_df = sheet
                    break
    if key_df is None or len(key_df) == 0:
        return None
    key_df.columns = [str(c).strip() for c in key_df.columns]
    cols_lower = [c.lower() for c in key_df.columns]
    code_col = None
    desc_col = None
    if "service_code" in cols_lower:
        code_col = key_df.columns[cols_lower.index("service_code")]
    if "service_name" in cols_lower:
        desc_col = key_df.columns[cols_lower.index("service_name")]
    if not code_col or not desc_col:
        code_col = key_df.columns[0]
        desc_col = key_df.columns[1] if len(key_df.columns) > 1 else key_df.columns[0]
    code_key = {}
    for _, row in key_df.iterrows():
        k = str(row.get(code_col, "")).strip()
        v = str(row.get(desc_col, "")).strip()
        if k and v and k != "nan" and v != "nan" and len(k) <= 20:
            code_key[k] = v
    return code_key if code_key else None


def _decode_service_codes(series: pd.Series, code_key: dict[str, str]) -> pd.Series:
    """Replace code tokens with descriptions; join with ', '. Skip * and unknown tokens (only output decoded)."""
    def decode_one(cell: str) -> str:
        if pd.isna(cell) or not str(cell).strip():
            return ""
        parts = []
        for token in str(cell).split():
            token = token.strip()
            if not token or token == "*":
                continue
            if token in code_key:
                parts.append(code_key[token])
        return ", ".join(parts) if parts else ""
    return series.apply(decode_one)


def load_source(path: str | Path) -> pd.DataFrame:
    """Load CSV or Excel into a DataFrame with lowercase column names.
    For Excel with multiple sheets (e.g. National Directory + Key), uses the
    sheet that looks like facility data (has facility name or state, and many rows).
    """
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(path)
    suf = path.suffix.lower()
    if suf == ".csv":
        df = pd.read_csv(path)
    elif suf in (".xlsx", ".xls"):
        # Suppress openpyxl header/footer parse warnings (harmless; SAMHSA Excel often has them)
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message=".*Cannot parse header or footer.*")
            xl = pd.ExcelFile(path)
        if len(xl.sheet_names) == 1:
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", message=".*Cannot parse header or footer.*")
                df = pd.read_excel(path)
        else:
            # Pick the sheet that has facility data: prefer one with a facility-name-like column and many rows
            def sheet_has_facility_name_col(sheet: pd.DataFrame) -> bool:
                cols_lower = [str(c).lower().strip() for c in sheet.columns]
                if "facility name" in cols_lower or "facility_name" in cols_lower:
                    return True
                if "program name" in cols_lower or "facility/program name" in cols_lower:
                    return True
                if any(("facility" in c or "program" in c) and "name" in c for c in cols_lower):
                    return True
                if "organization" in cols_lower or "provider name" in cols_lower:
                    return True
                return False

            best = None
            best_score = -1
            for name in xl.sheet_names:
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore", message=".*Cannot parse header or footer.*")
                    sheet = pd.read_excel(path, sheet_name=name)
                if len(sheet) < 10:
                    continue
                cols_lower = [str(c).lower().strip() for c in sheet.columns]
                has_state_city = "state" in cols_lower and "city" in cols_lower
                has_name_col = sheet_has_facility_name_col(sheet)
                # Strongly prefer sheet that has a facility name column; then state/city; then row count
                score = (1000 if has_name_col else 0) + (10 if has_state_city else 0) + min(len(sheet), 5000)
                if score > best_score:
                    best_score = score
                    best = sheet
            if best is not None:
                df = best
            else:
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore", message=".*Cannot parse header or footer.*")
                    df = pd.read_excel(path, sheet_name=0)
    else:
        raise ValueError(f"Unsupported format: {suf}. Use .csv or .xlsx")
    df.columns = [str(c).lower().strip() for c in df.columns]
    return df


def _guess_facility_name_column(df: pd.DataFrame, col_map: dict) -> str | None:
    """If no facility_name mapping, find a column that likely holds facility/program name."""
    if "facility_name" in col_map:
        return None
    for src_col in df.columns:
        c = str(src_col).lower().strip()
        if "program" in c and "name" in c:
            return src_col
        if "facility" in c and "name" in c:
            return src_col
        if c in ("organization", "provider name", "location name"):
            return src_col
        # First column is often the name in directory layouts
        if list(df.columns)[0] == src_col and ("name" in c or "facility" in c or "program" in c):
            return src_col
    return None


def _guess_address_column(df: pd.DataFrame, col_map: dict) -> str | None:
    """If no address mapping, find a column that likely holds street address."""
    if "address" in col_map:
        return None
    for src_col in df.columns:
        c = str(src_col).lower().strip()
        if "street" in c or ("address" in c and "line" in c):
            return src_col
        if c in ("physical address", "location address"):
            return src_col
    return None


# Keywords to try when guessing unmapped columns (app_col -> list of substrings; any match in column name).
_GUESS_COLUMN_KEYWORDS = {
    "treatment_type": ["treatment type", "type of care", "care type", "service setting", "level of care", "setting"],
    "payment_options": ["payment", "insurance", "fee", "sliding", "medicaid", "accepted payment"],
    "services": ["services", "service codes", "types of care", "offered", "treatment modalities"],
    "substances_addressed": ["substance", "primary focus", "drug", "alcohol", "opioid"],
    "languages": ["language", "non-english", "spanish", "bilingual"],
    "populations": ["population", "age", "special population", "veteran", "gender", "served"],
    "description": ["description", "comments", "notes", "remarks"],
}


def _guess_column_by_keywords(df: pd.DataFrame, col_map: dict, app_col: str) -> str | None:
    """If app_col not yet mapped, find a source column whose name contains any of the keywords."""
    if app_col in col_map:
        return None
    keywords = _GUESS_COLUMN_KEYWORDS.get(app_col, [])
    for src_col in df.columns:
        c = str(src_col).lower().strip()
        for kw in keywords:
            if kw in c:
                return src_col
    return None


def map_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Map source columns to app schema; add missing app columns as empty."""
    out = {}
    for app_col in APP_COLUMNS:
        out[app_col] = []
    # Find which source column maps to each app column
    col_map = {}
    for src_col in df.columns:
        src_lower = str(src_col).lower().strip()
        if src_lower in SOURCE_TO_APP:
            app_col = SOURCE_TO_APP[src_lower]
            if app_col not in col_map:
                col_map[app_col] = src_col
    # Fallbacks for National Directory Excel when headers differ
    guess_name = _guess_facility_name_column(df, col_map)
    if guess_name and "facility_name" not in col_map:
        col_map["facility_name"] = guess_name
    guess_addr = _guess_address_column(df, col_map)
    if guess_addr and "address" not in col_map:
        col_map["address"] = guess_addr
    for app_col in ("treatment_type", "payment_options", "services", "substances_addressed", "languages", "populations", "description"):
        guess = _guess_column_by_keywords(df, col_map, app_col)
        if guess and app_col not in col_map:
            col_map[app_col] = guess
    for app_col in APP_COLUMNS:
        if app_col in col_map:
            out[app_col] = df[col_map[app_col]].astype(str).replace("nan", "").tolist()
        else:
            out[app_col] = [""] * len(df)
    result = pd.DataFrame(out)

    # National Directory format: merge name1+name2 -> facility_name, street1+street2 -> address
    cols_lower = [str(c).lower().strip() for c in df.columns]
    if "name1" in cols_lower and "name2" in cols_lower:
        n1 = df["name1"].astype(str).replace("nan", "").str.strip()
        n2 = df["name2"].astype(str).replace("nan", "").str.strip()
        merged = (n1 + " " + n2).str.strip()
        result["facility_name"] = merged.where(merged != "", result["facility_name"])
    if "street1" in cols_lower and "street2" in cols_lower:
        s1 = df["street1"].astype(str).replace("nan", "").str.strip()
        s2 = df["street2"].astype(str).replace("nan", "").str.strip()
        merged = (s1 + " " + s2).str.strip()
        result["address"] = merged.where(merged != "", result["address"])
    # service_code_info is decoded in main() using the Key sheet when available (see load_code_key).

    # Normalize MAT to yes/no
    if "mat" in result.columns:
        result["mat"] = result["mat"].apply(_normalize_mat)
    return result


def drop_missing_location(df: pd.DataFrame) -> pd.DataFrame:
    """Keep only rows with non-empty city and state."""
    if "city" not in df.columns or "state" not in df.columns:
        return df
    return df[
        df["city"].notna() & (df["city"].astype(str).str.strip() != "")
        & df["state"].notna() & (df["state"].astype(str).str.strip() != "")
    ].copy()


def main():
    ap = argparse.ArgumentParser(description="Ingest N-SUMHSS/National Directory data into facilities.csv")
    ap.add_argument("source", nargs="?", help="Path to source CSV or Excel file. If omitted, print usage and exit.")
    ap.add_argument("-o", "--output", default=str(OUTPUT_CSV), help="Output CSV path")
    args = ap.parse_args()
    if not args.source:
        ap.print_help()
        sys.exit(0)
    path = Path(args.source)
    raw_df = load_source(path)
    df = map_columns(raw_df)
    if df["facility_name"].str.strip().eq("").all():
        print(
            "Warning: no facility names were mapped. Source columns were:\n  "
            + ", ".join(repr(c) for c in raw_df.columns),
            file=sys.stderr,
        )
    # Decode service_code_info using the Key sheet; store only in services (search uses it for all filters).
    if "service_code_info" in raw_df.columns and path.suffix.lower() in (".xlsx", ".xls"):
        code_key = load_code_key(path)
        if code_key:
            decoded = _decode_service_codes(raw_df["service_code_info"], code_key)
            df["services"] = decoded
    # Report if services is still empty (couldn't decode)
    empty_attrs = [c for c in ("services",) if c in df.columns and (df[c].astype(str).str.strip() == "").all()]
    if empty_attrs and "service_code_info" in raw_df.columns:
        print(
            "Note: " + ", ".join(empty_attrs) + " had no data (source has coded service_code_info; "
            "Key sheet not found or could not be parsed for decoding).",
            file=sys.stderr,
        )
    elif empty_attrs:
        print(
            "Note: these attributes had no data after mapping: " + ", ".join(empty_attrs) + ".",
            file=sys.stderr,
        )
    df = drop_missing_location(df)
    # Deduplicate by facility_name + address + city + state (keep first occurrence)
    key_cols = ["facility_name", "address", "city", "state"]
    if all(c in df.columns for c in key_cols):
        before = len(df)
        df = df.drop_duplicates(subset=key_cols, keep="first").reset_index(drop=True)
        if len(df) < before:
            print(f"Dropped {before - len(df)} duplicate rows (same name+address+city+state).", file=sys.stderr)
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    df.to_csv(args.output, index=False)
    print(f"Wrote {len(df)} rows to {args.output}", file=sys.stderr)


if __name__ == "__main__":
    main()