phanny commited on
Commit
727ab75
·
1 Parent(s): 7535bf6

add contacts

Browse files
Files changed (4) hide show
  1. app.py +7 -4
  2. docs/PROJECT_SUMMARY.md +83 -0
  3. src/chat.py +56 -21
  4. src/facilities.py +4 -0
app.py CHANGED
@@ -432,7 +432,7 @@ def _tuples_to_messages(history):
432
  def create_demo():
433
  chatbot = Chatbot()
434
 
435
- with gr.Blocks(title="SAMHSA Treatment Locator", css=CSS) as demo:
436
  gr.Markdown("# SAMHSA Treatment Locator")
437
  gr.Markdown(DESCRIPTION)
438
  gr.Markdown(f"<div class='disclaimer'>{DISCLAIMER}</div>", elem_classes=["disclaimer"])
@@ -495,12 +495,13 @@ def create_demo():
495
  history_tuples = _messages_to_tuples(history)
496
  reply, new_state = chatbot.get_response(message, history_tuples, state)
497
  new_state = dict(new_state)
498
- new_state["selected_facility_name"] = None # clear selection when new results
499
  new_history_tuples = history_tuples + [[message, reply]]
500
  new_history_messages = _tuples_to_messages(new_history_tuples)
501
  facilities = list(new_state.get("last_results") or [])
502
- map_html_out = _build_map_html(facilities, None, update_id, None)
503
- return new_history_messages, new_state, "", map_html_out, gr.update(choices=_facility_names(facilities), value=None)
 
 
504
  except Exception as e:
505
  err_msg = str(e)[:200]
506
  reply = f"Sorry, something went wrong: {err_msg}"
@@ -551,6 +552,8 @@ if __name__ == "__main__":
551
  demo = create_demo()
552
  sig = inspect.signature(demo.launch)
553
  kwargs = {}
 
 
554
  if "theme" in sig.parameters and hasattr(gr, "themes"):
555
  kwargs["theme"] = gr.themes.Soft(primary_hue="teal", secondary_hue="slate")
556
  demo.launch(**kwargs)
 
432
  def create_demo():
433
  chatbot = Chatbot()
434
 
435
+ with gr.Blocks(title="SAMHSA Treatment Locator") as demo:
436
  gr.Markdown("# SAMHSA Treatment Locator")
437
  gr.Markdown(DESCRIPTION)
438
  gr.Markdown(f"<div class='disclaimer'>{DISCLAIMER}</div>", elem_classes=["disclaimer"])
 
495
  history_tuples = _messages_to_tuples(history)
496
  reply, new_state = chatbot.get_response(message, history_tuples, state)
497
  new_state = dict(new_state)
 
498
  new_history_tuples = history_tuples + [[message, reply]]
499
  new_history_messages = _tuples_to_messages(new_history_tuples)
500
  facilities = list(new_state.get("last_results") or [])
501
+ sel = new_state.get("selected_facility_name")
502
+ map_html_out = _build_map_html(facilities, None, update_id, sel)
503
+ dropdown_value = sel if sel and any(f.get("facility_name") == sel or f.get("name") == sel for f in facilities) else None
504
+ return new_history_messages, new_state, "", map_html_out, gr.update(choices=_facility_names(facilities), value=dropdown_value)
505
  except Exception as e:
506
  err_msg = str(e)[:200]
507
  reply = f"Sorry, something went wrong: {err_msg}"
 
552
  demo = create_demo()
553
  sig = inspect.signature(demo.launch)
554
  kwargs = {}
555
+ if "css" in sig.parameters:
556
+ kwargs["css"] = CSS
557
  if "theme" in sig.parameters and hasattr(gr, "themes"):
558
  kwargs["theme"] = gr.themes.Soft(primary_hue="teal", secondary_hue="slate")
559
  demo.launch(**kwargs)
docs/PROJECT_SUMMARY.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 6.C395 Chatbot – Project Summary
2
+
3
+ ## What’s Done
4
+
5
+ ### 1. Chatbot (SAMHSA Treatment Locator)
6
+
7
+ - **Role:** Helps users find substance use and mental health treatment facilities in the US by conversation. No invented facilities; only real data is shown.
8
+ - **Flow (matches `samhsa_chatbot_conversation_example.txt`):**
9
+ - **Greet / clarify:** Asks for location (state/city), treatment type (inpatient/outpatient/residential/telehealth), payment (Medicaid, insurance, sliding scale, etc.), and optionally substances, populations (veterans, LGBTQ+, adolescents), therapies (MAT, CBT, 12-step), languages.
10
+ - **First results:** Once there’s at least a location, runs search and presents 2–3 facilities by name with short descriptions from data only.
11
+ - **Follow-up:** If the user asks about a specific facility (“Do they offer MAT?”), answers from that facility’s record only.
12
+ - **Closing:** Brief supportive close when the user is done.
13
+ - **Implementation:** `src/chat.py` – `Chatbot` class with criteria extraction (`_extract_criteria`), merge with prior state, search when location is present, and response via Hugging Face Inference API. System prompt enforces “use only provided facility data; never invent names/addresses/phones.”
14
+
15
+ ### 2. RAG-style retrieval (no hallucination)
16
+
17
+ - **Data:** Facility records from SAMHSA (CSV: `data/facilities.csv` or a Hugging Face Dataset when `FACILITIES_DATASET` is set). Scripts: `scripts/download_findtreatment_data.py`, `scripts/ingest_facilities.py`, `scripts/upload_facilities_to_hf.py`.
18
+ - **Search:** `src/facilities.py` – `load_facilities()`, `search(criteria, df, limit)`. Criteria: state, location (city), treatment_type, payment, mat, populations, languages, substances, therapies. Returns list of facility dicts.
19
+ - **RAG pattern:** User message → extract/merge criteria → run `search()` when location exists → format results as “Current facility data” in the system prompt → LLM answers only from that context. Follow-up about a specific facility uses `get_facility_by_name()` and the same “only this data” prompt. So the model is **retrieval-grounded**: it never invents facilities.
20
+
21
+ ### 3. Evaluation
22
+
23
+ - **Script:** `scripts/eval_chatbot.py`.
24
+ - **Scenarios:** 19 scenarios (e.g. “Outpatient, Boston, Medicaid”, “Veterans, Texas”, “Chicago, MAT”) with criteria dicts and sample user messages.
25
+ - **Metrics:**
26
+ - **Match:** For each scenario, `search(criteria)` is run; each returned facility is checked against criteria (state, treatment type, payment, MAT, populations, languages, substances, therapies). Output: “all match? Y/N” and summary “X/19 runs had all suggested facilities matching criteria.”
27
+ - **Hallucination (optional):** With `--with-chatbot`, the bot is called per scenario; facility names (and contact info) mentioned in the reply are checked against the dataset. Target: 0 invented facilities.
28
+ - **Output:** Table (default) or CSV: scenario, facilities returned, count, all_match, hallucination (if `--with-chatbot`).
29
+ - **Usage:** `python scripts/eval_chatbot.py` (search + match only); `python scripts/eval_chatbot.py --with-chatbot` (add hallucination check; needs HF token).
30
+
31
+ ### 4. Maps (Gradio app)
32
+
33
+ - **UI:** `app.py` – two-pane layout: **map (left)** + **chat (right)**. Facility dropdown to pick a result; selected facility is highlighted on the map.
34
+ - **Map backends:**
35
+ - **Google Maps:** When `GOOGLE_MAPS_API_KEY` is set in `.env`, the map is an interactive Google Maps iframe (markers, info windows, optional routes). Requires “Maps JavaScript API” and “Directions API” if you use routes.
36
+ - **Fallback:** If no key, uses **Folium/OpenStreetMap** (Leaflet): markers, popups, optional route via OSRM.
37
+ - **Behavior:** Geocoding for facility city/state (Nominatim cache); pins for search results; optional route from user location to first facility in Folium mode; selected facility shown with a distinct icon. Map updates when the user sends a new message (new results) or changes the facility dropdown.
38
+ - **Compatibility:** Gradio version differences handled: Chatbot `type="messages"` only if supported; CSS passed to `gr.Blocks(..., css=CSS)`; `launch()` only gets supported kwargs (e.g. `theme`).
39
+
40
+ ### 5. Deployment and repo hygiene
41
+
42
+ - **Hugging Face Space:** App runs on CPU; uses HF Inference for the model. Push with `git push huggingface student-version:main` (binary xlsx was removed from history so HF accepts the push). `scripts/push_to_hf.sh` runs that push.
43
+ - **Secrets:** Space needs `HF_TOKEN` (and optionally `GOOGLE_MAPS_API_KEY` if you configure it on the Space).
44
+ - **Data on Space:** Full facility CSV is large; use a Hugging Face Dataset and set `FACILITIES_DATASET` (see `data/README.md`).
45
+
46
+ ---
47
+
48
+ ## What’s still in progress / optional
49
+
50
+ - **Memo:** 1–2 page memo (Design, Data, Evaluation, Limitations) and optional figure (eval table or dialogue snippet) – not in repo yet.
51
+ - **Eval in memo:** Run `eval_chatbot.py` (with `--with-chatbot` if desired), paste or summarize the table in the memo.
52
+ - **Data story in repo:** Optional short `data/README.md` (or comment in `src/facilities.py`) with source (e.g. N-SUMHSS / National Directory), scope, and limitations for memo reference.
53
+ - **Accessibility / i18n:** Memo can mention keyboard use and clear labels; product is English-only for now.
54
+ - **Future API:** Backend is in `chat.py` + `facilities.py`; a thin FastAPI/Flask `POST /chat` could be added later for a React/Vercel frontend without moving core logic.
55
+
56
+ ---
57
+
58
+ ## What could be done next
59
+
60
+ | Area | Possible next steps |
61
+ |------|---------------------|
62
+ | **Chatbot** | Fine-tune `MY_MODEL` on example dialogues; extend criteria (e.g. more states/cities); handle “near me” with IP geolocation. |
63
+ | **RAG / data** | Add more facility attributes or filters; use embeddings + vector search instead of (or in addition to) keyword/criteria search; support data/README and dataset versioning. |
64
+ | **Evaluation** | Add more scenarios (e.g. edge cases, non-English-like prompts); automate eval in CI; track metrics over time. |
65
+ | **Maps** | Add user geolocation in the Space; improve mobile layout; optional clustering for many markers. |
66
+ | **UX** | Short “How to use” in the UI; optional language/locale; printable summary of results. |
67
+ | **DevOps** | Optional API for a React app; Docker for local run; pin Gradio version for consistent behavior. |
68
+
69
+ ---
70
+
71
+ ## File map
72
+
73
+ | Path | Purpose |
74
+ |------|--------|
75
+ | `app.py` | Gradio UI: map + chat, state, examples, disclaimer, theme. |
76
+ | `src/chat.py` | Chatbot: criteria extraction, search, prompt formatting, HF Inference, no hallucination. |
77
+ | `src/facilities.py` | Load CSV or HF Dataset; `search(criteria)`; column mapping. |
78
+ | `scripts/eval_chatbot.py` | 19 scenarios; match check; optional hallucination check; table/CSV output. |
79
+ | `scripts/push_to_hf.sh` | Push `student-version` to HF Space `main`. |
80
+ | `config.py` | `BASE_MODEL`, `MY_MODEL`, `HF_TOKEN`. |
81
+ | `data/README.md` | Data source, HF Dataset usage for Space. |
82
+ | `docs/MEMO.md` | Memo placeholder / notes. |
83
+ | `docs/PROJECT_SUMMARY.md` | This file. |
src/chat.py CHANGED
@@ -26,8 +26,8 @@ SYSTEM_PROMPT = """You are a supportive, non-judgmental assistant that helps peo
26
 
27
  Conversation flow:
28
  1. Greet / clarify: If the user has not yet given a location (state or city), ask for: (a) state or city, (b) treatment type (inpatient, outpatient, residential, telehealth), (c) payment (insurance, Medicaid/MassHealth, sliding scale, free), and as appropriate: substances they're concerned about (e.g. alcohol, opioids), special populations (veterans, LGBTQ+, adolescents, pregnant women), therapies (e.g. MAT, CBT, 12-step), and languages spoken. Do not search until you have at least a location.
29
- 2. First results: When you have at least location (and ideally type and payment), present 2–3 facilities by name with 1–2 sentence descriptions using ONLY the data in the "Current facility data" section below. Mention relevant attributes (payment, languages, populations, substances, therapies) when they match what the user asked for. Offer to give more details or other options.
30
- 3. Follow-up: If the user asks about a specific facility (e.g. "Do they offer MAT?" or "Tell me about Boston Medical Center"), answer ONLY from the facility record provided. Offer next steps (e.g. how to contact).
31
  4. Closing: If the user thanks you or says they're done, give a brief supportive close and invite them to return.
32
 
33
  Rules:
@@ -36,6 +36,7 @@ Rules:
36
  - Keep responses concise and actionable.
37
  - Be supportive and clear. Do not give medical advice.
38
  - If no location has been provided, ask for location before suggesting any facilities.
 
39
  """
40
 
41
 
@@ -146,10 +147,12 @@ def _format_facilities_for_prompt(facilities: list[dict]) -> str:
146
  addr = f.get("address", "")
147
  city = f.get("city", "")
148
  state = f.get("state", "")
149
- phone = f.get("phone", "")
150
  mat = f.get("mat", "")
151
  services = f.get("services", "")
152
- parts = [f"{i}. {name} {desc} Address: {addr}, {city}, {state}. Phone: {phone}. MAT: {mat}. Services: {services}."]
 
 
153
  for key, label in (("payment_options", "Payment"), ("substances_addressed", "Substances"), ("languages", "Languages"), ("populations", "Populations")):
154
  val = f.get(key, "")
155
  if val and str(val).strip():
@@ -158,6 +161,24 @@ def _format_facilities_for_prompt(facilities: list[dict]) -> str:
158
  return "\n".join(lines)
159
 
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  def _detect_facility_mention(text: str, last_results: list[dict]) -> str | None:
162
  """If user is asking about a specific facility, return a name fragment to look up."""
163
  if not last_results or not text or not text.strip():
@@ -206,30 +227,43 @@ class Chatbot:
206
  criteria = state.get("criteria", {})
207
  last_results = state.get("last_results", [])
208
  last_facility_detail = state.get("last_facility_detail")
 
209
 
210
  # Extract criteria from current message and merge
211
  new_criteria = _extract_criteria(message)
212
  criteria = _merge_criteria(criteria, new_criteria)
213
 
214
- # Check if user is asking about a specific facility (follow-up)
215
- facility_mention = _detect_facility_mention(message, last_results)
216
- if facility_mention:
217
- single = get_facility_by_name(facility_mention, self._get_df())
218
- if single:
219
- last_facility_detail = single
220
- context_data = "Current facility data (use ONLY this for your answer):\n" + _format_facilities_for_prompt([single])
221
- else:
222
- context_data = "No matching facility found in data. Say you don't have details for that facility and offer to search again or clarify."
223
  else:
224
- last_facility_detail = None
225
- # Run search when we have at least location
226
- has_location = bool(criteria.get("state") or criteria.get("location"))
227
- if has_location:
228
- results = search(criteria, df=self._get_df(), limit=5)
229
- last_results = results
230
- context_data = "Current facility data (suggest ONLY these; do not invent any other facility):\n" + _format_facilities_for_prompt(results)
 
 
 
 
231
  else:
232
- context_data = "No search has been run yet (user has not provided a location). Ask for state or city, and optionally treatment type, payment, substances, populations, therapies, and languages, before suggesting facilities."
 
 
 
 
 
 
 
 
 
 
233
 
234
  # Build messages for API: system (with context) + history + current user
235
  system_content = SYSTEM_PROMPT + "\n\n" + context_data
@@ -253,5 +287,6 @@ class Chatbot:
253
  "criteria": criteria,
254
  "last_results": last_results,
255
  "last_facility_detail": last_facility_detail,
 
256
  }
257
  return reply, new_state
 
26
 
27
  Conversation flow:
28
  1. Greet / clarify: If the user has not yet given a location (state or city), ask for: (a) state or city, (b) treatment type (inpatient, outpatient, residential, telehealth), (c) payment (insurance, Medicaid/MassHealth, sliding scale, free), and as appropriate: substances they're concerned about (e.g. alcohol, opioids), special populations (veterans, LGBTQ+, adolescents, pregnant women), therapies (e.g. MAT, CBT, 12-step), and languages spoken. Do not search until you have at least a location.
29
+ 2. First results: When you have at least location (and ideally type and payment), present 2–3 facilities by name with 1–2 sentence descriptions using ONLY the data in the "Current facility data" section below. For each facility, always include the phone number when available so the user can call; include address when helpful. Mention relevant attributes (payment, languages, populations, substances, therapies) when they match what the user asked for. Offer to give more details or other options.
30
+ 3. Follow-up: If the user asks about a specific facility (e.g. "Do they offer MAT?" or "Tell me about Boston Medical Center"), answer ONLY from the facility record provided. When they ask how to contact or for details, give the phone number and address from the data. Offer next steps (e.g. "You can call them at [phone]").
31
  4. Closing: If the user thanks you or says they're done, give a brief supportive close and invite them to return.
32
 
33
  Rules:
 
36
  - Keep responses concise and actionable.
37
  - Be supportive and clear. Do not give medical advice.
38
  - If no location has been provided, ask for location before suggesting any facilities.
39
+ - When listing or describing a facility, always include contact info from the data: phone number (when present) and address so the user can reach them.
40
  """
41
 
42
 
 
147
  addr = f.get("address", "")
148
  city = f.get("city", "")
149
  state = f.get("state", "")
150
+ phone = (f.get("phone") or "").strip() or (f.get("phone_number") or "").strip()
151
  mat = f.get("mat", "")
152
  services = f.get("services", "")
153
+ contact = f"Phone: {phone}. " if phone else "(No phone in data). "
154
+ contact += f"Address: {addr}, {city}, {state}." if (addr or city or state) else ""
155
+ parts = [f"{i}. {name} — {desc} Contact: {contact} MAT: {mat}. Services: {services}."]
156
  for key, label in (("payment_options", "Payment"), ("substances_addressed", "Substances"), ("languages", "Languages"), ("populations", "Populations")):
157
  val = f.get(key, "")
158
  if val and str(val).strip():
 
161
  return "\n".join(lines)
162
 
163
 
164
+ def _detect_numeric_facility_selection(text: str, last_results: list[dict]) -> int | None:
165
+ """If user is selecting by number (1, 2, 3, '1.', 'option 1', 'the first one'), return 1-based index or None."""
166
+ if not last_results or not text or not text.strip():
167
+ return None
168
+ text_lower = text.strip().lower()
169
+ # "1", "1.", "option 1", "the first one", "number 1"
170
+ for i in range(1, min(len(last_results) + 1, 10)):
171
+ if text_lower in (str(i), f"{i}.", f"option {i}", f"number {i}"):
172
+ return i
173
+ if i == 1 and text_lower in ("first", "the first", "the first one"):
174
+ return 1
175
+ if i == 2 and text_lower in ("second", "the second one"):
176
+ return 2
177
+ if i == 3 and text_lower in ("third", "the third one"):
178
+ return 3
179
+ return None
180
+
181
+
182
  def _detect_facility_mention(text: str, last_results: list[dict]) -> str | None:
183
  """If user is asking about a specific facility, return a name fragment to look up."""
184
  if not last_results or not text or not text.strip():
 
227
  criteria = state.get("criteria", {})
228
  last_results = state.get("last_results", [])
229
  last_facility_detail = state.get("last_facility_detail")
230
+ selected_facility_name = state.get("selected_facility_name")
231
 
232
  # Extract criteria from current message and merge
233
  new_criteria = _extract_criteria(message)
234
  criteria = _merge_criteria(criteria, new_criteria)
235
 
236
+ # Check if user is selecting by number (e.g. "1.", "2") — use existing last_results, don't re-run search
237
+ num_sel = _detect_numeric_facility_selection(message, last_results)
238
+ if num_sel is not None and 1 <= num_sel <= len(last_results):
239
+ chosen = last_results[num_sel - 1]
240
+ last_facility_detail = chosen
241
+ selected_facility_name = chosen.get("facility_name") or chosen.get("name")
242
+ context_data = "Current facility data (use ONLY this for your answer):\n" + _format_facilities_for_prompt([chosen])
 
 
243
  else:
244
+ # Check if user is asking about a specific facility by name
245
+ facility_mention = _detect_facility_mention(message, last_results)
246
+ if facility_mention:
247
+ single = get_facility_by_name(facility_mention, self._get_df())
248
+ if single:
249
+ last_facility_detail = single
250
+ selected_facility_name = single.get("facility_name") or single.get("name")
251
+ context_data = "Current facility data (use ONLY this for your answer):\n" + _format_facilities_for_prompt([single])
252
+ else:
253
+ context_data = "No matching facility found in data. Say you don't have details for that facility and offer to search again or clarify."
254
+ last_facility_detail = None
255
  else:
256
+ last_facility_detail = None
257
+ selected_facility_name = None
258
+ # Run search when we have at least location
259
+ has_location = bool(criteria.get("state") or criteria.get("location"))
260
+ if has_location:
261
+ results = search(criteria, df=self._get_df(), limit=5)
262
+ last_results = results
263
+ context_data = "Current facility data (suggest ONLY these; do not invent any other facility):\n" + _format_facilities_for_prompt(results)
264
+ else:
265
+ context_data = "No search has been run yet (user has not provided a location). Ask for state or city, and optionally treatment type, payment, substances, populations, therapies, and languages, before suggesting facilities."
266
+ selected_facility_name = state.get("selected_facility_name") # preserve when no search
267
 
268
  # Build messages for API: system (with context) + history + current user
269
  system_content = SYSTEM_PROMPT + "\n\n" + context_data
 
287
  "criteria": criteria,
288
  "last_results": last_results,
289
  "last_facility_detail": last_facility_detail,
290
+ "selected_facility_name": selected_facility_name,
291
  }
292
  return reply, new_state
src/facilities.py CHANGED
@@ -200,6 +200,10 @@ def search(criteria: dict[str, Any], df: pd.DataFrame | None = None, limit: int
200
  return t in svc or t in desc
201
  out = out[out.apply(has_therapy, axis=1)]
202
 
 
 
 
 
203
  out = out.head(limit)
204
  return out.to_dict(orient="records")
205
 
 
200
  return t in svc or t in desc
201
  out = out[out.apply(has_therapy, axis=1)]
202
 
203
+ # Stable order so map pins and model's "1. 2. 3." list match
204
+ sort_cols = [c for c in ("state", "city", "facility_name") if c in out.columns]
205
+ if sort_cols:
206
+ out = out.sort_values(by=sort_cols, na_position="last").reset_index(drop=True)
207
  out = out.head(limit)
208
  return out.to_dict(orient="records")
209