Spaces:

phanny
/

6.C395-chatbot

Sleeping

App Files Files Community

phanny commited on Mar 10

Commit

727ab75

1 Parent(s): 7535bf6

add contacts

Browse files

Files changed (4) hide show

app.py +7 -4
docs/PROJECT_SUMMARY.md +83 -0
src/chat.py +56 -21
src/facilities.py +4 -0

app.py CHANGED Viewed

@@ -432,7 +432,7 @@ def _tuples_to_messages(history):
 def create_demo():
     chatbot = Chatbot()
-    with gr.Blocks(title="SAMHSA Treatment Locator", css=CSS) as demo:
         gr.Markdown("# SAMHSA Treatment Locator")
         gr.Markdown(DESCRIPTION)
         gr.Markdown(f"<div class='disclaimer'>{DISCLAIMER}</div>", elem_classes=["disclaimer"])
@@ -495,12 +495,13 @@ def create_demo():
                 history_tuples = _messages_to_tuples(history)
                 reply, new_state = chatbot.get_response(message, history_tuples, state)
                 new_state = dict(new_state)
-                new_state["selected_facility_name"] = None  # clear selection when new results
                 new_history_tuples = history_tuples + [[message, reply]]
                 new_history_messages = _tuples_to_messages(new_history_tuples)
                 facilities = list(new_state.get("last_results") or [])
-                map_html_out = _build_map_html(facilities, None, update_id, None)
-                return new_history_messages, new_state, "", map_html_out, gr.update(choices=_facility_names(facilities), value=None)
             except Exception as e:
                 err_msg = str(e)[:200]
                 reply = f"Sorry, something went wrong: {err_msg}"
@@ -551,6 +552,8 @@ if __name__ == "__main__":
     demo = create_demo()
     sig = inspect.signature(demo.launch)
     kwargs = {}
     if "theme" in sig.parameters and hasattr(gr, "themes"):
         kwargs["theme"] = gr.themes.Soft(primary_hue="teal", secondary_hue="slate")
     demo.launch(**kwargs)

 def create_demo():
     chatbot = Chatbot()
+    with gr.Blocks(title="SAMHSA Treatment Locator") as demo:
         gr.Markdown("# SAMHSA Treatment Locator")
         gr.Markdown(DESCRIPTION)
         gr.Markdown(f"<div class='disclaimer'>{DISCLAIMER}</div>", elem_classes=["disclaimer"])
                 history_tuples = _messages_to_tuples(history)
                 reply, new_state = chatbot.get_response(message, history_tuples, state)
                 new_state = dict(new_state)
                 new_history_tuples = history_tuples + [[message, reply]]
                 new_history_messages = _tuples_to_messages(new_history_tuples)
                 facilities = list(new_state.get("last_results") or [])
+                sel = new_state.get("selected_facility_name")
+                map_html_out = _build_map_html(facilities, None, update_id, sel)
+                dropdown_value = sel if sel and any(f.get("facility_name") == sel or f.get("name") == sel for f in facilities) else None
+                return new_history_messages, new_state, "", map_html_out, gr.update(choices=_facility_names(facilities), value=dropdown_value)
             except Exception as e:
                 err_msg = str(e)[:200]
                 reply = f"Sorry, something went wrong: {err_msg}"
     demo = create_demo()
     sig = inspect.signature(demo.launch)
     kwargs = {}
+    if "css" in sig.parameters:
+        kwargs["css"] = CSS
     if "theme" in sig.parameters and hasattr(gr, "themes"):
         kwargs["theme"] = gr.themes.Soft(primary_hue="teal", secondary_hue="slate")
     demo.launch(**kwargs)

docs/PROJECT_SUMMARY.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# 6.C395 Chatbot – Project Summary
+## What’s Done
+### 1. Chatbot (SAMHSA Treatment Locator)
+- **Role:** Helps users find substance use and mental health treatment facilities in the US by conversation. No invented facilities; only real data is shown.
+- **Flow (matches `samhsa_chatbot_conversation_example.txt`):**
+  - **Greet / clarify:** Asks for location (state/city), treatment type (inpatient/outpatient/residential/telehealth), payment (Medicaid, insurance, sliding scale, etc.), and optionally substances, populations (veterans, LGBTQ+, adolescents), therapies (MAT, CBT, 12-step), languages.
+  - **First results:** Once there’s at least a location, runs search and presents 2–3 facilities by name with short descriptions from data only.
+  - **Follow-up:** If the user asks about a specific facility (“Do they offer MAT?”), answers from that facility’s record only.
+  - **Closing:** Brief supportive close when the user is done.
+- **Implementation:** `src/chat.py` – `Chatbot` class with criteria extraction (`_extract_criteria`), merge with prior state, search when location is present, and response via Hugging Face Inference API. System prompt enforces “use only provided facility data; never invent names/addresses/phones.”
+### 2. RAG-style retrieval (no hallucination)
+- **Data:** Facility records from SAMHSA (CSV: `data/facilities.csv` or a Hugging Face Dataset when `FACILITIES_DATASET` is set). Scripts: `scripts/download_findtreatment_data.py`, `scripts/ingest_facilities.py`, `scripts/upload_facilities_to_hf.py`.
+- **Search:** `src/facilities.py` – `load_facilities()`, `search(criteria, df, limit)`. Criteria: state, location (city), treatment_type, payment, mat, populations, languages, substances, therapies. Returns list of facility dicts.
+- **RAG pattern:** User message → extract/merge criteria → run `search()` when location exists → format results as “Current facility data” in the system prompt → LLM answers only from that context. Follow-up about a specific facility uses `get_facility_by_name()` and the same “only this data” prompt. So the model is **retrieval-grounded**: it never invents facilities.
+### 3. Evaluation
+- **Script:** `scripts/eval_chatbot.py`.
+- **Scenarios:** 19 scenarios (e.g. “Outpatient, Boston, Medicaid”, “Veterans, Texas”, “Chicago, MAT”) with criteria dicts and sample user messages.
+- **Metrics:**
+  - **Match:** For each scenario, `search(criteria)` is run; each returned facility is checked against criteria (state, treatment type, payment, MAT, populations, languages, substances, therapies). Output: “all match? Y/N” and summary “X/19 runs had all suggested facilities matching criteria.”
+  - **Hallucination (optional):** With `--with-chatbot`, the bot is called per scenario; facility names (and contact info) mentioned in the reply are checked against the dataset. Target: 0 invented facilities.
+- **Output:** Table (default) or CSV: scenario, facilities returned, count, all_match, hallucination (if `--with-chatbot`).
+- **Usage:** `python scripts/eval_chatbot.py` (search + match only); `python scripts/eval_chatbot.py --with-chatbot` (add hallucination check; needs HF token).
+### 4. Maps (Gradio app)
+- **UI:** `app.py` – two-pane layout: **map (left)** + **chat (right)**. Facility dropdown to pick a result; selected facility is highlighted on the map.
+- **Map backends:**
+  - **Google Maps:** When `GOOGLE_MAPS_API_KEY` is set in `.env`, the map is an interactive Google Maps iframe (markers, info windows, optional routes). Requires “Maps JavaScript API” and “Directions API” if you use routes.
+  - **Fallback:** If no key, uses **Folium/OpenStreetMap** (Leaflet): markers, popups, optional route via OSRM.
+- **Behavior:** Geocoding for facility city/state (Nominatim cache); pins for search results; optional route from user location to first facility in Folium mode; selected facility shown with a distinct icon. Map updates when the user sends a new message (new results) or changes the facility dropdown.
+- **Compatibility:** Gradio version differences handled: Chatbot `type="messages"` only if supported; CSS passed to `gr.Blocks(..., css=CSS)`; `launch()` only gets supported kwargs (e.g. `theme`).
+### 5. Deployment and repo hygiene
+- **Hugging Face Space:** App runs on CPU; uses HF Inference for the model. Push with `git push huggingface student-version:main` (binary xlsx was removed from history so HF accepts the push). `scripts/push_to_hf.sh` runs that push.
+- **Secrets:** Space needs `HF_TOKEN` (and optionally `GOOGLE_MAPS_API_KEY` if you configure it on the Space).
+- **Data on Space:** Full facility CSV is large; use a Hugging Face Dataset and set `FACILITIES_DATASET` (see `data/README.md`).
+---
+## What’s still in progress / optional
+- **Memo:** 1–2 page memo (Design, Data, Evaluation, Limitations) and optional figure (eval table or dialogue snippet) – not in repo yet.
+- **Eval in memo:** Run `eval_chatbot.py` (with `--with-chatbot` if desired), paste or summarize the table in the memo.
+- **Data story in repo:** Optional short `data/README.md` (or comment in `src/facilities.py`) with source (e.g. N-SUMHSS / National Directory), scope, and limitations for memo reference.
+- **Accessibility / i18n:** Memo can mention keyboard use and clear labels; product is English-only for now.
+- **Future API:** Backend is in `chat.py` + `facilities.py`; a thin FastAPI/Flask `POST /chat` could be added later for a React/Vercel frontend without moving core logic.
+---
+## What could be done next
+| Area | Possible next steps |
+|------|---------------------|
+| **Chatbot** | Fine-tune `MY_MODEL` on example dialogues; extend criteria (e.g. more states/cities); handle “near me” with IP geolocation. |
+| **RAG / data** | Add more facility attributes or filters; use embeddings + vector search instead of (or in addition to) keyword/criteria search; support data/README and dataset versioning. |
+| **Evaluation** | Add more scenarios (e.g. edge cases, non-English-like prompts); automate eval in CI; track metrics over time. |
+| **Maps** | Add user geolocation in the Space; improve mobile layout; optional clustering for many markers. |
+| **UX** | Short “How to use” in the UI; optional language/locale; printable summary of results. |
+| **DevOps** | Optional API for a React app; Docker for local run; pin Gradio version for consistent behavior. |
+---
+## File map
+| Path | Purpose |
+|------|--------|
+| `app.py` | Gradio UI: map + chat, state, examples, disclaimer, theme. |
+| `src/chat.py` | Chatbot: criteria extraction, search, prompt formatting, HF Inference, no hallucination. |
+| `src/facilities.py` | Load CSV or HF Dataset; `search(criteria)`; column mapping. |
+| `scripts/eval_chatbot.py` | 19 scenarios; match check; optional hallucination check; table/CSV output. |
+| `scripts/push_to_hf.sh` | Push `student-version` to HF Space `main`. |
+| `config.py` | `BASE_MODEL`, `MY_MODEL`, `HF_TOKEN`. |
+| `data/README.md` | Data source, HF Dataset usage for Space. |
+| `docs/MEMO.md` | Memo placeholder / notes. |
+| `docs/PROJECT_SUMMARY.md` | This file. |

src/chat.py CHANGED Viewed

@@ -26,8 +26,8 @@ SYSTEM_PROMPT = """You are a supportive, non-judgmental assistant that helps peo
 Conversation flow:
 1. Greet / clarify: If the user has not yet given a location (state or city), ask for: (a) state or city, (b) treatment type (inpatient, outpatient, residential, telehealth), (c) payment (insurance, Medicaid/MassHealth, sliding scale, free), and as appropriate: substances they're concerned about (e.g. alcohol, opioids), special populations (veterans, LGBTQ+, adolescents, pregnant women), therapies (e.g. MAT, CBT, 12-step), and languages spoken. Do not search until you have at least a location.
-2. First results: When you have at least location (and ideally type and payment), present 2–3 facilities by name with 1–2 sentence descriptions using ONLY the data in the "Current facility data" section below. Mention relevant attributes (payment, languages, populations, substances, therapies) when they match what the user asked for. Offer to give more details or other options.
-3. Follow-up: If the user asks about a specific facility (e.g. "Do they offer MAT?" or "Tell me about Boston Medical Center"), answer ONLY from the facility record provided. Offer next steps (e.g. how to contact).
 4. Closing: If the user thanks you or says they're done, give a brief supportive close and invite them to return.
 Rules:
@@ -36,6 +36,7 @@ Rules:
 - Keep responses concise and actionable.
 - Be supportive and clear. Do not give medical advice.
 - If no location has been provided, ask for location before suggesting any facilities.
 """
@@ -146,10 +147,12 @@ def _format_facilities_for_prompt(facilities: list[dict]) -> str:
         addr = f.get("address", "")
         city = f.get("city", "")
         state = f.get("state", "")
-        phone = f.get("phone", "")
         mat = f.get("mat", "")
         services = f.get("services", "")
-        parts = [f"{i}. {name} — {desc} Address: {addr}, {city}, {state}. Phone: {phone}. MAT: {mat}. Services: {services}."]
         for key, label in (("payment_options", "Payment"), ("substances_addressed", "Substances"), ("languages", "Languages"), ("populations", "Populations")):
             val = f.get(key, "")
             if val and str(val).strip():
@@ -158,6 +161,24 @@ def _format_facilities_for_prompt(facilities: list[dict]) -> str:
     return "\n".join(lines)
 def _detect_facility_mention(text: str, last_results: list[dict]) -> str | None:
     """If user is asking about a specific facility, return a name fragment to look up."""
     if not last_results or not text or not text.strip():
@@ -206,30 +227,43 @@ class Chatbot:
         criteria = state.get("criteria", {})
         last_results = state.get("last_results", [])
         last_facility_detail = state.get("last_facility_detail")
         # Extract criteria from current message and merge
         new_criteria = _extract_criteria(message)
         criteria = _merge_criteria(criteria, new_criteria)
-        # Check if user is asking about a specific facility (follow-up)
-        facility_mention = _detect_facility_mention(message, last_results)
-        if facility_mention:
-            single = get_facility_by_name(facility_mention, self._get_df())
-            if single:
-                last_facility_detail = single
-                context_data = "Current facility data (use ONLY this for your answer):\n" + _format_facilities_for_prompt([single])
-            else:
-                context_data = "No matching facility found in data. Say you don't have details for that facility and offer to search again or clarify."
         else:
-            last_facility_detail = None
-            # Run search when we have at least location
-            has_location = bool(criteria.get("state") or criteria.get("location"))
-            if has_location:
-                results = search(criteria, df=self._get_df(), limit=5)
-                last_results = results
-                context_data = "Current facility data (suggest ONLY these; do not invent any other facility):\n" + _format_facilities_for_prompt(results)
             else:
-                context_data = "No search has been run yet (user has not provided a location). Ask for state or city, and optionally treatment type, payment, substances, populations, therapies, and languages, before suggesting facilities."
         # Build messages for API: system (with context) + history + current user
         system_content = SYSTEM_PROMPT + "\n\n" + context_data
@@ -253,5 +287,6 @@ class Chatbot:
             "criteria": criteria,
             "last_results": last_results,
             "last_facility_detail": last_facility_detail,
         }
         return reply, new_state

 Conversation flow:
 1. Greet / clarify: If the user has not yet given a location (state or city), ask for: (a) state or city, (b) treatment type (inpatient, outpatient, residential, telehealth), (c) payment (insurance, Medicaid/MassHealth, sliding scale, free), and as appropriate: substances they're concerned about (e.g. alcohol, opioids), special populations (veterans, LGBTQ+, adolescents, pregnant women), therapies (e.g. MAT, CBT, 12-step), and languages spoken. Do not search until you have at least a location.
+2. First results: When you have at least location (and ideally type and payment), present 2–3 facilities by name with 1–2 sentence descriptions using ONLY the data in the "Current facility data" section below. For each facility, always include the phone number when available so the user can call; include address when helpful. Mention relevant attributes (payment, languages, populations, substances, therapies) when they match what the user asked for. Offer to give more details or other options.
+3. Follow-up: If the user asks about a specific facility (e.g. "Do they offer MAT?" or "Tell me about Boston Medical Center"), answer ONLY from the facility record provided. When they ask how to contact or for details, give the phone number and address from the data. Offer next steps (e.g. "You can call them at [phone]").
 4. Closing: If the user thanks you or says they're done, give a brief supportive close and invite them to return.
 Rules:
 - Keep responses concise and actionable.
 - Be supportive and clear. Do not give medical advice.
 - If no location has been provided, ask for location before suggesting any facilities.
+- When listing or describing a facility, always include contact info from the data: phone number (when present) and address so the user can reach them.
 """
         addr = f.get("address", "")
         city = f.get("city", "")
         state = f.get("state", "")
+        phone = (f.get("phone") or "").strip() or (f.get("phone_number") or "").strip()
         mat = f.get("mat", "")
         services = f.get("services", "")
+        contact = f"Phone: {phone}. " if phone else "(No phone in data). "
+        contact += f"Address: {addr}, {city}, {state}." if (addr or city or state) else ""
+        parts = [f"{i}. {name} — {desc} Contact: {contact} MAT: {mat}. Services: {services}."]
         for key, label in (("payment_options", "Payment"), ("substances_addressed", "Substances"), ("languages", "Languages"), ("populations", "Populations")):
             val = f.get(key, "")
             if val and str(val).strip():
     return "\n".join(lines)
+def _detect_numeric_facility_selection(text: str, last_results: list[dict]) -> int | None:
+    """If user is selecting by number (1, 2, 3, '1.', 'option 1', 'the first one'), return 1-based index or None."""
+    if not last_results or not text or not text.strip():
+        return None
+    text_lower = text.strip().lower()
+    # "1", "1.", "option 1", "the first one", "number 1"
+    for i in range(1, min(len(last_results) + 1, 10)):
+        if text_lower in (str(i), f"{i}.", f"option {i}", f"number {i}"):
+            return i
+        if i == 1 and text_lower in ("first", "the first", "the first one"):
+            return 1
+        if i == 2 and text_lower in ("second", "the second one"):
+            return 2
+        if i == 3 and text_lower in ("third", "the third one"):
+            return 3
+    return None
 def _detect_facility_mention(text: str, last_results: list[dict]) -> str | None:
     """If user is asking about a specific facility, return a name fragment to look up."""
     if not last_results or not text or not text.strip():
         criteria = state.get("criteria", {})
         last_results = state.get("last_results", [])
         last_facility_detail = state.get("last_facility_detail")
+        selected_facility_name = state.get("selected_facility_name")
         # Extract criteria from current message and merge
         new_criteria = _extract_criteria(message)
         criteria = _merge_criteria(criteria, new_criteria)
+        # Check if user is selecting by number (e.g. "1.", "2") — use existing last_results, don't re-run search
+        num_sel = _detect_numeric_facility_selection(message, last_results)
+        if num_sel is not None and 1 <= num_sel <= len(last_results):
+            chosen = last_results[num_sel - 1]
+            last_facility_detail = chosen
+            selected_facility_name = chosen.get("facility_name") or chosen.get("name")
+            context_data = "Current facility data (use ONLY this for your answer):\n" + _format_facilities_for_prompt([chosen])
         else:
+            # Check if user is asking about a specific facility by name
+            facility_mention = _detect_facility_mention(message, last_results)
+            if facility_mention:
+                single = get_facility_by_name(facility_mention, self._get_df())
+                if single:
+                    last_facility_detail = single
+                    selected_facility_name = single.get("facility_name") or single.get("name")
+                    context_data = "Current facility data (use ONLY this for your answer):\n" + _format_facilities_for_prompt([single])
+                else:
+                    context_data = "No matching facility found in data. Say you don't have details for that facility and offer to search again or clarify."
+                    last_facility_detail = None
             else:
+                last_facility_detail = None
+                selected_facility_name = None
+                # Run search when we have at least location
+                has_location = bool(criteria.get("state") or criteria.get("location"))
+                if has_location:
+                    results = search(criteria, df=self._get_df(), limit=5)
+                    last_results = results
+                    context_data = "Current facility data (suggest ONLY these; do not invent any other facility):\n" + _format_facilities_for_prompt(results)
+                else:
+                    context_data = "No search has been run yet (user has not provided a location). Ask for state or city, and optionally treatment type, payment, substances, populations, therapies, and languages, before suggesting facilities."
+                    selected_facility_name = state.get("selected_facility_name")  # preserve when no search
         # Build messages for API: system (with context) + history + current user
         system_content = SYSTEM_PROMPT + "\n\n" + context_data
             "criteria": criteria,
             "last_results": last_results,
             "last_facility_detail": last_facility_detail,
+            "selected_facility_name": selected_facility_name,
         }
         return reply, new_state

src/facilities.py CHANGED Viewed

@@ -200,6 +200,10 @@ def search(criteria: dict[str, Any], df: pd.DataFrame | None = None, limit: int
             return t in svc or t in desc
         out = out[out.apply(has_therapy, axis=1)]
     out = out.head(limit)
     return out.to_dict(orient="records")

             return t in svc or t in desc
         out = out[out.apply(has_therapy, axis=1)]
+    # Stable order so map pins and model's "1. 2. 3." list match
+    sort_cols = [c for c in ("state", "city", "facility_name") if c in out.columns]
+    if sort_cols:
+        out = out.sort_values(by=sort_cols, na_position="last").reset_index(drop=True)
     out = out.head(limit)
     return out.to_dict(orient="records")