Spaces:

Nishauri
/

ClinicianAssistant

Sleeping

App Files Files Community

JDFPalladium commited on Jul 31, 2025

Commit

35274a7

1 Parent(s): 1255a5e

adding idsr define tool and reflecting tweaks to other scripts and notebooks

Browse files

Files changed (9) hide show

app.py +27 -5
chatlib/idsr_definition.py +85 -0
chatlib/patient_all_data.py +4 -1
notebooks/create_patient_db.ipynb +487 -30
notebooks/create_slim_patient_db.ipynb +18 -135
chat.py → scripts/chat.py +0 -0
scripts/evaluate_trulens.py +147 -0
{chatlib → scripts}/patient_sql_agent.py +0 -0
scripts/ragas_eval.py +118 -0

app.py CHANGED Viewed

@@ -22,6 +22,7 @@ from chatlib.state_types import AppState
 from chatlib.guidlines_rag_agent_li import rag_retrieve
 from chatlib.patient_all_data import sql_chain
 from chatlib.idsr_check import idsr_check
 from chatlib.phi_filter import detect_and_redact_phi
 from chatlib.assistant_node import assistant
@@ -52,8 +53,15 @@ def idsr_check_tool(query, sitecode):
         "context": result.get("context", None),
     }
-tools = [rag_retrieve_tool, sql_chain_tool, idsr_check_tool]
 llm_with_tools = llm.bind_tools(tools)
@@ -61,11 +69,12 @@ sys_msg = SystemMessage(
     content="""
 You are a helpful assistant supporting clinicians during patient visits. When a patient ID is provided, the clinician is meeting with that HIV-positive patient and may inquire about their history, lab results, or medications. If no patient ID is provided, the clinician may be asking general HIV clinical questions or presenting symptoms for a new patient.
-You have access to three tools to help you answer the clinician's questions.
-- rag_retrieve: to access HIV clinical guidelines
-- sql_chain: to access HIV data about the patient with whom the clinician is meeting. For straightforward factual questions about the patient, you may call sql_chain directly. For questions requiring clinical interpretation or classification, first call rag_retrieve to get relevant clinical guideline context, then include that context when calling sql_chain.
-- idsr_check: to check if the patient case description matches any known diseases.
 When a tool is needed, respond only with a JSON object specifying the tool to call and its minimal arguments, for example:
 {
@@ -107,6 +116,19 @@ For example:
   }
 }
 There are only two cases where a tool is not needed:
 1. If the clinician's question is a simple greeting, farewell, or acknowledgement.
 2. The answer is clearly and completely present in the prior conversation turns.

 from chatlib.guidlines_rag_agent_li import rag_retrieve
 from chatlib.patient_all_data import sql_chain
 from chatlib.idsr_check import idsr_check
+from chatlib.idsr_definition import idsr_define
 from chatlib.phi_filter import detect_and_redact_phi
 from chatlib.assistant_node import assistant
         "context": result.get("context", None),
     }
+def idsr_define_tool(query):
+    """Retrieve disease definition based on the query."""
+    result = idsr_define(query, llm=llm)
+    return {
+        "answer": result.get("answer", ""),
+        "last_tool": "idsr_define"
+    }
+tools = [rag_retrieve_tool, sql_chain_tool, idsr_check_tool, idsr_define_tool]
 llm_with_tools = llm.bind_tools(tools)
     content="""
 You are a helpful assistant supporting clinicians during patient visits. When a patient ID is provided, the clinician is meeting with that HIV-positive patient and may inquire about their history, lab results, or medications. If no patient ID is provided, the clinician may be asking general HIV clinical questions or presenting symptoms for a new patient.
+You have access to four tools to help you answer the clinician's questions.
+- rag_retrieve_tool: to access HIV clinical guidelines
+- sql_chain_tool: to access HIV data about the patient with whom the clinician is meeting. For straightforward factual questions about the patient, you may call sql_chain directly. For questions requiring clinical interpretation or classification, first call rag_retrieve to get relevant clinical guideline context, then include that context when calling sql_chain.
+- idsr_check_tool: to check if the patient case description matches any known diseases.
+- idsr_define_tool: to retrieve the official case definition of a disease when the clinician asks about it (e.g., “What is the description of cholera?”). Do not use this tool for analyzing symptom descriptions — use `idsr_check_tool` for that.
 When a tool is needed, respond only with a JSON object specifying the tool to call and its minimal arguments, for example:
 {
   }
 }
+When calling the "idsr_define_tool" tool, always include the following arguments in the JSON response:
+- "query": the clinician's question
+For example:
+{
+  "tool": "idsr_define_tool",
+  "args": {
+    "query": "What is the description of cholera?"
+  }
+}
 There are only two cases where a tool is not needed:
 1. If the clinician's question is a simple greeting, farewell, or acknowledgement.
 2. The answer is clearly and completely present in the prior conversation turns.

chatlib/idsr_definition.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import PydanticOutputParser
+from pydantic import BaseModel, Field
+from typing import Optional
+from langchain_core.documents import Document
+import json
+with open("./data/processed/tagged_documents.json", "r", encoding="utf-8") as f:
+    doc_dicts = json.load(f)
+tagged_documents = [Document(**d) for d in doc_dicts]
+class DiseaseSelectionOutput(BaseModel):
+    disease_name: Optional[str] = Field(
+        description="The most likely disease the user is asking about, or null if no match is confident"
+    )
+def select_disease_from_query(query: str, llm, tagged_docs: list[Document]) -> Optional[str]:
+    disease_names = [doc.metadata.get("disease_name") for doc in tagged_docs]
+    disease_list = "\n".join(f"- {name}" for name in disease_names)
+    parser = PydanticOutputParser(pydantic_object=DiseaseSelectionOutput)
+    prompt = ChatPromptTemplate.from_template(
+        """
+You are helping a clinician retrieve a disease definition from a list of IDSR diseases.
+Given the following query:
+"{query}"
+Select the single disease from the list below that the query most likely refers to.
+List of available diseases:
+{disease_list}
+If no match is clearly appropriate, set "disease_name" to null.
+{format_instructions}
+"""
+    )
+    chain = prompt | llm | parser
+    output = chain.invoke({
+        "query": query,
+        "disease_list": disease_list,
+        "format_instructions": parser.get_format_instructions()
+    })
+    return output.disease_name
+def idsr_define(query: str, llm) -> dict:
+    disease_name = select_disease_from_query(query, llm, tagged_documents)
+    if not disease_name:
+        return {
+            "answer": "Sorry, I couldn't find a clear match for that disease. Please rephrase or try a different name."
+        }
+    # Search for matching doc
+    for doc in tagged_documents:
+        if doc.metadata.get("disease_name") == disease_name:
+            definition = doc.page_content.strip()
+            # Use LLM to generate a helpful answer
+            prompt = f"""
+    You are a medical assistant helping a clinician understand disease case definitions.
+    Here is a user query:
+    "{query}"
+    Here is the official case definition for the relevant disease:
+    "{definition}"
+    Based on the case definition, answer the user query clearly and concisely. Do not speculate beyond the information provided.
+    """
+            llm_response = llm.invoke(prompt)
+            return {
+                "answer": llm_response.content.strip()
+            }
+    return {
+        "answer": f"Sorry, no case definition was found for the selected disease."
+    }

chatlib/patient_all_data.py CHANGED Viewed

@@ -172,6 +172,9 @@ def sql_chain(query: str, llm, rag_result: str, pk_hash: str) -> dict:
             except (ValueError, TypeError):
                 return "invalid date"
         row = df.iloc[0]
         summary = (
             f"Sex: {safe(row['Sex'])}\n"
@@ -180,7 +183,7 @@ def sql_chain(query: str, llm, rag_result: str, pk_hash: str) -> dict:
             f"Occupation: {safe(row['Occupation'])}\n"
             f"OnIPT: {safe(row['OnIPT'])}\n"
             f"ARTOutcomeDescription: {safe(row['ARTOutcomeDescription'])}\n"
-            f"StartARTDate: {safe(row['StartARTDate'])}\n"
             f"Age: {calculate_age(safe(row['DOB']))}"
         )
         return summary

             except (ValueError, TypeError):
                 return "invalid date"
+        df = df.copy()
+        df["StartARTDate"] = pd.to_datetime(df["StartARTDate"], errors="coerce")
         row = df.iloc[0]
         summary = (
             f"Sex: {safe(row['Sex'])}\n"
             f"Occupation: {safe(row['Occupation'])}\n"
             f"OnIPT: {safe(row['OnIPT'])}\n"
             f"ARTOutcomeDescription: {safe(row['ARTOutcomeDescription'])}\n"
+            f"StartARTDate: {describe_relative_date(row['StartARTDate'])}\n"
             f"Age: {calculate_age(safe(row['DOB']))}"
         )
         return summary

notebooks/create_patient_db.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
    "id": "ddb26634",
    "metadata": {},
    "outputs": [],
@@ -10,7 +10,7 @@
     "import sqlite3\n",
     "import pandas as pd\n",
     "# inspect current database schema\n",
-    "conn = sqlite3.connect('patient_slim.sqlite')\n",
     "cursor = conn.cursor()\n",
     "# list tables\n",
     "# pull all data from the visits table \n",
@@ -22,19 +22,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "cd4faa4b",
    "metadata": {},
    "outputs": [],
    "source": [
     "# let's create a new sqlite database called patient_demonstration.sqlite\n",
-    "conn = sqlite3.connect('patient_demonstration.sqlite')\n",
     "cursor = conn.cursor() "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "id": "f8547b78",
    "metadata": {},
    "outputs": [],
@@ -82,17 +104,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "9ddfa626",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<sqlite3.Cursor at 0x7d4240c3d840>"
       ]
      },
-     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -110,7 +132,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "d14ef687",
    "metadata": {},
    "outputs": [],
@@ -177,12 +199,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "6e27bce5",
    "metadata": {},
    "outputs": [],
    "source": [
-    "conn = sqlite3.connect('patient_slim.sqlite')\n",
     "cursor = conn.cursor()\n",
     "# pull all data from the lab table except for the \"key\" column \n",
     "cursor.execute(\"SELECT * FROM lab;\")\n",
@@ -193,19 +215,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "id": "14402e96",
    "metadata": {},
    "outputs": [],
    "source": [
     "# let's create a new sqlite database called patient_demonstration.sqlite\n",
-    "conn = sqlite3.connect('patient_demonstration.sqlite')\n",
     "cursor = conn.cursor() "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "id": "540962b7",
    "metadata": {},
    "outputs": [],
@@ -235,7 +257,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "id": "8df7171e",
    "metadata": {},
    "outputs": [],
@@ -260,12 +282,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "id": "b66d3dbb",
    "metadata": {},
    "outputs": [],
    "source": [
-    "conn = sqlite3.connect('patient_slim.sqlite')\n",
     "cursor = conn.cursor()\n",
     "# pull all data from the lab table except for the \"key\" column \n",
     "cursor.execute(\"SELECT * FROM pharmacy;\")\n",
@@ -276,19 +298,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "id": "435b8d4e",
    "metadata": {},
    "outputs": [],
    "source": [
     "# let's create a new sqlite database called patient_demonstration.sqlite\n",
-    "conn = sqlite3.connect('patient_demonstration.sqlite')\n",
     "cursor = conn.cursor() "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "id": "b3753eeb",
    "metadata": {},
    "outputs": [],
@@ -322,7 +344,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "id": "8b8ed08a",
    "metadata": {},
    "outputs": [],
@@ -348,12 +370,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "id": "2de65432",
    "metadata": {},
    "outputs": [],
    "source": [
-    "conn = sqlite3.connect('patient_slim.sqlite')\n",
     "cursor = conn.cursor()\n",
     "# pull all data from the lab table except for the \"key\" column \n",
     "cursor.execute(\"SELECT * FROM demographics;\")\n",
@@ -364,19 +386,454 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "id": "a7a10f4f",
    "metadata": {},
    "outputs": [],
    "source": [
     "# let's create a new sqlite database called patient_demonstration.sqlite\n",
-    "conn = sqlite3.connect('patient_demonstration.sqlite')\n",
     "cursor = conn.cursor() "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "id": "947c63d5",
    "metadata": {},
    "outputs": [],
@@ -386,6 +843,7 @@
     "cursor.execute('DROP TABLE IF EXISTS demographics;')\n",
     "cursor.execute('''\n",
     "CREATE TABLE demographics (\n",
     "    PatientPKHash TEXT,\n",
     "    MFLCode TEXT,\n",
     "    FacilityName TEXT,\n",
@@ -403,14 +861,13 @@
     "    AsOfDate TEXT,\n",
     "    LoadDate TEXT,\n",
     "    StartARTDate TEXT,\n",
-    "    DOB TEXT,\n",
-    "    key TEXT\n",
     ");\n",
     "''')\n",
     "\n",
     "# let's now populate the table with the rows variable that contains all the data from the visits table\n",
     "cursor.executemany('''\n",
-    "INSERT INTO demographics (PatientPKHash, MFLCode, FacilityName, County, SubCounty, PartnerName, AgencyName, Sex, MaritalStatus, EducationLevel, Occupation, OnIPT, AgeGroup, ARTOutcomeDescription, AsOfDate, LoadDate, StartARTDate, DOB, key)\n",
     "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);\n",
     "''', rows)\n",
     "conn.commit()"
@@ -418,7 +875,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
    "id": "9cff0d90",
    "metadata": {},
    "outputs": [],
@@ -458,7 +915,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 19,
    "id": "ddb26634",
    "metadata": {},
    "outputs": [],
     "import sqlite3\n",
     "import pandas as pd\n",
     "# inspect current database schema\n",
+    "conn = sqlite3.connect('../data/raw/patient_slim.sqlite')\n",
     "cursor = conn.cursor()\n",
     "# list tables\n",
     "# pull all data from the visits table \n",
   },
   {
    "cell_type": "code",
+   "execution_count": 20,
    "id": "cd4faa4b",
    "metadata": {},
    "outputs": [],
    "source": [
     "# let's create a new sqlite database called patient_demonstration.sqlite\n",
+    "conn = sqlite3.connect('../data/processed/patient_demonstration.sqlite')\n",
     "cursor = conn.cursor() "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "866e707d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(271, 25)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# extract everything from the visits table\n",
+    "cursor.execute(\"SELECT * FROM clinical_visits;\")\n",
+    "rows = cursor.fetchall()\n",
+    "visits_df = pd.DataFrame(rows, columns=[column[0] for column in cursor.description])\n",
+    "print(visits_df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "id": "f8547b78",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "id": "9ddfa626",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "<sqlite3.Cursor at 0x71e4721907c0>"
       ]
      },
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "id": "d14ef687",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "id": "6e27bce5",
    "metadata": {},
    "outputs": [],
    "source": [
+    "conn = sqlite3.connect('../data/raw/patient_slim.sqlite')\n",
     "cursor = conn.cursor()\n",
     "# pull all data from the lab table except for the \"key\" column \n",
     "cursor.execute(\"SELECT * FROM lab;\")\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "id": "14402e96",
    "metadata": {},
    "outputs": [],
    "source": [
     "# let's create a new sqlite database called patient_demonstration.sqlite\n",
+    "conn = sqlite3.connect('../data/processed/patient_demonstration.sqlite')\n",
     "cursor = conn.cursor() "
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "id": "540962b7",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "id": "8df7171e",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "id": "b66d3dbb",
    "metadata": {},
    "outputs": [],
    "source": [
+    "conn = sqlite3.connect('../data/raw/patient_slim.sqlite')\n",
     "cursor = conn.cursor()\n",
     "# pull all data from the lab table except for the \"key\" column \n",
     "cursor.execute(\"SELECT * FROM pharmacy;\")\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "id": "435b8d4e",
    "metadata": {},
    "outputs": [],
    "source": [
     "# let's create a new sqlite database called patient_demonstration.sqlite\n",
+    "conn = sqlite3.connect('../data/processed/patient_demonstration.sqlite')\n",
     "cursor = conn.cursor() "
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "id": "b3753eeb",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "id": "8b8ed08a",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 24,
    "id": "2de65432",
    "metadata": {},
    "outputs": [],
    "source": [
+    "conn = sqlite3.connect('../data/raw/patient_slim.sqlite')\n",
     "cursor = conn.cursor()\n",
     "# pull all data from the lab table except for the \"key\" column \n",
     "cursor.execute(\"SELECT * FROM demographics;\")\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 27,
+   "id": "f3a11ac1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>key</th>\n",
+       "      <th>PatientPKHash</th>\n",
+       "      <th>MFLCode</th>\n",
+       "      <th>FacilityName</th>\n",
+       "      <th>County</th>\n",
+       "      <th>SubCounty</th>\n",
+       "      <th>PartnerName</th>\n",
+       "      <th>AgencyName</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>MaritalStatus</th>\n",
+       "      <th>EducationLevel</th>\n",
+       "      <th>Occupation</th>\n",
+       "      <th>OnIPT</th>\n",
+       "      <th>AgeGroup</th>\n",
+       "      <th>ARTOutcomeDescription</th>\n",
+       "      <th>AsOfDate</th>\n",
+       "      <th>LoadDate</th>\n",
+       "      <th>StartARTDate</th>\n",
+       "      <th>DOB</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>07149C6735AA9A2B3EFB198A5DB19825E3DA3DBCDE8CB8...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>13703</td>\n",
+       "      <td>Kisii Teaching and Referral Hospital (Level 6)</td>\n",
+       "      <td>Kisii</td>\n",
+       "      <td>Kitutu Chache South</td>\n",
+       "      <td>LVCT Vukisha 95</td>\n",
+       "      <td>CDC</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>Single</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>LOST IN HMIS</td>\n",
+       "      <td>20088</td>\n",
+       "      <td>20161</td>\n",
+       "      <td>2012-04-12 00:00:00.000</td>\n",
+       "      <td>2010-05-10 00:00:00.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>290D316E1B41A21F58E780026971F4D86DBB3BF043A77B...</td>\n",
+       "      <td>4</td>\n",
+       "      <td>13028</td>\n",
+       "      <td>Kibera Community Health Centre - Amref</td>\n",
+       "      <td>Nairobi</td>\n",
+       "      <td>Kibra</td>\n",
+       "      <td>CIHEB CONNECT</td>\n",
+       "      <td>CDC</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>MARRIED MONOGAMOUS</td>\n",
+       "      <td>SECONDARY</td>\n",
+       "      <td>Trader</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>ACTIVE</td>\n",
+       "      <td>20088</td>\n",
+       "      <td>20161</td>\n",
+       "      <td>2009-05-12 00:00:00.000</td>\n",
+       "      <td>1970-08-25 00:00:00.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>45889B18F2C615A78371E1DAFC2680C0A36284C6195885...</td>\n",
+       "      <td>9</td>\n",
+       "      <td>15834</td>\n",
+       "      <td>Busia County Referral Hospital</td>\n",
+       "      <td>Busia</td>\n",
+       "      <td>Matayos</td>\n",
+       "      <td>USAID Dumisha Afya</td>\n",
+       "      <td>USAID</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>ACTIVE</td>\n",
+       "      <td>20088</td>\n",
+       "      <td>20161</td>\n",
+       "      <td>2014-08-12 00:00:00.000</td>\n",
+       "      <td>1972-04-13 00:00:00.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>9C9BFF8365B05D99D4F6A62716DD1353875B8A9280A772...</td>\n",
+       "      <td>7</td>\n",
+       "      <td>14831</td>\n",
+       "      <td>Kericho District Hospital</td>\n",
+       "      <td>Kericho</td>\n",
+       "      <td>Ainamoi</td>\n",
+       "      <td>HJF-South Rift Valley</td>\n",
+       "      <td>DOD</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>MARRIED MONOGAMOUS</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>Trader</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>ACTIVE</td>\n",
+       "      <td>20088</td>\n",
+       "      <td>20161</td>\n",
+       "      <td>2023-05-10 00:00:00.000</td>\n",
+       "      <td>1989-06-15 00:00:00.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>A51AEA4EC14F999A52AF53B4B531F760992ADA406B620C...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>11259</td>\n",
+       "      <td>Bomu Medical Centre (Likoni)</td>\n",
+       "      <td>Mombasa</td>\n",
+       "      <td>Likoni</td>\n",
+       "      <td>Mkomani Clinic society</td>\n",
+       "      <td>CDC</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>MARRIED MONOGAMOUS</td>\n",
+       "      <td>PRIMARY</td>\n",
+       "      <td>Trader</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>UNDOCUMENTED LOSS</td>\n",
+       "      <td>20088</td>\n",
+       "      <td>20161</td>\n",
+       "      <td>2018-05-22 00:00:00.000</td>\n",
+       "      <td>1995-05-21 00:00:00.000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                 key  PatientPKHash MFLCode  \\\n",
+       "0  07149C6735AA9A2B3EFB198A5DB19825E3DA3DBCDE8CB8...              3   13703   \n",
+       "1  290D316E1B41A21F58E780026971F4D86DBB3BF043A77B...              4   13028   \n",
+       "2  45889B18F2C615A78371E1DAFC2680C0A36284C6195885...              9   15834   \n",
+       "3  9C9BFF8365B05D99D4F6A62716DD1353875B8A9280A772...              7   14831   \n",
+       "4  A51AEA4EC14F999A52AF53B4B531F760992ADA406B620C...              1   11259   \n",
+       "\n",
+       "                                     FacilityName   County  \\\n",
+       "0  Kisii Teaching and Referral Hospital (Level 6)    Kisii   \n",
+       "1          Kibera Community Health Centre - Amref  Nairobi   \n",
+       "2                  Busia County Referral Hospital    Busia   \n",
+       "3                       Kericho District Hospital  Kericho   \n",
+       "4                    Bomu Medical Centre (Likoni)  Mombasa   \n",
+       "\n",
+       "             SubCounty             PartnerName AgencyName     Sex  \\\n",
+       "0  Kitutu Chache South         LVCT Vukisha 95        CDC  Female   \n",
+       "1                Kibra           CIHEB CONNECT        CDC  Female   \n",
+       "2              Matayos      USAID Dumisha Afya      USAID  Female   \n",
+       "3              Ainamoi   HJF-South Rift Valley        DOD  Female   \n",
+       "4               Likoni  Mkomani Clinic society        CDC  Female   \n",
+       "\n",
+       "        MaritalStatus EducationLevel Occupation OnIPT AgeGroup  \\\n",
+       "0              Single           NULL       NULL  NULL     NULL   \n",
+       "1  MARRIED MONOGAMOUS     SECONDARY      Trader  NULL     NULL   \n",
+       "2                NULL           NULL       NULL  NULL     NULL   \n",
+       "3  MARRIED MONOGAMOUS           NULL     Trader  NULL     NULL   \n",
+       "4  MARRIED MONOGAMOUS        PRIMARY     Trader  NULL     NULL   \n",
+       "\n",
+       "  ARTOutcomeDescription  AsOfDate  LoadDate             StartARTDate  \\\n",
+       "0          LOST IN HMIS     20088     20161  2012-04-12 00:00:00.000   \n",
+       "1                ACTIVE     20088     20161  2009-05-12 00:00:00.000   \n",
+       "2                ACTIVE     20088     20161  2014-08-12 00:00:00.000   \n",
+       "3                ACTIVE     20088     20161  2023-05-10 00:00:00.000   \n",
+       "4     UNDOCUMENTED LOSS     20088     20161  2018-05-22 00:00:00.000   \n",
+       "\n",
+       "                       DOB  \n",
+       "0  2010-05-10 00:00:00.000  \n",
+       "1  1970-08-25 00:00:00.000  \n",
+       "2  1972-04-13 00:00:00.000  \n",
+       "3  1989-06-15 00:00:00.000  \n",
+       "4  1995-05-21 00:00:00.000  "
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
    "id": "a7a10f4f",
    "metadata": {},
    "outputs": [],
    "source": [
     "# let's create a new sqlite database called patient_demonstration.sqlite\n",
+    "conn = sqlite3.connect('../data/processed/patient_demonstration.sqlite')\n",
     "cursor = conn.cursor() "
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 32,
+   "id": "07296631",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>key</th>\n",
+       "      <th>PatientPKHash</th>\n",
+       "      <th>MFLCode</th>\n",
+       "      <th>FacilityName</th>\n",
+       "      <th>County</th>\n",
+       "      <th>SubCounty</th>\n",
+       "      <th>PartnerName</th>\n",
+       "      <th>AgencyName</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>MaritalStatus</th>\n",
+       "      <th>EducationLevel</th>\n",
+       "      <th>Occupation</th>\n",
+       "      <th>OnIPT</th>\n",
+       "      <th>AgeGroup</th>\n",
+       "      <th>ARTOutcomeDescription</th>\n",
+       "      <th>AsOfDate</th>\n",
+       "      <th>LoadDate</th>\n",
+       "      <th>StartARTDate</th>\n",
+       "      <th>DOB</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>07149C6735AA9A2B3EFB198A5DB19825E3DA3DBCDE8CB8...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>13703</td>\n",
+       "      <td>Kisii Teaching and Referral Hospital (Level 6)</td>\n",
+       "      <td>Kisii</td>\n",
+       "      <td>Kitutu Chache South</td>\n",
+       "      <td>LVCT Vukisha 95</td>\n",
+       "      <td>CDC</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>Single</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>LOST IN HMIS</td>\n",
+       "      <td>20088</td>\n",
+       "      <td>20161</td>\n",
+       "      <td>2012-04-12 00:00:00.000</td>\n",
+       "      <td>2010-05-10 00:00:00.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>290D316E1B41A21F58E780026971F4D86DBB3BF043A77B...</td>\n",
+       "      <td>4</td>\n",
+       "      <td>13028</td>\n",
+       "      <td>Kibera Community Health Centre - Amref</td>\n",
+       "      <td>Nairobi</td>\n",
+       "      <td>Kibra</td>\n",
+       "      <td>CIHEB CONNECT</td>\n",
+       "      <td>CDC</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>MARRIED MONOGAMOUS</td>\n",
+       "      <td>SECONDARY</td>\n",
+       "      <td>Trader</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>ACTIVE</td>\n",
+       "      <td>20088</td>\n",
+       "      <td>20161</td>\n",
+       "      <td>2009-05-12 00:00:00.000</td>\n",
+       "      <td>1970-08-25 00:00:00.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>45889B18F2C615A78371E1DAFC2680C0A36284C6195885...</td>\n",
+       "      <td>9</td>\n",
+       "      <td>15834</td>\n",
+       "      <td>Busia County Referral Hospital</td>\n",
+       "      <td>Busia</td>\n",
+       "      <td>Matayos</td>\n",
+       "      <td>USAID Dumisha Afya</td>\n",
+       "      <td>USAID</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>ACTIVE</td>\n",
+       "      <td>20088</td>\n",
+       "      <td>20161</td>\n",
+       "      <td>2014-08-12 00:00:00.000</td>\n",
+       "      <td>1972-04-13 00:00:00.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>9C9BFF8365B05D99D4F6A62716DD1353875B8A9280A772...</td>\n",
+       "      <td>7</td>\n",
+       "      <td>14831</td>\n",
+       "      <td>Kericho District Hospital</td>\n",
+       "      <td>Kericho</td>\n",
+       "      <td>Ainamoi</td>\n",
+       "      <td>HJF-South Rift Valley</td>\n",
+       "      <td>DOD</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>MARRIED MONOGAMOUS</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>Trader</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>ACTIVE</td>\n",
+       "      <td>20088</td>\n",
+       "      <td>20161</td>\n",
+       "      <td>2023-05-10 00:00:00.000</td>\n",
+       "      <td>1989-06-15 00:00:00.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>A51AEA4EC14F999A52AF53B4B531F760992ADA406B620C...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>11259</td>\n",
+       "      <td>Bomu Medical Centre (Likoni)</td>\n",
+       "      <td>Mombasa</td>\n",
+       "      <td>Likoni</td>\n",
+       "      <td>Mkomani Clinic society</td>\n",
+       "      <td>CDC</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>MARRIED MONOGAMOUS</td>\n",
+       "      <td>PRIMARY</td>\n",
+       "      <td>Trader</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>NULL</td>\n",
+       "      <td>UNDOCUMENTED LOSS</td>\n",
+       "      <td>20088</td>\n",
+       "      <td>20161</td>\n",
+       "      <td>2018-05-22 00:00:00.000</td>\n",
+       "      <td>1995-05-21 00:00:00.000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                 key PatientPKHash MFLCode  \\\n",
+       "0  07149C6735AA9A2B3EFB198A5DB19825E3DA3DBCDE8CB8...             3   13703   \n",
+       "1  290D316E1B41A21F58E780026971F4D86DBB3BF043A77B...             4   13028   \n",
+       "2  45889B18F2C615A78371E1DAFC2680C0A36284C6195885...             9   15834   \n",
+       "3  9C9BFF8365B05D99D4F6A62716DD1353875B8A9280A772...             7   14831   \n",
+       "4  A51AEA4EC14F999A52AF53B4B531F760992ADA406B620C...             1   11259   \n",
+       "\n",
+       "                                     FacilityName   County  \\\n",
+       "0  Kisii Teaching and Referral Hospital (Level 6)    Kisii   \n",
+       "1          Kibera Community Health Centre - Amref  Nairobi   \n",
+       "2                  Busia County Referral Hospital    Busia   \n",
+       "3                       Kericho District Hospital  Kericho   \n",
+       "4                    Bomu Medical Centre (Likoni)  Mombasa   \n",
+       "\n",
+       "             SubCounty             PartnerName AgencyName     Sex  \\\n",
+       "0  Kitutu Chache South         LVCT Vukisha 95        CDC  Female   \n",
+       "1                Kibra           CIHEB CONNECT        CDC  Female   \n",
+       "2              Matayos      USAID Dumisha Afya      USAID  Female   \n",
+       "3              Ainamoi   HJF-South Rift Valley        DOD  Female   \n",
+       "4               Likoni  Mkomani Clinic society        CDC  Female   \n",
+       "\n",
+       "        MaritalStatus EducationLevel Occupation OnIPT AgeGroup  \\\n",
+       "0              Single           NULL       NULL  NULL     NULL   \n",
+       "1  MARRIED MONOGAMOUS     SECONDARY      Trader  NULL     NULL   \n",
+       "2                NULL           NULL       NULL  NULL     NULL   \n",
+       "3  MARRIED MONOGAMOUS           NULL     Trader  NULL     NULL   \n",
+       "4  MARRIED MONOGAMOUS        PRIMARY     Trader  NULL     NULL   \n",
+       "\n",
+       "  ARTOutcomeDescription AsOfDate LoadDate             StartARTDate  \\\n",
+       "0          LOST IN HMIS    20088    20161  2012-04-12 00:00:00.000   \n",
+       "1                ACTIVE    20088    20161  2009-05-12 00:00:00.000   \n",
+       "2                ACTIVE    20088    20161  2014-08-12 00:00:00.000   \n",
+       "3                ACTIVE    20088    20161  2023-05-10 00:00:00.000   \n",
+       "4     UNDOCUMENTED LOSS    20088    20161  2018-05-22 00:00:00.000   \n",
+       "\n",
+       "                       DOB  \n",
+       "0  2010-05-10 00:00:00.000  \n",
+       "1  1970-08-25 00:00:00.000  \n",
+       "2  1972-04-13 00:00:00.000  \n",
+       "3  1989-06-15 00:00:00.000  \n",
+       "4  1995-05-21 00:00:00.000  "
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cursor.execute(\"select * from demographics;\")\n",
+    "rows = cursor.fetchall()\n",
+    "df = pd.DataFrame(rows, columns=[column[0] for column in cursor.description])\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
    "id": "947c63d5",
    "metadata": {},
    "outputs": [],
     "cursor.execute('DROP TABLE IF EXISTS demographics;')\n",
     "cursor.execute('''\n",
     "CREATE TABLE demographics (\n",
+    "    key TEXT,\n",
     "    PatientPKHash TEXT,\n",
     "    MFLCode TEXT,\n",
     "    FacilityName TEXT,\n",
     "    AsOfDate TEXT,\n",
     "    LoadDate TEXT,\n",
     "    StartARTDate TEXT,\n",
+    "    DOB TEXT\n",
     ");\n",
     "''')\n",
     "\n",
     "# let's now populate the table with the rows variable that contains all the data from the visits table\n",
     "cursor.executemany('''\n",
+    "INSERT INTO demographics (key, PatientPKHash, MFLCode, FacilityName, County, SubCounty, PartnerName, AgencyName, Sex, MaritalStatus, EducationLevel, Occupation, OnIPT, AgeGroup, ARTOutcomeDescription, AsOfDate, LoadDate, StartARTDate, DOB)\n",
     "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);\n",
     "''', rows)\n",
     "conn.commit()"
   },
   {
    "cell_type": "code",
+   "execution_count": 18,
    "id": "9cff0d90",
    "metadata": {},
    "outputs": [],
  ],
  "metadata": {
   "kernelspec": {
+   "display_name": "clinician-assistant-lg",
    "language": "python",
    "name": "python3"
   },

notebooks/create_slim_patient_db.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 20,
    "id": "c867740b",
    "metadata": {},
    "outputs": [],
@@ -10,7 +10,7 @@
     "import sqlite3\n",
     "import pandas as pd\n",
     "# inspect current database schema\n",
-    "conn = sqlite3.connect('iit_test.sqlite')\n",
     "cursor = conn.cursor()\n",
     "# list tables\n",
     "# pull all data from the visits table \n",
@@ -22,7 +22,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
    "id": "f424fcf6",
    "metadata": {},
    "outputs": [
@@ -30,7 +30,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_2997/3546205200.py:11: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
       "Try using .loc[row_indexer,col_indexer] = value instead\n",
       "\n",
@@ -53,35 +53,14 @@
     "sampled_df['PatientPKHash'] = sampled_df['PatientPKHash'].map(key_to_number)\n",
     "\n",
     "# save sampled_df back to iit_test.sqlite as a new table called sampled_visits\n",
-    "sampled_conn = sqlite3.connect('patient_slim.sqlite')\n",
     "sampled_df.to_sql('visits', sampled_conn, if_exists='replace', index=False)\n",
     "sampled_conn.close()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
-   "id": "8615f9fa",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(271, 25)"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sampled_df.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
    "id": "1bad1098",
    "metadata": {},
    "outputs": [
@@ -89,7 +68,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_2997/4153193150.py:11: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
       "Try using .loc[row_indexer,col_indexer] = value instead\n",
       "\n",
@@ -100,7 +79,7 @@
    ],
    "source": [
     "# now, read in pharmacy table from iit_test.sqlite\n",
-    "conn = sqlite3.connect('iit_test.sqlite')\n",
     "cursor = conn.cursor()\n",
     "cursor.execute(\"SELECT * FROM pharmacy;\")\n",
     "rows = cursor.fetchall()\n",
@@ -110,46 +89,14 @@
     "# filter these to the same 10 keys, replace the keys with numbers 1-10, and save to patient_slim.sqlite as a new table called sampled_pharmacy\n",
     "sampled_pharmacy_df = pharmacy_df[pharmacy_df['PatientPKHash'].isin(sampled_keys)]\n",
     "sampled_pharmacy_df['PatientPKHash'] = sampled_pharmacy_df['PatientPKHash'].map(key_to_number)\n",
-    "sampled_conn = sqlite3.connect('patient_slim.sqlite')\n",
     "sampled_pharmacy_df.to_sql('pharmacy', sampled_conn, if_exists='replace', index=False)\n",
     "sampled_conn.close()\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "bc8fac93",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "PatientPKHash\n",
-       "1     14\n",
-       "2     24\n",
-       "3     24\n",
-       "4      9\n",
-       "5     40\n",
-       "6      1\n",
-       "7     15\n",
-       "8      1\n",
-       "9     64\n",
-       "10    14\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sampled_pharmacy_df.groupby('PatientPKHash').size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
    "id": "df01b886",
    "metadata": {},
    "outputs": [
@@ -157,7 +104,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_2997/3478231606.py:11: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
       "Try using .loc[row_indexer,col_indexer] = value instead\n",
       "\n",
@@ -168,7 +115,7 @@
    ],
    "source": [
     "# repeat the process above for lab table\n",
-    "conn = sqlite3.connect('iit_test.sqlite')\n",
     "cursor = conn.cursor()\n",
     "cursor.execute(\"SELECT * FROM lab;\")\n",
     "rows = cursor.fetchall()\n",
@@ -178,46 +125,14 @@
     "# filter these to the same 10 keys, replace the keys with numbers 1-10, and save to patient_slim.sqlite as a new table called sampled_lab\n",
     "sampled_lab_df = lab_df[lab_df['PatientPKHash'].isin(sampled_keys)]\n",
     "sampled_lab_df['PatientPKHash'] = sampled_lab_df['PatientPKHash'].map(key_to_number)\n",
-    "sampled_conn = sqlite3.connect('patient_slim.sqlite')\n",
     "sampled_lab_df.to_sql('lab', sampled_conn, if_exists='replace', index=False)\n",
     "sampled_conn.close()\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "id": "2578bf85",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "PatientPKHash\n",
-       "1      6\n",
-       "2      2\n",
-       "3     17\n",
-       "4     22\n",
-       "5     23\n",
-       "6      1\n",
-       "7      2\n",
-       "8     10\n",
-       "9     13\n",
-       "10    12\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sampled_lab_df.groupby('PatientPKHash').size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
    "id": "ebf358c5",
    "metadata": {},
    "outputs": [
@@ -225,7 +140,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_2997/3867144072.py:11: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
       "Try using .loc[row_indexer,col_indexer] = value instead\n",
       "\n",
@@ -236,7 +151,7 @@
    ],
    "source": [
     "# now, from dem table\n",
-    "conn = sqlite3.connect('iit_test.sqlite')\n",
     "cursor = conn.cursor()\n",
     "cursor.execute(\"SELECT * FROM dem;\")\n",
     "rows = cursor.fetchall()\n",
@@ -246,47 +161,15 @@
     "# filter these to the same 10 keys, replace the keys with numbers 1-10, and save to patient_slim.sqlite as a new table called sampled_dem\n",
     "sampled_dem_df = dem_df[dem_df['PatientPKHash'].isin(sampled_keys)]\n",
     "sampled_dem_df['PatientPKHash'] = sampled_dem_df['PatientPKHash'].map(key_to_number)\n",
-    "sampled_conn = sqlite3.connect('patient_slim.sqlite')\n",
     "sampled_dem_df.to_sql('demographics', sampled_conn, if_exists='replace', index=False)\n",
     "sampled_conn.close()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "527420fa",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "PatientPKHash\n",
-       "1     1\n",
-       "2     1\n",
-       "3     1\n",
-       "4     1\n",
-       "5     1\n",
-       "6     1\n",
-       "7     1\n",
-       "8     1\n",
-       "9     1\n",
-       "10    1\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sampled_dem_df.groupby('PatientPKHash').size()"
-   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 4,
    "id": "c867740b",
    "metadata": {},
    "outputs": [],
     "import sqlite3\n",
     "import pandas as pd\n",
     "# inspect current database schema\n",
+    "conn = sqlite3.connect('../data/raw/iit_test.sqlite')\n",
     "cursor = conn.cursor()\n",
     "# list tables\n",
     "# pull all data from the visits table \n",
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "id": "f424fcf6",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/tmp/ipykernel_12725/435846127.py:11: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
       "Try using .loc[row_indexer,col_indexer] = value instead\n",
       "\n",
     "sampled_df['PatientPKHash'] = sampled_df['PatientPKHash'].map(key_to_number)\n",
     "\n",
     "# save sampled_df back to iit_test.sqlite as a new table called sampled_visits\n",
+    "sampled_conn = sqlite3.connect('../data/raw/patient_slim.sqlite')\n",
     "sampled_df.to_sql('visits', sampled_conn, if_exists='replace', index=False)\n",
     "sampled_conn.close()"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "id": "1bad1098",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/tmp/ipykernel_12725/2381592446.py:11: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
       "Try using .loc[row_indexer,col_indexer] = value instead\n",
       "\n",
    ],
    "source": [
     "# now, read in pharmacy table from iit_test.sqlite\n",
+    "conn = sqlite3.connect('../data/raw/iit_test.sqlite')\n",
     "cursor = conn.cursor()\n",
     "cursor.execute(\"SELECT * FROM pharmacy;\")\n",
     "rows = cursor.fetchall()\n",
     "# filter these to the same 10 keys, replace the keys with numbers 1-10, and save to patient_slim.sqlite as a new table called sampled_pharmacy\n",
     "sampled_pharmacy_df = pharmacy_df[pharmacy_df['PatientPKHash'].isin(sampled_keys)]\n",
     "sampled_pharmacy_df['PatientPKHash'] = sampled_pharmacy_df['PatientPKHash'].map(key_to_number)\n",
+    "sampled_conn = sqlite3.connect('../data/raw/patient_slim.sqlite')\n",
     "sampled_pharmacy_df.to_sql('pharmacy', sampled_conn, if_exists='replace', index=False)\n",
     "sampled_conn.close()\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "id": "df01b886",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/tmp/ipykernel_12725/4028870248.py:11: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
       "Try using .loc[row_indexer,col_indexer] = value instead\n",
       "\n",
    ],
    "source": [
     "# repeat the process above for lab table\n",
+    "conn = sqlite3.connect('../data/raw/iit_test.sqlite')\n",
     "cursor = conn.cursor()\n",
     "cursor.execute(\"SELECT * FROM lab;\")\n",
     "rows = cursor.fetchall()\n",
     "# filter these to the same 10 keys, replace the keys with numbers 1-10, and save to patient_slim.sqlite as a new table called sampled_lab\n",
     "sampled_lab_df = lab_df[lab_df['PatientPKHash'].isin(sampled_keys)]\n",
     "sampled_lab_df['PatientPKHash'] = sampled_lab_df['PatientPKHash'].map(key_to_number)\n",
+    "sampled_conn = sqlite3.connect('../data/raw/patient_slim.sqlite')\n",
     "sampled_lab_df.to_sql('lab', sampled_conn, if_exists='replace', index=False)\n",
     "sampled_conn.close()\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "id": "ebf358c5",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/tmp/ipykernel_12725/696424165.py:11: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
       "Try using .loc[row_indexer,col_indexer] = value instead\n",
       "\n",
    ],
    "source": [
     "# now, from dem table\n",
+    "conn = sqlite3.connect('../data/raw/iit_test.sqlite')\n",
     "cursor = conn.cursor()\n",
     "cursor.execute(\"SELECT * FROM dem;\")\n",
     "rows = cursor.fetchall()\n",
     "# filter these to the same 10 keys, replace the keys with numbers 1-10, and save to patient_slim.sqlite as a new table called sampled_dem\n",
     "sampled_dem_df = dem_df[dem_df['PatientPKHash'].isin(sampled_keys)]\n",
     "sampled_dem_df['PatientPKHash'] = sampled_dem_df['PatientPKHash'].map(key_to_number)\n",
+    "sampled_conn = sqlite3.connect('../data/raw/patient_slim.sqlite')\n",
     "sampled_dem_df.to_sql('demographics', sampled_conn, if_exists='replace', index=False)\n",
     "sampled_conn.close()"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
+   "display_name": "clinician-assistant-lg",
    "language": "python",
    "name": "python3"
   },

chat.py → scripts/chat.py RENAMED Viewed

File without changes

scripts/evaluate_trulens.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import numpy as np
+import pandas as pd
+from dotenv import load_dotenv
+import os
+from llama_index.core import StorageContext, load_index_from_storage, QueryBundle
+from llama_index.core.retrievers import VectorIndexRetriever
+from llama_index.core.postprocessor import LLMRerank
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.llms.openai import OpenAI
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from trulens_eval import Tru
+from trulens.core import Feedback
+from trulens.providers.openai import OpenAI as OpenAIFeedbackProvider
+from trulens_eval.tru_app import TruLlama
+# Load environment
+if os.path.exists("config.env"):
+    load_dotenv("config.env")
+# Load vectorstore metadata
+embeddings = np.load("data/processed/lp/summary_embeddings/embeddings.npy")
+df = pd.read_csv("data/processed/lp/summary_embeddings/index.tsv", sep="\t")
+# LLMs and components
+embedding_model = OpenAIEmbedding()
+llm_llama = OpenAI(model="gpt-4o", temperature=0.0)
+reranker = LLMRerank(llm=llm_llama, top_n=3)
+# langchain summarize LLM
+llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
+grounded = Feedback(Groundedness()).on_input().on_context().with_name("faithfulness")
+context_rel = Feedback(Relevance()).on_input().on_context().with_name("context_relevance")
+answer_rel = Feedback(AnswerRelevance()).on_input().on_output().with_name("answer_relevance")
+# Prompt for query expansion
+query_expansion_prompt = ChatPromptTemplate.from_messages([
+    ("system", "You are an expert in HIV medicine."),
+    ("user", (
+        "Given the query below, provide a concise, comma-separated list of related terms and synonyms "
+        "useful for document retrieval. Return only the list, no explanations.\n\n"
+        "Query: {query}"
+    ))
+])
+# ---------- Functions ----------
+def cosine_similarity_numpy(query_vec: np.ndarray, matrix: np.ndarray) -> np.ndarray:
+    query_norm = query_vec / np.linalg.norm(query_vec)
+    matrix_norm = matrix / np.linalg.norm(matrix, axis=1, keepdims=True)
+    return matrix_norm @ query_norm
+def expand_query(query, llm, prompt_template):
+    messages = prompt_template.format_messages(query=query)
+    return llm.invoke(messages).content.strip()
+def retrieve_contexts(expanded_query, embeddings, df, embedding_model):
+    query_vec = embedding_model.get_text_embedding(expanded_query)
+    sims = cosine_similarity_numpy(query_vec, embeddings)
+    top_indices = sims.argsort()[-3:][::-1]
+    paths = df.loc[top_indices, "vectorestore_path"].tolist()
+    all_nodes = []
+    for path in paths:
+        ctx = StorageContext.from_defaults(persist_dir=path)
+        index = load_index_from_storage(ctx)
+        retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
+        all_nodes.extend(retriever.retrieve(expanded_query))
+    reranked = reranker.postprocess_nodes(all_nodes, QueryBundle(expanded_query))
+    return [n.text for n in reranked]
+def summarize(query, contexts, llm):
+    prompt = (
+        "You're a clinical assistant helping a provider answer a question using HIV/AIDS guidelines.\n\n"
+        f"Question: {query}\n\n"
+        "Provide a detailed summary of the most relevant points to the user question from the following source texts. Use bullet points.\n\n"
+        + "\n\n".join([f"Source {i+1}: {t}" for i, t in enumerate(contexts)])
+    )
+    return llm.invoke(prompt).content.strip()
+# ---------- RAG Pipeline ----------
+def custom_rag_app(query):
+    expanded = expand_query(query, llm, query_expansion_prompt)
+    contexts = retrieve_contexts(expanded, embeddings, df, embedding_model)
+    answer = summarize(query, contexts, llm)
+    return {
+        "question": query,
+        "expanded_query": expanded,
+        "contexts": contexts,
+        "answer": answer
+    }
+# ---------- Feedbacks ----------
+provider = OpenAIFeedbackProvider()
+f_grounded = Feedback(provider.groundedness).on_input().on_context().with_name("faithfulness")
+f_context_rel = Feedback(provider.context_relevance).on_input().on_context().with_name("context_relevance")
+f_answer_rel = Feedback(provider.relevance).on_input().on_output().with_name("answer_relevance")
+# ---------- TruLens App ----------
+tru_llama = TruLlama(
+    app=custom_rag_app,
+    feedbacks=[f_grounded, f_context_rel, f_answer_rel],
+    app_id="evaluate-trulens-llama-v2"
+)
+tru = Tru()
+# ---------- Run Evaluation ----------
+test_queries = [
+    "What are important drug interactions with dolutegravir?",
+    "How should PrEP be provided to adolescent girls?",
+    "When is cotrimoxazole prophylaxis indicated?",
+    "What are the guidelines for ART failure?",
+    "How do you manage HIV in pregnancy?"
+]
+records = []
+for q in test_queries:
+    record = tru_llama.run_with_record(question=q)
+    fb = record["feedback"]
+    records.append({
+        "question": q,
+        "answer": record["output"],
+        "contexts": record["context"],
+        "faithfulness_score": fb["faithfulness"].get("score"),
+        "context_relevance_score": fb["context_relevance"].get("score"),
+        "answer_relevance_score": fb["answer_relevance"].get("score"),
+        "faithfulness_justification": fb["faithfulness"].get("justification", "")
+    })
+df = pd.DataFrame(records)
+df.to_csv("trulens_llama_eval_results.csv", index=False)
+print("✅ Evaluation complete. Saved to trulens_llama_eval_results.csv")
+print(df)

{chatlib → scripts}/patient_sql_agent.py RENAMED Viewed

File without changes

scripts/ragas_eval.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# custom_rag_with_ragas.py
+import numpy as np
+import pandas as pd
+from datasets import Dataset
+from ragas.evaluation import evaluate
+from ragas.metrics import (
+    faithfulness,
+    answer_relevancy,
+    context_precision,
+    context_recall
+)
+from llama_index.core import StorageContext, load_index_from_storage, QueryBundle
+from llama_index.core.retrievers import VectorIndexRetriever
+from llama_index.core.postprocessor import LLMRerank
+from llama_index.embeddings.openai import OpenAIEmbedding
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chat_models import ChatOpenAI
+from llama_index.llms.openai import OpenAI
+import os
+from dotenv import load_dotenv
+if os.path.exists("config.env"):
+    load_dotenv("config.env")
+embeddings = np.load("data/processed/lp/summary_embeddings/embeddings.npy")
+df = pd.read_csv("data/processed/lp/summary_embeddings/index.tsv", sep="\t")
+embedding_model = OpenAIEmbedding()
+# Define your reranker-compatible LLM
+llm_llama = OpenAI(model="gpt-4o", temperature=0.0)
+# Create LLM reranker
+reranker = LLMRerank(llm=llm_llama, top_n=3)
+# summarizer LLM
+llm = ChatOpenAI(temperature=0.0, model="gpt-4o")
+# Define a prompt template for query expansion
+query_expansion_prompt = ChatPromptTemplate.from_messages([
+    ("system", "You are an expert in HIV medicine."),
+    ("user", (
+        "Given the query below, provide a concise, comma-separated list of related terms and synonyms "
+        "useful for document retrieval. Return only the list, no explanations.\n\n"
+        "Query: {query}"
+    ))
+])
+def cosine_similarity_numpy(query_vec: np.ndarray, matrix: np.ndarray) -> np.ndarray:
+    query_norm = query_vec / np.linalg.norm(query_vec)
+    matrix_norm = matrix / np.linalg.norm(matrix, axis=1, keepdims=True)
+    return matrix_norm @ query_norm
+def expand_query(query, llm, prompt_template):
+    messages = prompt_template.format_messages(query=query)
+    return llm.invoke(messages).content.strip()
+def retrieve_contexts(expanded_query, embeddings, df, embedding_model):
+    query_vec = embedding_model.get_text_embedding(expanded_query)
+    similarities = cosine_similarity_numpy(query_vec, embeddings)
+    top_indices = similarities.argsort()[-3:][::-1]
+    paths = df.loc[top_indices, "vectorestore_path"].tolist()
+    print(paths)
+    all_nodes = []
+    for path in paths:
+        ctx = StorageContext.from_defaults(persist_dir=path)
+        index = load_index_from_storage(ctx)
+        retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
+        all_nodes.extend(retriever.retrieve(expanded_query))
+    return [n.text for n in LLMRerank(llm=llm_llama, top_n=3).postprocess_nodes(all_nodes, QueryBundle(expanded_query))]
+def summarize(query, contexts, llm):
+    prompt = (
+        "You're a clinical assistant helping a provider answer a question using HIV/AIDS guidelines.\n\n"
+        f"Question: {query}\n\n"
+        "Provide a detailed summary of the most relevant points from the following source texts using bullet points.\n\n"
+        + "\n\n".join([f"Source {i+1}: {text}" for i, text in enumerate(contexts)])
+    )
+    return llm.invoke(prompt).content.strip()
+# Run on test queries
+test_queries = [
+    "What are important drug interactions with dolutegravir?",
+    "How should PrEP be provided to adolescent girls?",
+    "When is cotrimoxazole prophylaxis indicated?",
+    "What are the guidelines for ART failure?",
+    "How do you manage HIV in pregnancy?"
+]
+results = []
+for q in test_queries:
+    print(f"⏳ Processing: {q}")
+    expanded = expand_query(q, llm, query_expansion_prompt)
+    contexts = retrieve_contexts(expanded, embeddings, df, embedding_model)
+    answer = summarize(q, contexts, llm)
+    results.append({
+        "question": q,
+        "contexts": contexts,
+        "answer": answer
+    })
+# --- Ragas Evaluation ---
+print("✅ Running Ragas evaluation...")
+ragas_data = Dataset.from_list(results)
+eval_results = evaluate(
+    ragas_data,
+    metrics=[faithfulness, answer_relevancy]
+)
+df_eval = eval_results.to_pandas()
+df_eval.to_csv("ragas_eval_results.csv", index=False)
+print("✅ Evaluation complete. Saved to ragas_eval_results.csv")
+print(df_eval)