Spaces:

Nishauri
/

ClinicianAssistant

Sleeping

App Files Files Community

JDFPalladium commited on Jul 25, 2025

Commit

389c5f0

1 Parent(s): 7bc9486

cleaning up organization of scripts and data and updating filepaths in app to processed data

Browse files

Files changed (13) hide show

chatlib/guidlines_rag_agent_li.py +2 -2
chatlib/idsr_check.py +4 -4
chatlib/patient_all_data.py +1 -1
chatlib/phi_filter.py +1 -1
notebooks/create_location_db.ipynb +89 -0
notebooks/create_patient_db.ipynb +480 -0
notebooks/create_slim_patient_db.ipynb +308 -0
notebooks/create_textrag.ipynb +139 -0
notebooks/gen_idsr_rag.ipynb +562 -0
scripts/build_location_db.py +36 -0
scripts/parse_guidelines.py +58 -0
scripts/prep_summaries.py +28 -0
scripts/process_idsr.py +138 -0

chatlib/guidlines_rag_agent_li.py CHANGED Viewed

@@ -9,8 +9,8 @@ import pandas as pd
 from llama_index.embeddings.openai import OpenAIEmbedding
 # load vectorstore summaries
-embeddings = np.load("guidance_docs/lp/summary_embeddings/embeddings.npy")
-df = pd.read_csv("guidance_docs/lp/summary_embeddings/index.tsv", sep="\t")
 embedding_model = OpenAIEmbedding()

 from llama_index.embeddings.openai import OpenAIEmbedding
 # load vectorstore summaries
+embeddings = np.load("data/processed/lp/summary_embeddings/embeddings.npy")
+df = pd.read_csv("data/processed/lp/summary_embeddings/index.tsv", sep="\t")
 embedding_model = OpenAIEmbedding()

chatlib/idsr_check.py CHANGED Viewed

@@ -14,17 +14,17 @@ import sqlite3
 # import os
-with open("./guidance_docs/idsr_keywords.txt", "r", encoding="utf-8") as f:
     keywords = [line.strip() for line in f if line.strip()]
 vectorstore = FAISS.load_local(
-    "./guidance_docs/disease_vectorstore",
     OpenAIEmbeddings(),
     allow_dangerous_deserialization=True,
 )
-with open("./guidance_docs/tagged_documents.json", "r", encoding="utf-8") as f:
     doc_dicts = json.load(f)
 tagged_documents = [Document(**d) for d in doc_dicts]
@@ -138,7 +138,7 @@ def idsr_check(query: str, llm, sitecode) -> AppState:
     # first, get sitecode from environment variable
     # sitecode = os.environ.get("SITECODE")
     # next, connect to location database and get county where code = sitecode
-    conn = sqlite3.connect("data/location_data.sqlite")
     county_cursor = conn.cursor()
     county_cursor.execute(
         "SELECT County FROM sitecode_county_xwalk WHERE Code = ?", (sitecode,)

 # import os
+with open("./data/processed/idsr_keywords.txt", "r", encoding="utf-8") as f:
     keywords = [line.strip() for line in f if line.strip()]
 vectorstore = FAISS.load_local(
+    "./data/processed/disease_vectorstore",
     OpenAIEmbeddings(),
     allow_dangerous_deserialization=True,
 )
+with open("./data/processed/tagged_documents.json", "r", encoding="utf-8") as f:
     doc_dicts = json.load(f)
 tagged_documents = [Document(**d) for d in doc_dicts]
     # first, get sitecode from environment variable
     # sitecode = os.environ.get("SITECODE")
     # next, connect to location database and get county where code = sitecode
+    conn = sqlite3.connect("data/processed/location_data.sqlite")
     county_cursor = conn.cursor()
     county_cursor.execute(
         "SELECT County FROM sitecode_county_xwalk WHERE Code = ?", (sitecode,)

chatlib/patient_all_data.py CHANGED Viewed

@@ -41,7 +41,7 @@ def sql_chain(query: str, llm, rag_result: str, pk_hash: str) -> dict:
     if not pk_hash:
         raise ValueError("pk_hash is required in state for SQL queries.")
-    conn = sqlite3.connect("data/patient_demonstration.sqlite")
     cursor = conn.cursor()
     cursor.execute(

     if not pk_hash:
         raise ValueError("pk_hash is required in state for SQL queries.")
+    conn = sqlite3.connect("data/processed/patient_demonstration.sqlite")
     cursor = conn.cursor()
     cursor.execute(

chatlib/phi_filter.py CHANGED Viewed

@@ -4,7 +4,7 @@ import re
 from .helpers import dateparser_detect, describe_relative_date
-def load_kenyan_names(filepath="data/kenyan_names.txt"):
     if not Path(filepath).exists():
         return set()
     with open(filepath, "r", encoding="utf-8") as f:

 from .helpers import dateparser_detect, describe_relative_date
+def load_kenyan_names(filepath="data/processed/kenyan_names.txt"):
     if not Path(filepath).exists():
         return set()
     with open(filepath, "r", encoding="utf-8") as f:

notebooks/create_location_db.ipynb ADDED Viewed

	@@ -0,0 +1,89 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1c8c38eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sqlite3\n",
+    "import pandas as pd\n",
+    "\n",
+    "# read in kenya_disease_county_matrix.csv and sitecode_county_xwalk.csv\n",
+    "disease_df = pd.read_csv('kenya_disease_county_matrix.csv')\n",
+    "xwalk_df = pd.read_csv('sitecode_county_xwalk.csv')\n",
+    "rainy_df = pd.read_csv('kenya_counties_rainy_seasons.csv')\n",
+    "who_df = pd.read_csv('who_bulletin.csv')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f0c63494",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create sqlite database\n",
+    "conn = sqlite3.connect('location_data.sqlite')\n",
+    "cursor = conn.cursor()\n",
+    "\n",
+    "# add each dataframe to a table in the database\n",
+    "disease_df.to_sql('county_disease_info', conn, if_exists='replace', index=False)\n",
+    "xwalk_df.to_sql('sitecode_county_xwalk', conn, if_exists='replace', index=False)\n",
+    "rainy_df.to_sql('county_rainy_seasons', conn, if_exists='replace', index=False)\n",
+    "who_df.to_sql('who_bulletin', conn, if_exists='replace', index=False)\n",
+    "\n",
+    "# commit changes and close connection\n",
+    "conn.commit()\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c12e58cf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['County', 'Disease', 'Prevalence Level', 'Notes']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# get table in location_data.sqlite and show column names\n",
+    "import sqlite3\n",
+    "conn = sqlite3.connect('location_data.sqlite')\n",
+    "cursor = conn.cursor()\n",
+    "cursor.execute(\"SELECT * FROM county_disease_info;\")\n",
+    "tables = cursor.fetchall()\n",
+    "columns = [column[0] for column in cursor.description]\n",
+    "print(columns)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/create_patient_db.ipynb ADDED Viewed

	@@ -0,0 +1,480 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "ddb26634",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sqlite3\n",
+    "import pandas as pd\n",
+    "# inspect current database schema\n",
+    "conn = sqlite3.connect('patient_slim.sqlite')\n",
+    "cursor = conn.cursor()\n",
+    "# list tables\n",
+    "# pull all data from the visits table \n",
+    "cursor.execute(\"SELECT * FROM visits;\")\n",
+    "rows = cursor.fetchall()\n",
+    "df = pd.DataFrame(rows, columns=[column[0] for column in cursor.description])\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "cd4faa4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# let's create a new sqlite database called patient_demonstration.sqlite\n",
+    "conn = sqlite3.connect('patient_demonstration.sqlite')\n",
+    "cursor = conn.cursor() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f8547b78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a table called clinical_visits with the column names pulled above\n",
+    "# overwite the table if it already exists\n",
+    "cursor.execute('DROP TABLE IF EXISTS clinical_visits;')\n",
+    "cursor.execute('''\n",
+    "CREATE TABLE clinical_visits (\n",
+    "    PatientPKHash TEXT,\n",
+    "    SiteCode TEXT,\n",
+    "    VisitDate TEXT,\n",
+    "    VisitType TEXT,\n",
+    "    VisitBy TEXT,\n",
+    "    NextAppointmentDate TEXT,\n",
+    "    TCAReason TEXT,\n",
+    "    Pregnant TEXT,\n",
+    "    Breastfeeding TEXT,\n",
+    "    StabilityAssessment TEXT,\n",
+    "    DifferentiatedCare TEXT,\n",
+    "    WHOStage INTEGER,\n",
+    "    WHOStagingOI TEXT,\n",
+    "    Height REAL,\n",
+    "    Weight REAL,    \n",
+    "    EMR TEXT,\n",
+    "    Project TEXT,\n",
+    "    Adherence TEXT,\n",
+    "    AdherenceCategory TEXT,\n",
+    "    BP TEXT,\n",
+    "    OI TEXT,\n",
+    "    OIDate DATE,\n",
+    "    CurrentRegimen TEXT,\n",
+    "    AppointmentReminderWillingness TEXT,\n",
+    "    key TEXT\n",
+    ");\n",
+    "''')\n",
+    "\n",
+    "# let's now populate the table with the rows variable that contains all the data from the visits table\n",
+    "cursor.executemany('''\n",
+    "INSERT INTO clinical_visits (PatientPKHash, SiteCode, VisitDate, VisitType, VisitBy, NextAppointmentDate, TCAReason, Pregnant, Breastfeeding, StabilityAssessment, DifferentiatedCare, WHOStage, WHOStagingOI, Height, Weight, EMR, Project, Adherence, AdherenceCategory, BP, OI, OIDate, CurrentRegimen, AppointmentReminderWillingness, key)\n",
+    "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);\n",
+    "''', rows)\n",
+    "conn.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "9ddfa626",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<sqlite3.Cursor at 0x7d4240c3d840>"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# now let's create a data dictionary\n",
+    "cursor.execute('DROP TABLE IF EXISTS data_dictionary;')\n",
+    "cursor.execute('''\n",
+    "CREATE TABLE data_dictionary (\n",
+    "    table_name TEXT,\n",
+    "    column_name TEXT,\n",
+    "    description TEXT);\n",
+    "''')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d14ef687",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# populate the data dictionary with descriptions for each column in the clinical_visits table\n",
+    "cursor.execute('''\n",
+    "INSERT INTO data_dictionary (table_name, column_name, description) VALUES\n",
+    "('clinical_visits', 'PatientPKHash', 'Hashed patient identifier'),\n",
+    "('clinical_visits', 'SiteCode', 'Code for the clinical site'),\n",
+    "('clinical_visits', 'VisitDate', 'Date of the patient visit'),\n",
+    "('clinical_visits', 'VisitType', 'Type of the patient visit. Values include Unknown, SCHEDULED VISIT, UNSCHEDULED VISIT LATE,\n",
+    "       UNSCHEDULED VISIT EARLY, Unscheduled, Scheduled. These should typically be grouped as Scheduled and Unscheduled'),\n",
+    "('clinical_visits', 'VisitBy', 'Provider of the visit. Values include , Self, Treatment supporter, Refill visit documentation, Other'),\n",
+    "('clinical_visits', 'NextAppointmentDate', 'Date of the next scheduled clinical appointment set during VisitDate. \n",
+    "               This is typically a date in the future after VisitDate.'),\n",
+    "('clinical_visits', 'TCAReason', 'Reason for the TCA (To Come Again) status. Values include, Follow up, Lab tests, Pharmacy Refill, Counseling,Other'),\n",
+    "('clinical_visits', 'Pregnant', 'Is the patient pregnant? Values include Yes and No.'),\n",
+    "('clinical_visits', 'Breastfeeding', 'Is the patient breastfeeding? Values include Yes, No and N/A'),\n",
+    "('clinical_visits', 'StabilityAssessment', 'Stability assessment result. Values include Stable, Unstable, and not stable.\n",
+    "               typically, this should be grouped as Stable and Unstable (including not stable)'),\n",
+    "('clinical_visits', 'DifferentiatedCare', 'Differentiated care model. Values include Fast Track, Standard Care,\n",
+    "       Community ART Distribution peer led,\n",
+    "       Facility ART distribution Group,\n",
+    "       Community ART Distribution HCW Led'),\n",
+    "('clinical_visits', 'WHOStage', 'WHO stage of the patient, either 1, 2, 3, or 4'),\n",
+    "('clinical_visits', 'WHOStagingOI', 'Opportunistic infection observed during WHO staging. Values include\n",
+    "       Asymptomatic, Oral hairy leukoplakia,\n",
+    "       Unexplained severe weight loss, Pulmonary tuberculosis,\n",
+    "       Extra pulmonary tuberculosis,\n",
+    "       Unexplained severe weight loss,Pulmonary tuberculosis,\n",
+    "       Recurrent upper respiratory tract infections,\n",
+    "       Asymptomatic,Persistent generalized lymphadenopathy),\n",
+    "       Symptomatic HIV-associated nephropathy,\n",
+    "       Cryptococcal meningitis, Herpes zoster,\n",
+    "       Unexplained severe weight loss,Recurrent upper respiratory tract infections,\n",
+    "       Persistent generalized lymphadenopathy),\n",
+    "       Minor mucocutaneous manifestations,\n",
+    "       Unexplained severe weight loss,Unexplained persistent fever,Pulmonary tuberculosis,\n",
+    "       Recurrent oral ulcerations, Unexplained moderate malnutrition,\n",
+    "       Oral candidiasis, HIV wasting syndrome,\n",
+    "       Pulmonary tuberculosis,Oral candidiasis,\n",
+    "       Unexplained persistent fever'),\n",
+    "('clinical_visits', 'Height', 'Height of the patient in centimeters'),\n",
+    "('clinical_visits', 'Weight', 'Weight of the patient in kilograms'),\n",
+    "('clinical_visits', 'EMR', 'Electronic medical record information. Values include AMRS, KenyaEMR, ECARE, DREAMS'),\n",
+    "('clinical_visits', 'Project', 'Project associated with the visit. Values include Ampath Plus, Kenya HMIS II, EDARP, DREAM Kenya Trusts'),\n",
+    "('clinical_visits', 'Adherence', 'Adherence to treatment. Values include Good, , Fair, Good|, Good|Good, Poor, Poor|Poor,\n",
+    "       Poor|, 0, Poor|Good, Good|Poor. This variable will typically be used in combination with AdherenceCategory, and | here should align\n",
+    "               with | in that variable, indicating two values for two categories.'),\n",
+    "('clinical_visits', 'AdherenceCategory', 'Category of adherence. Values include GOOD, , FAIR, ART|CTX, ARV. \n",
+    "               GOOD and FAIR are erroneous and should be dropped when the variable is used. ART and and ARV should be\n",
+    "               considered as ART.'),\n",
+    "('clinical_visits', 'BP', 'Blood pressure readings. Value reported as systolic/diastolic in mmHg, e.g., 120/80.'),\n",
+    "('clinical_visits', 'OI', 'Opportunistic infections present. Values include Asymptomatic, Lymphadenopathy,\n",
+    "       Respiratory Tract Infections, Moderate Weight Loss'),\n",
+    "('clinical_visits', 'OIDate', 'Date of opportunistic infection diagnosis'),\n",
+    "('clinical_visits', 'CurrentRegimen', 'Current treatment regimen. Value includes two or three digit descriptions of molecules separated by / signs'),\n",
+    "('clinical_visits', 'AppointmentReminderWillingness', 'Willingness to receive appointment reminders. Values include Yes and No'),\n",
+    "('clinical_visits', 'key', 'Unique key for patientPKHash and SiteCode combination');\n",
+    "''')\n",
+    "conn.commit()\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6e27bce5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conn = sqlite3.connect('patient_slim.sqlite')\n",
+    "cursor = conn.cursor()\n",
+    "# pull all data from the lab table except for the \"key\" column \n",
+    "cursor.execute(\"SELECT * FROM lab;\")\n",
+    "rows = cursor.fetchall()\n",
+    "df = pd.DataFrame(rows, columns=[column[0] for column in cursor.description])\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "14402e96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# let's create a new sqlite database called patient_demonstration.sqlite\n",
+    "conn = sqlite3.connect('patient_demonstration.sqlite')\n",
+    "cursor = conn.cursor() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "540962b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a table called clinical_visits with the column names pulled above\n",
+    "# overwite the table if it already exists\n",
+    "cursor.execute('DROP TABLE IF EXISTS lab;')\n",
+    "cursor.execute('''\n",
+    "CREATE TABLE lab (\n",
+    "    PatientPKHash TEXT,\n",
+    "    SiteCode TEXT,\n",
+    "    OrderedbyDate TEXT,\n",
+    "    ReportedbyDate TEXT,\n",
+    "    TestName TEXT,\n",
+    "    TestResult TEXT,\n",
+    "    key TEXT\n",
+    ");\n",
+    "''')\n",
+    "\n",
+    "# let's now populate the table with the rows variable that contains all the data from the visits table\n",
+    "cursor.executemany('''\n",
+    "INSERT INTO lab (PatientPKHash, SiteCode, OrderedbyDate, ReportedbyDate, TestName, TestResult, key)\n",
+    "VALUES (?, ?, ?, ?, ?, ?, ?);\n",
+    "''', rows)\n",
+    "conn.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8df7171e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now, add lab table to the data dictionary\n",
+    "cursor.execute('''\n",
+    "INSERT INTO data_dictionary (table_name, column_name, description) VALUES\n",
+    "('lab', 'PatientPKHash', 'Hashed patient identifier'),\n",
+    "('lab', 'SiteCode', 'Code for the clinical site'),\n",
+    "('lab', 'OrderedbyDate', 'Date when the lab test was ordered'),\n",
+    "('lab', 'ReportedbyDate', 'Date when the lab test result was reported'),\n",
+    "('lab', 'TestName', 'Name of the lab test conducted, including CD4 Count for adults,\n",
+    "               CD4 Percentage for children, and Viral Load'),\n",
+    "('lab', 'TestResult', 'Result of the lab test. This will sometimes appear as numeric value\n",
+    "               and sometimes as text. Typically, when text, the value will be \"LDL\", meaning low \n",
+    "               detectable level, or low HIV viral load.'),\n",
+    "('lab', 'key', 'Unique key for PatientPKHash and SiteCode combination');\n",
+    "''')\n",
+    "conn.commit()\n",
+    "conn.close()  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b66d3dbb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conn = sqlite3.connect('patient_slim.sqlite')\n",
+    "cursor = conn.cursor()\n",
+    "# pull all data from the lab table except for the \"key\" column \n",
+    "cursor.execute(\"SELECT * FROM pharmacy;\")\n",
+    "rows = cursor.fetchall()\n",
+    "df = pd.DataFrame(rows, columns=[column[0] for column in cursor.description])\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "435b8d4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# let's create a new sqlite database called patient_demonstration.sqlite\n",
+    "conn = sqlite3.connect('patient_demonstration.sqlite')\n",
+    "cursor = conn.cursor() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "b3753eeb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a table called clinical_visits with the column names pulled above\n",
+    "# overwite the table if it already exists\n",
+    "cursor.execute('DROP TABLE IF EXISTS pharmacy;')\n",
+    "cursor.execute('''\n",
+    "CREATE TABLE pharmacy (\n",
+    "    PatientPKHash TEXT,\n",
+    "    SiteCode TEXT,\n",
+    "    Drug TEXT,\n",
+    "    DispenseDate TEXT,\n",
+    "    ExpectedReturn TEXT,\n",
+    "    Duration INTEGER,\n",
+    "    TreatmentType TEXT,\n",
+    "    RegimenLine TEXT,\n",
+    "    RegimenChangedSwitched TEXT,\n",
+    "    RegimenChangeSwitchedReason TEXT,\n",
+    "    key TEXT\n",
+    ");\n",
+    "''')\n",
+    "\n",
+    "# let's now populate the table with the rows variable that contains all the data from the visits table\n",
+    "cursor.executemany('''\n",
+    "INSERT INTO pharmacy (PatientPKHash, SiteCode, Drug, DispenseDate, ExpectedReturn, Duration, TreatmentType, RegimenLine, RegimenChangedSwitched, RegimenChangeSwitchedReason, key)\n",
+    "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);\n",
+    "''', rows)\n",
+    "conn.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "8b8ed08a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now, add pharmacy table to the data dictionary\n",
+    "cursor.execute('''\n",
+    "INSERT INTO data_dictionary (table_name, column_name, description) VALUES\n",
+    "('pharmacy', 'PatientPKHash', 'Hashed patient identifier'),\n",
+    "('pharmacy', 'SiteCode', 'Code for the clinical site'),\n",
+    "('pharmacy', 'Drug', 'Description of the drug prescribed, reported as collection of molecules (e.g. 3TC+DTG+TDF). Most common are ARVs for HIV'),\n",
+    "('pharmacy', 'DispenseDate', 'Date when the drug was dispensed'),\n",
+    "('pharmacy', 'ExpectedReturn', 'Expected return date for the next pharmacy visit'),\n",
+    "('pharmacy', 'Duration', 'Duration in number of days for which the drug is prescribed. Any duration of 60 days or greater is considered a multi-month dispensing (MMD).'),\n",
+    "('pharmacy', 'TreatmentType', 'Type of treatment. Values include ARV, PMTCT, Prophylaxis.'),\n",
+    "('pharmacy', 'RegimenLine', 'Line of treatment regimen. Valid values include First Line, Second Line, Third Line'),\n",
+    "('pharmacy', 'RegimenChangedSwitched', 'Indicates if the regimen was changed or switched. Valid values are Switch and Substition. Otherwise, regimen was not changed.'),\n",
+    "('pharmacy', 'RegimenChangeSwitchedReason', 'Reason for changing or switching the regimen. Valid values include New drug available, Virological failure, Drugs out of stock, Drug toxicity, New Diagnosis of tuberculosis, and Other.'),\n",
+    "('pharmacy', 'key', 'Unique key for PatientPKHash and SiteCode combination');\n",
+    "''')\n",
+    "conn.commit()\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "2de65432",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conn = sqlite3.connect('patient_slim.sqlite')\n",
+    "cursor = conn.cursor()\n",
+    "# pull all data from the lab table except for the \"key\" column \n",
+    "cursor.execute(\"SELECT * FROM demographics;\")\n",
+    "rows = cursor.fetchall()\n",
+    "df = pd.DataFrame(rows, columns=[column[0] for column in cursor.description])\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "a7a10f4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# let's create a new sqlite database called patient_demonstration.sqlite\n",
+    "conn = sqlite3.connect('patient_demonstration.sqlite')\n",
+    "cursor = conn.cursor() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "947c63d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a table called clinical_visits with the column names pulled above\n",
+    "# overwite the table if it already exists\n",
+    "cursor.execute('DROP TABLE IF EXISTS demographics;')\n",
+    "cursor.execute('''\n",
+    "CREATE TABLE demographics (\n",
+    "    PatientPKHash TEXT,\n",
+    "    MFLCode TEXT,\n",
+    "    FacilityName TEXT,\n",
+    "    County TEXT,\n",
+    "    SubCounty TEXT,\n",
+    "    PartnerName TEXT,\n",
+    "    AgencyName TEXT,\n",
+    "    Sex TEXT,\n",
+    "    MaritalStatus TEXT,\n",
+    "    EducationLevel TEXT,\n",
+    "    Occupation TEXT,\n",
+    "    OnIPT TEXT,\n",
+    "    AgeGroup TEXT,\n",
+    "    ARTOutcomeDescription TEXT,\n",
+    "    AsOfDate TEXT,\n",
+    "    LoadDate TEXT,\n",
+    "    StartARTDate TEXT,\n",
+    "    DOB TEXT,\n",
+    "    key TEXT\n",
+    ");\n",
+    "''')\n",
+    "\n",
+    "# let's now populate the table with the rows variable that contains all the data from the visits table\n",
+    "cursor.executemany('''\n",
+    "INSERT INTO demographics (PatientPKHash, MFLCode, FacilityName, County, SubCounty, PartnerName, AgencyName, Sex, MaritalStatus, EducationLevel, Occupation, OnIPT, AgeGroup, ARTOutcomeDescription, AsOfDate, LoadDate, StartARTDate, DOB, key)\n",
+    "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);\n",
+    "''', rows)\n",
+    "conn.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "9cff0d90",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now, add pharmacy table to the data dictionary\n",
+    "cursor.execute('''\n",
+    "INSERT INTO data_dictionary (table_name, column_name, description) VALUES\n",
+    "('demographics', 'PatientPKHash', 'Hashed patient identifier'),\n",
+    "('demographics', 'MFLCode', 'Code for the clinical site, same as SiteCode'),\n",
+    "('demographics', 'FacilityName', 'Name of the clinical facility'),\n",
+    "('demographics', 'County', 'County where the patient is located'),\n",
+    "('demographics', 'SubCounty', 'Sub-county where the patient is located'),\n",
+    "('demographics', 'PartnerName', 'Name of the implementing partner that manages the facility'),\n",
+    "('demographics', 'AgencyName', 'Name of the agency that supports the facility'),\n",
+    "('demographics', 'Sex', 'Sex of the patient. Valid values are male and female. Capitalization is not standardized so always set to lower case.'),\n",
+    "('demographics', 'MaritalStatus', 'Marital status of the patient. Valid values include married monogamous,\n",
+    "               married polygamous, single, divorced, widowed, cohabiting, separated. There are also some erroneous values \n",
+    "               that should be ignored and treated as missing.'),\n",
+    "('demographics', 'EducationLevel', 'Education level of the patient. Valid values primary, secondary, tertiary, none. \n",
+    "               there is a value for NULL that should be treated as missing.'),\n",
+    "('demographics', 'Occupation', 'Occupation of the patient. Valid values include farmer, trader, none (for unemployed),\n",
+    "               student, self employed, professional, employee, driver, and NULL that should be treated as missing.'),\n",
+    "('demographics', 'OnIPT', 'Indicates if the patient is on IPT. This is all null.'),\n",
+    "('demographics', 'AgeGroup', 'Age group of the patient. This is all null.'),\n",
+    "('demographics', 'ARTOutcomeDescription', 'Description of the ART outcome. Valid values include active, dead,\n",
+    "               loss to follow up, transferred out, undocumented loss, and lost in hmis.'),\n",
+    "('demographics', 'AsOfDate', 'Date as of which the data is reported'),\n",
+    "('demographics', 'LoadDate', 'Date when the data was loaded'),\n",
+    "('demographics', 'StartARTDate', 'Date when the patient started ART'),\n",
+    "('demographics', 'DOB', 'Date of birth of the patient'),\n",
+    "('demographics', 'key', 'Unique key for PatientPKHash and MFLCode combination');\n",
+    "''')\n",
+    "conn.commit()\n",
+    "conn.close()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/create_slim_patient_db.ipynb ADDED Viewed

	@@ -0,0 +1,308 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c867740b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sqlite3\n",
+    "import pandas as pd\n",
+    "# inspect current database schema\n",
+    "conn = sqlite3.connect('iit_test.sqlite')\n",
+    "cursor = conn.cursor()\n",
+    "# list tables\n",
+    "# pull all data from the visits table \n",
+    "cursor.execute(\"SELECT * FROM visits;\")\n",
+    "rows = cursor.fetchall()\n",
+    "df = pd.DataFrame(rows, columns=[column[0] for column in cursor.description])\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "f424fcf6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2997/3546205200.py:11: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  sampled_df['PatientPKHash'] = sampled_df['PatientPKHash'].map(key_to_number)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# pick ten unique keys at random from df without replacement\n",
+    "sampled_keys = df['PatientPKHash'].drop_duplicates().sample(n=10, random_state=42).tolist()\n",
+    "\n",
+    "# filter dataframe to only include sampled keys\n",
+    "sampled_df = df[df['PatientPKHash'].isin(sampled_keys)]\n",
+    "\n",
+    "# create a dict with key as key and numbers 1-10 as values\n",
+    "key_to_number = {key: i+1 for i, key in enumerate(sampled_keys)}\n",
+    "\n",
+    "# replace key column in sampled_df with corresponding number from key_to_number\n",
+    "sampled_df['PatientPKHash'] = sampled_df['PatientPKHash'].map(key_to_number)\n",
+    "\n",
+    "# save sampled_df back to iit_test.sqlite as a new table called sampled_visits\n",
+    "sampled_conn = sqlite3.connect('patient_slim.sqlite')\n",
+    "sampled_df.to_sql('visits', sampled_conn, if_exists='replace', index=False)\n",
+    "sampled_conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "8615f9fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(271, 25)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sampled_df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "1bad1098",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2997/4153193150.py:11: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  sampled_pharmacy_df['PatientPKHash'] = sampled_pharmacy_df['PatientPKHash'].map(key_to_number)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# now, read in pharmacy table from iit_test.sqlite\n",
+    "conn = sqlite3.connect('iit_test.sqlite')\n",
+    "cursor = conn.cursor()\n",
+    "cursor.execute(\"SELECT * FROM pharmacy;\")\n",
+    "rows = cursor.fetchall()\n",
+    "pharmacy_df = pd.DataFrame(rows, columns=[column[0] for column in cursor.description])\n",
+    "conn.close()\n",
+    "\n",
+    "# filter these to the same 10 keys, replace the keys with numbers 1-10, and save to patient_slim.sqlite as a new table called sampled_pharmacy\n",
+    "sampled_pharmacy_df = pharmacy_df[pharmacy_df['PatientPKHash'].isin(sampled_keys)]\n",
+    "sampled_pharmacy_df['PatientPKHash'] = sampled_pharmacy_df['PatientPKHash'].map(key_to_number)\n",
+    "sampled_conn = sqlite3.connect('patient_slim.sqlite')\n",
+    "sampled_pharmacy_df.to_sql('pharmacy', sampled_conn, if_exists='replace', index=False)\n",
+    "sampled_conn.close()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "bc8fac93",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PatientPKHash\n",
+       "1     14\n",
+       "2     24\n",
+       "3     24\n",
+       "4      9\n",
+       "5     40\n",
+       "6      1\n",
+       "7     15\n",
+       "8      1\n",
+       "9     64\n",
+       "10    14\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sampled_pharmacy_df.groupby('PatientPKHash').size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "df01b886",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2997/3478231606.py:11: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  sampled_lab_df['PatientPKHash'] = sampled_lab_df['PatientPKHash'].map(key_to_number)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# repeat the process above for lab table\n",
+    "conn = sqlite3.connect('iit_test.sqlite')\n",
+    "cursor = conn.cursor()\n",
+    "cursor.execute(\"SELECT * FROM lab;\")\n",
+    "rows = cursor.fetchall()\n",
+    "lab_df = pd.DataFrame(rows, columns=[column[0] for column in cursor.description])\n",
+    "conn.close()\n",
+    "\n",
+    "# filter these to the same 10 keys, replace the keys with numbers 1-10, and save to patient_slim.sqlite as a new table called sampled_lab\n",
+    "sampled_lab_df = lab_df[lab_df['PatientPKHash'].isin(sampled_keys)]\n",
+    "sampled_lab_df['PatientPKHash'] = sampled_lab_df['PatientPKHash'].map(key_to_number)\n",
+    "sampled_conn = sqlite3.connect('patient_slim.sqlite')\n",
+    "sampled_lab_df.to_sql('lab', sampled_conn, if_exists='replace', index=False)\n",
+    "sampled_conn.close()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "2578bf85",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PatientPKHash\n",
+       "1      6\n",
+       "2      2\n",
+       "3     17\n",
+       "4     22\n",
+       "5     23\n",
+       "6      1\n",
+       "7      2\n",
+       "8     10\n",
+       "9     13\n",
+       "10    12\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sampled_lab_df.groupby('PatientPKHash').size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "ebf358c5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2997/3867144072.py:11: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  sampled_dem_df['PatientPKHash'] = sampled_dem_df['PatientPKHash'].map(key_to_number)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# now, from dem table\n",
+    "conn = sqlite3.connect('iit_test.sqlite')\n",
+    "cursor = conn.cursor()\n",
+    "cursor.execute(\"SELECT * FROM dem;\")\n",
+    "rows = cursor.fetchall()\n",
+    "dem_df = pd.DataFrame(rows, columns=[column[0] for column in cursor.description])\n",
+    "conn.close()    \n",
+    "\n",
+    "# filter these to the same 10 keys, replace the keys with numbers 1-10, and save to patient_slim.sqlite as a new table called sampled_dem\n",
+    "sampled_dem_df = dem_df[dem_df['PatientPKHash'].isin(sampled_keys)]\n",
+    "sampled_dem_df['PatientPKHash'] = sampled_dem_df['PatientPKHash'].map(key_to_number)\n",
+    "sampled_conn = sqlite3.connect('patient_slim.sqlite')\n",
+    "sampled_dem_df.to_sql('demographics', sampled_conn, if_exists='replace', index=False)\n",
+    "sampled_conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "527420fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PatientPKHash\n",
+       "1     1\n",
+       "2     1\n",
+       "3     1\n",
+       "4     1\n",
+       "5     1\n",
+       "6     1\n",
+       "7     1\n",
+       "8     1\n",
+       "9     1\n",
+       "10    1\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sampled_dem_df.groupby('PatientPKHash').size()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/create_textrag.ipynb ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d13fafe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import os\n",
+    "import asyncio\n",
+    "from llama_parse import LlamaParse\n",
+    "from llama_index.core import VectorStoreIndex\n",
+    "from llama_index.core.node_parser import SimpleNodeParser\n",
+    "from llama_index.core.schema import Document\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv(\"../config.env\")\n",
+    "os.environ.get(\"OPENAI_API_KEY\")\n",
+    "os.environ.get(\"LLAMAPARSE_API_KEY\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a79afb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# instantiate LlamaParse\n",
+    "parser = LlamaParse(\n",
+    "    api_key=os.environ.get(\"LLAMAPARSE_API_KEY\"),\n",
+    "    result_type=\"markdown\",  # or \"text\"\n",
+    "    extract_charts=True,\n",
+    "    auto_mode=True,\n",
+    "    auto_mode_trigger_on_image_in_page=True,\n",
+    "    auto_mode_trigger_on_table_in_page=True,\n",
+    "    bbox_top=0.05,\n",
+    "    bbox_bottom=0.1,\n",
+    "    verbose=True\n",
+    ")\n",
+    "\n",
+    "# documents = parser.load_data(f\"GuidelinesSections/Kenya-ARV-Guidelines-2022-HepB-HepC-Coinfection.pdf\")\n",
+    "# # Write the output to a file\n",
+    "# with open(\"output.md\", \"w\", encoding=\"utf-8\") as f:\n",
+    "#    for doc in documents:\n",
+    "#        f.write(doc.text)\n",
+    "# filename=\"GuidelinesSections/Kenya-ARV-Guidelines-2022-HepB-HepC-Coinfection.pdf\"\n",
+    "# full_text = \"\\n\\n\".join(doc.text for doc in documents)\n",
+    "# combined_doc = Document(text=full_text)\n",
+    "# node_parser = SimpleNodeParser()\n",
+    "# nodes = node_parser.get_nodes_from_documents([combined_doc])\n",
+    "# # create the index\n",
+    "# index = VectorStoreIndex(nodes)\n",
+    "# # remove \"Kenya-ARV-Guidelines-2022-\" from filename\n",
+    "# short_filename = filename.replace(\"GuidelinesSections/Kenya-ARV-Guidelines-2022-\",\"\").replace(\".pdf\", \"\")\n",
+    "# # persist the index\n",
+    "# index.storage_context.persist(f\"lp/indices/{short_filename}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e94da2b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ea85ed0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# iterate through all files in guidance_docs/GuidelinesSections\n",
+    "# first, load the data using the parser\n",
+    "# then, flatted the data in each doc to create a single large doc per section\n",
+    "# finally, chunk the data using SentenceSplitter (tight size control)\n",
+    "async def parse_docs():\n",
+    "    for filename in os.listdir(\"GuidelinesSections\"):\n",
+    "        if filename.endswith(\".pdf\"):\n",
+    "            documents = parser.load_data(f\"GuidelinesSections/{filename}\")\n",
+    "            full_text = \"\\n\\n\".join(doc.text for doc in documents)\n",
+    "            combined_doc = Document(text=full_text)\n",
+    "            node_parser = SimpleNodeParser()\n",
+    "            nodes = node_parser.get_nodes_from_documents([combined_doc])\n",
+    "            # create the index\n",
+    "            index = VectorStoreIndex(nodes)\n",
+    "            # remove \"Kenya-ARV-Guidelines-2022-\" from filename\n",
+    "            short_filename = filename.replace(\"Kenya-ARV-Guidelines-2022-\",\"\").replace(\".pdf\", \"\")\n",
+    "            # persist the index\n",
+    "            index.storage_context.persist(f\"lp/indices/{short_filename}\")\n",
+    "        \n",
+    "await parse_docs()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7135ce0d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfa61623",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "clinician-assistant-lg",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/gen_idsr_rag.ipynb ADDED Viewed

	@@ -0,0 +1,562 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da62e982",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from pprint import pprint\n",
+    "import os\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv(\"../config.env\")\n",
+    "os.environ.get(\"OPENAI_API_KEY\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7b2b560b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read in IDSR.txt\n",
+    "with open(\"IDSR.txt\", encoding=\"utf-8\") as f:\n",
+    "    text = f.read()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "50d72066",
+   "metadata": {},
+   "source": [
+    "Extract Keywords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "75a4c7bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"\"\"\n",
+    "You are a helpful assistant. Extract a list of 30–50 key symptoms, signs, or diagnostic terms from the following disease descriptions.\n",
+    "\n",
+    "Focus on words or phrases that are likely to appear in clinical case definitions or user queries — such as \"fever\", \"skin lesions\", \"swollen lymph nodes\", \"positive blood smear\", etc.\n",
+    "\n",
+    "Only return the keywords or short phrases — one per line.\n",
+    "\n",
+    "Text:\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f704812",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI()\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"gpt-4o\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "        {\"role\": \"user\", \"content\": prompt + text}\n",
+    "    ],\n",
+    "    temperature=0.0\n",
+    ")\n",
+    "keywords = [line.strip() for line in response.choices[0].message.content.splitlines() if line.strip()]\n",
+    "print(\"Extracted Keywords:\")\n",
+    "for keyword in keywords:\n",
+    "    print(\"-\", keyword)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f698154",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# remove dashes and normalize keywords\n",
+    "def normalize_kw(kw):\n",
+    "    return kw.lstrip(\"-• \").strip().lower() \n",
+    "keywords = [normalize_kw(kw) for kw in keywords]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11324098",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save keywords to file\n",
+    "with open(\"idsr_keywords.txt\", \"w\", encoding=\"utf-8\") as f:\n",
+    "    for keyword in keywords:\n",
+    "        f.write(f\"{keyword}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "add8c3fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load file\n",
+    "with open(\"idsr_keywords.txt\", \"r\", encoding=\"utf-8\") as f:\n",
+    "    keywords = [line.strip() for line in f if line.strip()]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1d12b253",
+   "metadata": {},
+   "source": [
+    "Prep each disease as a document"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2923ecab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we need to split the text into a list of dictionaries:\n",
+    "# the text is structured as follows:\n",
+    "# the section for each disease starts after an empty line.\n",
+    "# the disease name itself takes up the first line.\n",
+    "# following the disease name, there will be subsections, each one beginning with an \"-\", some text, and then a colon. \n",
+    "# what is between the \"-\" and the colon is the name of the subsection. the name of each subsection takes up one line.\n",
+    "# following this, the next few lines contains the text for that subsection. however many lines it takes up,\n",
+    "# this should be the value for the subsection key in the dictionary, condenses to a single string.\n",
+    "# some diseases have multiple subsections, while others have only one.\n",
+    "# when we encounter an empty line, it indicates the start of a new disease section.\n",
+    "# what we should produce is one dictionary per disease, with a key called disease_name and value being the name of the disease. \n",
+    "# the other keys should be the subsections, with the value being the text that follows the subsection name.\n",
+    "\n",
+    "def parse_disease_text(text):\n",
+    "    diseases = []\n",
+    "    lines = text.strip().splitlines()\n",
+    "    \n",
+    "    current_disease = None\n",
+    "    current_subsection = None\n",
+    "    buffer = []\n",
+    "\n",
+    "    def finalize_subsection():\n",
+    "        if current_disease is not None and current_subsection and buffer:\n",
+    "            content = \" \".join(line.strip() for line in buffer).strip()\n",
+    "            current_disease[current_subsection] = content\n",
+    "\n",
+    "    subsection_pattern = re.compile(r\"^-\\s*(.+):\\s*$\")\n",
+    "\n",
+    "    for line in lines + [\"\"]:  # Extra empty line to trigger final save\n",
+    "        if not line.strip():\n",
+    "            finalize_subsection()\n",
+    "            if current_disease:\n",
+    "                diseases.append(current_disease)\n",
+    "            current_disease = None\n",
+    "            current_subsection = None\n",
+    "            buffer = []\n",
+    "            continue\n",
+    "\n",
+    "        if current_disease is None:\n",
+    "            current_disease = {\"disease_name\": line.strip()}\n",
+    "            continue\n",
+    "\n",
+    "        match = subsection_pattern.match(line)\n",
+    "        if match:\n",
+    "            finalize_subsection()\n",
+    "            current_subsection = match.group(1).strip()\n",
+    "            buffer = []\n",
+    "        else:\n",
+    "            buffer.append(line.rstrip())\n",
+    "\n",
+    "    return diseases\n",
+    "\n",
+    "\n",
+    "\n",
+    "disease_dicts = parse_disease_text(text)\n",
+    "   "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fd83b33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.documents import Document\n",
+    "\n",
+    "def convert_disease_dicts_to_documents(disease_dicts):\n",
+    "    docs = []\n",
+    "    for disease in disease_dicts:\n",
+    "        disease_name = disease.get(\"disease_name\", \"\")\n",
+    "        subsections = [f\"{key}:\\n{value}\" for key, value in disease.items() if key != \"disease_name\"]\n",
+    "        full_text = f\"Disease: {disease_name}\\n\\n\" + \"\\n\\n\".join(subsections)\n",
+    "        docs.append(Document(page_content=full_text, metadata={\"disease_name\": disease_name}))\n",
+    "    return docs\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19baadb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 2: Convert to LangChain documents\n",
+    "documents = convert_disease_dicts_to_documents(disease_dicts)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15bc8f40",
+   "metadata": {},
+   "source": [
+    "Tag each document with keywords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33d70fff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from rapidfuzz import fuzz\n",
+    "\n",
+    "def tag_documents_with_keywords(documents, keywords, threshold=85):\n",
+    "    \"\"\"\n",
+    "    Tags each Document in the list with a 'matched_keywords' metadata field\n",
+    "    using fuzzy matching (e.g., RapidFuzz partial ratio).\n",
+    "\n",
+    "    Parameters:\n",
+    "        documents (list): List of langchain `Document` objects.\n",
+    "        keywords (list): List of predefined clinical keywords (e.g. from GPT).\n",
+    "        threshold (int): Similarity threshold (0–100) for fuzzy matching.\n",
+    "\n",
+    "    Returns:\n",
+    "        List of tagged Document objects with updated metadata.\n",
+    "    \"\"\"\n",
+    "    tagged = []\n",
+    "\n",
+    "    for doc in documents:\n",
+    "        content = doc.page_content.lower()\n",
+    "\n",
+    "        # Match keywords against document content\n",
+    "        matched = []\n",
+    "        for kw in keywords:\n",
+    "            kw_lower = kw.lower()\n",
+    "            if fuzz.partial_ratio(kw_lower, content) >= threshold:\n",
+    "                matched.append(kw)\n",
+    "\n",
+    "        # Add tags to metadata\n",
+    "        doc.metadata[\"matched_keywords\"] = matched\n",
+    "        tagged.append(doc)\n",
+    "\n",
+    "    return tagged\n",
+    "\n",
+    "tagged_documents = tag_documents_with_keywords(documents, keywords)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b588f56e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "# Convert Document objects to dicts\n",
+    "doc_dicts = [doc.dict() for doc in tagged_documents]\n",
+    "\n",
+    "with open(\"tagged_documents.json\", \"w\", encoding=\"utf-8\") as f:\n",
+    "    json.dump(doc_dicts, f, ensure_ascii=False, indent=2)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "166513b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load tagged documents from file\n",
+    "import json\n",
+    "from langchain_core.documents import Document\n",
+    "with open(\"tagged_documents.json\", \"r\", encoding=\"utf-8\") as f:\n",
+    "    tagged_documents = [Document(**doc) for doc in json.load(f)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f586616",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tagged_documents[50]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39882d72",
+   "metadata": {},
+   "source": [
+    "Fuzzy-match query to keywords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db127464",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from rapidfuzz import fuzz\n",
+    "\n",
+    "def find_keywords_in_prompt(prompt, keywords, threshold=80):\n",
+    "    \"\"\"\n",
+    "    Returns all keywords that appear in the prompt using fuzzy matching.\n",
+    "    \n",
+    "    Args:\n",
+    "        prompt (str): The user prompt.\n",
+    "        keywords (list): List of keywords to match.\n",
+    "        threshold (int): Fuzzy match threshold (0-100).\n",
+    "        \n",
+    "    Returns:\n",
+    "        list: Matched keywords.\n",
+    "    \"\"\"\n",
+    "    prompt_lower = prompt.lower()\n",
+    "    matched = []\n",
+    "    for kw in keywords:\n",
+    "        kw_lower = kw.lower()\n",
+    "        # Use partial_ratio for substring-like matching\n",
+    "        if fuzz.partial_ratio(kw_lower, prompt_lower) >= threshold:\n",
+    "            matched.append(kw)\n",
+    "    return matched\n",
+    "\n",
+    "# Example usage:\n",
+    "# keywords = [\"fever\", \"skin lesions\", \"swollen lymph nodes\"]\n",
+    "# prompt = \"The patient presents with fever and swollen nodes.\"\n",
+    "# print(find_keywords_in_prompt(prompt, keywords))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e51dd2f1",
+   "metadata": {},
+   "source": [
+    "GPT to match query to keywords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d51d699e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import List\n",
+    "from pydantic import BaseModel, Field\n",
+    "from langchain_core.output_parsers import PydanticOutputParser\n",
+    "from langchain.prompts import PromptTemplate\n",
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.chains import LLMChain\n",
+    "\n",
+    "class KeywordsOutput(BaseModel):\n",
+    "    keywords: List[str] = Field(description=\"List of relevant keywords extracted from the query\")\n",
+    "\n",
+    "def extract_keywords_with_gpt(query: str, known_keywords: List[str]) -> List[str]:\n",
+    "    parser = PydanticOutputParser(pydantic_object=KeywordsOutput)\n",
+    "\n",
+    "    prompt = PromptTemplate(\n",
+    "        template=\"\"\"\n",
+    "You are helping identify relevant medical concepts. \n",
+    "Given this query: \"{query}\"\n",
+    "\n",
+    "Select the most relevant keywords from this list:\n",
+    "{keyword_list}\n",
+    "\n",
+    "Return the matching keywords as a JSON object with a single key \"keywords\" whose value is a list of strings.\n",
+    "\n",
+    "{format_instructions}\n",
+    "\"\"\",\n",
+    "        input_variables=[\"query\", \"keyword_list\"],\n",
+    "        partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
+    "    )\n",
+    "\n",
+    "    chain = LLMChain(\n",
+    "        llm=ChatOpenAI(temperature=0, model=\"gpt-4o\"),\n",
+    "        prompt=prompt,\n",
+    "        output_parser=parser,\n",
+    "    )\n",
+    "\n",
+    "    output = chain.run(query=query, keyword_list=\", \".join(known_keywords))\n",
+    "\n",
+    "    # output is a list of strings, not a KeywordsOutput instance\n",
+    "    return output.keywords\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45fdb67b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# matched_keywords = extract_keywords_with_gpt(query = \"child presenting with lesions\", known_keywords = keywords)\n",
+    "# print(\"Matched Keywords:\", matched_keywords)\n",
+    "type(matched_keywords)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9c4c9bc",
+   "metadata": {},
+   "source": [
+    "Hybrid search using matched keywords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e59aa39",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def hybrid_search_with_query_keywords(query, vectorstore, documents, keyword_list, top_k=5):\n",
+    "    # Step 1: Semantic search\n",
+    "    semantic_hits = vectorstore.similarity_search(query, k=top_k)\n",
+    "\n",
+    "    # Step 2: Use GPT to extract keywords from the query\n",
+    "    matched_keywords = extract_keywords_with_gpt(query, keyword_list)\n",
+    "\n",
+    "    # Step 3: Filter docs whose metadata has any of those keywords\n",
+    "    keyword_hits = [\n",
+    "        doc for doc in documents\n",
+    "        if any(\n",
+    "            normalize_kw(kw1) == normalize_kw(kw2)\n",
+    "            for kw1 in doc.metadata.get(\"matched_keywords\", [])\n",
+    "            for kw2 in matched_keywords\n",
+    "        )\n",
+    "    ]\n",
+    "\n",
+    "    for kw in matched_keywords:\n",
+    "        print(f\"Matched keyword: {kw}\")\n",
+    "\n",
+    "    # print metadata of keyword_hits\n",
+    "    for doc in keyword_hits:\n",
+    "        print(doc.metadata.get(\"disease_name\"))\n",
+    "        print(doc.metadata.get(\"matched_keywords\"))\n",
+    "        print(doc.page_content)\n",
+    "\n",
+    "    # Step 4: Merge by unique content\n",
+    "    merged = {doc.page_content: doc for doc in semantic_hits + keyword_hits}\n",
+    "    return list(merged.values()), matched_keywords\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b215b0fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "from langchain.vectorstores import FAISS\n",
+    "\n",
+    "embedding_model = OpenAIEmbeddings()\n",
+    "\n",
+    "# `documents` is the list of LangChain Document objects from before\n",
+    "vectorstore = FAISS.from_documents(tagged_documents, embedding_model)\n",
+    "\n",
+    "vectorstore.save_local(\"disease_vectorstore\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96ffa9b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Startup:\n",
+    "from langchain.vectorstores import FAISS\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "vectorstore = FAISS.load_local(\"disease_vectorstore\", OpenAIEmbeddings(),allow_dangerous_deserialization=True)\n",
+    "\n",
+    "# Query time:\n",
+    "query = \"child presenting with lesions\"\n",
+    "results, matched = hybrid_search_with_query_keywords(query, vectorstore, tagged_documents, keywords)\n",
+    "\n",
+    "# print(\"Matched keywords:\", matched)\n",
+    "# for doc in results:\n",
+    "#     print(\"---\")\n",
+    "#     print(doc.metadata.get(\"disease_name\"))\n",
+    "#     print(doc.metadata.get(\"matched_keywords\"))\n",
+    "#     print(doc.page_content)\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38fb3c90",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# doc=tagged_documents[0].metadata.get(\"matched_keywords\")\n",
+    "doc\n",
+    "# matched_keywords\n",
+    "# doc in matched_keywords\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed6a99f4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

scripts/build_location_db.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+import sqlite3
+import pandas as pd
+# Define file paths
+base_dir = os.path.dirname(__file__)
+raw_dir = os.path.abspath(os.path.join(base_dir, "data", "raw"))
+processed_dir = os.path.abspath(os.path.join(base_dir, "data", "processed"))
+os.makedirs(processed_dir, exist_ok=True)
+# Input CSVs
+disease_path = os.path.join(raw_dir, "kenya_disease_county_matrix.csv")
+xwalk_path = os.path.join(raw_dir, "sitecode_county_xwalk.csv")
+rainy_path = os.path.join(raw_dir, "kenya_counties_rainy_seasons.csv")
+who_path = os.path.join(raw_dir, "who_bulletin.csv")
+# Output DB
+db_path = os.path.join(processed_dir, "location_data.sqlite")
+# Read CSVs
+disease_df = pd.read_csv(disease_path)
+xwalk_df = pd.read_csv(xwalk_path)
+rainy_df = pd.read_csv(rainy_path)
+who_df = pd.read_csv(who_path)
+# Write to SQLite
+conn = sqlite3.connect(db_path)
+disease_df.to_sql('county_disease_info', conn, if_exists='replace', index=False)
+xwalk_df.to_sql('sitecode_county_xwalk', conn, if_exists='replace', index=False)
+rainy_df.to_sql('county_rainy_seasons', conn, if_exists='replace', index=False)
+who_df.to_sql('who_bulletin', conn, if_exists='replace', index=False)
+conn.commit()
+conn.close()
+print(f"SQLite database written to: {db_path}")

scripts/parse_guidelines.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+import asyncio
+from dotenv import load_dotenv
+from llama_parse import LlamaParse
+from llama_index.core import VectorStoreIndex
+from llama_index.core.node_parser import SimpleNodeParser
+from llama_index.core.schema import Document
+# Load environment variables
+load_dotenv("config.env")
+# Set up LlamaParse
+parser = LlamaParse(
+    api_key=os.environ.get("LLAMAPARSE_API_KEY"),
+    result_type="markdown",
+    extract_charts=True,
+    auto_mode=True,
+    auto_mode_trigger_on_image_in_page=True,
+    auto_mode_trigger_on_table_in_page=True,
+    bbox_top=0.05,
+    bbox_bottom=0.1,
+    verbose=True,
+)
+# Create output directory if it doesn't exist
+os.makedirs("data/processed/lp/indices", exist_ok=True)
+async def parse_docs():
+    for filename in os.listdir("data/raw/GuidelinesSections"):
+        if filename.endswith(".pdf"):
+            filepath = f"data/raw/GuidelinesSections/{filename}"
+            print(f"Processing: {filepath}")
+            try:
+                documents = await parser.aload_data(filepath)
+            except Exception as e:
+                print(f"❌ Failed to parse {filename}: {e}")
+                continue
+            full_text = "\n\n".join(doc.text for doc in documents)
+            combined_doc = Document(text=full_text)
+            node_parser = SimpleNodeParser()
+            nodes = node_parser.get_nodes_from_documents([combined_doc])
+            index = VectorStoreIndex(nodes)
+            short_filename = (
+                filename.replace("Kenya-ARV-Guidelines-2022-", "")
+                .replace(".pdf", "")
+            )
+            index.storage_context.persist(persist_dir=f"data/processed/lp/indices/{short_filename}")
+            print(f"✅ Saved index for {short_filename}")
+if __name__ == "__main__":
+    asyncio.run(parse_docs())

scripts/prep_summaries.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import numpy as np
+import pandas as pd
+from llama_index.embeddings.openai import OpenAIEmbedding
+import os
+from dotenv import load_dotenv
+load_dotenv("config.env")
+os.environ.get("OPENAI_API_KEY")
+# load vectorstore summaries
+df = pd.read_csv("data/raw/guidelines_summaries.csv")
+# Embed summaries
+embedding_model = OpenAIEmbedding()
+summary_embeddings = []
+for summary in df["summary"]:
+    emb = embedding_model.get_text_embedding(summary)
+    summary_embeddings.append(emb)
+summary_embeddings = np.vstack(summary_embeddings)
+# Save embeddings and metadata
+os.makedirs("data/processed/lp/summary_embeddings", exist_ok=True)
+np.save("data/processed/lp/summary_embeddings/embeddings.npy", summary_embeddings)
+df.to_csv("data/processed/lp/summary_embeddings/index.tsv", sep="\t", index=False)
+print("✅ Saved embeddings and index.")

scripts/process_idsr.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+import re
+import json
+from dotenv import load_dotenv
+from openai import OpenAI
+from langchain_core.documents import Document
+from langchain_openai import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+from rapidfuzz import fuzz
+# === Setup ===
+base_dir = os.path.dirname(__file__)
+raw_path = os.path.abspath(os.path.join(base_dir, "data", "raw"))
+processed_path = os.path.abspath(os.path.join(base_dir, "data", "processed"))
+os.makedirs(processed_path, exist_ok=True)
+load_dotenv(os.path.join(base_dir, "config.env"))
+api_key = os.environ.get("OPENAI_API_KEY")
+# === Step 1: Read IDSR Text ===
+with open(os.path.join(raw_path, "IDSR.txt"), encoding="utf-8") as f:
+    text = f.read()
+# === Step 2: Extract Keywords via GPT ===
+prompt = """
+You are a helpful assistant. Extract a list of 30–50 key symptoms, signs, or diagnostic terms from the following disease descriptions.
+Focus on words or phrases that are likely to appear in clinical case definitions or user queries — such as "fever", "skin lesions", "swollen lymph nodes", "positive blood smear", etc.
+Only return the keywords or short phrases — one per line.
+Text:
+"""
+client = OpenAI()
+response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt + text}
+    ],
+    temperature=0.0
+)
+# Normalize keywords
+keywords = [line.strip() for line in response.choices[0].message.content.splitlines() if line.strip()]
+def normalize_kw(kw):
+    return kw.lstrip("-• ").strip().lower()
+keywords = [normalize_kw(kw) for kw in keywords]
+# Save keywords
+kw_path = os.path.join(processed_path, "idsr_keywords.txt")
+with open(kw_path, "w", encoding="utf-8") as f:
+    for keyword in keywords:
+        f.write(f"{keyword}\n")
+print(f"✅ Saved keywords to {kw_path}")
+# === Step 3: Parse Disease Sections ===
+def parse_disease_text(text):
+    diseases = []
+    lines = text.strip().splitlines()
+    current_disease = None
+    current_subsection = None
+    buffer = []
+    def finalize_subsection():
+        if current_disease is not None and current_subsection and buffer:
+            content = " ".join(line.strip() for line in buffer).strip()
+            current_disease[current_subsection] = content
+    subsection_pattern = re.compile(r"^-\s*(.+):\s*$")
+    for line in lines + [""]:
+        if not line.strip():
+            finalize_subsection()
+            if current_disease:
+                diseases.append(current_disease)
+            current_disease = None
+            current_subsection = None
+            buffer = []
+            continue
+        if current_disease is None:
+            current_disease = {"disease_name": line.strip()}
+            continue
+        match = subsection_pattern.match(line)
+        if match:
+            finalize_subsection()
+            current_subsection = match.group(1).strip()
+            buffer = []
+        else:
+            buffer.append(line.rstrip())
+    return diseases
+disease_dicts = parse_disease_text(text)
+# === Step 4: Convert to LangChain Documents ===
+def convert_disease_dicts_to_documents(disease_dicts):
+    docs = []
+    for disease in disease_dicts:
+        disease_name = disease.get("disease_name", "")
+        subsections = [f"{key}:\n{value}" for key, value in disease.items() if key != "disease_name"]
+        full_text = f"Disease: {disease_name}\n\n" + "\n\n".join(subsections)
+        docs.append(Document(page_content=full_text, metadata={"disease_name": disease_name}))
+    return docs
+documents = convert_disease_dicts_to_documents(disease_dicts)
+# === Step 5: Tag Documents with Keywords ===
+def tag_documents_with_keywords(documents, keywords, threshold=85):
+    tagged = []
+    for doc in documents:
+        content = doc.page_content.lower()
+        matched = [kw for kw in keywords if fuzz.partial_ratio(kw.lower(), content) >= threshold]
+        doc.metadata["matched_keywords"] = matched
+        tagged.append(doc)
+    return tagged
+tagged_documents = tag_documents_with_keywords(documents, keywords)
+# Save JSON version
+json_path = os.path.join(processed_path, "tagged_documents.json")
+with open(json_path, "w", encoding="utf-8") as f:
+    json.dump([doc.dict() for doc in tagged_documents], f, ensure_ascii=False, indent=2)
+print(f"✅ Saved tagged documents to {json_path}")
+# === Step 6: Build and Save FAISS Vectorstore ===
+embedding_model = OpenAIEmbeddings()
+vectorstore = FAISS.from_documents(tagged_documents, embedding_model)
+vs_path = os.path.join(processed_path, "disease_vectorstore")
+vectorstore.save_local(vs_path)
+print(f"✅ Saved FAISS vectorstore to {vs_path}")