Benedette commited on
Commit
aae626b
·
verified ·
1 Parent(s): be99d26

creating initial app.py

Browse files
Files changed (1) hide show
  1. app.py +232 -0
app.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import create_engine
2
+ #from sqlalchemy.ext.declarative import declarative_base
3
+ from sqlalchemy.orm import sessionmaker
4
+ #import models
5
+ DB_PASSWORD= "kc{!ET7g"
6
+ DB_HOST ="10.230.50.66"
7
+ DB_PORT = "1433"
8
+ DB= "Reporting"
9
+ # Construct the connection string
10
+ SQL_DATABASE_URL = f'mssql+pymssql://Benedette:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB}'
11
+ # Create an engine instance
12
+ engine = create_engine(
13
+ SQL_DATABASE_URL, connect_args={}, echo=True
14
+ )
15
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
16
+ #linelist_factart_schema = models.t_Linelist_FACTART
17
+
18
+
19
+ # Base = declarative_base()
20
+ db = SessionLocal()
21
+ # linelist_factart_schema
22
+
23
+ # a wrapper around the SQLAlchemy engine to interact with a SQL database.
24
+ from llama_index.core import SQLDatabase
25
+ # sql_database = SQLDatabase(engine)
26
+ tables = ["Linelist_FACTART","LineListTransHTS", "LinelistPrep","LinelistHEI", "AggregateDSD","AggregateOTZEligibilityAndEnrollments","AggregateARTHistory"]
27
+ sql_database = SQLDatabase(engine, include_tables=tables)
28
+ sql_database
29
+
30
+ import os
31
+ os.environ["OPENAI_API_KEY"]
32
+ from llama_index.llms.openai import OpenAI
33
+ llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
34
+
35
+ fact_linelist_str = (
36
+ "A client-level linelist that contains comprehensive data on all clients who have ever received treatment for HIV/AIDS, "
37
+ "encompassing various indicators and clinical parameters."
38
+ "Use this table to answer quetions related to active patients currently on treatment/txcurr, viral load results"
39
+ "Key attributes captured in this linelist include:"
40
+ "if input in the NUPI column is NULL then the client has no NUPI"
41
+ "Active patients is where ARTOutcomeDescription is Active."
42
+ "Clinical indicators like Last CD4 count, Last VL (Viral Load), and WHO Stage, aiding in the assessment of disease progression and treatment response."
43
+ "Demographic information, including Age at ART Start, Gender, Marital Status, and County/Sub-County, facilitating analysis of patient demographics."
44
+ "Medical history and co-morbidities, such as Diabetes and Hypertension status, providing context on underlying health conditions and associated risk factors."
45
+ "Facility-related data, such as Facility Name, Site Code,County, and Partner Name, enabling assessment of service delivery across different healthcare facilities and implementing partners."
46
+ "Pregnancy-related indicators, including Pregnant ART Start and Pregnant at Enrollment, supporting maternal and child health monitoring and intervention."
47
+ "clients are uniquley identified by concatenating PatientPKHash and Sitecode"
48
+ "LastVL is the most current VL for the client"
49
+ "LowViremia/suppressed is when a client viral load is less that 200 copies per ml,1= True 0 = False "
50
+ "HighViremia/unsuppressed is when a client viral load is more that 200 copies per ml,1= True 0 = False "
51
+ "HasValidVL is boolean value for if client has a valid VL"
52
+ "ISTxCurs indicates whether the patients are active on teatment where 1= True"
53
+ "Treatment Outcomes(ARTOutcomeDescription) is as at one point"
54
+ "This table can be used to answer queries such as:"
55
+ "What is the distribution of treatment outcomes among HIV/AIDS patients, such as Active, Transfer Out, and Loss to Follow-Up by county, partner,age?"
56
+ "What proportion of patients have achieved viral suppression, as indicated by their Last VL results by coounty?"
57
+ "What percentage of HIV/AIDS patients have co-morbid conditions such as diabetes or hypertension?"
58
+
59
+ )
60
+
61
+ hts_linelist_str = (
62
+ "A client-level linelist containing comprehensive HIV testing data for all adult clients (> 18 years) who have undergone HIV testing."
63
+ "This dataset captures a wide range of information including demographic details, testing outcomes, testing history, and programmatic indicators."
64
+ "It serves as a valuable table for analyzing HIV testing patterns, testing outcomes, and testing strategies among adult populations."
65
+ "Please note that this dataset is not suitable for inquiries related to patients on treatment."
66
+ "use this table to answer any questions related to HIV testing"
67
+ "Additional details available include:"
68
+ "- Age at Testing (AgeAtTesting): Age of the client at the time of HIV testing."
69
+ "- Age Group (AgeGroup): Categorization of clients into 4-year age bands from 1 to 64 years."
70
+ "- Agency Name (AgencyName): Name of the funding body or organization supporting the testing program."
71
+ "- Client Self-Tested (ClientSelfTested): Indicates whether a client has ever performed self-testing for HIV."
72
+ "- Client Tested As (ClientTestedAs): Categorizes clients based on whether they were tested individually or as part of a couple."
73
+ "- County (County) and Sub-County (SubCounty): Geographic location of the testing facility."
74
+ "- Couple Discordant (CoupleDiscordant): Indicates whether a couple tested together was concordant or discordant for HIV."
75
+ "- Date of Birth (DOB): Date of birth of the client."
76
+ "- Enrollment Date (EnrollmentDate): Date when the client was enrolled into the CCC."
77
+ "- Entry Point (EntryPoint): Service point where the HIV test was conducted (e.g., VCT, OPD)."
78
+ "- Ever Tested for HIV (EverTestedForHiv): Indicates whether the client has ever been tested for HIV before."
79
+ "- Facility Name (FacilityName) and MFL Code (MFLCode): Name and code of the testing facility."
80
+ "- Final Test Result (FinalTestResult): Result of the HIV test for the encounter."
81
+ "- Gender (Gender) and Marital Status (MaritalStatus): Demographic characteristics of the client."
82
+ "- Linked (Linked): Boolean value indicating whether the client was successfully linked to follow-up services."
83
+ "- Months Since Last Test (MonthsSinceLastTest): Number of months since the client's last HIV test."
84
+ "- Test Date (TestDate): Date when the client was tested for HIV."
85
+ "- Test Strategy (TestStrategy): Strategy employed for HIV testing (e.g., Hospital Patient, Non-Patient)."
86
+ "- Test Type (TestType): Type of HIV test conducted during the encounter."
87
+ "- Tested (Tested): Boolean value indicating whether the client was tested for HIV."
88
+ "- Tested Before (TestedBefore): Indicates if the client has been tested for HIV within the last 12 months."
89
+ "- TB Screening (tbScreening): Outcome of TB screening conducted during the encounter."
90
+ "Positivity rate is number of positive tests from all the test conducted in a certain period"
91
+ )
92
+
93
+ prep_str = (
94
+ "A client-level line list containing comprehensive information on all clients enrolled in Pre-Exposure Prophylaxis (PrEP) programs."
95
+ "Additional information available includes:"
96
+ "- As of Date (AsofDate): End of the reporting month for the data."
97
+ "- Assessment Month (AssessmentMonth) and Assessment Year (AssessmentYear): Month and year when the client was assessed for PrEP enrollment."
98
+ "- Eligible for PrEP (EligiblePrep): Boolean value indicating whether the client is eligible for enrollment in PrEP based on risk category."
99
+ "- Latest HIV Risk Category (LatestHIVRiskCategory): Last recorded risk category from the HIV testing machine learning model."
100
+ "- Screened for PrEP (ScreenedPrep): Boolean value indicating whether the client was assessed for enrollment into PrEP."
101
+ "- PatientPKHash: Hashed value representing the unique client ID in the specific facility."
102
+ "Use this table to answer any prep related question,i.e from high risk clients how many were enrolled in Prep"
103
+ )
104
+
105
+ hei_str= ("A client level linelist that contains various indicators of HIV-exposed infants"
106
+ "this table should be used for any HEI related questions,Iincluding whether HEI is breastfeeding,tested at different timepoints, "
107
+ "outcome of Hei after they exit the HEI program"
108
+ "Additional information available includes:"
109
+ "-BF12mnths Indicates whether the HEI is breastfeeding at 12 months of age as of last cwc visit"
110
+ "-BF18mnths Indicates whether the HEI is breastfeeding at 18 months of age as of last cwc visit"
111
+ "-EBF6mnths Indicates whether the HEI is using Exclusive Replacement(ERF) feeding method at 6 months of age as of last cwc visit"
112
+ "-HEIExitCriteria the Exit reason for an exposed infant after 24months"
113
+ "-InitialPCRBtwn8wks_12mnthsIndicates whether the HEI's DNAPCR1 was done at age of between 8 weeks and 48 weeks"
114
+ "-TestedAt12months-Indicates whether the HEI's DNAPCR2 was done at age of 12 months of age")
115
+ otzenroll_str=("An aggregate table that contains counts of TXCurr/number of active individuals between 10 and 19 years who are eligible for OTZ program, /enrolled in OTZ,"
116
+ "completed training modules and eligible for VL"
117
+ "Use this table for addressing any inquiries regarding OTZ and corresponding viral loads"
118
+ "AgeGroup: A 4-year age band from 1 to 64 years"
119
+ "CompletedToday_OTZ_Beyond: Has the client completed OTZ_Beyond today"
120
+ "CompletedToday_OTZ_Leadership: Has the client completed OTZ_Leadership today"
121
+ "CompletedToday_OTZ_MakingDecisions: Has the client completed OTZ_MakingDecisions today"
122
+ "CompletedToday_OTZ_Orientation: Has the client completed OTZ_Orientation today"
123
+ "CompletedToday_OTZ_Participation: Has the client completed OTZ_Participation today"
124
+ "CompletedToday_OTZ_SRH: Has the client completed OTZ_SRH today"
125
+ "CompletedToday_OTZ_Transition: Has the client completed OTZ_Transition today"
126
+ "CompletedToday_OTZ_TreatmentLiteracy: Has the client completed OTZ_TreatmentLiteracy today"
127
+ "CompletedTraining: Number of clients who have completed OTZ modules training"
128
+ "County: The County where the facility is located"
129
+ "EligibleVL: Is the client eligible for a viral load"
130
+ "Enrolled: Number of clients enrolled into OTZ program"
131
+ "FacilityName: The facility name as entered in KHMFL"
132
+ "FirstVL: The first ever documented viral load"
133
+ "Gender: Sex of the patient"
134
+ "HasValidVL: Does the client have a valid viral load"
135
+ "LastVL: This is the most current Viral load for the client -"
136
+ "LoadDate: Date when the dataset was ETL loaded"
137
+ "MFLCode: Master facility code as assigned in the KHMFL"
138
+ "ModulesPreviouslyCovered: Modules that the client has covered before this visit"
139
+ "OTZEnrollmentYearMonth: The year and the month the client was enrolled in OTZ program"
140
+ "PartnerName: The implementing partner mechanism"
141
+ "SubCounty: The Sub County where the facility is located"
142
+ "TransferInStatus: Did the client transfer in"
143
+ "ValidVLResult: The VL result that is within 12 months from the reporting period taking into account age group validity"
144
+ "ValidVLResultCategory: The viral load results categorizations as LDL, High-risk LLV, Low-risk LLV, and unsuppressed"
145
+ "patients_eligible: Number of clients eligible for enrollment into OTZ program")
146
+
147
+ aggtxcurr_str= (
148
+ "An aggregate dataset containing counts of active number of patients/TxCurr for each facility at each month, disaggregated by various indicators."
149
+ "Query this table To identify increase/decrease the total number of active patients at overtime"
150
+ "Number of acive patients or treatment is calculated at end of the month "
151
+ "AsofDateKey:the End of month reporting date (format = yyyy-mm-dd), use this date to extract number of active client as at that month "
152
+ "DATIMAgeGroup: The DATIM Age disaggregations"
153
+ #"NumofPatients The total number of active patients/TXcurr"
154
+ "isTxCurr: The total number of active patients/TXcurr"
155
+ )
156
+
157
+ from llama_index.core.objects import (
158
+ SQLTableNodeMapping,
159
+ ObjectIndex,
160
+ SQLTableSchema,
161
+ )
162
+
163
+ from llama_index.core import VectorStoreIndex
164
+ #store the table schema in an index
165
+ table_node_mapping = SQLTableNodeMapping(sql_database)
166
+
167
+ #store schema information for each table.
168
+ table_schema_objs = [
169
+ (SQLTableSchema(table_name="Linelist_FACTART", context_str=fact_linelist_str)),
170
+ (SQLTableSchema(table_name="LineListTransHTS", context_str=hts_linelist_str)),
171
+ (SQLTableSchema(table_name="LineListPrep", context_str=prep_str)),
172
+ (SQLTableSchema(table_name="LinelistHEI", context_str=hei_str)),
173
+ (SQLTableSchema(table_name="AggregateOTZEligibilityAndEnrollments", context_str=otzenroll_str)),
174
+ #(SQLTableSchema(table_name="AggregateDSD", context_str=dsd_str)),
175
+ (SQLTableSchema(table_name="AggregateARTHistory", context_str=aggtxcurr_str)),
176
+ ]
177
+
178
+ obj_index = ObjectIndex.from_objects(
179
+ table_schema_objs, # A list of table schema objects
180
+ table_node_mapping, # An object responsible for mapping tables to nodes.
181
+ VectorStoreIndex, # for vector-based searching or indexing.
182
+ )
183
+
184
+ from llama_index.core.indices.struct_store import SQLTableRetrieverQueryEngine
185
+
186
+ query_engine = SQLTableRetrieverQueryEngine(
187
+ sql_database,
188
+ obj_index.as_retriever(similarity_top_k=2),
189
+ )
190
+
191
+
192
+ preamble = ("Given an input question, first create a syntactically correct"
193
+ "query to run, then look at the results of the query and return the answer"
194
+ "You can order the results by a relevant column to return the most"
195
+ "interesting examples in the database."
196
+ "Pay attention to use only the column names that you can see in the schema"
197
+ "description. Be careful to not query for columns that do not exist."
198
+ "Pay attention to which column is in which table. Also, qualify column names"
199
+ "with the table name when needed.")
200
+
201
+ prompt_intro = (" Here is the prompt: ")
202
+
203
+
204
+ import gradio as gr
205
+ def texttosql(question: str, conversation_history: list[str]):
206
+
207
+
208
+ context = " ".join([item["user"] + " " + item["chatbot"] for item in conversation_history])
209
+ response = query_engine.query(preamble +
210
+ "the user previously asked and received the following: " +
211
+ context +
212
+ prompt_intro +
213
+ question)
214
+
215
+ conversation_history.append({"user": question, "chatbot": response.response})
216
+
217
+
218
+
219
+ return response.response,response.metadata["sql_query"] ,response.metadata["result"] , conversation_history
220
+
221
+ inputs = [gr.Textbox(lines=10, label="Question"),
222
+ gr.State(value=[])]
223
+ outputs = [
224
+ gr.Textbox(label="Chatbot Response", type="text"),
225
+ gr.Textbox(label="sql_query", autoscroll = False, type="text"),
226
+ gr.Textbox(label="Metadata_result", autoscroll = False, type="text"),
227
+ # gr.Textbox(label="Source 3", max_lines = 10, autoscroll = False, type="text"),
228
+ gr.State()
229
+ ]
230
+
231
+ gr.Interface(fn=texttosql, inputs=inputs, outputs=outputs, title="txttosql Chatbot",
232
+ description="Enter a question and see the processed outputs in collapsible boxes.").launch()