Spaces:
Sleeping
Sleeping
Jonathan Card commited on
Commit ·
4c93471
1
Parent(s): 9641c0b
Integrate the changes made the afternoon of 2025-06-12.
Browse filesI wanted to sort out the UI so that it would work with the changes made
this afternoon, including changes like the LLMService requiring the
patient ID rather than patient name for the query. This required minor
changes to the data service, but I also made some changes to refactor
the LLMService to encapsulate the structure of the dataset into the
data service. I also included some questions and observations in the
comments, and made some minor changes discussed in the group, such as
providing the syntax for default values for a method parameter.
- data_service.py +26 -6
- g34_final_project.py +7 -6
- llm_service.py +21 -10
- test/test_app.py +12 -10
- test/test_data_service.py +17 -2
data_service.py
CHANGED
|
@@ -16,11 +16,17 @@ class DataService(object):
|
|
| 16 |
|
| 17 |
def get_patients(self) -> list[str]:
|
| 18 |
raise Exception("Should not use the base class")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
class DefaultDataService(DataService):
|
| 21 |
|
| 22 |
def __init__(self):
|
| 23 |
-
self._dataset_url = "
|
| 24 |
self._initialized = False
|
| 25 |
|
| 26 |
def initialize(func):
|
|
@@ -37,8 +43,22 @@ class DefaultDataService(DataService):
|
|
| 37 |
pass
|
| 38 |
|
| 39 |
@initialize
|
| 40 |
-
def get_patients(self) ->
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def get_patients(self) -> list[str]:
|
| 18 |
raise Exception("Should not use the base class")
|
| 19 |
+
|
| 20 |
+
def get_documents(self) -> list[str]:
|
| 21 |
+
raise Exception("Should not use the base class")
|
| 22 |
+
|
| 23 |
+
def get_document_metadatas(self) -> dict[str, any]:
|
| 24 |
+
raise Exception("Should not use the base class")
|
| 25 |
|
| 26 |
class DefaultDataService(DataService):
|
| 27 |
|
| 28 |
def __init__(self):
|
| 29 |
+
self._dataset_url = "hf://datasets/iscg34/finalproject/patient_information.csv"
|
| 30 |
self._initialized = False
|
| 31 |
|
| 32 |
def initialize(func):
|
|
|
|
| 43 |
pass
|
| 44 |
|
| 45 |
@initialize
|
| 46 |
+
def get_patients(self) -> pd.DataFrame:
|
| 47 |
+
# TODO: Now that I've changed the return type to DataFrame, I'm tempted to change the column names so that the rest of the application does not hard-code the names "PATIENT_ID" and "FIRST", and those are used solely within the data_service. But that could be over-thinking for this small application.
|
| 48 |
+
# I chose not to sort the patients alphabetically here because I thought that was a violation of the division of responsibilities between the data service and the UI. And I chose to change the return type to DataFrame because it's possible, based on my experience with Spark and Hadoop, that sorting would trigger a manifestation of the data that, until this point, could (theoretically) be being processed in a distributed way and not loaded locally, and sorting would trigger the processing of everything, and then sorting would be another O(n) operation.
|
| 49 |
+
# TODO: And the calculation of size() is silly, but I don't see noop action after group. Maybe apply()....
|
| 50 |
+
patients = self._df.groupby(["PATIENT_ID", "FIRST"]).size().reset_index()[["PATIENT_ID", "FIRST"]]
|
| 51 |
+
return patients
|
| 52 |
+
|
| 53 |
+
# TODO: We need to be careful with these two functions, get_documents and get_document_metadatas. They must be in the same order, because I don't think we're preserving the ID columns
|
| 54 |
+
@initialize
|
| 55 |
+
def get_documents(self) -> list[str]:
|
| 56 |
+
texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
|
| 57 |
+
return texts
|
| 58 |
+
|
| 59 |
+
# TODO: This kind-of triggered my "code smell" and then I saw iloc is deprecated. Surely there's a way to select out columns 0,3-11 (with the first as the id), and have get_documents also return a table that is columns [index, CLINICAL_NOTES], and the chroma.add function will be able to match the row IDs? I think that's what it says it expects to do.
|
| 60 |
+
# TODO: This is probably no longer the correct columns with the cleaned/processed dataset.
|
| 61 |
+
@initialize
|
| 62 |
+
def get_document_metadatas(self) -> dict[str, any]:
|
| 63 |
+
metadatas = [self._df.iloc[i,3:11].to_dict() for i in range(self._df.shape[0])]
|
| 64 |
+
return metadatas
|
g34_final_project.py
CHANGED
|
@@ -24,8 +24,9 @@ class App(object):
|
|
| 24 |
|
| 25 |
# Dividing the UI into columns is weird, since the principal organizing item is the row...
|
| 26 |
with gr.Row():
|
|
|
|
| 27 |
patient_dropdown = gr.Dropdown(
|
| 28 |
-
choices=
|
| 29 |
multiselect=False,
|
| 30 |
value=None,
|
| 31 |
label=_("Patient")
|
|
@@ -52,7 +53,7 @@ class App(object):
|
|
| 52 |
|
| 53 |
query_textbox.submit(
|
| 54 |
fn=ask_query,
|
| 55 |
-
inputs=[
|
| 56 |
outputs=(query_response)
|
| 57 |
)
|
| 58 |
|
|
@@ -68,7 +69,7 @@ if __name__ == '__main__':
|
|
| 68 |
#print("The application UI will launch, but AI functionality will be disabled.")
|
| 69 |
print("Please add the OpenAI API key as an application secret.")
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
| 24 |
|
| 25 |
# Dividing the UI into columns is weird, since the principal organizing item is the row...
|
| 26 |
with gr.Row():
|
| 27 |
+
patients = data_service.get_patients()
|
| 28 |
patient_dropdown = gr.Dropdown(
|
| 29 |
+
choices=[(x[1]["FIRST"], x[1]["PATIENT_ID"]) for x in patients.sort_values(by="FIRST").iterrows()],
|
| 30 |
multiselect=False,
|
| 31 |
value=None,
|
| 32 |
label=_("Patient")
|
|
|
|
| 53 |
|
| 54 |
query_textbox.submit(
|
| 55 |
fn=ask_query,
|
| 56 |
+
inputs=[patient_dropdown, query_textbox],
|
| 57 |
outputs=(query_response)
|
| 58 |
)
|
| 59 |
|
|
|
|
| 69 |
#print("The application UI will launch, but AI functionality will be disabled.")
|
| 70 |
print("Please add the OpenAI API key as an application secret.")
|
| 71 |
|
| 72 |
+
with DataService().build() as data_service:
|
| 73 |
+
with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
|
| 74 |
+
app = App(data_service, llm_service)
|
| 75 |
+
app.launch()
|
llm_service.py
CHANGED
|
@@ -4,18 +4,22 @@ from chromadb.config import Settings
|
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
import pandas as pd
|
| 6 |
from openai import OpenAI
|
|
|
|
| 7 |
|
| 8 |
class LLMService(object):
|
| 9 |
def __init__(self):
|
| 10 |
self._openAIKey = None
|
|
|
|
| 11 |
|
| 12 |
@contextmanager
|
| 13 |
def build(self):
|
| 14 |
llm_service = None
|
| 15 |
if self._openAIKey is None:
|
| 16 |
raise ValueError("OPEN AI key was not provided and there is no default value.")
|
|
|
|
|
|
|
| 17 |
try:
|
| 18 |
-
llm_service = DefaultLLMService(self._openAIKey)
|
| 19 |
yield llm_service
|
| 20 |
finally:
|
| 21 |
if llm_service is not None:
|
|
@@ -33,10 +37,15 @@ class LLMService(object):
|
|
| 33 |
def with_key(self, api_key: str):
|
| 34 |
self._openAIKey = api_key
|
| 35 |
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
class DefaultLLMService(LLMService):
|
| 38 |
-
def __init__(self, api_key: str):
|
| 39 |
self._api_key = api_key
|
|
|
|
| 40 |
self._chatclient=OpenAI(api_key=self._api_key)
|
| 41 |
#TODO decide on embedding model, using one provided in notebook
|
| 42 |
self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
@@ -51,26 +60,28 @@ class DefaultLLMService(LLMService):
|
|
| 51 |
persist_directory="./chroma_db"
|
| 52 |
))
|
| 53 |
collection_name = "patient_data"
|
|
|
|
| 54 |
#if clear:
|
| 55 |
# client.delete_collection(collection_name)
|
| 56 |
return client.get_or_create_collection(name=collection_name)
|
| 57 |
|
|
|
|
| 58 |
def build_chromadb(self):
|
| 59 |
#TODO replace with cleaned data url or move to service inputs
|
| 60 |
-
self._df = pd.read_csv("hf://datasets/iscg34/finalproject/patient_information.csv")
|
| 61 |
-
|
| 62 |
collection = self.get_chromadb(clear=1)
|
| 63 |
-
texts = self.
|
|
|
|
|
|
|
| 64 |
embeddings = self._embed_model.encode(texts).tolist()
|
| 65 |
|
| 66 |
collection.add(
|
| 67 |
documents=texts,
|
| 68 |
embeddings=embeddings,
|
| 69 |
-
metadatas=
|
| 70 |
ids=[str(i) for i in range(len(texts))]
|
| 71 |
)
|
| 72 |
|
| 73 |
-
def query_chromadb(self, patient: str, query: str, result_template:str, top_n=3) -> str:
|
| 74 |
if (patient=="") or (patient is None):
|
| 75 |
return ""
|
| 76 |
if (query=="") or (query is None):
|
|
@@ -119,7 +130,7 @@ class DefaultLLMService(LLMService):
|
|
| 119 |
# Has this patient ...?
|
| 120 |
# Does this patient have a history of ...?
|
| 121 |
# TODO: Find in vector database the most related docs to both 1. patient & 2. query
|
| 122 |
-
rag=self.query_chromadb(patient,query
|
| 123 |
# TODO: Figure out how to utilize other columns.
|
| 124 |
prompt_template="""
|
| 125 |
You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
|
|
@@ -128,10 +139,10 @@ class DefaultLLMService(LLMService):
|
|
| 128 |
{RAG}
|
| 129 |
|
| 130 |
Questions:\n
|
| 131 |
-
{
|
| 132 |
|
| 133 |
Answer:"""
|
| 134 |
-
filled_prompt=prompt_template.format(RAG=rag,
|
| 135 |
# Get model output
|
| 136 |
response = self._chatclient.chat.completions.create(
|
| 137 |
model="gpt-3.5-turbo",
|
|
|
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
import pandas as pd
|
| 6 |
from openai import OpenAI
|
| 7 |
+
from data_service import DataService
|
| 8 |
|
| 9 |
class LLMService(object):
|
| 10 |
def __init__(self):
|
| 11 |
self._openAIKey = None
|
| 12 |
+
self._data_service = None
|
| 13 |
|
| 14 |
@contextmanager
|
| 15 |
def build(self):
|
| 16 |
llm_service = None
|
| 17 |
if self._openAIKey is None:
|
| 18 |
raise ValueError("OPEN AI key was not provided and there is no default value.")
|
| 19 |
+
if self._data_service is None:
|
| 20 |
+
raise ValueError("To get the patient documents, a data service must be provided before building the LLMService.")
|
| 21 |
try:
|
| 22 |
+
llm_service = DefaultLLMService(self._openAIKey, self._data_service)
|
| 23 |
yield llm_service
|
| 24 |
finally:
|
| 25 |
if llm_service is not None:
|
|
|
|
| 37 |
def with_key(self, api_key: str):
|
| 38 |
self._openAIKey = api_key
|
| 39 |
return self
|
| 40 |
+
|
| 41 |
+
def with_data_service(self, data_service: DataService):
|
| 42 |
+
self._data_service = data_service
|
| 43 |
+
return self
|
| 44 |
|
| 45 |
class DefaultLLMService(LLMService):
|
| 46 |
+
def __init__(self, api_key: str, data_service: DataService):
|
| 47 |
self._api_key = api_key
|
| 48 |
+
self._data_service = data_service
|
| 49 |
self._chatclient=OpenAI(api_key=self._api_key)
|
| 50 |
#TODO decide on embedding model, using one provided in notebook
|
| 51 |
self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
| 60 |
persist_directory="./chroma_db"
|
| 61 |
))
|
| 62 |
collection_name = "patient_data"
|
| 63 |
+
# TODO: If clear and collection exists
|
| 64 |
#if clear:
|
| 65 |
# client.delete_collection(collection_name)
|
| 66 |
return client.get_or_create_collection(name=collection_name)
|
| 67 |
|
| 68 |
+
# TODO: It probably makes no difference, but the reason I chose to use a decorator @initialize in data_service is that I learned somewhere that it was better to put as little logic in a constructor as possible because (at least in other languages), errors in constructors made everything complicated, and potentially slow initialization logic made things difficult to ... I don't remember. Debug or parallelize or something. Maybe all of them. The trade-off is, if you forgot to decorate the method with @initialize, bad things.
|
| 69 |
def build_chromadb(self):
|
| 70 |
#TODO replace with cleaned data url or move to service inputs
|
|
|
|
|
|
|
| 71 |
collection = self.get_chromadb(clear=1)
|
| 72 |
+
texts = self._data_service.get_documents()
|
| 73 |
+
metadatas = self._data_service.get_document_metadatas()
|
| 74 |
+
# TODO: I'm looking at the docs and I'm wondering 1. if this is the default embedding function anyway (I think it's good to bring it out and see we can adjust it; I'm just pointing this out), and 2. whether it would be equivalent to provide self._embed_model.encode as the embedding function of the collection at configuration time.
|
| 75 |
embeddings = self._embed_model.encode(texts).tolist()
|
| 76 |
|
| 77 |
collection.add(
|
| 78 |
documents=texts,
|
| 79 |
embeddings=embeddings,
|
| 80 |
+
metadatas=metadatas, #store everything except clinical notes and ids as metadata
|
| 81 |
ids=[str(i) for i in range(len(texts))]
|
| 82 |
)
|
| 83 |
|
| 84 |
+
def query_chromadb(self, patient: str, query: str, result_template:str = "", top_n=3) -> str:
|
| 85 |
if (patient=="") or (patient is None):
|
| 86 |
return ""
|
| 87 |
if (query=="") or (query is None):
|
|
|
|
| 130 |
# Has this patient ...?
|
| 131 |
# Does this patient have a history of ...?
|
| 132 |
# TODO: Find in vector database the most related docs to both 1. patient & 2. query
|
| 133 |
+
rag=self.query_chromadb(patient,query)
|
| 134 |
# TODO: Figure out how to utilize other columns.
|
| 135 |
prompt_template="""
|
| 136 |
You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
|
|
|
|
| 139 |
{RAG}
|
| 140 |
|
| 141 |
Questions:\n
|
| 142 |
+
{Query}
|
| 143 |
|
| 144 |
Answer:"""
|
| 145 |
+
filled_prompt=prompt_template.format(RAG=rag,Query=query)
|
| 146 |
# Get model output
|
| 147 |
response = self._chatclient.chat.completions.create(
|
| 148 |
model="gpt-3.5-turbo",
|
test/test_app.py
CHANGED
|
@@ -3,17 +3,19 @@ from unittest.mock import MagicMock, create_autospec
|
|
| 3 |
from data_service import DataService
|
| 4 |
from llm_service import LLMService
|
| 5 |
from g34_final_project import App
|
|
|
|
| 6 |
|
| 7 |
class TestApp(unittest.TestCase):
|
| 8 |
def test_app(self):
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
app.launch()
|
| 19 |
-
# TODO: This is probably not a good way to test. Maybe there isn't one for this. I just wanted to put a stub here. But it would be nice to be able to test things like "it accessed the data service get_patients once", etc.
|
|
|
|
| 3 |
from data_service import DataService
|
| 4 |
from llm_service import LLMService
|
| 5 |
from g34_final_project import App
|
| 6 |
+
import os
|
| 7 |
|
| 8 |
class TestApp(unittest.TestCase):
|
| 9 |
def test_app(self):
|
| 10 |
+
"""
|
| 11 |
+
A simple test method to allow launching of the application with mock objects
|
| 12 |
+
rather than the actual implementations. Used for testing the UI.
|
| 13 |
+
"""
|
| 14 |
+
mock_data_service = DataService().build
|
| 15 |
+
with mock_data_service() as data_service:
|
| 16 |
+
with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
|
| 17 |
|
| 18 |
+
# Replacing the call to get_summary with a mock since this isn't implemented yet.
|
| 19 |
+
llm_service.get_summary = MagicMock(return_value="This is a summary")
|
| 20 |
+
app = App(data_service, llm_service)
|
| 21 |
+
app.launch()
|
|
|
|
|
|
test/test_data_service.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import unittest
|
|
|
|
| 2 |
|
| 3 |
from data_service import DataService
|
| 4 |
|
|
@@ -8,5 +9,19 @@ class TestDataService(unittest.TestCase):
|
|
| 8 |
#data_service = DataService().to("").withCreds("").build()
|
| 9 |
with DataService().build() as data_service:
|
| 10 |
patients = data_service.get_patients()
|
| 11 |
-
self.assertTrue(isinstance(patients,
|
| 12 |
-
self.assertTrue(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
import pandas as pd
|
| 3 |
|
| 4 |
from data_service import DataService
|
| 5 |
|
|
|
|
| 9 |
#data_service = DataService().to("").withCreds("").build()
|
| 10 |
with DataService().build() as data_service:
|
| 11 |
patients = data_service.get_patients()
|
| 12 |
+
self.assertTrue(isinstance(patients, pd.DataFrame))
|
| 13 |
+
self.assertTrue("PATIENT_ID" in patients.columns.values)
|
| 14 |
+
self.assertTrue("FIRST" in patients.columns.values)
|
| 15 |
+
self.assertTrue(patients.size > 0)
|
| 16 |
+
|
| 17 |
+
def test_get_patient_documents(self):
|
| 18 |
+
with DataService().build() as data_service:
|
| 19 |
+
documents = data_service.get_documents()
|
| 20 |
+
self.assertTrue(isinstance(documents, list))
|
| 21 |
+
self.assertTrue(len(documents) > 0)
|
| 22 |
+
|
| 23 |
+
def test_get_patient_metadatas(self):
|
| 24 |
+
with DataService().build() as data_service:
|
| 25 |
+
metadatas = data_service.get_document_metadatas()
|
| 26 |
+
self.assertTrue(isinstance(metadatas, list))
|
| 27 |
+
self.assertTrue(len(metadatas) > 0)
|