Spaces:
Sleeping
Sleeping
card-integrate-changes-2025-06-12
#1
by JonCard - opened
- data_service.py +26 -6
- g34_final_project.py +7 -6
- llm_service.py +21 -10
- test/test_app.py +12 -10
- test/test_data_service.py +17 -2
data_service.py
CHANGED
|
@@ -16,11 +16,17 @@ class DataService(object):
|
|
| 16 |
|
| 17 |
def get_patients(self) -> list[str]:
|
| 18 |
raise Exception("Should not use the base class")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
class DefaultDataService(DataService):
|
| 21 |
|
| 22 |
def __init__(self):
|
| 23 |
-
self._dataset_url = "
|
| 24 |
self._initialized = False
|
| 25 |
|
| 26 |
def initialize(func):
|
|
@@ -37,8 +43,22 @@ class DefaultDataService(DataService):
|
|
| 37 |
pass
|
| 38 |
|
| 39 |
@initialize
|
| 40 |
-
def get_patients(self) ->
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def get_patients(self) -> list[str]:
|
| 18 |
raise Exception("Should not use the base class")
|
| 19 |
+
|
| 20 |
+
def get_documents(self) -> list[str]:
|
| 21 |
+
raise Exception("Should not use the base class")
|
| 22 |
+
|
| 23 |
+
def get_document_metadatas(self) -> dict[str, any]:
|
| 24 |
+
raise Exception("Should not use the base class")
|
| 25 |
|
| 26 |
class DefaultDataService(DataService):
|
| 27 |
|
| 28 |
def __init__(self):
|
| 29 |
+
self._dataset_url = "hf://datasets/iscg34/finalproject/patient_information.csv"
|
| 30 |
self._initialized = False
|
| 31 |
|
| 32 |
def initialize(func):
|
|
|
|
| 43 |
pass
|
| 44 |
|
| 45 |
@initialize
|
| 46 |
+
def get_patients(self) -> pd.DataFrame:
|
| 47 |
+
# TODO: Now that I've changed the return type to DataFrame, I'm tempted to change the column names so that the rest of the application does not hard-code the names "PATIENT_ID" and "FIRST", and those are used solely within the data_service. But that could be over-thinking for this small application.
|
| 48 |
+
# I chose not to sort the patients alphabetically here because I thought that was a violation of the division of responsibilities between the data service and the UI. And I chose to change the return type to DataFrame because it's possible, based on my experience with Spark and Hadoop, that sorting would trigger a manifestation of the data that, until this point, could (theoretically) be being processed in a distributed way and not loaded locally, and sorting would trigger the processing of everything, and then sorting would be another O(n) operation.
|
| 49 |
+
# TODO: And the calculation of size() is silly, but I don't see noop action after group. Maybe apply()....
|
| 50 |
+
patients = self._df.groupby(["PATIENT_ID", "FIRST"]).size().reset_index()[["PATIENT_ID", "FIRST"]]
|
| 51 |
+
return patients
|
| 52 |
+
|
| 53 |
+
# TODO: We need to be careful with these two functions, get_documents and get_document_metadatas. They must be in the same order, because I don't think we're preserving the ID columns
|
| 54 |
+
@initialize
|
| 55 |
+
def get_documents(self) -> list[str]:
|
| 56 |
+
texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
|
| 57 |
+
return texts
|
| 58 |
+
|
| 59 |
+
# TODO: This kind-of triggered my "code smell" and then I saw iloc is deprecated. Surely there's a way to select out columns 0,3-11 (with the first as the id), and have get_documents also return a table that is columns [index, CLINICAL_NOTES], and the chroma.add function will be able to match the row IDs? I think that's what it says it expects to do.
|
| 60 |
+
# TODO: This is probably no longer the correct columns with the cleaned/processed dataset.
|
| 61 |
+
@initialize
|
| 62 |
+
def get_document_metadatas(self) -> dict[str, any]:
|
| 63 |
+
metadatas = [self._df.iloc[i,3:11].to_dict() for i in range(self._df.shape[0])]
|
| 64 |
+
return metadatas
|
g34_final_project.py
CHANGED
|
@@ -24,8 +24,9 @@ class App(object):
|
|
| 24 |
|
| 25 |
# Dividing the UI into columns is weird, since the principal organizing item is the row...
|
| 26 |
with gr.Row():
|
|
|
|
| 27 |
patient_dropdown = gr.Dropdown(
|
| 28 |
-
choices=
|
| 29 |
multiselect=False,
|
| 30 |
value=None,
|
| 31 |
label=_("Patient")
|
|
@@ -52,7 +53,7 @@ class App(object):
|
|
| 52 |
|
| 53 |
query_textbox.submit(
|
| 54 |
fn=ask_query,
|
| 55 |
-
inputs=[
|
| 56 |
outputs=(query_response)
|
| 57 |
)
|
| 58 |
|
|
@@ -68,7 +69,7 @@ if __name__ == '__main__':
|
|
| 68 |
#print("The application UI will launch, but AI functionality will be disabled.")
|
| 69 |
print("Please add the OpenAI API key as an application secret.")
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
| 24 |
|
| 25 |
# Dividing the UI into columns is weird, since the principal organizing item is the row...
|
| 26 |
with gr.Row():
|
| 27 |
+
patients = data_service.get_patients()
|
| 28 |
patient_dropdown = gr.Dropdown(
|
| 29 |
+
choices=[(x[1]["FIRST"], x[1]["PATIENT_ID"]) for x in patients.sort_values(by="FIRST").iterrows()],
|
| 30 |
multiselect=False,
|
| 31 |
value=None,
|
| 32 |
label=_("Patient")
|
|
|
|
| 53 |
|
| 54 |
query_textbox.submit(
|
| 55 |
fn=ask_query,
|
| 56 |
+
inputs=[patient_dropdown, query_textbox],
|
| 57 |
outputs=(query_response)
|
| 58 |
)
|
| 59 |
|
|
|
|
| 69 |
#print("The application UI will launch, but AI functionality will be disabled.")
|
| 70 |
print("Please add the OpenAI API key as an application secret.")
|
| 71 |
|
| 72 |
+
with DataService().build() as data_service:
|
| 73 |
+
with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
|
| 74 |
+
app = App(data_service, llm_service)
|
| 75 |
+
app.launch()
|
llm_service.py
CHANGED
|
@@ -4,18 +4,22 @@ from chromadb.config import Settings
|
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
import pandas as pd
|
| 6 |
from openai import OpenAI
|
|
|
|
| 7 |
|
| 8 |
class LLMService(object):
|
| 9 |
def __init__(self):
|
| 10 |
self._openAIKey = None
|
|
|
|
| 11 |
|
| 12 |
@contextmanager
|
| 13 |
def build(self):
|
| 14 |
llm_service = None
|
| 15 |
if self._openAIKey is None:
|
| 16 |
raise ValueError("OPEN AI key was not provided and there is no default value.")
|
|
|
|
|
|
|
| 17 |
try:
|
| 18 |
-
llm_service = DefaultLLMService(self._openAIKey)
|
| 19 |
yield llm_service
|
| 20 |
finally:
|
| 21 |
if llm_service is not None:
|
|
@@ -33,10 +37,15 @@ class LLMService(object):
|
|
| 33 |
def with_key(self, api_key: str):
|
| 34 |
self._openAIKey = api_key
|
| 35 |
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
class DefaultLLMService(LLMService):
|
| 38 |
-
def __init__(self, api_key: str):
|
| 39 |
self._api_key = api_key
|
|
|
|
| 40 |
self._chatclient=OpenAI(api_key=self._api_key)
|
| 41 |
#TODO decide on embedding model, using one provided in notebook
|
| 42 |
self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
@@ -51,26 +60,28 @@ class DefaultLLMService(LLMService):
|
|
| 51 |
persist_directory="./chroma_db"
|
| 52 |
))
|
| 53 |
collection_name = "patient_data"
|
|
|
|
| 54 |
#if clear:
|
| 55 |
# client.delete_collection(collection_name)
|
| 56 |
return client.get_or_create_collection(name=collection_name)
|
| 57 |
|
|
|
|
| 58 |
def build_chromadb(self):
|
| 59 |
#TODO replace with cleaned data url or move to service inputs
|
| 60 |
-
self._df = pd.read_csv("hf://datasets/iscg34/finalproject/patient_information.csv")
|
| 61 |
-
|
| 62 |
collection = self.get_chromadb(clear=1)
|
| 63 |
-
texts = self.
|
|
|
|
|
|
|
| 64 |
embeddings = self._embed_model.encode(texts).tolist()
|
| 65 |
|
| 66 |
collection.add(
|
| 67 |
documents=texts,
|
| 68 |
embeddings=embeddings,
|
| 69 |
-
metadatas=
|
| 70 |
ids=[str(i) for i in range(len(texts))]
|
| 71 |
)
|
| 72 |
|
| 73 |
-
def query_chromadb(self, patient: str, query: str, result_template:str, top_n=3) -> str:
|
| 74 |
if (patient=="") or (patient is None):
|
| 75 |
return ""
|
| 76 |
if (query=="") or (query is None):
|
|
@@ -119,7 +130,7 @@ class DefaultLLMService(LLMService):
|
|
| 119 |
# Has this patient ...?
|
| 120 |
# Does this patient have a history of ...?
|
| 121 |
# TODO: Find in vector database the most related docs to both 1. patient & 2. query
|
| 122 |
-
rag=self.query_chromadb(patient,query
|
| 123 |
# TODO: Figure out how to utilize other columns.
|
| 124 |
prompt_template="""
|
| 125 |
You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
|
|
@@ -128,10 +139,10 @@ class DefaultLLMService(LLMService):
|
|
| 128 |
{RAG}
|
| 129 |
|
| 130 |
Questions:\n
|
| 131 |
-
{
|
| 132 |
|
| 133 |
Answer:"""
|
| 134 |
-
filled_prompt=prompt_template.format(RAG=rag,
|
| 135 |
# Get model output
|
| 136 |
response = self._chatclient.chat.completions.create(
|
| 137 |
model="gpt-3.5-turbo",
|
|
|
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
import pandas as pd
|
| 6 |
from openai import OpenAI
|
| 7 |
+
from data_service import DataService
|
| 8 |
|
| 9 |
class LLMService(object):
|
| 10 |
def __init__(self):
|
| 11 |
self._openAIKey = None
|
| 12 |
+
self._data_service = None
|
| 13 |
|
| 14 |
@contextmanager
|
| 15 |
def build(self):
|
| 16 |
llm_service = None
|
| 17 |
if self._openAIKey is None:
|
| 18 |
raise ValueError("OPEN AI key was not provided and there is no default value.")
|
| 19 |
+
if self._data_service is None:
|
| 20 |
+
raise ValueError("To get the patient documents, a data service must be provided before building the LLMService.")
|
| 21 |
try:
|
| 22 |
+
llm_service = DefaultLLMService(self._openAIKey, self._data_service)
|
| 23 |
yield llm_service
|
| 24 |
finally:
|
| 25 |
if llm_service is not None:
|
|
|
|
| 37 |
def with_key(self, api_key: str):
|
| 38 |
self._openAIKey = api_key
|
| 39 |
return self
|
| 40 |
+
|
| 41 |
+
def with_data_service(self, data_service: DataService):
|
| 42 |
+
self._data_service = data_service
|
| 43 |
+
return self
|
| 44 |
|
| 45 |
class DefaultLLMService(LLMService):
|
| 46 |
+
def __init__(self, api_key: str, data_service: DataService):
|
| 47 |
self._api_key = api_key
|
| 48 |
+
self._data_service = data_service
|
| 49 |
self._chatclient=OpenAI(api_key=self._api_key)
|
| 50 |
#TODO decide on embedding model, using one provided in notebook
|
| 51 |
self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
| 60 |
persist_directory="./chroma_db"
|
| 61 |
))
|
| 62 |
collection_name = "patient_data"
|
| 63 |
+
# TODO: If clear and collection exists
|
| 64 |
#if clear:
|
| 65 |
# client.delete_collection(collection_name)
|
| 66 |
return client.get_or_create_collection(name=collection_name)
|
| 67 |
|
| 68 |
+
# TODO: It probably makes no difference, but the reason I chose to use a decorator @initialize in data_service is that I learned somewhere that it was better to put as little logic in a constructor as possible because (at least in other languages), errors in constructors made everything complicated, and potentially slow initialization logic made things difficult to ... I don't remember. Debug or parallelize or something. Maybe all of them. The trade-off is, if you forgot to decorate the method with @initialize, bad things.
|
| 69 |
def build_chromadb(self):
|
| 70 |
#TODO replace with cleaned data url or move to service inputs
|
|
|
|
|
|
|
| 71 |
collection = self.get_chromadb(clear=1)
|
| 72 |
+
texts = self._data_service.get_documents()
|
| 73 |
+
metadatas = self._data_service.get_document_metadatas()
|
| 74 |
+
# TODO: I'm looking at the docs and I'm wondering 1. if this is the default embedding function anyway (I think it's good to bring it out and see we can adjust it; I'm just pointing this out), and 2. whether it would be equivalent to provide self._embed_model.encode as the embedding function of the collection at configuration time.
|
| 75 |
embeddings = self._embed_model.encode(texts).tolist()
|
| 76 |
|
| 77 |
collection.add(
|
| 78 |
documents=texts,
|
| 79 |
embeddings=embeddings,
|
| 80 |
+
metadatas=metadatas, #store everything except clinical notes and ids as metadata
|
| 81 |
ids=[str(i) for i in range(len(texts))]
|
| 82 |
)
|
| 83 |
|
| 84 |
+
def query_chromadb(self, patient: str, query: str, result_template:str = "", top_n=3) -> str:
|
| 85 |
if (patient=="") or (patient is None):
|
| 86 |
return ""
|
| 87 |
if (query=="") or (query is None):
|
|
|
|
| 130 |
# Has this patient ...?
|
| 131 |
# Does this patient have a history of ...?
|
| 132 |
# TODO: Find in vector database the most related docs to both 1. patient & 2. query
|
| 133 |
+
rag=self.query_chromadb(patient,query)
|
| 134 |
# TODO: Figure out how to utilize other columns.
|
| 135 |
prompt_template="""
|
| 136 |
You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
|
|
|
|
| 139 |
{RAG}
|
| 140 |
|
| 141 |
Questions:\n
|
| 142 |
+
{Query}
|
| 143 |
|
| 144 |
Answer:"""
|
| 145 |
+
filled_prompt=prompt_template.format(RAG=rag,Query=query)
|
| 146 |
# Get model output
|
| 147 |
response = self._chatclient.chat.completions.create(
|
| 148 |
model="gpt-3.5-turbo",
|
test/test_app.py
CHANGED
|
@@ -3,17 +3,19 @@ from unittest.mock import MagicMock, create_autospec
|
|
| 3 |
from data_service import DataService
|
| 4 |
from llm_service import LLMService
|
| 5 |
from g34_final_project import App
|
|
|
|
| 6 |
|
| 7 |
class TestApp(unittest.TestCase):
|
| 8 |
def test_app(self):
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
app.launch()
|
| 19 |
-
# TODO: This is probably not a good way to test. Maybe there isn't one for this. I just wanted to put a stub here. But it would be nice to be able to test things like "it accessed the data service get_patients once", etc.
|
|
|
|
| 3 |
from data_service import DataService
|
| 4 |
from llm_service import LLMService
|
| 5 |
from g34_final_project import App
|
| 6 |
+
import os
|
| 7 |
|
| 8 |
class TestApp(unittest.TestCase):
|
| 9 |
def test_app(self):
|
| 10 |
+
"""
|
| 11 |
+
A simple test method to allow launching of the application with mock objects
|
| 12 |
+
rather than the actual implementations. Used for testing the UI.
|
| 13 |
+
"""
|
| 14 |
+
mock_data_service = DataService().build
|
| 15 |
+
with mock_data_service() as data_service:
|
| 16 |
+
with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
|
| 17 |
|
| 18 |
+
# Replacing the call to get_summary with a mock since this isn't implemented yet.
|
| 19 |
+
llm_service.get_summary = MagicMock(return_value="This is a summary")
|
| 20 |
+
app = App(data_service, llm_service)
|
| 21 |
+
app.launch()
|
|
|
|
|
|
test/test_data_service.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import unittest
|
|
|
|
| 2 |
|
| 3 |
from data_service import DataService
|
| 4 |
|
|
@@ -8,5 +9,19 @@ class TestDataService(unittest.TestCase):
|
|
| 8 |
#data_service = DataService().to("").withCreds("").build()
|
| 9 |
with DataService().build() as data_service:
|
| 10 |
patients = data_service.get_patients()
|
| 11 |
-
self.assertTrue(isinstance(patients,
|
| 12 |
-
self.assertTrue(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
import pandas as pd
|
| 3 |
|
| 4 |
from data_service import DataService
|
| 5 |
|
|
|
|
| 9 |
#data_service = DataService().to("").withCreds("").build()
|
| 10 |
with DataService().build() as data_service:
|
| 11 |
patients = data_service.get_patients()
|
| 12 |
+
self.assertTrue(isinstance(patients, pd.DataFrame))
|
| 13 |
+
self.assertTrue("PATIENT_ID" in patients.columns.values)
|
| 14 |
+
self.assertTrue("FIRST" in patients.columns.values)
|
| 15 |
+
self.assertTrue(patients.size > 0)
|
| 16 |
+
|
| 17 |
+
def test_get_patient_documents(self):
|
| 18 |
+
with DataService().build() as data_service:
|
| 19 |
+
documents = data_service.get_documents()
|
| 20 |
+
self.assertTrue(isinstance(documents, list))
|
| 21 |
+
self.assertTrue(len(documents) > 0)
|
| 22 |
+
|
| 23 |
+
def test_get_patient_metadatas(self):
|
| 24 |
+
with DataService().build() as data_service:
|
| 25 |
+
metadatas = data_service.get_document_metadatas()
|
| 26 |
+
self.assertTrue(isinstance(metadatas, list))
|
| 27 |
+
self.assertTrue(len(metadatas) > 0)
|