Jonathan Card commited on
Commit
4c93471
·
1 Parent(s): 9641c0b

Integrate the changes made the afternoon of 2025-06-12.

Browse files

I wanted to sort out the UI so that it would work with the changes made
this afternoon, including changes like the LLMService requiring the
patient ID rather than patient name for the query. This required minor
changes to the data service, but I also made some changes to refactor
the LLMService to encapsulate the structure of the dataset into the
data service. I also included some questions and observations in the
comments, and made some minor changes discussed in the group, such as
providing the syntax for default values for a method parameter.

data_service.py CHANGED
@@ -16,11 +16,17 @@ class DataService(object):
16
 
17
  def get_patients(self) -> list[str]:
18
  raise Exception("Should not use the base class")
 
 
 
 
 
 
19
 
20
  class DefaultDataService(DataService):
21
 
22
  def __init__(self):
23
- self._dataset_url = "https://huggingface.co/datasets/iscg34/finalproject/raw/main/patient_encounters1_notes.csv"
24
  self._initialized = False
25
 
26
  def initialize(func):
@@ -37,8 +43,22 @@ class DefaultDataService(DataService):
37
  pass
38
 
39
  @initialize
40
- def get_patients(self) -> list[str]:
41
- col = self._df["FIRST"]
42
- u = col.unique()
43
- l = u.tolist()
44
- return l
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def get_patients(self) -> list[str]:
18
  raise Exception("Should not use the base class")
19
+
20
+ def get_documents(self) -> list[str]:
21
+ raise Exception("Should not use the base class")
22
+
23
+ def get_document_metadatas(self) -> dict[str, any]:
24
+ raise Exception("Should not use the base class")
25
 
26
  class DefaultDataService(DataService):
27
 
28
  def __init__(self):
29
+ self._dataset_url = "hf://datasets/iscg34/finalproject/patient_information.csv"
30
  self._initialized = False
31
 
32
  def initialize(func):
 
43
  pass
44
 
45
  @initialize
46
+ def get_patients(self) -> pd.DataFrame:
47
+ # TODO: Now that I've changed the return type to DataFrame, I'm tempted to change the column names so that the rest of the application does not hard-code the names "PATIENT_ID" and "FIRST", and those are used solely within the data_service. But that could be over-thinking for this small application.
48
+ # I chose not to sort the patients alphabetically here because I thought that was a violation of the division of responsibilities between the data service and the UI. And I chose to change the return type to DataFrame because it's possible, based on my experience with Spark and Hadoop, that sorting would trigger a manifestation of the data that, until this point, could (theoretically) be being processed in a distributed way and not loaded locally, and sorting would trigger the processing of everything, and then sorting would be another O(n) operation.
49
+ # TODO: And the calculation of size() is silly, but I don't see noop action after group. Maybe apply()....
50
+ patients = self._df.groupby(["PATIENT_ID", "FIRST"]).size().reset_index()[["PATIENT_ID", "FIRST"]]
51
+ return patients
52
+
53
+ # TODO: We need to be careful with these two functions, get_documents and get_document_metadatas. They must be in the same order, because I don't think we're preserving the ID columns
54
+ @initialize
55
+ def get_documents(self) -> list[str]:
56
+ texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
57
+ return texts
58
+
59
+ # TODO: This kind-of triggered my "code smell" and then I saw iloc is deprecated. Surely there's a way to select out columns 0,3-11 (with the first as the id), and have get_documents also return a table that is columns [index, CLINICAL_NOTES], and the chroma.add function will be able to match the row IDs? I think that's what it says it expects to do.
60
+ # TODO: This is probably no longer the correct columns with the cleaned/processed dataset.
61
+ @initialize
62
+ def get_document_metadatas(self) -> dict[str, any]:
63
+ metadatas = [self._df.iloc[i,3:11].to_dict() for i in range(self._df.shape[0])]
64
+ return metadatas
g34_final_project.py CHANGED
@@ -24,8 +24,9 @@ class App(object):
24
 
25
  # Dividing the UI into columns is weird, since the principal organizing item is the row...
26
  with gr.Row():
 
27
  patient_dropdown = gr.Dropdown(
28
- choices=data_service.get_patients(),
29
  multiselect=False,
30
  value=None,
31
  label=_("Patient")
@@ -52,7 +53,7 @@ class App(object):
52
 
53
  query_textbox.submit(
54
  fn=ask_query,
55
- inputs=[query_textbox, patient_dropdown],
56
  outputs=(query_response)
57
  )
58
 
@@ -68,7 +69,7 @@ if __name__ == '__main__':
68
  #print("The application UI will launch, but AI functionality will be disabled.")
69
  print("Please add the OpenAI API key as an application secret.")
70
 
71
- # TODO: Weird. How is gradio_app not out-of-scope by here?
72
- with DataService().build() as data_service, LLMService().with_key(os.getenv("OPENAI_API_KEY")).build() as llm_service:
73
- app = App(data_service, llm_service)
74
- app.launch()
 
24
 
25
  # Dividing the UI into columns is weird, since the principal organizing item is the row...
26
  with gr.Row():
27
+ patients = data_service.get_patients()
28
  patient_dropdown = gr.Dropdown(
29
+ choices=[(x[1]["FIRST"], x[1]["PATIENT_ID"]) for x in patients.sort_values(by="FIRST").iterrows()],
30
  multiselect=False,
31
  value=None,
32
  label=_("Patient")
 
53
 
54
  query_textbox.submit(
55
  fn=ask_query,
56
+ inputs=[patient_dropdown, query_textbox],
57
  outputs=(query_response)
58
  )
59
 
 
69
  #print("The application UI will launch, but AI functionality will be disabled.")
70
  print("Please add the OpenAI API key as an application secret.")
71
 
72
+ with DataService().build() as data_service:
73
+ with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
74
+ app = App(data_service, llm_service)
75
+ app.launch()
llm_service.py CHANGED
@@ -4,18 +4,22 @@ from chromadb.config import Settings
4
  from sentence_transformers import SentenceTransformer
5
  import pandas as pd
6
  from openai import OpenAI
 
7
 
8
  class LLMService(object):
9
  def __init__(self):
10
  self._openAIKey = None
 
11
 
12
  @contextmanager
13
  def build(self):
14
  llm_service = None
15
  if self._openAIKey is None:
16
  raise ValueError("OPEN AI key was not provided and there is no default value.")
 
 
17
  try:
18
- llm_service = DefaultLLMService(self._openAIKey)
19
  yield llm_service
20
  finally:
21
  if llm_service is not None:
@@ -33,10 +37,15 @@ class LLMService(object):
33
  def with_key(self, api_key: str):
34
  self._openAIKey = api_key
35
  return self
 
 
 
 
36
 
37
  class DefaultLLMService(LLMService):
38
- def __init__(self, api_key: str):
39
  self._api_key = api_key
 
40
  self._chatclient=OpenAI(api_key=self._api_key)
41
  #TODO decide on embedding model, using one provided in notebook
42
  self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
@@ -51,26 +60,28 @@ class DefaultLLMService(LLMService):
51
  persist_directory="./chroma_db"
52
  ))
53
  collection_name = "patient_data"
 
54
  #if clear:
55
  # client.delete_collection(collection_name)
56
  return client.get_or_create_collection(name=collection_name)
57
 
 
58
  def build_chromadb(self):
59
  #TODO replace with cleaned data url or move to service inputs
60
- self._df = pd.read_csv("hf://datasets/iscg34/finalproject/patient_information.csv")
61
-
62
  collection = self.get_chromadb(clear=1)
63
- texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
 
 
64
  embeddings = self._embed_model.encode(texts).tolist()
65
 
66
  collection.add(
67
  documents=texts,
68
  embeddings=embeddings,
69
- metadatas=[self._df.iloc[i,3:11].to_dict() for i in range(len(texts))], #store everything except clinical notes and ids as metadata
70
  ids=[str(i) for i in range(len(texts))]
71
  )
72
 
73
- def query_chromadb(self, patient: str, query: str, result_template:str, top_n=3) -> str:
74
  if (patient=="") or (patient is None):
75
  return ""
76
  if (query=="") or (query is None):
@@ -119,7 +130,7 @@ class DefaultLLMService(LLMService):
119
  # Has this patient ...?
120
  # Does this patient have a history of ...?
121
  # TODO: Find in vector database the most related docs to both 1. patient & 2. query
122
- rag=self.query_chromadb(patient,query, "")
123
  # TODO: Figure out how to utilize other columns.
124
  prompt_template="""
125
  You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
@@ -128,10 +139,10 @@ class DefaultLLMService(LLMService):
128
  {RAG}
129
 
130
  Questions:\n
131
- {query}
132
 
133
  Answer:"""
134
- filled_prompt=prompt_template.format(RAG=rag,query=query)
135
  # Get model output
136
  response = self._chatclient.chat.completions.create(
137
  model="gpt-3.5-turbo",
 
4
  from sentence_transformers import SentenceTransformer
5
  import pandas as pd
6
  from openai import OpenAI
7
+ from data_service import DataService
8
 
9
  class LLMService(object):
10
  def __init__(self):
11
  self._openAIKey = None
12
+ self._data_service = None
13
 
14
  @contextmanager
15
  def build(self):
16
  llm_service = None
17
  if self._openAIKey is None:
18
  raise ValueError("OPEN AI key was not provided and there is no default value.")
19
+ if self._data_service is None:
20
+ raise ValueError("To get the patient documents, a data service must be provided before building the LLMService.")
21
  try:
22
+ llm_service = DefaultLLMService(self._openAIKey, self._data_service)
23
  yield llm_service
24
  finally:
25
  if llm_service is not None:
 
37
  def with_key(self, api_key: str):
38
  self._openAIKey = api_key
39
  return self
40
+
41
+ def with_data_service(self, data_service: DataService):
42
+ self._data_service = data_service
43
+ return self
44
 
45
  class DefaultLLMService(LLMService):
46
+ def __init__(self, api_key: str, data_service: DataService):
47
  self._api_key = api_key
48
+ self._data_service = data_service
49
  self._chatclient=OpenAI(api_key=self._api_key)
50
  #TODO decide on embedding model, using one provided in notebook
51
  self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
 
60
  persist_directory="./chroma_db"
61
  ))
62
  collection_name = "patient_data"
63
+ # TODO: If clear and collection exists
64
  #if clear:
65
  # client.delete_collection(collection_name)
66
  return client.get_or_create_collection(name=collection_name)
67
 
68
+ # TODO: It probably makes no difference, but the reason I chose to use a decorator @initialize in data_service is that I learned somewhere that it was better to put as little logic in a constructor as possible because (at least in other languages), errors in constructors made everything complicated, and potentially slow initialization logic made things difficult to ... I don't remember. Debug or parallelize or something. Maybe all of them. The trade-off is, if you forgot to decorate the method with @initialize, bad things.
69
  def build_chromadb(self):
70
  #TODO replace with cleaned data url or move to service inputs
 
 
71
  collection = self.get_chromadb(clear=1)
72
+ texts = self._data_service.get_documents()
73
+ metadatas = self._data_service.get_document_metadatas()
74
+ # TODO: I'm looking at the docs and I'm wondering 1. if this is the default embedding function anyway (I think it's good to bring it out and see we can adjust it; I'm just pointing this out), and 2. whether it would be equivalent to provide self._embed_model.encode as the embedding function of the collection at configuration time.
75
  embeddings = self._embed_model.encode(texts).tolist()
76
 
77
  collection.add(
78
  documents=texts,
79
  embeddings=embeddings,
80
+ metadatas=metadatas, #store everything except clinical notes and ids as metadata
81
  ids=[str(i) for i in range(len(texts))]
82
  )
83
 
84
+ def query_chromadb(self, patient: str, query: str, result_template:str = "", top_n=3) -> str:
85
  if (patient=="") or (patient is None):
86
  return ""
87
  if (query=="") or (query is None):
 
130
  # Has this patient ...?
131
  # Does this patient have a history of ...?
132
  # TODO: Find in vector database the most related docs to both 1. patient & 2. query
133
+ rag=self.query_chromadb(patient,query)
134
  # TODO: Figure out how to utilize other columns.
135
  prompt_template="""
136
  You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
 
139
  {RAG}
140
 
141
  Questions:\n
142
+ {Query}
143
 
144
  Answer:"""
145
+ filled_prompt=prompt_template.format(RAG=rag,Query=query)
146
  # Get model output
147
  response = self._chatclient.chat.completions.create(
148
  model="gpt-3.5-turbo",
test/test_app.py CHANGED
@@ -3,17 +3,19 @@ from unittest.mock import MagicMock, create_autospec
3
  from data_service import DataService
4
  from llm_service import LLMService
5
  from g34_final_project import App
 
6
 
7
  class TestApp(unittest.TestCase):
8
  def test_app(self):
9
- mock_tmp = MagicMock()
10
- mock_tmp.return_value.__enter__.return_value = create_autospec(DataService)
11
- with mock_tmp() as data_service, LLMService().with_key("").build() as llm_service:
12
- data_service.get_patients = MagicMock(return_value=["kareem", "abdul", "jabbar"])
 
 
 
13
 
14
- llm_service.get_summary = MagicMock(return_value="This is a summary")
15
- llm_service.answer_query = MagicMock(return_value="This is the query answer")
16
- llm_service.close = MagicMock(return_value=None)
17
- app = App(data_service, llm_service)
18
- app.launch()
19
- # TODO: This is probably not a good way to test. Maybe there isn't one for this. I just wanted to put a stub here. But it would be nice to be able to test things like "it accessed the data service get_patients once", etc.
 
3
  from data_service import DataService
4
  from llm_service import LLMService
5
  from g34_final_project import App
6
+ import os
7
 
8
  class TestApp(unittest.TestCase):
9
  def test_app(self):
10
+ """
11
+ A simple test method to allow launching of the application with mock objects
12
+ rather than the actual implementations. Used for testing the UI.
13
+ """
14
+ mock_data_service = DataService().build
15
+ with mock_data_service() as data_service:
16
+ with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
17
 
18
+ # Replacing the call to get_summary with a mock since this isn't implemented yet.
19
+ llm_service.get_summary = MagicMock(return_value="This is a summary")
20
+ app = App(data_service, llm_service)
21
+ app.launch()
 
 
test/test_data_service.py CHANGED
@@ -1,4 +1,5 @@
1
  import unittest
 
2
 
3
  from data_service import DataService
4
 
@@ -8,5 +9,19 @@ class TestDataService(unittest.TestCase):
8
  #data_service = DataService().to("").withCreds("").build()
9
  with DataService().build() as data_service:
10
  patients = data_service.get_patients()
11
- self.assertTrue(isinstance(patients, list))
12
- self.assertTrue(len(patients) > 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import unittest
2
+ import pandas as pd
3
 
4
  from data_service import DataService
5
 
 
9
  #data_service = DataService().to("").withCreds("").build()
10
  with DataService().build() as data_service:
11
  patients = data_service.get_patients()
12
+ self.assertTrue(isinstance(patients, pd.DataFrame))
13
+ self.assertTrue("PATIENT_ID" in patients.columns.values)
14
+ self.assertTrue("FIRST" in patients.columns.values)
15
+ self.assertTrue(patients.size > 0)
16
+
17
+ def test_get_patient_documents(self):
18
+ with DataService().build() as data_service:
19
+ documents = data_service.get_documents()
20
+ self.assertTrue(isinstance(documents, list))
21
+ self.assertTrue(len(documents) > 0)
22
+
23
+ def test_get_patient_metadatas(self):
24
+ with DataService().build() as data_service:
25
+ metadatas = data_service.get_document_metadatas()
26
+ self.assertTrue(isinstance(metadatas, list))
27
+ self.assertTrue(len(metadatas) > 0)