card-integrate-changes-2025-06-12

#1
by JonCard - opened
data_service.py CHANGED
@@ -16,11 +16,17 @@ class DataService(object):
16
 
17
  def get_patients(self) -> list[str]:
18
  raise Exception("Should not use the base class")
 
 
 
 
 
 
19
 
20
  class DefaultDataService(DataService):
21
 
22
  def __init__(self):
23
- self._dataset_url = "https://huggingface.co/datasets/iscg34/finalproject/raw/main/patient_encounters1_notes.csv"
24
  self._initialized = False
25
 
26
  def initialize(func):
@@ -37,8 +43,22 @@ class DefaultDataService(DataService):
37
  pass
38
 
39
  @initialize
40
- def get_patients(self) -> list[str]:
41
- col = self._df["FIRST"]
42
- u = col.unique()
43
- l = u.tolist()
44
- return l
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def get_patients(self) -> list[str]:
18
  raise Exception("Should not use the base class")
19
+
20
+ def get_documents(self) -> list[str]:
21
+ raise Exception("Should not use the base class")
22
+
23
+ def get_document_metadatas(self) -> dict[str, any]:
24
+ raise Exception("Should not use the base class")
25
 
26
  class DefaultDataService(DataService):
27
 
28
  def __init__(self):
29
+ self._dataset_url = "hf://datasets/iscg34/finalproject/patient_information.csv"
30
  self._initialized = False
31
 
32
  def initialize(func):
 
43
  pass
44
 
45
  @initialize
46
+ def get_patients(self) -> pd.DataFrame:
47
+ # TODO: Now that I've changed the return type to DataFrame, I'm tempted to change the column names so that the rest of the application does not hard-code the names "PATIENT_ID" and "FIRST", and those are used solely within the data_service. But that could be over-thinking for this small application.
48
+ # I chose not to sort the patients alphabetically here because I thought that was a violation of the division of responsibilities between the data service and the UI. And I chose to change the return type to DataFrame because it's possible, based on my experience with Spark and Hadoop, that sorting would trigger a manifestation of the data that, until this point, could (theoretically) be being processed in a distributed way and not loaded locally, and sorting would trigger the processing of everything, and then sorting would be another O(n) operation.
49
+ # TODO: And the calculation of size() is silly, but I don't see noop action after group. Maybe apply()....
50
+ patients = self._df.groupby(["PATIENT_ID", "FIRST"]).size().reset_index()[["PATIENT_ID", "FIRST"]]
51
+ return patients
52
+
53
+ # TODO: We need to be careful with these two functions, get_documents and get_document_metadatas. They must be in the same order, because I don't think we're preserving the ID columns
54
+ @initialize
55
+ def get_documents(self) -> list[str]:
56
+ texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
57
+ return texts
58
+
59
+ # TODO: This kind-of triggered my "code smell" and then I saw iloc is deprecated. Surely there's a way to select out columns 0,3-11 (with the first as the id), and have get_documents also return a table that is columns [index, CLINICAL_NOTES], and the chroma.add function will be able to match the row IDs? I think that's what it says it expects to do.
60
+ # TODO: This is probably no longer the correct columns with the cleaned/processed dataset.
61
+ @initialize
62
+ def get_document_metadatas(self) -> dict[str, any]:
63
+ metadatas = [self._df.iloc[i,3:11].to_dict() for i in range(self._df.shape[0])]
64
+ return metadatas
g34_final_project.py CHANGED
@@ -24,8 +24,9 @@ class App(object):
24
 
25
  # Dividing the UI into columns is weird, since the principal organizing item is the row...
26
  with gr.Row():
 
27
  patient_dropdown = gr.Dropdown(
28
- choices=data_service.get_patients(),
29
  multiselect=False,
30
  value=None,
31
  label=_("Patient")
@@ -52,7 +53,7 @@ class App(object):
52
 
53
  query_textbox.submit(
54
  fn=ask_query,
55
- inputs=[query_textbox, patient_dropdown],
56
  outputs=(query_response)
57
  )
58
 
@@ -68,7 +69,7 @@ if __name__ == '__main__':
68
  #print("The application UI will launch, but AI functionality will be disabled.")
69
  print("Please add the OpenAI API key as an application secret.")
70
 
71
- # TODO: Weird. How is gradio_app not out-of-scope by here?
72
- with DataService().build() as data_service, LLMService().with_key(os.getenv("OPENAI_API_KEY")).build() as llm_service:
73
- app = App(data_service, llm_service)
74
- app.launch()
 
24
 
25
  # Dividing the UI into columns is weird, since the principal organizing item is the row...
26
  with gr.Row():
27
+ patients = data_service.get_patients()
28
  patient_dropdown = gr.Dropdown(
29
+ choices=[(x[1]["FIRST"], x[1]["PATIENT_ID"]) for x in patients.sort_values(by="FIRST").iterrows()],
30
  multiselect=False,
31
  value=None,
32
  label=_("Patient")
 
53
 
54
  query_textbox.submit(
55
  fn=ask_query,
56
+ inputs=[patient_dropdown, query_textbox],
57
  outputs=(query_response)
58
  )
59
 
 
69
  #print("The application UI will launch, but AI functionality will be disabled.")
70
  print("Please add the OpenAI API key as an application secret.")
71
 
72
+ with DataService().build() as data_service:
73
+ with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
74
+ app = App(data_service, llm_service)
75
+ app.launch()
llm_service.py CHANGED
@@ -4,18 +4,22 @@ from chromadb.config import Settings
4
  from sentence_transformers import SentenceTransformer
5
  import pandas as pd
6
  from openai import OpenAI
 
7
 
8
  class LLMService(object):
9
  def __init__(self):
10
  self._openAIKey = None
 
11
 
12
  @contextmanager
13
  def build(self):
14
  llm_service = None
15
  if self._openAIKey is None:
16
  raise ValueError("OPEN AI key was not provided and there is no default value.")
 
 
17
  try:
18
- llm_service = DefaultLLMService(self._openAIKey)
19
  yield llm_service
20
  finally:
21
  if llm_service is not None:
@@ -33,10 +37,15 @@ class LLMService(object):
33
  def with_key(self, api_key: str):
34
  self._openAIKey = api_key
35
  return self
 
 
 
 
36
 
37
  class DefaultLLMService(LLMService):
38
- def __init__(self, api_key: str):
39
  self._api_key = api_key
 
40
  self._chatclient=OpenAI(api_key=self._api_key)
41
  #TODO decide on embedding model, using one provided in notebook
42
  self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
@@ -51,26 +60,28 @@ class DefaultLLMService(LLMService):
51
  persist_directory="./chroma_db"
52
  ))
53
  collection_name = "patient_data"
 
54
  #if clear:
55
  # client.delete_collection(collection_name)
56
  return client.get_or_create_collection(name=collection_name)
57
 
 
58
  def build_chromadb(self):
59
  #TODO replace with cleaned data url or move to service inputs
60
- self._df = pd.read_csv("hf://datasets/iscg34/finalproject/patient_information.csv")
61
-
62
  collection = self.get_chromadb(clear=1)
63
- texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
 
 
64
  embeddings = self._embed_model.encode(texts).tolist()
65
 
66
  collection.add(
67
  documents=texts,
68
  embeddings=embeddings,
69
- metadatas=[self._df.iloc[i,3:11].to_dict() for i in range(len(texts))], #store everything except clinical notes and ids as metadata
70
  ids=[str(i) for i in range(len(texts))]
71
  )
72
 
73
- def query_chromadb(self, patient: str, query: str, result_template:str, top_n=3) -> str:
74
  if (patient=="") or (patient is None):
75
  return ""
76
  if (query=="") or (query is None):
@@ -119,7 +130,7 @@ class DefaultLLMService(LLMService):
119
  # Has this patient ...?
120
  # Does this patient have a history of ...?
121
  # TODO: Find in vector database the most related docs to both 1. patient & 2. query
122
- rag=self.query_chromadb(patient,query, "")
123
  # TODO: Figure out how to utilize other columns.
124
  prompt_template="""
125
  You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
@@ -128,10 +139,10 @@ class DefaultLLMService(LLMService):
128
  {RAG}
129
 
130
  Questions:\n
131
- {query}
132
 
133
  Answer:"""
134
- filled_prompt=prompt_template.format(RAG=rag,query=query)
135
  # Get model output
136
  response = self._chatclient.chat.completions.create(
137
  model="gpt-3.5-turbo",
 
4
  from sentence_transformers import SentenceTransformer
5
  import pandas as pd
6
  from openai import OpenAI
7
+ from data_service import DataService
8
 
9
  class LLMService(object):
10
  def __init__(self):
11
  self._openAIKey = None
12
+ self._data_service = None
13
 
14
  @contextmanager
15
  def build(self):
16
  llm_service = None
17
  if self._openAIKey is None:
18
  raise ValueError("OPEN AI key was not provided and there is no default value.")
19
+ if self._data_service is None:
20
+ raise ValueError("To get the patient documents, a data service must be provided before building the LLMService.")
21
  try:
22
+ llm_service = DefaultLLMService(self._openAIKey, self._data_service)
23
  yield llm_service
24
  finally:
25
  if llm_service is not None:
 
37
  def with_key(self, api_key: str):
38
  self._openAIKey = api_key
39
  return self
40
+
41
+ def with_data_service(self, data_service: DataService):
42
+ self._data_service = data_service
43
+ return self
44
 
45
  class DefaultLLMService(LLMService):
46
+ def __init__(self, api_key: str, data_service: DataService):
47
  self._api_key = api_key
48
+ self._data_service = data_service
49
  self._chatclient=OpenAI(api_key=self._api_key)
50
  #TODO decide on embedding model, using one provided in notebook
51
  self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
 
60
  persist_directory="./chroma_db"
61
  ))
62
  collection_name = "patient_data"
63
+ # TODO: If clear and collection exists
64
  #if clear:
65
  # client.delete_collection(collection_name)
66
  return client.get_or_create_collection(name=collection_name)
67
 
68
+ # TODO: It probably makes no difference, but the reason I chose to use a decorator @initialize in data_service is that I learned somewhere that it was better to put as little logic in a constructor as possible because (at least in other languages), errors in constructors made everything complicated, and potentially slow initialization logic made things difficult to ... I don't remember. Debug or parallelize or something. Maybe all of them. The trade-off is, if you forgot to decorate the method with @initialize, bad things.
69
  def build_chromadb(self):
70
  #TODO replace with cleaned data url or move to service inputs
 
 
71
  collection = self.get_chromadb(clear=1)
72
+ texts = self._data_service.get_documents()
73
+ metadatas = self._data_service.get_document_metadatas()
74
+ # TODO: I'm looking at the docs and I'm wondering 1. if this is the default embedding function anyway (I think it's good to bring it out and see we can adjust it; I'm just pointing this out), and 2. whether it would be equivalent to provide self._embed_model.encode as the embedding function of the collection at configuration time.
75
  embeddings = self._embed_model.encode(texts).tolist()
76
 
77
  collection.add(
78
  documents=texts,
79
  embeddings=embeddings,
80
+ metadatas=metadatas, #store everything except clinical notes and ids as metadata
81
  ids=[str(i) for i in range(len(texts))]
82
  )
83
 
84
+ def query_chromadb(self, patient: str, query: str, result_template:str = "", top_n=3) -> str:
85
  if (patient=="") or (patient is None):
86
  return ""
87
  if (query=="") or (query is None):
 
130
  # Has this patient ...?
131
  # Does this patient have a history of ...?
132
  # TODO: Find in vector database the most related docs to both 1. patient & 2. query
133
+ rag=self.query_chromadb(patient,query)
134
  # TODO: Figure out how to utilize other columns.
135
  prompt_template="""
136
  You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
 
139
  {RAG}
140
 
141
  Questions:\n
142
+ {Query}
143
 
144
  Answer:"""
145
+ filled_prompt=prompt_template.format(RAG=rag,Query=query)
146
  # Get model output
147
  response = self._chatclient.chat.completions.create(
148
  model="gpt-3.5-turbo",
test/test_app.py CHANGED
@@ -3,17 +3,19 @@ from unittest.mock import MagicMock, create_autospec
3
  from data_service import DataService
4
  from llm_service import LLMService
5
  from g34_final_project import App
 
6
 
7
  class TestApp(unittest.TestCase):
8
  def test_app(self):
9
- mock_tmp = MagicMock()
10
- mock_tmp.return_value.__enter__.return_value = create_autospec(DataService)
11
- with mock_tmp() as data_service, LLMService().with_key("").build() as llm_service:
12
- data_service.get_patients = MagicMock(return_value=["kareem", "abdul", "jabbar"])
 
 
 
13
 
14
- llm_service.get_summary = MagicMock(return_value="This is a summary")
15
- llm_service.answer_query = MagicMock(return_value="This is the query answer")
16
- llm_service.close = MagicMock(return_value=None)
17
- app = App(data_service, llm_service)
18
- app.launch()
19
- # TODO: This is probably not a good way to test. Maybe there isn't one for this. I just wanted to put a stub here. But it would be nice to be able to test things like "it accessed the data service get_patients once", etc.
 
3
  from data_service import DataService
4
  from llm_service import LLMService
5
  from g34_final_project import App
6
+ import os
7
 
8
  class TestApp(unittest.TestCase):
9
  def test_app(self):
10
+ """
11
+ A simple test method to allow launching of the application with mock objects
12
+ rather than the actual implementations. Used for testing the UI.
13
+ """
14
+ mock_data_service = DataService().build
15
+ with mock_data_service() as data_service:
16
+ with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
17
 
18
+ # Replacing the call to get_summary with a mock since this isn't implemented yet.
19
+ llm_service.get_summary = MagicMock(return_value="This is a summary")
20
+ app = App(data_service, llm_service)
21
+ app.launch()
 
 
test/test_data_service.py CHANGED
@@ -1,4 +1,5 @@
1
  import unittest
 
2
 
3
  from data_service import DataService
4
 
@@ -8,5 +9,19 @@ class TestDataService(unittest.TestCase):
8
  #data_service = DataService().to("").withCreds("").build()
9
  with DataService().build() as data_service:
10
  patients = data_service.get_patients()
11
- self.assertTrue(isinstance(patients, list))
12
- self.assertTrue(len(patients) > 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import unittest
2
+ import pandas as pd
3
 
4
  from data_service import DataService
5
 
 
9
  #data_service = DataService().to("").withCreds("").build()
10
  with DataService().build() as data_service:
11
  patients = data_service.get_patients()
12
+ self.assertTrue(isinstance(patients, pd.DataFrame))
13
+ self.assertTrue("PATIENT_ID" in patients.columns.values)
14
+ self.assertTrue("FIRST" in patients.columns.values)
15
+ self.assertTrue(patients.size > 0)
16
+
17
+ def test_get_patient_documents(self):
18
+ with DataService().build() as data_service:
19
+ documents = data_service.get_documents()
20
+ self.assertTrue(isinstance(documents, list))
21
+ self.assertTrue(len(documents) > 0)
22
+
23
+ def test_get_patient_metadatas(self):
24
+ with DataService().build() as data_service:
25
+ metadatas = data_service.get_document_metadatas()
26
+ self.assertTrue(isinstance(metadatas, list))
27
+ self.assertTrue(len(metadatas) > 0)