Spaces:

iscg34
/

finalproject

Sleeping

Jonathan Card commited on Jun 13, 2025

Commit

4c93471

1 Parent(s): 9641c0b

Integrate the changes made the afternoon of 2025-06-12.

I wanted to sort out the UI so that it would work with the changes made
this afternoon, including changes like the LLMService requiring the
patient ID rather than patient name for the query. This required minor
changes to the data service, but I also made some changes to refactor
the LLMService to encapsulate the structure of the dataset into the
data service. I also included some questions and observations in the
comments, and made some minor changes discussed in the group, such as
providing the syntax for default values for a method parameter.

Files changed (5) hide show

data_service.py +26 -6
g34_final_project.py +7 -6
llm_service.py +21 -10
test/test_app.py +12 -10
test/test_data_service.py +17 -2

data_service.py CHANGED Viewed

@@ -16,11 +16,17 @@ class DataService(object):
   def get_patients(self) -> list[str]:
     raise Exception("Should not use the base class")
 class DefaultDataService(DataService):
   def __init__(self):
-    self._dataset_url = "https://huggingface.co/datasets/iscg34/finalproject/raw/main/patient_encounters1_notes.csv"
     self._initialized = False
   def initialize(func):
@@ -37,8 +43,22 @@ class DefaultDataService(DataService):
     pass
   @initialize
-  def get_patients(self) -> list[str]:
-    col = self._df["FIRST"]
-    u = col.unique()
-    l = u.tolist()
-    return l

   def get_patients(self) -> list[str]:
     raise Exception("Should not use the base class")
+  def get_documents(self) -> list[str]:
+    raise Exception("Should not use the base class")
+  def get_document_metadatas(self) -> dict[str, any]:
+    raise Exception("Should not use the base class")
 class DefaultDataService(DataService):
   def __init__(self):
+    self._dataset_url = "hf://datasets/iscg34/finalproject/patient_information.csv"
     self._initialized = False
   def initialize(func):
     pass
   @initialize
+  def get_patients(self) -> pd.DataFrame:
+    # TODO: Now that I've changed the return type to DataFrame, I'm tempted to change the column names so that the rest of the application does not hard-code the names "PATIENT_ID" and "FIRST", and those are used solely within the data_service. But that could be over-thinking for this small application.
+    # I chose not to sort the patients alphabetically here because I thought that was a violation of the division of responsibilities between the data service and the UI. And I chose to change the return type to DataFrame because it's possible, based on my experience with Spark and Hadoop, that sorting would trigger a manifestation of the data that, until this point, could (theoretically) be being processed in a distributed way and not loaded locally, and sorting would trigger the processing of everything, and then sorting would be another O(n) operation.
+    # TODO: And the calculation of size() is silly, but I don't see noop action after group. Maybe apply()....
+    patients = self._df.groupby(["PATIENT_ID", "FIRST"]).size().reset_index()[["PATIENT_ID", "FIRST"]]
+    return patients
+  # TODO: We need to be careful with these two functions, get_documents and get_document_metadatas. They must be in the same order, because I don't think we're preserving the ID columns
+  @initialize
+  def get_documents(self) -> list[str]:
+    texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
+    return texts
+  # TODO: This kind-of triggered my "code smell" and then I saw iloc is deprecated. Surely there's a way to select out columns 0,3-11 (with the first as the id), and have get_documents also return a table that is columns [index, CLINICAL_NOTES], and the chroma.add function will be able to match the row IDs? I think that's what it says it expects to do.
+  # TODO: This is probably no longer the correct columns with the cleaned/processed dataset.
+  @initialize
+  def get_document_metadatas(self) -> dict[str, any]:
+    metadatas = [self._df.iloc[i,3:11].to_dict() for i in range(self._df.shape[0])]
+    return metadatas

g34_final_project.py CHANGED Viewed

@@ -24,8 +24,9 @@ class App(object):
       # Dividing the UI into columns is weird, since the principal organizing item is the row...
       with gr.Row():
         patient_dropdown = gr.Dropdown(
-          choices=data_service.get_patients(),
           multiselect=False,
           value=None,
           label=_("Patient")
@@ -52,7 +53,7 @@ class App(object):
       query_textbox.submit(
         fn=ask_query,
-        inputs=[query_textbox, patient_dropdown],
         outputs=(query_response)
       )
@@ -68,7 +69,7 @@ if __name__ == '__main__':
     #print("The application UI will launch, but AI functionality will be disabled.")
     print("Please add the OpenAI API key as an application secret.")
-  # TODO: Weird. How is gradio_app not out-of-scope by here?
-  with DataService().build() as data_service, LLMService().with_key(os.getenv("OPENAI_API_KEY")).build() as llm_service:
-    app = App(data_service, llm_service)
-    app.launch()

       # Dividing the UI into columns is weird, since the principal organizing item is the row...
       with gr.Row():
+        patients = data_service.get_patients()
         patient_dropdown = gr.Dropdown(
+          choices=[(x[1]["FIRST"], x[1]["PATIENT_ID"]) for x in patients.sort_values(by="FIRST").iterrows()],
           multiselect=False,
           value=None,
           label=_("Patient")
       query_textbox.submit(
         fn=ask_query,
+        inputs=[patient_dropdown, query_textbox],
         outputs=(query_response)
       )
     #print("The application UI will launch, but AI functionality will be disabled.")
     print("Please add the OpenAI API key as an application secret.")
+  with DataService().build() as data_service:
+    with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
+      app = App(data_service, llm_service)
+      app.launch()

llm_service.py CHANGED Viewed

@@ -4,18 +4,22 @@ from chromadb.config import Settings
 from sentence_transformers import SentenceTransformer
 import pandas as pd
 from openai import OpenAI
 class LLMService(object):
   def __init__(self):
     self._openAIKey = None
   @contextmanager
   def build(self):
     llm_service = None
     if self._openAIKey is None:
       raise ValueError("OPEN AI key was not provided and there is no default value.")
     try:
-      llm_service = DefaultLLMService(self._openAIKey)
       yield llm_service
     finally:
       if llm_service is not None:
@@ -33,10 +37,15 @@ class LLMService(object):
   def with_key(self, api_key: str):
     self._openAIKey = api_key
     return self
 class DefaultLLMService(LLMService):
-  def __init__(self, api_key: str):
     self._api_key = api_key
     self._chatclient=OpenAI(api_key=self._api_key)
     #TODO decide on embedding model, using one provided in notebook
     self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
@@ -51,26 +60,28 @@ class DefaultLLMService(LLMService):
       persist_directory="./chroma_db"
     ))
     collection_name = "patient_data"
     #if clear:
     #  client.delete_collection(collection_name)
     return client.get_or_create_collection(name=collection_name)
   def build_chromadb(self):
     #TODO replace with cleaned data url or move to service inputs
-    self._df = pd.read_csv("hf://datasets/iscg34/finalproject/patient_information.csv")
     collection = self.get_chromadb(clear=1)
-    texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
     embeddings = self._embed_model.encode(texts).tolist()
     collection.add(
       documents=texts,
       embeddings=embeddings,
-      metadatas=[self._df.iloc[i,3:11].to_dict() for i in range(len(texts))], #store everything except clinical notes and ids as metadata
       ids=[str(i) for i in range(len(texts))]
     )
-  def query_chromadb(self, patient: str, query: str, result_template:str, top_n=3) -> str:
     if (patient=="") or (patient is None):
       return ""
     if (query=="") or (query is None):
@@ -119,7 +130,7 @@ class DefaultLLMService(LLMService):
     #     Has this patient ...?
     #     Does this patient have a history of ...?
     # TODO: Find in vector database the most related docs to both 1. patient & 2. query
-    rag=self.query_chromadb(patient,query, "")
     # TODO: Figure out how to utilize other columns.
     prompt_template="""
     You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
@@ -128,10 +139,10 @@ class DefaultLLMService(LLMService):
     {RAG}
     Questions:\n
-    {query}
     Answer:"""
-    filled_prompt=prompt_template.format(RAG=rag,query=query)
     # Get model output
     response = self._chatclient.chat.completions.create(
         model="gpt-3.5-turbo",

 from sentence_transformers import SentenceTransformer
 import pandas as pd
 from openai import OpenAI
+from data_service import DataService
 class LLMService(object):
   def __init__(self):
     self._openAIKey = None
+    self._data_service = None
   @contextmanager
   def build(self):
     llm_service = None
     if self._openAIKey is None:
       raise ValueError("OPEN AI key was not provided and there is no default value.")
+    if self._data_service is None:
+      raise ValueError("To get the patient documents, a data service must be provided before building the LLMService.")
     try:
+      llm_service = DefaultLLMService(self._openAIKey, self._data_service)
       yield llm_service
     finally:
       if llm_service is not None:
   def with_key(self, api_key: str):
     self._openAIKey = api_key
     return self
+  def with_data_service(self, data_service: DataService):
+    self._data_service = data_service
+    return self
 class DefaultLLMService(LLMService):
+  def __init__(self, api_key: str, data_service: DataService):
     self._api_key = api_key
+    self._data_service = data_service
     self._chatclient=OpenAI(api_key=self._api_key)
     #TODO decide on embedding model, using one provided in notebook
     self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
       persist_directory="./chroma_db"
     ))
     collection_name = "patient_data"
+    # TODO: If clear and collection exists
     #if clear:
     #  client.delete_collection(collection_name)
     return client.get_or_create_collection(name=collection_name)
+  # TODO: It probably makes no difference, but the reason I chose to use a decorator @initialize in data_service is that I learned somewhere that it was better to put as little logic in a constructor as possible because (at least in other languages), errors in constructors made everything complicated, and potentially slow initialization logic made things difficult to ... I don't remember. Debug or parallelize or something. Maybe all of them. The trade-off is, if you forgot to decorate the method with @initialize, bad things.
   def build_chromadb(self):
     #TODO replace with cleaned data url or move to service inputs
     collection = self.get_chromadb(clear=1)
+    texts = self._data_service.get_documents()
+    metadatas = self._data_service.get_document_metadatas()
+    # TODO: I'm looking at the docs and I'm wondering 1. if this is the default embedding function anyway (I think it's good to bring it out and see we can adjust it; I'm just pointing this out), and 2. whether it would be equivalent to provide self._embed_model.encode as the embedding function of the collection at configuration time.
     embeddings = self._embed_model.encode(texts).tolist()
     collection.add(
       documents=texts,
       embeddings=embeddings,
+      metadatas=metadatas, #store everything except clinical notes and ids as metadata
       ids=[str(i) for i in range(len(texts))]
     )
+  def query_chromadb(self, patient: str, query: str, result_template:str = "", top_n=3) -> str:
     if (patient=="") or (patient is None):
       return ""
     if (query=="") or (query is None):
     #     Has this patient ...?
     #     Does this patient have a history of ...?
     # TODO: Find in vector database the most related docs to both 1. patient & 2. query
+    rag=self.query_chromadb(patient,query)
     # TODO: Figure out how to utilize other columns.
     prompt_template="""
     You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
     {RAG}
     Questions:\n
+    {Query}
     Answer:"""
+    filled_prompt=prompt_template.format(RAG=rag,Query=query)
     # Get model output
     response = self._chatclient.chat.completions.create(
         model="gpt-3.5-turbo",

test/test_app.py CHANGED Viewed

@@ -3,17 +3,19 @@ from unittest.mock import MagicMock, create_autospec
 from data_service import DataService
 from llm_service import LLMService
 from g34_final_project import App
 class TestApp(unittest.TestCase):
   def test_app(self):
-    mock_tmp = MagicMock()
-    mock_tmp.return_value.__enter__.return_value = create_autospec(DataService)
-    with mock_tmp() as data_service, LLMService().with_key("").build() as llm_service:
-      data_service.get_patients = MagicMock(return_value=["kareem", "abdul", "jabbar"])
-      llm_service.get_summary = MagicMock(return_value="This is a summary")
-      llm_service.answer_query = MagicMock(return_value="This is the query answer")
-      llm_service.close = MagicMock(return_value=None)
-      app = App(data_service, llm_service)
-      app.launch()
-      # TODO: This is probably not a good way to test. Maybe there isn't one for this. I just wanted to put a stub here. But it would be nice to be able to test things like "it accessed the data service get_patients once", etc.

 from data_service import DataService
 from llm_service import LLMService
 from g34_final_project import App
+import os
 class TestApp(unittest.TestCase):
   def test_app(self):
+    """
+    A simple test method to allow launching of the application with mock objects
+    rather than the actual implementations. Used for testing the UI.
+    """
+    mock_data_service = DataService().build
+    with mock_data_service() as data_service:
+      with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
+        # Replacing the call to get_summary with a mock since this isn't implemented yet.
+        llm_service.get_summary = MagicMock(return_value="This is a summary")
+        app = App(data_service, llm_service)
+        app.launch()

test/test_data_service.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import unittest
 from data_service import DataService
@@ -8,5 +9,19 @@ class TestDataService(unittest.TestCase):
     #data_service = DataService().to("").withCreds("").build()
     with DataService().build() as data_service:
       patients = data_service.get_patients()
-      self.assertTrue(isinstance(patients, list))
-      self.assertTrue(len(patients) > 0)

 import unittest
+import pandas as pd
 from data_service import DataService
     #data_service = DataService().to("").withCreds("").build()
     with DataService().build() as data_service:
       patients = data_service.get_patients()
+      self.assertTrue(isinstance(patients, pd.DataFrame))
+      self.assertTrue("PATIENT_ID" in patients.columns.values)
+      self.assertTrue("FIRST" in patients.columns.values)
+      self.assertTrue(patients.size > 0)
+  def test_get_patient_documents(self):
+    with DataService().build() as data_service:
+      documents = data_service.get_documents()
+      self.assertTrue(isinstance(documents, list))
+      self.assertTrue(len(documents) > 0)
+  def test_get_patient_metadatas(self):
+    with DataService().build() as data_service:
+      metadatas = data_service.get_document_metadatas()
+      self.assertTrue(isinstance(metadatas, list))
+      self.assertTrue(len(metadatas) > 0)