Spaces:

iscg34
/

finalproject

Sleeping

App Files Files Community

card-integrate-changes-2025-06-12

by JonCard - opened Jun 13, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+83

-34

Files changed (5) hide show

data_service.py +26 -6
g34_final_project.py +7 -6
llm_service.py +21 -10
test/test_app.py +12 -10
test/test_data_service.py +17 -2

data_service.py CHANGED Viewed

@@ -16,11 +16,17 @@ class DataService(object):
   def get_patients(self) -> list[str]:
     raise Exception("Should not use the base class")
 class DefaultDataService(DataService):
   def __init__(self):
-    self._dataset_url = "https://huggingface.co/datasets/iscg34/finalproject/raw/main/patient_encounters1_notes.csv"
     self._initialized = False
   def initialize(func):
@@ -37,8 +43,22 @@ class DefaultDataService(DataService):
     pass
   @initialize
-  def get_patients(self) -> list[str]:
-    col = self._df["FIRST"]
-    u = col.unique()
-    l = u.tolist()
-    return l

   def get_patients(self) -> list[str]:
     raise Exception("Should not use the base class")
+  def get_documents(self) -> list[str]:
+    raise Exception("Should not use the base class")
+  def get_document_metadatas(self) -> dict[str, any]:
+    raise Exception("Should not use the base class")
 class DefaultDataService(DataService):
   def __init__(self):
+    self._dataset_url = "hf://datasets/iscg34/finalproject/patient_information.csv"
     self._initialized = False
   def initialize(func):
     pass
   @initialize
+  def get_patients(self) -> pd.DataFrame:
+    # TODO: Now that I've changed the return type to DataFrame, I'm tempted to change the column names so that the rest of the application does not hard-code the names "PATIENT_ID" and "FIRST", and those are used solely within the data_service. But that could be over-thinking for this small application.
+    # I chose not to sort the patients alphabetically here because I thought that was a violation of the division of responsibilities between the data service and the UI. And I chose to change the return type to DataFrame because it's possible, based on my experience with Spark and Hadoop, that sorting would trigger a manifestation of the data that, until this point, could (theoretically) be being processed in a distributed way and not loaded locally, and sorting would trigger the processing of everything, and then sorting would be another O(n) operation.
+    # TODO: And the calculation of size() is silly, but I don't see noop action after group. Maybe apply()....
+    patients = self._df.groupby(["PATIENT_ID", "FIRST"]).size().reset_index()[["PATIENT_ID", "FIRST"]]
+    return patients
+  # TODO: We need to be careful with these two functions, get_documents and get_document_metadatas. They must be in the same order, because I don't think we're preserving the ID columns
+  @initialize
+  def get_documents(self) -> list[str]:
+    texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
+    return texts
+  # TODO: This kind-of triggered my "code smell" and then I saw iloc is deprecated. Surely there's a way to select out columns 0,3-11 (with the first as the id), and have get_documents also return a table that is columns [index, CLINICAL_NOTES], and the chroma.add function will be able to match the row IDs? I think that's what it says it expects to do.
+  # TODO: This is probably no longer the correct columns with the cleaned/processed dataset.
+  @initialize
+  def get_document_metadatas(self) -> dict[str, any]:
+    metadatas = [self._df.iloc[i,3:11].to_dict() for i in range(self._df.shape[0])]
+    return metadatas

g34_final_project.py CHANGED Viewed

@@ -24,8 +24,9 @@ class App(object):
       # Dividing the UI into columns is weird, since the principal organizing item is the row...
       with gr.Row():
         patient_dropdown = gr.Dropdown(
-          choices=data_service.get_patients(),
           multiselect=False,
           value=None,
           label=_("Patient")
@@ -52,7 +53,7 @@ class App(object):
       query_textbox.submit(
         fn=ask_query,
-        inputs=[query_textbox, patient_dropdown],
         outputs=(query_response)
       )
@@ -68,7 +69,7 @@ if __name__ == '__main__':
     #print("The application UI will launch, but AI functionality will be disabled.")
     print("Please add the OpenAI API key as an application secret.")
-  # TODO: Weird. How is gradio_app not out-of-scope by here?
-  with DataService().build() as data_service, LLMService().with_key(os.getenv("OPENAI_API_KEY")).build() as llm_service:
-    app = App(data_service, llm_service)
-    app.launch()

       # Dividing the UI into columns is weird, since the principal organizing item is the row...
       with gr.Row():
+        patients = data_service.get_patients()
         patient_dropdown = gr.Dropdown(
+          choices=[(x[1]["FIRST"], x[1]["PATIENT_ID"]) for x in patients.sort_values(by="FIRST").iterrows()],
           multiselect=False,
           value=None,
           label=_("Patient")
       query_textbox.submit(
         fn=ask_query,
+        inputs=[patient_dropdown, query_textbox],
         outputs=(query_response)
       )
     #print("The application UI will launch, but AI functionality will be disabled.")
     print("Please add the OpenAI API key as an application secret.")
+  with DataService().build() as data_service:
+    with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
+      app = App(data_service, llm_service)
+      app.launch()

llm_service.py CHANGED Viewed

@@ -4,18 +4,22 @@ from chromadb.config import Settings
 from sentence_transformers import SentenceTransformer
 import pandas as pd
 from openai import OpenAI
 class LLMService(object):
   def __init__(self):
     self._openAIKey = None
   @contextmanager
   def build(self):
     llm_service = None
     if self._openAIKey is None:
       raise ValueError("OPEN AI key was not provided and there is no default value.")
     try:
-      llm_service = DefaultLLMService(self._openAIKey)
       yield llm_service
     finally:
       if llm_service is not None:
@@ -33,10 +37,15 @@ class LLMService(object):
   def with_key(self, api_key: str):
     self._openAIKey = api_key
     return self
 class DefaultLLMService(LLMService):
-  def __init__(self, api_key: str):
     self._api_key = api_key
     self._chatclient=OpenAI(api_key=self._api_key)
     #TODO decide on embedding model, using one provided in notebook
     self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
@@ -51,26 +60,28 @@ class DefaultLLMService(LLMService):
       persist_directory="./chroma_db"
     ))
     collection_name = "patient_data"
     #if clear:
     #  client.delete_collection(collection_name)
     return client.get_or_create_collection(name=collection_name)
   def build_chromadb(self):
     #TODO replace with cleaned data url or move to service inputs
-    self._df = pd.read_csv("hf://datasets/iscg34/finalproject/patient_information.csv")
     collection = self.get_chromadb(clear=1)
-    texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
     embeddings = self._embed_model.encode(texts).tolist()
     collection.add(
       documents=texts,
       embeddings=embeddings,
-      metadatas=[self._df.iloc[i,3:11].to_dict() for i in range(len(texts))], #store everything except clinical notes and ids as metadata
       ids=[str(i) for i in range(len(texts))]
     )
-  def query_chromadb(self, patient: str, query: str, result_template:str, top_n=3) -> str:
     if (patient=="") or (patient is None):
       return ""
     if (query=="") or (query is None):
@@ -119,7 +130,7 @@ class DefaultLLMService(LLMService):
     #     Has this patient ...?
     #     Does this patient have a history of ...?
     # TODO: Find in vector database the most related docs to both 1. patient & 2. query
-    rag=self.query_chromadb(patient,query, "")
     # TODO: Figure out how to utilize other columns.
     prompt_template="""
     You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
@@ -128,10 +139,10 @@ class DefaultLLMService(LLMService):
     {RAG}
     Questions:\n
-    {query}
     Answer:"""
-    filled_prompt=prompt_template.format(RAG=rag,query=query)
     # Get model output
     response = self._chatclient.chat.completions.create(
         model="gpt-3.5-turbo",

 from sentence_transformers import SentenceTransformer
 import pandas as pd
 from openai import OpenAI
+from data_service import DataService
 class LLMService(object):
   def __init__(self):
     self._openAIKey = None
+    self._data_service = None
   @contextmanager
   def build(self):
     llm_service = None
     if self._openAIKey is None:
       raise ValueError("OPEN AI key was not provided and there is no default value.")
+    if self._data_service is None:
+      raise ValueError("To get the patient documents, a data service must be provided before building the LLMService.")
     try:
+      llm_service = DefaultLLMService(self._openAIKey, self._data_service)
       yield llm_service
     finally:
       if llm_service is not None:
   def with_key(self, api_key: str):
     self._openAIKey = api_key
     return self
+  def with_data_service(self, data_service: DataService):
+    self._data_service = data_service
+    return self
 class DefaultLLMService(LLMService):
+  def __init__(self, api_key: str, data_service: DataService):
     self._api_key = api_key
+    self._data_service = data_service
     self._chatclient=OpenAI(api_key=self._api_key)
     #TODO decide on embedding model, using one provided in notebook
     self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
       persist_directory="./chroma_db"
     ))
     collection_name = "patient_data"
+    # TODO: If clear and collection exists
     #if clear:
     #  client.delete_collection(collection_name)
     return client.get_or_create_collection(name=collection_name)
+  # TODO: It probably makes no difference, but the reason I chose to use a decorator @initialize in data_service is that I learned somewhere that it was better to put as little logic in a constructor as possible because (at least in other languages), errors in constructors made everything complicated, and potentially slow initialization logic made things difficult to ... I don't remember. Debug or parallelize or something. Maybe all of them. The trade-off is, if you forgot to decorate the method with @initialize, bad things.
   def build_chromadb(self):
     #TODO replace with cleaned data url or move to service inputs
     collection = self.get_chromadb(clear=1)
+    texts = self._data_service.get_documents()
+    metadatas = self._data_service.get_document_metadatas()
+    # TODO: I'm looking at the docs and I'm wondering 1. if this is the default embedding function anyway (I think it's good to bring it out and see we can adjust it; I'm just pointing this out), and 2. whether it would be equivalent to provide self._embed_model.encode as the embedding function of the collection at configuration time.
     embeddings = self._embed_model.encode(texts).tolist()
     collection.add(
       documents=texts,
       embeddings=embeddings,
+      metadatas=metadatas, #store everything except clinical notes and ids as metadata
       ids=[str(i) for i in range(len(texts))]
     )
+  def query_chromadb(self, patient: str, query: str, result_template:str = "", top_n=3) -> str:
     if (patient=="") or (patient is None):
       return ""
     if (query=="") or (query is None):
     #     Has this patient ...?
     #     Does this patient have a history of ...?
     # TODO: Find in vector database the most related docs to both 1. patient & 2. query
+    rag=self.query_chromadb(patient,query)
     # TODO: Figure out how to utilize other columns.
     prompt_template="""
     You are an AI Assistant answering questions about a patient based on the relevant patient information provided.\n
     {RAG}
     Questions:\n
+    {Query}
     Answer:"""
+    filled_prompt=prompt_template.format(RAG=rag,Query=query)
     # Get model output
     response = self._chatclient.chat.completions.create(
         model="gpt-3.5-turbo",

test/test_app.py CHANGED Viewed

@@ -3,17 +3,19 @@ from unittest.mock import MagicMock, create_autospec
 from data_service import DataService
 from llm_service import LLMService
 from g34_final_project import App
 class TestApp(unittest.TestCase):
   def test_app(self):
-    mock_tmp = MagicMock()
-    mock_tmp.return_value.__enter__.return_value = create_autospec(DataService)
-    with mock_tmp() as data_service, LLMService().with_key("").build() as llm_service:
-      data_service.get_patients = MagicMock(return_value=["kareem", "abdul", "jabbar"])
-      llm_service.get_summary = MagicMock(return_value="This is a summary")
-      llm_service.answer_query = MagicMock(return_value="This is the query answer")
-      llm_service.close = MagicMock(return_value=None)
-      app = App(data_service, llm_service)
-      app.launch()
-      # TODO: This is probably not a good way to test. Maybe there isn't one for this. I just wanted to put a stub here. But it would be nice to be able to test things like "it accessed the data service get_patients once", etc.

 from data_service import DataService
 from llm_service import LLMService
 from g34_final_project import App
+import os
 class TestApp(unittest.TestCase):
   def test_app(self):
+    """
+    A simple test method to allow launching of the application with mock objects
+    rather than the actual implementations. Used for testing the UI.
+    """
+    mock_data_service = DataService().build
+    with mock_data_service() as data_service:
+      with LLMService().with_key(os.getenv("OPENAI_API_KEY")).with_data_service(data_service).build() as llm_service:
+        # Replacing the call to get_summary with a mock since this isn't implemented yet.
+        llm_service.get_summary = MagicMock(return_value="This is a summary")
+        app = App(data_service, llm_service)
+        app.launch()

test/test_data_service.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import unittest
 from data_service import DataService
@@ -8,5 +9,19 @@ class TestDataService(unittest.TestCase):
     #data_service = DataService().to("").withCreds("").build()
     with DataService().build() as data_service:
       patients = data_service.get_patients()
-      self.assertTrue(isinstance(patients, list))
-      self.assertTrue(len(patients) > 0)

 import unittest
+import pandas as pd
 from data_service import DataService
     #data_service = DataService().to("").withCreds("").build()
     with DataService().build() as data_service:
       patients = data_service.get_patients()
+      self.assertTrue(isinstance(patients, pd.DataFrame))
+      self.assertTrue("PATIENT_ID" in patients.columns.values)
+      self.assertTrue("FIRST" in patients.columns.values)
+      self.assertTrue(patients.size > 0)
+  def test_get_patient_documents(self):
+    with DataService().build() as data_service:
+      documents = data_service.get_documents()
+      self.assertTrue(isinstance(documents, list))
+      self.assertTrue(len(documents) > 0)
+  def test_get_patient_metadatas(self):
+    with DataService().build() as data_service:
+      metadatas = data_service.get_document_metadatas()
+      self.assertTrue(isinstance(metadatas, list))
+      self.assertTrue(len(metadatas) > 0)