Spaces:

iscg34
/

finalproject

Sleeping

File size: 3,273 Bytes

from contextlib import contextmanager
import pandas as pd

class DataService(object):
  @contextmanager
  def build(self):
    data_service = None
    try:
      data_service = DefaultDataService()
      yield data_service
    finally:
      if data_service: data_service.close()

  def close(self):
    raise Exception("Should not use the base class")

  def get_patients(self) -> list[str]:
    raise Exception("Should not use the base class")
  
  def get_documents(self) -> list[str]:
    raise Exception("Should not use the base class")

  def get_document_metadatas(self) -> dict[str, any]:
    raise Exception("Should not use the base class")

class DefaultDataService(DataService):

  def __init__(self):
    self._dataset_url = "hf://datasets/iscg34/finalproject/patient_information.csv"
    self._initialized = False

  def initialize(func):
    def wrapper(self):
      if not self._initialized:
        self._df = pd.read_csv(self._dataset_url)
        self._initialized = True
      return func(self)

    return wrapper

  def close(self):
    #raise Exception("Not implemented")
    pass

  @initialize
  def get_patients(self) -> pd.DataFrame:
    # TODO: Now that I've changed the return type to DataFrame, I'm tempted to change the column names so that the rest of the application does not hard-code the names "PATIENT_ID" and "FIRST", and those are used solely within the data_service. But that could be over-thinking for this small application.
    # I chose not to sort the patients alphabetically here because I thought that was a violation of the division of responsibilities between the data service and the UI. And I chose to change the return type to DataFrame because it's possible, based on my experience with Spark and Hadoop, that sorting would trigger a manifestation of the data that, until this point, could (theoretically) be being processed in a distributed way and not loaded locally, and sorting would trigger the processing of everything, and then sorting would be another O(n) operation.
    # TODO: And the calculation of size() is silly, but I don't see noop action after group. Maybe apply()....
    patients = self._df.groupby(["PATIENT_ID", "FIRST"]).size().reset_index()[["PATIENT_ID", "FIRST"]]
    return patients

  # TODO: We need to be careful with these two functions, get_documents and get_document_metadatas. They must be in the same order, because I don't think we're preserving the ID columns
  @initialize
  def get_documents(self) -> list[str]:
    texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
    return texts

  # TODO: This kind-of triggered my "code smell" and then I saw iloc is deprecated. Surely there's a way to select out columns 0,3-11 (with the first as the id), and have get_documents also return a table that is columns [index, CLINICAL_NOTES], and the chroma.add function will be able to match the row IDs? I think that's what it says it expects to do.
  # TODO: This is probably no longer the correct columns with the cleaned/processed dataset.
  @initialize
  def get_document_metadatas(self) -> dict[str, any]:
    metadatas = [self._df.iloc[i,3:11].to_dict() for i in range(self._df.shape[0])]
    return metadatas

  @initialize
  def get_data(self) -> pd.DataFrame:
    return self._df