from contextlib import contextmanager import pandas as pd class DataService(object): @contextmanager def build(self): data_service = None try: data_service = DefaultDataService() yield data_service finally: if data_service: data_service.close() def close(self): raise Exception("Should not use the base class") def get_patients(self) -> list[str]: raise Exception("Should not use the base class") def get_documents(self) -> list[str]: raise Exception("Should not use the base class") def get_document_metadatas(self) -> dict[str, any]: raise Exception("Should not use the base class") class DefaultDataService(DataService): def __init__(self): self._dataset_url = "hf://datasets/iscg34/finalproject/patient_information.csv" self._initialized = False def initialize(func): def wrapper(self): if not self._initialized: self._df = pd.read_csv(self._dataset_url) self._initialized = True return func(self) return wrapper def close(self): #raise Exception("Not implemented") pass @initialize def get_patients(self) -> pd.DataFrame: # TODO: Now that I've changed the return type to DataFrame, I'm tempted to change the column names so that the rest of the application does not hard-code the names "PATIENT_ID" and "FIRST", and those are used solely within the data_service. But that could be over-thinking for this small application. # I chose not to sort the patients alphabetically here because I thought that was a violation of the division of responsibilities between the data service and the UI. And I chose to change the return type to DataFrame because it's possible, based on my experience with Spark and Hadoop, that sorting would trigger a manifestation of the data that, until this point, could (theoretically) be being processed in a distributed way and not loaded locally, and sorting would trigger the processing of everything, and then sorting would be another O(n) operation. # TODO: And the calculation of size() is silly, but I don't see noop action after group. Maybe apply().... patients = self._df.groupby(["PATIENT_ID", "FIRST"]).size().reset_index()[["PATIENT_ID", "FIRST"]] return patients # TODO: We need to be careful with these two functions, get_documents and get_document_metadatas. They must be in the same order, because I don't think we're preserving the ID columns @initialize def get_documents(self) -> list[str]: texts = self._df["CLINICAL_NOTES"].astype(str).tolist() return texts # TODO: This kind-of triggered my "code smell" and then I saw iloc is deprecated. Surely there's a way to select out columns 0,3-11 (with the first as the id), and have get_documents also return a table that is columns [index, CLINICAL_NOTES], and the chroma.add function will be able to match the row IDs? I think that's what it says it expects to do. # TODO: This is probably no longer the correct columns with the cleaned/processed dataset. @initialize def get_document_metadatas(self) -> dict[str, any]: metadatas = [self._df.iloc[i,3:11].to_dict() for i in range(self._df.shape[0])] return metadatas @initialize def get_data(self) -> pd.DataFrame: return self._df