Spaces:
Sleeping
Sleeping
| from contextlib import contextmanager | |
| import pandas as pd | |
| class DataService(object): | |
| def build(self): | |
| data_service = None | |
| try: | |
| data_service = DefaultDataService() | |
| yield data_service | |
| finally: | |
| if data_service: data_service.close() | |
| def close(self): | |
| raise Exception("Should not use the base class") | |
| def get_patients(self) -> list[str]: | |
| raise Exception("Should not use the base class") | |
| def get_documents(self) -> list[str]: | |
| raise Exception("Should not use the base class") | |
| def get_document_metadatas(self) -> dict[str, any]: | |
| raise Exception("Should not use the base class") | |
| class DefaultDataService(DataService): | |
| def __init__(self): | |
| self._dataset_url = "hf://datasets/iscg34/finalproject/patient_information.csv" | |
| self._initialized = False | |
| def initialize(func): | |
| def wrapper(self): | |
| if not self._initialized: | |
| self._df = pd.read_csv(self._dataset_url) | |
| self._initialized = True | |
| return func(self) | |
| return wrapper | |
| def close(self): | |
| #raise Exception("Not implemented") | |
| pass | |
| def get_patients(self) -> pd.DataFrame: | |
| # TODO: Now that I've changed the return type to DataFrame, I'm tempted to change the column names so that the rest of the application does not hard-code the names "PATIENT_ID" and "FIRST", and those are used solely within the data_service. But that could be over-thinking for this small application. | |
| # I chose not to sort the patients alphabetically here because I thought that was a violation of the division of responsibilities between the data service and the UI. And I chose to change the return type to DataFrame because it's possible, based on my experience with Spark and Hadoop, that sorting would trigger a manifestation of the data that, until this point, could (theoretically) be being processed in a distributed way and not loaded locally, and sorting would trigger the processing of everything, and then sorting would be another O(n) operation. | |
| # TODO: And the calculation of size() is silly, but I don't see noop action after group. Maybe apply().... | |
| patients = self._df.groupby(["PATIENT_ID", "FIRST"]).size().reset_index()[["PATIENT_ID", "FIRST"]] | |
| return patients | |
| # TODO: We need to be careful with these two functions, get_documents and get_document_metadatas. They must be in the same order, because I don't think we're preserving the ID columns | |
| def get_documents(self) -> list[str]: | |
| texts = self._df["CLINICAL_NOTES"].astype(str).tolist() | |
| return texts | |
| # TODO: This kind-of triggered my "code smell" and then I saw iloc is deprecated. Surely there's a way to select out columns 0,3-11 (with the first as the id), and have get_documents also return a table that is columns [index, CLINICAL_NOTES], and the chroma.add function will be able to match the row IDs? I think that's what it says it expects to do. | |
| # TODO: This is probably no longer the correct columns with the cleaned/processed dataset. | |
| def get_document_metadatas(self) -> dict[str, any]: | |
| metadatas = [self._df.iloc[i,3:11].to_dict() for i in range(self._df.shape[0])] | |
| return metadatas | |
| def get_data(self) -> pd.DataFrame: | |
| return self._df |