Spaces:
Sleeping
Sleeping
File size: 3,273 Bytes
53a6315 5dc04ad 53a6315 f49d5d2 53a6315 7836133 53a6315 5dc04ad 7836133 5dc04ad 53a6315 5dc04ad 53a6315 5dc04ad 7836133 102159c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | from contextlib import contextmanager
import pandas as pd
class DataService(object):
@contextmanager
def build(self):
data_service = None
try:
data_service = DefaultDataService()
yield data_service
finally:
if data_service: data_service.close()
def close(self):
raise Exception("Should not use the base class")
def get_patients(self) -> list[str]:
raise Exception("Should not use the base class")
def get_documents(self) -> list[str]:
raise Exception("Should not use the base class")
def get_document_metadatas(self) -> dict[str, any]:
raise Exception("Should not use the base class")
class DefaultDataService(DataService):
def __init__(self):
self._dataset_url = "hf://datasets/iscg34/finalproject/patient_information.csv"
self._initialized = False
def initialize(func):
def wrapper(self):
if not self._initialized:
self._df = pd.read_csv(self._dataset_url)
self._initialized = True
return func(self)
return wrapper
def close(self):
#raise Exception("Not implemented")
pass
@initialize
def get_patients(self) -> pd.DataFrame:
# TODO: Now that I've changed the return type to DataFrame, I'm tempted to change the column names so that the rest of the application does not hard-code the names "PATIENT_ID" and "FIRST", and those are used solely within the data_service. But that could be over-thinking for this small application.
# I chose not to sort the patients alphabetically here because I thought that was a violation of the division of responsibilities between the data service and the UI. And I chose to change the return type to DataFrame because it's possible, based on my experience with Spark and Hadoop, that sorting would trigger a manifestation of the data that, until this point, could (theoretically) be being processed in a distributed way and not loaded locally, and sorting would trigger the processing of everything, and then sorting would be another O(n) operation.
# TODO: And the calculation of size() is silly, but I don't see noop action after group. Maybe apply()....
patients = self._df.groupby(["PATIENT_ID", "FIRST"]).size().reset_index()[["PATIENT_ID", "FIRST"]]
return patients
# TODO: We need to be careful with these two functions, get_documents and get_document_metadatas. They must be in the same order, because I don't think we're preserving the ID columns
@initialize
def get_documents(self) -> list[str]:
texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
return texts
# TODO: This kind-of triggered my "code smell" and then I saw iloc is deprecated. Surely there's a way to select out columns 0,3-11 (with the first as the id), and have get_documents also return a table that is columns [index, CLINICAL_NOTES], and the chroma.add function will be able to match the row IDs? I think that's what it says it expects to do.
# TODO: This is probably no longer the correct columns with the cleaned/processed dataset.
@initialize
def get_document_metadatas(self) -> dict[str, any]:
metadatas = [self._df.iloc[i,3:11].to_dict() for i in range(self._df.shape[0])]
return metadatas
@initialize
def get_data(self) -> pd.DataFrame:
return self._df |