Spaces:

iscg34
/

finalproject

Sleeping

App Files Files Community

finalproject / data_service.py

raagustin

Add summary prompts and only add new vector db entries

102159c 10 months ago

raw

history blame contribute delete

3.27 kB

	from contextlib import contextmanager
	import pandas as pd

	class DataService(object):
	@contextmanager
	def build(self):
	data_service = None
	try:
	data_service = DefaultDataService()
	yield data_service
	finally:
	if data_service: data_service.close()

	def close(self):
	raise Exception("Should not use the base class")

	def get_patients(self) -> list[str]:
	raise Exception("Should not use the base class")

	def get_documents(self) -> list[str]:
	raise Exception("Should not use the base class")

	def get_document_metadatas(self) -> dict[str, any]:
	raise Exception("Should not use the base class")

	class DefaultDataService(DataService):

	def __init__(self):
	self._dataset_url = "hf://datasets/iscg34/finalproject/patient_information.csv"
	self._initialized = False

	def initialize(func):
	def wrapper(self):
	if not self._initialized:
	self._df = pd.read_csv(self._dataset_url)
	self._initialized = True
	return func(self)

	return wrapper

	def close(self):
	#raise Exception("Not implemented")
	pass

	@initialize
	def get_patients(self) -> pd.DataFrame:
	# TODO: Now that I've changed the return type to DataFrame, I'm tempted to change the column names so that the rest of the application does not hard-code the names "PATIENT_ID" and "FIRST", and those are used solely within the data_service. But that could be over-thinking for this small application.
	# I chose not to sort the patients alphabetically here because I thought that was a violation of the division of responsibilities between the data service and the UI. And I chose to change the return type to DataFrame because it's possible, based on my experience with Spark and Hadoop, that sorting would trigger a manifestation of the data that, until this point, could (theoretically) be being processed in a distributed way and not loaded locally, and sorting would trigger the processing of everything, and then sorting would be another O(n) operation.
	# TODO: And the calculation of size() is silly, but I don't see noop action after group. Maybe apply()....
	patients = self._df.groupby(["PATIENT_ID", "FIRST"]).size().reset_index()[["PATIENT_ID", "FIRST"]]
	return patients

	# TODO: We need to be careful with these two functions, get_documents and get_document_metadatas. They must be in the same order, because I don't think we're preserving the ID columns
	@initialize
	def get_documents(self) -> list[str]:
	texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
	return texts

	# TODO: This kind-of triggered my "code smell" and then I saw iloc is deprecated. Surely there's a way to select out columns 0,3-11 (with the first as the id), and have get_documents also return a table that is columns [index, CLINICAL_NOTES], and the chroma.add function will be able to match the row IDs? I think that's what it says it expects to do.
	# TODO: This is probably no longer the correct columns with the cleaned/processed dataset.
	@initialize
	def get_document_metadatas(self) -> dict[str, any]:
	metadatas = [self._df.iloc[i,3:11].to_dict() for i in range(self._df.shape[0])]
	return metadatas

	@initialize
	def get_data(self) -> pd.DataFrame:
	return self._df