Spaces:

iscg34
/

finalproject

Sleeping

App Files Files Community

raagustin commited on Jun 11, 2025

Commit

b6b8c87

1 Parent(s): 5c410ee

Add vector db creation and querying methods

Browse files

Files changed (2) hide show

llm_service.py +70 -1
requirements.txt +3 -1

llm_service.py CHANGED Viewed

@@ -1,4 +1,8 @@
 from contextlib import contextmanager
 class LLMService(object):
   def __init__(self):
@@ -31,12 +35,77 @@ class LLMService(object):
 class DefaultLLMService(LLMService):
   def __init__(self, api_key: str):
     self._api_key = api_key
   def close(self):
     raise Exception("Not implemented")
   def get_summary(self, patient: str) -> str:
-    # TODO: Presumably querying the vector database for patient-related stuff
     # TODO: Add the found data to the context and asking OpenAI to summarize the docs provided?
     raise Exception("Not implemented")

 from contextlib import contextmanager
+import chromadb
+from chromadb.config import Settings
+from sentence_transformers import SentenceTransformer
+import pandas as pd
 class LLMService(object):
   def __init__(self):
 class DefaultLLMService(LLMService):
   def __init__(self, api_key: str):
     self._api_key = api_key
+    #TODO decide on embedding model, using one provided in notebook
+    self._embed_model = SentenceTransformer("all-MiniLM-L6-v2")
+    self.build_chromadb()
   def close(self):
     raise Exception("Not implemented")
+  def get_chromadb(self,clear=0):
+    client=chromadb.Client(Settings(
+      persist_directory="./chroma_db"
+    ))
+    if clear:
+      client.delete_collection(collection_name)
+    return client.get_or_create_collection(name=collection_name)
+  def build_chromadb(self):
+    #TODO replace with cleaned data url or move to service inputs
+    self._df = pd.read_csv("https://huggingface.co/datasets/patjs/patient1/raw/main/patient_encounters1_notes.csv")
+    collection = self.get_chromadb(clear=1)
+    texts = self._df["CLINICAL_NOTES"].astype(str).tolist()
+    embeddings = self._embed_model.encode(texts).tolist()
+    collection.add(
+      documents=texts,
+      embeddings=embeddings,
+      metadatas=[df.iloc[i,3:11].to_dict() for i in range(len(texts))], #store everything except clinical notes and ids as metadata
+      ids=[str(i) for i in range(len(texts))]
+    )
+  def query_chromadb(self, patient: str, query: str, result_template:str, top_n=3) -> str:
+    if (patient=="") or (patient is None):
+      return ""
+    if (query=="") or (query is None):
+      return ""
+    collection = self.get_chromadb()
+    query_embedding = self._embed_model.encode([query])[0].tolist()
+    results = collection.query(
+      query_embeddings=[query_embedding],
+      n_results=top_n,
+      # include=["documents","metadatas","distances"], #these are default query outputs so no need to specifiy
+      where={"PATIENT_ID":patient} # specify patient
+    )
+    #TODO refine template for what info to include
+    if result_template=="":
+      result_template="""
+      #{rank}: {desc} {st_date}\n
+      {note}\n
+      """
+    context=""
+    for i in range(len(results["ids"][0])):
+      result_txt = result_template.format(
+          rank=(i+1), #range is 0 indexed, increment for rank
+          desc=results["metadatas"][0][i]["DESCRIPTION"],
+          st_date=results["metadatas"][0][i]["START"],
+          note=results["documents"][0][i])
+      context = context+result_txt
+    return context
   def get_summary(self, patient: str) -> str:
+    #TODO get all visit notes or querying the vector database with specific prompt for a patient
+    # all_visits = self._df.loc[self._df["PATIENT_ID"] == patient]
+    vector_query=""
+    vector_result_template="" #format for each result from vector search
+    summary_context=self.query_chromadb(patient,vector_query,vector_result_template)
+    summary_query=""
     # TODO: Add the found data to the context and asking OpenAI to summarize the docs provided?
     raise Exception("Not implemented")

requirements.txt CHANGED Viewed

@@ -2,4 +2,6 @@ gradio==5.32.1
 langchain
 openai
 python-dotenv
-langchain_openai

 langchain
 openai
 python-dotenv
+langchain_openai
+chromadb
+sentence-transformers