Spaces:

klinic-hackupc
/

klinic

Sleeping

App Files Files Community

ACMCMC commited on May 5, 2024

Commit

1e2e3b8

1 Parent(s): dda0120

WIP

Browse files

Files changed (4) hide show

app.py +13 -10
clinical_trials_embeddings.ipynb +3 -3
llm_res.py +105 -78
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ from utils import (
     get_clinical_trials_related_to_diseases,
     get_clinical_records_by_ids
 )
 import json
 import numpy as np
 from sentence_transformers import SentenceTransformer
@@ -35,10 +36,6 @@ CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}
 engine = create_engine(CONNECTION_STRING)
-st.title("Klìnic")
-st.header("", divider='rainbow')
-st.text('') # dummy to add spacing
 with st.container(): # user input
     col1, col2 = st.columns((6, 1))
@@ -58,30 +55,36 @@ with st.container():
         with st.status("Analyzing...") as status:
             # 1. Embed the textual description that the user entered using the model
             # 2. Get 5 diseases with the highest cosine silimarity from the DB
             encoder = SentenceTransformer("allenai-specter")
             diseases_related_to_the_user_text = get_diseases_related_to_a_textual_description(
                 description_input, encoder
             )
-            # for disease_label in diseases_related_to_the_user_text:
-            #    st.text(disease_label)
             # 3. Get the similarities of the embeddings of those diseases (cosine similarity of the embeddings of the nodes of such diseases)
             diseases_uris = [disease["uri"] for disease in diseases_related_to_the_user_text]
             get_similarities_among_diseases_uris(diseases_uris)
-            #print(diseases_related_to_the_user_text)
             # 4. Potentially filter out the diseases that are not similar enough (e.g. similarity < 0.8)
             # 5. Augment the set of diseases: add new diseases that are similar to the ones that are already in the set, until we get 10-15 diseases
             augmented_set_of_diseases = augment_the_set_of_diseaces(diseases_uris)
-            #print(augmented_set_of_diseases)
             # 6. Query the embeddings of the diseases related to each clinical trial (also in the DB), to get the most similar clinical trials to our set of diseases
             clinical_trials_related_to_the_diseases = get_clinical_trials_related_to_diseases(
                 augmented_set_of_diseases, encoder
             )
-            #print(f'clinical_trials_related_to_the_diseases: {clinical_trials_related_to_the_diseases}')
             json_of_clinical_trials = get_clinical_records_by_ids(
                 [trial["nct_id"] for trial in clinical_trials_related_to_the_diseases]
             )
-            #print(f'json_of_clinical_trials: {json_of_clinical_trials}')
             # 8. Use an LLM to extract numerical data from the clinical trials (e.g. number of patients, number of deaths, etc.). Get summary statistics out of that.
             # 9. Show the results to the user: graph of the diseases chosen, summary of the clinical trials, summary statistics of the clinical trials, and list of the details of the clinical trials considered
             status.update(label="Done!", state="complete")
             time.sleep(1)

     get_clinical_trials_related_to_diseases,
     get_clinical_records_by_ids
 )
+from llm_res import process_dictionaty_with_llm_to_generate_response
 import json
 import numpy as np
 from sentence_transformers import SentenceTransformer
 engine = create_engine(CONNECTION_STRING)
 with st.container(): # user input
     col1, col2 = st.columns((6, 1))
         with st.status("Analyzing...") as status:
             # 1. Embed the textual description that the user entered using the model
             # 2. Get 5 diseases with the highest cosine silimarity from the DB
+            status.write("Analyzing the description that you wrote...")
             encoder = SentenceTransformer("allenai-specter")
             diseases_related_to_the_user_text = get_diseases_related_to_a_textual_description(
                 description_input, encoder
             )
             # 3. Get the similarities of the embeddings of those diseases (cosine similarity of the embeddings of the nodes of such diseases)
+            status.write("Getting the similarities among the diseases to filter out less promising ones...")
             diseases_uris = [disease["uri"] for disease in diseases_related_to_the_user_text]
             get_similarities_among_diseases_uris(diseases_uris)
             # 4. Potentially filter out the diseases that are not similar enough (e.g. similarity < 0.8)
             # 5. Augment the set of diseases: add new diseases that are similar to the ones that are already in the set, until we get 10-15 diseases
+            status.write("Augmenting the set of diseases by finding others with related embeddings...")
             augmented_set_of_diseases = augment_the_set_of_diseaces(diseases_uris)
+            # print(augmented_set_of_diseases)
             # 6. Query the embeddings of the diseases related to each clinical trial (also in the DB), to get the most similar clinical trials to our set of diseases
+            status.write("Getting the clinical trials related to the diseases found...")
             clinical_trials_related_to_the_diseases = get_clinical_trials_related_to_diseases(
                 augmented_set_of_diseases, encoder
             )
+            status.write("Getting the details of the clinical trials...")
             json_of_clinical_trials = get_clinical_records_by_ids(
                 [trial["nct_id"] for trial in clinical_trials_related_to_the_diseases]
             )
+            status.json(json_of_clinical_trials)
+            # 7. Use an LLM to get a summary of the clinical trials, in plain text format.
+            status.write("Getting a summary of the clinical trials...")
+            response = process_dictionaty_with_llm_to_generate_response(json_of_clinical_trials)
+            print(f'Response from LLM: {response}')
             # 8. Use an LLM to extract numerical data from the clinical trials (e.g. number of patients, number of deaths, etc.). Get summary statistics out of that.
+            status.write("Getting summary statistics of the clinical trials...")
             # 9. Show the results to the user: graph of the diseases chosen, summary of the clinical trials, summary statistics of the clinical trials, and list of the details of the clinical trials considered
             status.update(label="Done!", state="complete")
             time.sleep(1)

clinical_trials_embeddings.ipynb CHANGED Viewed

@@ -61,9 +61,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "os.environ[\"OPENAI_API_KEY\"] = (\n",
-    "    \"sk-proj-CG2E98bSWs53X2eWO0Z4T3BlbkFJLm7H1vfkbua0zP548CKQ\"\n",
-    ")"
    ]
   },
   {

    "metadata": {},
    "outputs": [],
    "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()"
    ]
   },
   {

llm_res.py CHANGED Viewed

@@ -21,6 +21,10 @@ from langchain_core.pydantic_v1 import BaseModel, Field
 from langchain_openai import ChatOpenAI
 from typing import List, Dict, Any
 import requests
 # getting the json files
 def get_clinical_record_info(clinical_record_id: str) -> Dict[str, Any]:
@@ -31,6 +35,7 @@ def get_clinical_record_info(clinical_record_id: str) -> Dict[str, Any]:
     response = requests.get(request_url, headers={"accept": "application/json"})
     return response.json()
 def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str, Any]]:
     clinical_records = []
     for clinical_record_id in clinical_record_ids:
@@ -38,80 +43,99 @@ def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str
         clinical_records.append(clinical_record_info)
     return clinical_records
-def process_json(json_file):
-    # processing the files and getting the info needed
-    # Open the JSON file for reading
-    with open(json_file, 'r') as f:
-        data = json.load(f)  # Parse JSON data into a Python dictionary
     # Define the fields you want to keep
-    fields_to_keep = ['class_of_organization', 'title', 'overallStatus', 'descriptionModule', 'conditions', 'interventions', 'outcomesModule', 'eligibilityModule']
     # Iterate through the dictionary and keep only the desired fields
     filtered_data = []
     for item in data:
         try:
-            organization_name= item['protocolSection']['identificationModule']['organization']['fullName']
         except:
-            organization_name= ""
         try:
-            project_title= item['protocolSection']['identificationModule']['officialTitle']
         except:
-            project_title= ""
-        try:
-            status= item['protocolSection']['statusModule']['overallStatus']
         except:
-            status= ""
         try:
-            brief_description= item['protocolSection']['descriptionModule']['briefSummary']
         except:
-            brief_description= ""
         try:
-            detailed_description= item['protocolSection']['descriptionModule']['detailedDescription']
         except:
-            detailed_description= ""
         try:
-            conditions= item['protocolSection']['conditionsModule']['conditions']
         except:
-            conditions= []
         try:
-            keywords= item['protocolSection']['conditionsModule']['keywords']
         except:
-            keywords= []
         try:
-            interventions= item['protocolSection']['armsInterventionsModule']['interventions']
         except:
-            interventions= []
         try:
-            primary_outcomes= item['protocolSection']['outcomesModule']['primaryOutcomes']
         except:
-            primary_outcomes= []
         try:
-            secondary_outcomes= item['protocolSection']['outcomesModule']['secondaryOutcomes']
         except:
-            secondary_outcomes= []
         try:
-            eligibility= item['protocolSection']['eligibilityModule']
         except:
-            eligibility= {}
-        filtered_item = {"organization_name": organization_name,
-                        "project_title": project_title,
-                        "status": status,
-                        "brief_description": brief_description,
-                        "detailed_description": detailed_description,
-                        "keywords":keywords,
-                        "interventions": interventions,
-                        "primary_outcomes": primary_outcomes,
-                        "secondary_outcomes": secondary_outcomes,
-                        "eligibility": eligibility}
         filtered_data.append(filtered_item)
     # for ele in filtered_data:
     #     print(ele)
-    # Write the filtered data to a new JSON file
-    with open('output.json', 'w') as f:
-        json.dump(filtered_data, f, indent=4)
 def llm_config():
     tagging_prompt = ChatPromptTemplate.from_template(
@@ -127,20 +151,38 @@ def llm_config():
     )
     class Classification(BaseModel):
-        description: str = Field(description= "text description grouping all the clinical trials using brief_description and detailed_description keys")
-        project_title: list = Field(description="Extract the project title of all the clinical trials")
-        status: list= Field(description="Extract the status of all the clinical trials")
-        keywords: list= Field(description="Extract the most relevant keywords regrouping all the clinical trials")
-        interventions: list= Field(description="describe the interventions for each clinical trial using title, name and description")
-        primary_outcomes: list= Field(description= "get the primary outcomes of each clinical trial")
         # secondary_outcomes: list= Field(description= "get the secondary outcomes of each clinical trial")
-        eligibility: list= Field(description= "get the eligibilityCriteria grouping all the clinical trials")
         # healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers")
-        minimum_age: list = Field(description="get the minimum age from each experiment")
-        maximum_age: list = Field(description="get the maximum age from each experiment")
         gender: list = Field(description="get the gender from each experiment")
-        def get_dict(self):
             return {
                 "summary": self.description,
                 "project_title": self.project_title,
@@ -153,45 +195,30 @@ def llm_config():
                 # "healthy_volunteers": self.healthy_volunteers,
                 "minimum_age": self.minimum_age,
                 "maximum_age": self.maximum_age,
-                "gender": self.gender
             }
     # LLM
     llm = ChatOpenAI(
-        temperature=0.6,
         model="gpt-4",
-        openai_api_key="sk-proj-CG2E98bSWs53X2eWO0Z4T3BlbkFJLm7H1vfkbua0zP548CKQ"
-    ).with_structured_output(
-        Classification
-    )
     tagging_chain = tagging_prompt | llm
     return tagging_chain
-def get_llm_results(results):
-    result_dict= results.get_dict()
-    return result_dict
-def save_llm_results(results_json):
-    with open('llm_results.json', 'w') as f:
-      json.dump(results_json, f, indent=4)
 # clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377'])
 # print(clinical_record_info)
 # with open('data.json', 'w') as f:
 #     json.dump(clinical_record_info, f, indent=4)
-# change the json file here and run it to get the output
-json_file= "D:/HACKUPC/hupc/klinic/data.json"
-process_json(json_file)
-with open('output.json', 'r') as file:
-    data = json.load(file)
-tagging_chain= llm_config()
-res= tagging_chain.invoke({"input": data})
-result_json= get_llm_results(res)
-save_llm_results(result_json)
-print(result_json)

 from langchain_openai import ChatOpenAI
 from typing import List, Dict, Any
 import requests
+from dotenv import load_dotenv
+load_dotenv()
 # getting the json files
 def get_clinical_record_info(clinical_record_id: str) -> Dict[str, Any]:
     response = requests.get(request_url, headers={"accept": "application/json"})
     return response.json()
 def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str, Any]]:
     clinical_records = []
     for clinical_record_id in clinical_record_ids:
         clinical_records.append(clinical_record_info)
     return clinical_records
+def process_json_data_for_llm(data):
     # Define the fields you want to keep
+    fields_to_keep = [
+        "class_of_organization",
+        "title",
+        "overallStatus",
+        "descriptionModule",
+        "conditions",
+        "interventions",
+        "outcomesModule",
+        "eligibilityModule",
+    ]
     # Iterate through the dictionary and keep only the desired fields
     filtered_data = []
     for item in data:
         try:
+            organization_name = item["protocolSection"]["identificationModule"][
+                "organization"
+            ]["fullName"]
         except:
+            organization_name = ""
         try:
+            project_title = item["protocolSection"]["identificationModule"][
+                "officialTitle"
+            ]
         except:
+            project_title = ""
+        try:
+            status = item["protocolSection"]["statusModule"]["overallStatus"]
         except:
+            status = ""
         try:
+            brief_description = item["protocolSection"]["descriptionModule"][
+                "briefSummary"
+            ]
         except:
+            brief_description = ""
         try:
+            detailed_description = item["protocolSection"]["descriptionModule"][
+                "detailedDescription"
+            ]
         except:
+            detailed_description = ""
         try:
+            conditions = item["protocolSection"]["conditionsModule"]["conditions"]
         except:
+            conditions = []
         try:
+            keywords = item["protocolSection"]["conditionsModule"]["keywords"]
         except:
+            keywords = []
         try:
+            interventions = item["protocolSection"]["armsInterventionsModule"][
+                "interventions"
+            ]
         except:
+            interventions = []
         try:
+            primary_outcomes = item["protocolSection"]["outcomesModule"][
+                "primaryOutcomes"
+            ]
         except:
+            primary_outcomes = []
         try:
+            secondary_outcomes = item["protocolSection"]["outcomesModule"][
+                "secondaryOutcomes"
+            ]
         except:
+            secondary_outcomes = []
         try:
+            eligibility = item["protocolSection"]["eligibilityModule"]
         except:
+            eligibility = {}
+        filtered_item = {
+            "organization_name": organization_name,
+            "project_title": project_title,
+            "status": status,
+            "brief_description": brief_description,
+            "detailed_description": detailed_description,
+            "keywords": keywords,
+            "interventions": interventions,
+            "primary_outcomes": primary_outcomes,
+            "secondary_outcomes": secondary_outcomes,
+            "eligibility": eligibility,
+        }
         filtered_data.append(filtered_item)
     # for ele in filtered_data:
     #     print(ele)
 def llm_config():
     tagging_prompt = ChatPromptTemplate.from_template(
     )
     class Classification(BaseModel):
+        description: str = Field(
+            description="text description grouping all the clinical trials using brief_description and detailed_description keys"
+        )
+        project_title: list = Field(
+            description="Extract the project title of all the clinical trials"
+        )
+        status: list = Field(
+            description="Extract the status of all the clinical trials"
+        )
+        keywords: list = Field(
+            description="Extract the most relevant keywords regrouping all the clinical trials"
+        )
+        interventions: list = Field(
+            description="describe the interventions for each clinical trial using title, name and description"
+        )
+        primary_outcomes: list = Field(
+            description="get the primary outcomes of each clinical trial"
+        )
         # secondary_outcomes: list= Field(description= "get the secondary outcomes of each clinical trial")
+        eligibility: list = Field(
+            description="get the eligibilityCriteria grouping all the clinical trials"
+        )
         # healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers")
+        minimum_age: list = Field(
+            description="get the minimum age from each experiment"
+        )
+        maximum_age: list = Field(
+            description="get the maximum age from each experiment"
+        )
         gender: list = Field(description="get the gender from each experiment")
+        def get_dict(self):
             return {
                 "summary": self.description,
                 "project_title": self.project_title,
                 # "healthy_volunteers": self.healthy_volunteers,
                 "minimum_age": self.minimum_age,
                 "maximum_age": self.maximum_age,
+                "gender": self.gender,
             }
     # LLM
     llm = ChatOpenAI(
+        temperature=0.6,
         model="gpt-4",
+        openai_api_key=os.environ["OPENAI_API_KEY"],
+    ).with_structured_output(Classification)
     tagging_chain = tagging_prompt | llm
     return tagging_chain
 # clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377'])
 # print(clinical_record_info)
 # with open('data.json', 'w') as f:
 #     json.dump(clinical_record_info, f, indent=4)
+tagging_chain = llm_config()
+def process_dictionaty_with_llm_to_generate_response(json_contents):
+    processed_data = process_json_data_for_llm(json_contents)
+    res = tagging_chain.invoke({"input": processed_data})
+    return res

requirements.txt CHANGED Viewed

@@ -10,3 +10,4 @@ openai==1.25.1
 sentence_transformers==2.7.0
 streamlit-agraph
 streamlit==1.34.0

 sentence_transformers==2.7.0
 streamlit-agraph
 streamlit==1.34.0
+langchain-openai==0.1.6